In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.utils import resample
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from matplotlib import pyplot
import pickle
import csv

In [2]:
def output_values(X_t, Y_data):
    Y_t = []
    for e in X_t['tr_build_id']:
        y_index = list(Y_data['tr_build_id']).index(e)
        if Y_data['tr_status'][y_index] == 'passed':
            Y_t.append(1)
        else:
            Y_t.append(0) 
    return Y_t

In [17]:
def get_pass_streak(y_project):
    p = y_project[0]
    pass_streak = [y_project[0]]
    for i in range(1, len(y_project)):
        if y_project[i] == 1:
            p += 1
        else:
            p = 0
        pass_streak.append(p)
    return num_passes

In [18]:
cloud_controller = pd.read_csv('metrics_data/cloud_controller_ng_metrics.csv')
res_cloud_controller = pd.read_csv('../data/cloud_controller_ng.csv')
y_cloud_controller = output_values(cloud_controller, res_cloud_controller)

In [19]:
geoserver = pd.read_csv('metrics_data/geoserver_metrics.csv')
res_geoserver = pd.read_csv('../data/geoserver.csv')
y_geoserver = output_values(geoserver, res_geoserver)

In [20]:
gradle = pd.read_csv('metrics_data/gradle_metrics copy 2.csv')
res_gradle = pd.read_csv('../data/gradle.csv')
y_gradle = output_values(gradle, res_gradle)

In [21]:
projects = [gradle, cloud_controller, geoserver]

In [22]:
gradle.drop('num_commits', inplace=True, axis=1)
gradle.drop('reviewer_experience', inplace=True, axis=1)
gradle.drop('num_of_reviewers', inplace=True, axis=1)

cloud_controller.drop('num_commits', inplace=True, axis=1)
cloud_controller.drop('reviewer_experience', inplace=True, axis=1)
cloud_controller.drop('num_of_reviewers', inplace=True, axis=1)

geoserver.drop('num_commits', inplace=True, axis=1)
geoserver.drop('reviewer_experience', inplace=True, axis=1)
geoserver.drop('num_of_reviewers', inplace=True, axis=1)

In [23]:
gradle['num_of_passes'] = get_pass_streak(y_gradle)
cloud_controller['num_of_passes'] = get_pass_streak(y_cloud_controller)
geoserver['num_of_passes'] = get_pass_streak(y_geoserver)

In [24]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]

In [25]:
param_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf
             }

In [26]:
forest = RandomForestClassifier()
grid_search = GridSearchCV(estimator = forest, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [36]:
#estimating for geoserver
split_index = int(len(y_geoserver)*0.80)
X_train = np.array(geoserver[:split_index])
X_test = np.array(geoserver[split_index:])
y_train = np.array(y_geoserver[:split_index])
y_test = np.array(y_geoserver[split_index:])

In [37]:
print(X_train)

[[11905123        0        0 ...        1        0        0]
 [11905554        0        0 ...        3        0        0]
 [11909050        0        0 ...       10        0        0]
 ...
 [97840373        0        0 ...        8     1262      767]
 [97840774        8        0 ...       16     1262      768]
 [97841159       23        0 ...       35      231      768]]


In [38]:
grid_search.fit(X_train, y_train)
y_pred_test = grid_search.predict(X_test)
accuracy_score(y_test, y_pred_test)

Fitting 3 folds for each of 540 candidates, totalling 1620 fits


0.7177033492822966

In [64]:
queue = y_test[-10:].tolist()
max_queue_length = 10

y_pred_test = []
for index in range(len(X_test)):
    new_build = X_test[index]
    new_build[-1] = queue.count(1)
    new_build = new_build.reshape((1,9))
    predict_result = grid_search.predict(new_build)
    queue.pop(0)
    queue.append(predict_result[0])
    y_pred_test.append(predict_result[0])

In [65]:
accuracy_score(y_test, y_pred_test)

0.7129186602870813

In [66]:
print(y_test)

[1 1 1 1 0 0 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1
 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1
 1 1 1 1 1 1 0 1 1 0 1 1 1 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1
 1 1 1 1 0 1 1 1 1 1 1 1 0 1 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 0 1 1 0 1 0 1 0 1 1
 1 1 1 0 1 0 0 0 0 1 0]


In [67]:
print(y_pred_test)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [68]:
confusion_matrix(y_test, y_pred_test)

array([[  5, 113],
       [  7, 293]])