In [1]:
import pandas as pd
import numpy as np
from numpy import argmax
from numpy import sqrt
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.utils import resample
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from matplotlib import pyplot
from statistics import median
import pickle
import csv
import warnings
import datetime
warnings.filterwarnings("ignore")

In [2]:
project_list = ['geoserver', 'gradle', 'cloud_controller_ng', 'opal', 'jruby', 'cloudify', 'chef', 'orbeon-forms', 'vagrant']

In [3]:
def output_values(Y_data):
    Y_t = []
    for e in Y_data:
        if e == 'passed':
            Y_t.append(1)
        else:
            Y_t.append(0) 
    return Y_t

In [4]:
def get_pass_streak(y_project):
    p = y_project[0]
    pass_streak = [y_project[0]]
    for i in range(1, len(y_project)):
        pass_streak.append(p)
        if y_project[i] == 1:
            p += 1
        else:
            p = 0
    return pass_streak

In [5]:
def get_first_failures(df):
    
    results = df['tr_status'].tolist()
    length = len(results)
    verdict = ['keep']
    prev = results[0]
    
    for i in range(1, length):
        if results[i] == 0:
            if prev == 0:
                verdict.append('discard')
                #print(i+1)
            else:
                verdict.append('keep')
        else:
            verdict.append('keep')
        prev = results[i]
    
    df['verdict'] = verdict
    df = df[ df['verdict'] == 'keep' ]
    df.drop('verdict', inplace=True, axis=1)
    return df

In [6]:
def get_complete_data(p_name):
    
    #open the metrics file
    filename = 'metrics_data/' + p_name + '_metrics.csv'
    project = pd.read_csv(filename)
    
    #clean the data & remove correlated columns
    project = project [ project['developer_experience'] >= 0]
    project.drop('num_commits', inplace=True, axis=1)
    project.drop('reviewer_experience', inplace=True, axis=1)
    project.drop('num_of_reviewers', inplace=True, axis=1)
    
    build_ids = project['tr_build_id'].tolist()
    #get results data
    res_file = '../data/' + p_name + '.csv'
    res_project = pd.read_csv(res_file, usecols = ['tr_build_id', 'gh_build_started_at', 'tr_status'])
    res_project['gh_build_started_at'] =  pd.to_datetime(res_project['gh_build_started_at'], format='%Y-%m-%d %H:%M:%S')
    y_project = res_project[res_project['tr_build_id'].isin(build_ids)]['tr_status'].tolist()
    y_project = output_values(y_project)
    
    #append date of build
    project_dates = res_project[res_project['tr_build_id'].isin(build_ids)]['gh_build_started_at'].tolist()
    project['gh_build_started_at'] = project_dates
    
    #add results column to the dataframe
    project['tr_status'] = y_project
    
    return project

In [7]:
def get_start_end_date(project):
    dates = project['gh_build_started_at'].tolist()
    
    start_date = dates[0] - datetime.timedelta(days = 1)
    end_date = dates[-1] - datetime.timedelta(days = 1)
    
    return start_date, end_date

In [8]:
def get_required_data(p_name, build_ids):
    
    res_file = '../data/' + p_name + '.csv'
    res_project = pd.read_csv(res_file, usecols = ['tr_build_id', 'tr_duration'])
    durations = res_project[res_project['tr_build_id'].isin(build_ids)]['tr_duration'].tolist()
    return durations

In [9]:
def compute_performance(p_name, test_builds, test_result, pred_result):
    
    
    durations = get_required_data(p_name, test_builds)
    actual_duration = sum(durations)
    actual_failures = test_result.count(0)
    
    total_builds = len(test_builds)
    num_of_builds = 0
    total_duration = 0
    cbf = 0
    saved_builds = 0
    
    batch = []
    batch_duration = []
    actual_results = []
    max_batch_size = 4
    
    for i in range(len(pred_result)):
        if pred_result[i] == 0:
            
            if test_result[i] == 0:
                cbf += 1
                
            if len(batch) < max_batch_size:
                batch.append(pred_result[i])
                batch_duration.append(durations[i])
                actual_results.append(test_result[i])
            
            if len(batch) == max_batch_size:
                num_of_builds += 1
                total_duration += max(batch_duration)
                
                if 0 in actual_results:
                    num_of_builds += 4
                    total_duration += sum(batch_duration)
        else:
            saved_builds += 1
            
    if len(batch) > 0:
        num_of_builds += 1
        total_duration += max(batch_duration)
        
        if 0 in actual_results:
            num_of_builds += len(batch)
            total_duration += sum(batch_duration)
                    
    #Delay computation
    flag = 0
    count = 0
    delay = []
    for i in range(len(pred_result)):
        if flag == 1:
            if pred_result[i] == 1:
                count += 1
            
            if pred_result[i] == 0:
                delay.append(count)
                count = 0
                flag = 0
                
        if test_result[i] != 1:
            if pred_result[i] == 1:
                flag = 1
    delay.append(count)
    
    print("===========================================")
    print('The performance of the model is as follows:')
    print('\t Time saved : {}'.format(total_duration))
    print('\t % Time saved : {}%'.format(100*total_duration/actual_duration))
    print('\t Num. Builds saved : {}%'.format(saved_builds))
    print('\t % Builds saved : {}%'.format(100*saved_builds/total_builds))
    print('\t Num. Builds required : {}'.format(num_of_builds))
    print('\t % Builds required : {}%'.format(100*num_of_builds/total_builds))
    print('\t Num. Failed Builds Identified : {}'.format(cbf))
    print('\t % Failed Builds Identified : {}%'.format(100*cbf/actual_failures))
    print('\t Median Delay Induced : {} builds'.format(median(delay)))
    print('\t Total Delay Induced: {} builds'.format(sum(delay)))
    print('\t Total number of builds: {}'.format(total_builds))
    print('\t Total number of failed builds: {}'.format(actual_failures))
    print('\t Total Duration: {}'.format(actual_duration))
    print("===========================================")
    

In [None]:
for p_name in project_list:
    
    print('Processing {}'.format(p_name))
    
    project = get_complete_data(p_name)
    start_date, end_date = get_start_end_date(project)
    
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
    
    param_grid = {'n_estimators': n_estimators, 'max_depth': max_depth}
    forest = RandomForestClassifier()
    grid_search = GridSearchCV(estimator = forest, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 0)
    
    phase = 1

    while start_date < end_date:
        
        train_period = 30
        test_period = 10
        
        while True:
            train_end = start_date + datetime.timedelta(days = train_period + 1)
            test_start = start_date + datetime.timedelta(days = train_period)
            test_end = test_start + datetime.timedelta(days = test_period)

            #getting data of train & test phase wise
            train_data = project[ (project['gh_build_started_at'] > start_date) & (project['gh_build_started_at'] < train_end)]
            test_data = project[ (project['gh_build_started_at'] > test_start) & (project['gh_build_started_at'] < test_end)]

            #getting 'y' data
            train_result = train_data['tr_status'].tolist()
            test_result = test_data['tr_status'].tolist()
            
            if len(train_result) > 100 and len(test_result) > 10 :
                break
            
            if len(train_result) <= 100:
                train_period += 20
            
            if len(test_result) <= 10:
                test_period += 20
                
            
        
        #dropping build start time column
        train_data.drop('gh_build_started_at', inplace=True, axis=1)
        test_data.drop('gh_build_started_at', inplace=True, axis=1)
        
        #add pass_streak to training data:
        train_data['num_of_passes'] = get_pass_streak(train_result)
        
        best_n_estimators = []
        best_max_depth = []
        
        best_f1 = 0
        best_f1_sample = 0
        best_f1_sample_result = 0
        best_f1_estimator = 0
        best_thresholds = []
        
        #bootstrap 10 times
        for i in range(50):
            
            file_name = 'rq2_' + p_name + '_' + str(phase) + '_model_' + str(i+1) + '_model.pkl'
            sample_train = resample(train_data, replace=True, n_samples=len(train_data))
            sample_train_result = sample_train['tr_status']
            
            build_ids = sample_train['tr_build_id'].tolist()
            sample_test = train_data [~train_data['tr_build_id'].isin(build_ids)] 
            sample_test_result = sample_test['tr_status']
            
            #dropping result column and build ids column
            sample_train.drop('tr_status', inplace=True, axis=1)
            sample_train.drop('tr_build_id', inplace=True, axis=1)
            sample_test.drop('tr_status', inplace=True, axis=1)
            sample_test.drop('tr_build_id', inplace=True, axis=1)
            
            #training
            grid_search.fit(sample_train, sample_train_result)
            sample_pred_vals = grid_search.predict_proba(sample_test)
            
            pred_vals = sample_pred_vals[:, 1]
            fpr, tpr, t = roc_curve(sample_test_result, pred_vals)
            gmeans = sqrt(tpr * (1-fpr))
            ix = argmax(gmeans)
            bt = t[ix]
            best_thresholds.append(bt)
            
            final_pred_result = []
            #threshold setting
            for j in range(len(pred_vals)):
                if pred_vals[j] > bt:
                    final_pred_result.append(1)
                else:
                    final_pred_result.append(0)
            
            accuracy = accuracy_score(sample_test_result, final_pred_result)
            precision = precision_score(sample_test_result, final_pred_result)
            recall = recall_score(sample_test_result, final_pred_result)
            confusion = confusion_matrix(sample_test_result, final_pred_result)
            auc_score = roc_auc_score(sample_test_result, final_pred_result)
            f1 = f1_score(sample_test_result, final_pred_result)
    
            if f1 > best_f1:
                best_f1 = f1
                best_f1_sample = sample_train
                best_f1_sample_result = sample_train_result
                best_f1_estimator = grid_search.best_estimator_

            print(precision, recall, accuracy, f1, auc_score)
            best_n_estimators.append(grid_search.best_params_['n_estimators'])
            best_max_depth.append(grid_search.best_params_['max_depth'])
        
        #completed with bootstrapping 
        threshold = median(best_thresholds)
        n_estimator = median(best_n_estimators)
        max_depth = median(best_max_depth)
        #retrain on the best 
        forest = RandomForestClassifier(n_estimators=int(n_estimator), max_depth=int(max_depth))
        forest.fit(best_f1_sample, best_f1_sample_result)
        
        test_builds = test_data['tr_build_id'].tolist()
        test_data.drop('tr_build_id', inplace=True, axis=1)
        test_data.drop('tr_status', inplace=True, axis=1)
        
        final_pred_result = []
        queue = 0
        i = 0
        total = len(test_data)
        while i < total :
            data = test_data.iloc[i]
            data['num_of_passes'] = queue
            predict = forest.predict_proba([data])
            if predict[0][1] > threshold:
                final_pred_result.append(1)
                queue += 1
                i+=1
            else:
                final_pred_result.append(0)
                queue = 0
                i += 1
                
                
        print('Individual testing for {}....'.format(p_name))
        
#         accuracy = accuracy_score(test_result, final_pred_result)
#         precision = precision_score(test_result, final_pred_result)
#         recall = recall_score(test_result, final_pred_result)
#         confusion = confusion_matrix(test_result, final_pred_result)
#         auc_score = roc_auc_score(test_result, final_pred_result)
#         f1 = f1_score(test_result, final_pred_result)
        
        print(precision, recall, accuracy, f1, auc_score)
        print(confusion)
        
        compute_performance(p_name, test_builds, test_result, final_pred_result)
        
#         #testing full set at time
#         print('Group testing for {}....'.format(p_name))
#         test_data['num_of_passes'] = get_pass_streak(test_result)
#         pred_result = forest.predict_proba(test_data)
        
#         final_pred_result = []
#         pred_vals = pred_result[:, 1]
#         fpr, tpr, t = roc_curve(test_result, pred_vals)
#         gmeans = sqrt(tpr * (1-fpr))
#         ix = argmax(gmeans)
#         bt = t[ix]
#         best_thresholds.append(bt)

#         final_pred_result = []
#         #threshold setting
#         for j in range(len(pred_vals)):
#             if pred_vals[j] > bt:
#                 final_pred_result.append(1)
#             else:
#                 final_pred_result.append(0)
        
# #         accuracy = accuracy_score(test_result, final_pred_result)
# #         precision = precision_score(test_result, final_pred_result)
# #         recall = recall_score(test_result, final_pred_result)
# #         confusion = confusion_matrix(test_result, final_pred_result)
# #         auc_score = roc_auc_score(test_result, final_pred_result)
# #         f1 = f1_score(test_result, final_pred_result)
        
#         print(precision, recall, accuracy, f1, auc_score)
#         print(confusion)
        
#         compute_performance(p_name, test_builds, test_result, final_pred_result)
        
        start_date = test_end
        phase += 1
        
    print('\n\n\n\n\n')

Processing geoserver
0.6 0.84 0.660377358490566 0.7000000000000001 0.6699999999999999
0.625 0.6 0.6274509803921569 0.6122448979591836 0.6269230769230769
0.6956521739130435 0.5714285714285714 0.6545454545454545 0.6274509803921569 0.656084656084656
0.6428571428571429 0.72 0.6666666666666666 0.6792452830188679 0.6676923076923078
0.7407407407407407 0.7692307692307693 0.7346938775510204 0.7547169811320754 0.7324414715719063
0.75 0.5 0.673469387755102 0.6 0.67
0.6111111111111112 0.5238095238095238 0.6222222222222222 0.5641025641025642 0.6160714285714284
0.7058823529411765 0.46153846153846156 0.5681818181818182 0.558139534883721 0.5918803418803419
0.76 0.7037037037037037 0.7254901960784313 0.7307692307692308 0.7268518518518519
0.7692307692307693 0.7407407407407407 0.7547169811320755 0.7547169811320754 0.7549857549857549
0.6666666666666666 0.4166666666666667 0.5957446808510638 0.5128205128205129 0.5996376811594203
0.7575757575757576 0.8620689655172413 0.7692307692307693 0.8064516129032258 0.75