In [1]:
import pandas as pd
import numpy as np
from numpy import argmax
from numpy import sqrt
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.utils import resample
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.model_selection import KFold
from matplotlib import pyplot
from statistics import median
import pickle
import csv
import warnings
import datetime
import multiprocess
warnings.filterwarnings("ignore")

In [2]:
def get_median(data):
    data = sorted(data)
    size = len(data)
    if size % 2 == 0:  
        median = (data[size // 2] + data[size // 2 - 1]) / 2
        data[0] = median
    if size % 2 == 1:  
        median = data[(size - 1) // 2]
        data[0] = median
    return data[0]


In [3]:
def get_first_failures(df):
    
    results = df['tr_status'].tolist()
    length = len(results)
    verdict = ['keep']
    prev = results[0]
    
    for i in range(1, length):
        if results[i] == 0:
            if prev == 0:
                verdict.append('discard')
                #print(i+1)
            else:
                verdict.append('keep')
        else:
            verdict.append('keep')
        prev = results[i]
    
    df['verdict'] = verdict
    df = df[ df['verdict'] == 'keep' ]
    df.drop('verdict', inplace=True, axis=1)
    return df


In [4]:
def output_values(Y_data):
    Y_t = []
    for e in Y_data:
        if e == 'passed':
            Y_t.append(1)
        else:
            Y_t.append(0) 
    return Y_t


In [39]:
def get_data(project_path):
    columns = ['tr_build_id', 'git_num_all_built_commits', 'git_diff_src_churn', 'git_diff_test_churn', 'gh_diff_files_modified', 'tr_status']
    df = pd.read_csv(project_path, usecols = columns)
    df['tr_status'] = output_values(df['tr_status'])
    df = get_first_failures(df)
    print('Length of project is: {}'.format(len(df)))
    return df


In [44]:
def sbs(p_name):
    
    result_file = 'version_results/' + p_name.split('.')[0] + '_sbs_predictions.csv'
    fileroot = '../../RQ2-Models/' + p_name.split('.')[0] + '_models/'
    
    pframe = pd.DataFrame()
    test_file = get_data('../data/full_data/' + p_name)
    
    for i in range(1,11):
        
        #retrieve the indexes
        filename = '../data/project_data_pickles/' + p_name + '_' + str(i) + '_indexes.pkl'
        with open(filename, 'rb') as save_file:
            train_build_ids = pickle.load(save_file)
            test_build_ids = pickle.load(save_file)
        
        
        #form the test df
        X_test = test_file [ test_file['tr_build_id'].isin(test_build_ids)] 
        y_test = X_test['tr_status'].tolist()
        
        X_test.drop('tr_status', inplace=True, axis=1)
        X_test.drop('tr_build_id', inplace=True, axis=1)
        
        
        #retrieve the model
        if len(y_test) > 0:        
            filename = fileroot + 'rq2_' + p_name.split('.')[0] + '_' + str(i) + '_best_model.pkl'
            model_file = open(filename, 'rb')
            forest = pickle.load(model_file)
        
        else:
            print('Not found for {}'.format(i))
            continue
        
        y_pred = forest.predict(X_test)
        
        verframe = pd.DataFrame()
        verframe['Build_Result'] = y_pred
        verframe['Actual_Result'] = y_test
        
        pframe = pframe.append(verframe)
    
    pframe.to_csv(result_file)
    return pframe

In [87]:
def get_results(pframe, project):
    
    actual_results = pframe['Actual_Result'].tolist()
    pred_results = pframe['Build_Result'].tolist()
    
    first_failure = 0
    ci = []

    total_builds = len(actual_results)
    sbs_builds = 0
    
    #SBS Algorithm
    for i in range(len(actual_results)):

        #If first failure is already found, continue building until actual build pass is seen
        if first_failure == 1:
            ci.append(0)
            sbs_builds += 1

            if actual_results[i] == 1:
                #actual build pass is seen, switch to prediction
                first_failure = 0
            else:
                first_failure = 1
        else:
            #we're in prediction state, if predicted to skip, we skip
            if pred_results[i] == 1:
                ci.append(1)
            else:
                #if predicted to fail, we switch to determine state and set first_failure to True
                ci.append(0)
                sbs_builds += 1
                first_failure = 1-actual_results[i]


    total_builds = len(ci)
    actual_builds = ci.count(0)
    saved_builds = 100*ci.count(1)/total_builds
    reqd_builds = 100*ci.count(0)/total_builds
    
    if sbs_builds != actual_builds:
        print('PROBLEM!!')
    
    #computing delay
    delay_indexes = []
    built_indexes = []
    
    for i in range(len(ci)):
        if ci[i] == 0:
            built_indexes.append(i)
        else:
            if actual_results[i] == 0:
                delay_indexes.append(i)
    
    bp = 0
    mp = 0
    temp_delay = 0
    total_delay = 0
    
    delay_list = []
    while bp < len(built_indexes):
        while mp < len(delay_indexes) and delay_indexes[mp] < built_indexes[bp]:
            temp_delay = built_indexes[bp] - delay_indexes[mp]
            #print("Difference: {}, Built_index = {} , Missed_index = {}".format(temp_delay, built_indexes[bp], delay_indexes[mp]))
            total_delay += temp_delay
            delay_list.append(temp_delay)
            mp += 1
        bp += 1

    while mp < len(delay_indexes):
        temp_delay = total_builds - delay_indexes[mp]
        #print("Difference: {}, Built_index = {} , Missed_index = {}".format(temp_delay, total_builds, delay_indexes[mp]))
        total_delay += temp_delay
        delay_list.append(temp_delay)
        mp += 1


    delay = [total_delay]
    print('required_builds is {}'.format(reqd_builds))
    print('saved_builds is {}'.format(saved_builds))
    print('length of ci is {}'.format(len(ci)))
    print('delay is {}\n\n'.format(delay_list))
    
    lines = []
    
    for alg in algorithms:
        for b in batch_sizes:
            if alg == 'BATCH4':
                if b != 4:
                    continue
            
            if alg == 'BATCHSTOP4':
                if b < 4:
                    continue
            
            lines.append([project, alg, b, saved_builds, reqd_builds, len(ci), delay_list, median(delay_list)])
    
    return lines   
    
    
    
    

In [88]:
projects = ['heroku.csv', 'vagrant.csv', 'opal.csv', 'cloudify.csv', 'cloud_controller_ng.csv', 'rubinius.csv', 'open-build-service.csv', 'gradle.csv', 'sonarqube.csv', 'loomio.csv', 'fog.csv', 'puppet.csv', 'concerto.csv', 'sufia.csv', 'geoserver.csv', 'orbeon-forms.csv', 'graylog2-server.csv']


In [89]:
algorithms = ['BATCH4', 'BATCHSTOP4', 'BATCHBISECT']
batch_sizes = [1, 2, 4, 8, 16]

In [90]:
lines = []
for p in projects:
    pframe = sbs(p)
    lines.extend(get_results(pframe, p))
    
#     for alg in algorithms:
#         for b in batch_sizes:
            
#             if alg == 'BATCH4':
#                 if b != 4:
#                     continue
#             if alg == 'BATCHSTOP4':
#                 if b < 4:
#                     continue
                    
#             l[1] = alg
#             l[2] = b
#             lines.append(l)
    

Length of project is: 1892
Not found for 4
required_builds is 0.8994708994708994
saved_builds is 99.1005291005291
length of ci is 1890
delay is [102, 99, 96, 74, 72, 66, 58, 37, 28, 19, 17, 165, 162, 160, 155, 153, 125, 114, 93, 89, 70, 58, 39, 30, 16, 14, 11, 261, 259, 247, 243, 222, 212, 197, 174, 167, 135, 128, 125, 123, 116, 112, 103, 86, 76, 71, 53, 51, 30, 22, 12, 139, 136, 133, 127, 125, 108, 102, 93, 91, 77, 73, 69, 67, 53, 49, 47, 41, 37, 32, 11, 295, 293, 284, 279, 265, 246, 223, 215, 202, 195, 154, 143, 131, 98, 90, 88, 73, 66, 64, 79, 56, 34, 32, 17, 11, 3, 4, 66, 64, 59, 46, 84, 13, 3, 3, 113, 16, 220, 192, 166, 55, 42, 19, 5]


Length of project is: 3879
Not found for 1
Not found for 2
required_builds is 0.670621614650503
saved_builds is 99.3293783853495
length of ci is 3877
delay is [53, 40, 275, 233, 204, 46, 65, 30, 305, 101, 86, 40, 218, 207, 136, 131, 105, 47, 5, 69, 57, 45, 135, 83, 72, 55, 27, 171, 144, 133, 58, 52, 47, 36, 25, 19, 828, 822, 817, 811, 799, 790, 781

required_builds is 4.940711462450593
saved_builds is 95.0592885375494
length of ci is 2530
delay is [3, 3, 13, 5, 2, 16, 14, 12, 9, 7, 4, 2, 23, 18, 27, 5, 3, 1, 44, 32, 34, 31, 29, 22, 20, 12, 22, 25, 11, 8, 5, 1, 5, 5, 1, 23, 20, 17, 15, 11, 8, 6, 4, 2, 2, 49, 46, 43, 41, 39, 37, 32, 17, 17, 10, 29, 22, 20, 10, 7, 2, 44, 31, 18, 1, 113, 90, 78, 74, 69, 49, 36, 11, 139, 110, 65, 24, 16, 14, 5, 9, 7, 5, 3, 161, 159, 156, 154, 150, 147, 143, 139, 122, 119, 114, 105, 102, 88, 84, 80, 73, 64, 61, 59, 55, 48, 40, 37, 31, 20, 17, 1, 34, 32, 30, 27, 25, 22, 14, 12, 3, 73, 25, 5, 49, 44, 42, 40, 35, 27, 1, 8, 24, 22, 11, 1, 34, 28, 26, 21, 17, 9, 11, 7, 5, 2, 52, 50, 38, 25, 10, 21, 10, 2, 41, 39, 37, 26, 13, 1, 121, 111, 90, 85, 74, 53, 47, 45, 35, 28, 24, 2, 1, 27, 23, 4, 70, 65, 49, 44, 37, 28, 26, 19, 17, 7, 40, 31, 29, 27, 25, 18, 16, 10, 8, 1, 6, 4, 15, 13, 11, 7, 5, 2, 78, 76, 73, 71, 68, 62, 59, 57, 54, 52, 49, 42, 38, 36, 32, 30, 24, 14, 3, 56, 50, 23, 9, 2, 15, 12, 8, 5, 1, 4, 5, 2,

In [91]:
df = pd.DataFrame(lines, columns=['project', 'algorithm', 'batch_size', 'saved_builds', 'builds_reqd', 'testall_size', 'delay_list', 'median_delay'])


In [92]:
df

Unnamed: 0,project,algorithm,batch_size,saved_builds,builds_reqd,testall_size,delay_list,median_delay
0,heroku.csv,BATCH4,4,99.100529,0.899471,1890,"[102, 99, 96, 74, 72, 66, 58, 37, 28, 19, 17, ...",86.0
1,heroku.csv,BATCHSTOP4,4,99.100529,0.899471,1890,"[102, 99, 96, 74, 72, 66, 58, 37, 28, 19, 17, ...",86.0
2,heroku.csv,BATCHSTOP4,8,99.100529,0.899471,1890,"[102, 99, 96, 74, 72, 66, 58, 37, 28, 19, 17, ...",86.0
3,heroku.csv,BATCHSTOP4,16,99.100529,0.899471,1890,"[102, 99, 96, 74, 72, 66, 58, 37, 28, 19, 17, ...",86.0
4,heroku.csv,BATCHBISECT,1,99.100529,0.899471,1890,"[102, 99, 96, 74, 72, 66, 58, 37, 28, 19, 17, ...",86.0
...,...,...,...,...,...,...,...,...
148,graylog2-server.csv,BATCHBISECT,1,99.855977,0.144023,2083,"[82, 76, 27, 12, 1815, 1758, 1739, 1699, 1655,...",949.5
149,graylog2-server.csv,BATCHBISECT,2,99.855977,0.144023,2083,"[82, 76, 27, 12, 1815, 1758, 1739, 1699, 1655,...",949.5
150,graylog2-server.csv,BATCHBISECT,4,99.855977,0.144023,2083,"[82, 76, 27, 12, 1815, 1758, 1739, 1699, 1655,...",949.5
151,graylog2-server.csv,BATCHBISECT,8,99.855977,0.144023,2083,"[82, 76, 27, 12, 1815, 1758, 1739, 1699, 1655,...",949.5


In [93]:
df.to_csv('version_sbs_results.csv')