In [1]:
import pandas as pd
import numpy as np
from numpy import argmax
from numpy import sqrt
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.utils import resample
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.model_selection import KFold
from matplotlib import pyplot
from statistics import median
import pickle
import csv
import warnings
import datetime
import multiprocessing
from joblib import Parallel, delayed
import os

In [2]:
warnings.filterwarnings("ignore")

In [3]:
project_list = ['jruby.csv', 'metasploit-framework.csv', 'cloudify.csv', 'vagrant.csv', 'rubinius.csv', 'open-build-service.csv', 'sonarqube.csv', 'loomio.csv', 'fog.csv', 'opal.csv', 'cloud_controller_ng.csv', 'puppet.csv', 'concerto.csv', 'sufia.csv', 'geoserver.csv', 'orbeon-forms.csv', 'graylog2-server.csv']

project_list = ['heroku.csv', 'rails.csv', 'gradle.csv']

In [4]:
batch_total = 0
batch_duration = 0

In [5]:
def batch_bisect(batch_results, duration_subbatch):
    global batch_total
    global batch_duration
    
    batch_total += 1
    batch_duration += duration_subbatch[-1]
    
    if len(batch_results) == 1:
        return
    
    if 0 in batch_results:
        half_batch = len(batch_results)//2
        batch_bisect(batch_results[:half_batch], duration_subbatch[:half_batch])
        batch_bisect(batch_results[half_batch:], duration_subbatch[half_batch:])

In [6]:
def batch_stop_4(batch_results, duration_subbatch):
    global batch_total
    global batch_duration
    
    batch_total += 1
    batch_duration += duration_subbatch[-1]
    
    if len(batch_results) <= 4:
        if 0 in batch_results:
            batch_total += 4
            batch_duration += sum(duration_subbatch)
        return
    
    if 0 in batch_results:
        half_batch = len(batch_results)//2
        batch_stop_4(batch_results[:half_batch], duration_subbatch[:half_batch])
        batch_stop_4(batch_results[half_batch:], duration_subbatch[half_batch:])

In [7]:
def output_values(Y_data):
    Y_t = []
    for e in Y_data:
        if e == 'passed':
            Y_t.append(1)
        else:
            Y_t.append(0) 
    return Y_t

In [8]:
def get_complete_data(p_name, first_failures=True):
    
    #open the metrics file
    filename = 'project_metrics/' + p_name.split('.')[0] + '_metrics.csv'
    project = pd.read_csv(filename)
    project = project.drop(project.columns[9], axis=1)
    
    build_ids = project['tr_build_id'].tolist()
    duration_file = pd.read_csv('../../data/full_data/' + p_name, usecols=['tr_build_id', 'tr_duration'])
    given_builds = duration_file[ duration_file['tr_build_id'].isin(build_ids)]
    
    project = pd.merge(project, given_builds, on='tr_build_id')   
    
    project['tr_status'] = output_values(project['tr_status'])
    if first_failures:
        project = get_first_failures(project)
    return project

In [9]:
def hybrid_performance(p_name, test_builds, test_result, batchsize, ci):
    total_builds = len(test_result)

    bad_builds = 0
    flag = 0
    for i in range(len(test_result)):
        if flag == 1:
            if ci[i] == 1:
                bad_builds += 1
            else:
                flag == 0
        else:
            if test_result[i] == 0:
                if ci[i] == 1:
                    flag = 1
                    bad_builds += 1

    

    delay = []
    delay_indexes = []
    built_indexes = []
    for i in range(len(test_result)):
        if ci[i] == 0:
            built_indexes.append(i)
        if test_result[i] == 0:
            if ci[i] != 0:
                delay_indexes.append(i)
    
    num_failed = test_result.count(0)
    if num_failed == 0:
        failures_found = 100
        failures_not_found = 0

    else:
        num_of_failure_unidentified = len(delay_indexes)
        identified_failures = test_result.count(0) - num_of_failure_unidentified
        failures_found = 100*identified_failures/test_result.count(0)
        failures_not_found = 100*num_of_failure_unidentified/test_result.count(0)
    
#     print(delay_indexes)
#     print(built_indexes)
    from_value = 0
    
    for k in range(len(built_indexes)):
        for j in range(len(delay_indexes)):
            if delay_indexes[j] > from_value and delay_indexes[j] < built_indexes[k]:
                delay.append(built_indexes[k] - delay_indexes[j])
        from_value = built_indexes[k]
    
    if len(delay_indexes) != 0:
        final_index = len(test_result)
        for j in range(len(delay_indexes)):
            delay.append(final_index - delay_indexes[j])
    
#     print("===========================================")
#     print('Total Number of builds for {} = {}'.format(p_name, total_builds))
#     print('Total % of builds required for {} = {}'.format(p_name, builds_reqd))
#     print('Total % of time required for {} = {}'.format(p_name, time_reqd))
#     print('Total delays made for {} = {}'.format(p_name, sum(delay)))
#     print('Total % of failures identified for {} = {}'.format(p_name, failures_found))
#     print('Total % of failures unidentified for {} = {}'.format(p_name, 100*num_of_failure_unidentified/test_result.count(0)))
#     print("===========================================")
    
    return (sum(delay), failures_found, failures_not_found, bad_builds)

In [10]:
def get_delay_from_ci(ci, y_test, batch_size, pflag=False):
    
    i = 0        
    delay_list = []
    missed = []
    batch_list = []
    sbs_list = []
    delay_list = []
    
    b = batch_size-1
    
    while i < len(ci):
        if pflag:
            print(ci[i], y_test[i])
        if ci[i] == 0:

            while len(missed) > 0:
                ind = missed.pop()
                sbs_list.append(i - ind)
                if pflag:
                    print('{} {} {}'.format(ind, i, i-ind))

            batch_list.append(b)
            b -= 1
            if b == -1:
                b = batch_size - 1
            
#             if pflag:
#                 print(y_test[i], ci[i], sbs_delay, batch_delay, sbs_delay + batch_delay)

        if ci[i] == 1:
            if y_test[i] == 0:
                missed.append(i)

        i += 1

    while len(missed) > 0:
            ind = missed.pop()
            sbs_list.append(i - ind)
            if pflag:
                print('{} {} {}'.format(ind, i, i-ind))
    
    delay_list.extend(sbs_list)
    delay_list.extend(batch_list)
    
    return delay_list

In [11]:
def mlci_process(p_name):
    
    global batch_total
    global batch_duration
    
    p = p_name.split('.')[0]
    
    print('Processing {}'.format(p_name))

    result_rows = []
    empty_prediction = np.empty([1, 2])
    
    project = get_complete_data(p_name, first_failures=False)
    
    #sliding window parameters
    print("Entire Project = {}".format(len(project)))
    window_size = len(project)//5
    end_p = window_size
    start_p = 0
    
    while end_p <= len(project):
        
        window = project[start_p:end_p]
        print("Start:{} \t End:{} \t Length Window:{}".format(start_p, end_p, len(window)))
                
        train_start = start_p
        train_end = start_p + int(0.7*len(window))
        test_start = train_end
        test_end = end_p
        
        print("TrainS: {} \t TrainE: {} \t TestS: {} \t TestE: {}".format(train_start, train_end, test_start, test_end))
        train_data = project[train_start:train_end]
        test_data = project[test_start:test_end]
        
        print("Length of Train: {} \t Length of Test: {}".format(len(train_data), len(test_data)))
        
        
        file_name = 'dump_data/' + p_name.split('.')[0] + '_models/'+ 'rq2_' + p_name + '_' + str(end_p) + '_best_model.pkl'
        print(file_name)
        
        if os.path.isfile(file_name):
            dump_file = open(file_name, 'rb')
            forest = pickle.load(dump_file)
            threshold = pickle.load(dump_file)
            print("unpacked file")
        else:
            print("file does not exist")
            #forest, threshold = bootstrapping(p_name, train_data, end_p)
        
        if type(forest) == type(int):
            print("Ending at {}".format(end_p))
            break
    
        test_result = test_data['tr_status'].tolist()
        test_duration = test_data['tr_duration'].tolist()

        if len(test_result) == 0:
            print("length of test is 0")
            return 

        test_builds = test_data['tr_build_id'].tolist()
        test_data.drop('tr_build_id', inplace=True, axis=1)
        test_data.drop('tr_status', inplace=True, axis=1)
        test_data.drop('tr_duration', inplace=True, axis=1)

        batchsizelist = [1, 2, 4, 8, 16]
        algorithms = ['BATCH4', 'BATCHSTOP4', 'BATCHBISECT']

        batch_delays = 0
        final_pred_result = []


        for alg in algorithms:
            for batchsize in batchsizelist:
                
                project_actual_duration = 0
                project_batch_duration = 0
                
                print("processing {} {}".format(alg, batchsize))
                batch_delays = 0

                pass_streak = 0
                i = 0
                total = len(test_data)
                num_of_builds = 0
                build_duration = 0

                #The variable 'ci' will hold the actual execution process of the current phase
                #If ci[i] == 0, it means that build was made
                #If ci[i] == 1, it means that build was saved
                ci = []


                if alg == 'BATCH4':
                    if batchsize != 4:
                        continue
                    else:
                            while i < total :
                                data = test_data.iloc[i]
                                data['num_of_passes'] = pass_streak
                                
                                predict = forest.predict_proba([data])

                                if predict.shape == empty_prediction.shape:
                                    prediction = predict[0][1]
                                else:
                                    prediction = predict[0]

                                #predicted that build has passed
                                if prediction > threshold:
                                    final_pred_result.append(1)
                                    ci.append(1)
                                    pass_streak += 1
                                    i+=1

                                else:
                                    #We found first failure

                                    #Until an entire batch passes, we are going to continue group builds ie., subsequent failures are grouped
                                    while i < total:
                                        if (total - i) > 4:
                                            ci.extend([0,0,0,0])
                                            duration_subbatch = test_duration[i:i+4]
                                        else:
                                            ci.extend([0 for e in range(total-i)])
                                            duration_subbatch = test_duration[i:total]

                                        batch_delays += (batchsize - 1)*batchsize*0.5
                                        
                                        actual_batch_results = test_result[i:i+4]
                                        project_actual_duration += sum(duration_subbatch)
                                        project_batch_duration += duration_subbatch[-1]

                                        num_of_builds += 1
                                        
                                            # print(batch_build_times)
                                            # print(durations)
                                            # print(test_result)

                                        #if any build has failed in the batch, then whole batch will fail
                                        if 0 in actual_batch_results:
                                            project_batch_duration += sum(test_duration[i:i+4])
                                            i = i+4
                                            num_of_builds += 4
                                            
                                        else:
                                            break
                                    #Now that we have found a passing build, we can update pass_streak to 1
                                    pass_streak = 1
                                    i += 4

                if alg == 'BATCHSTOP4':
                    if batchsize < 4:
                        continue
                    else:
                        pass_streak = 0
                        ci = []
                        while i < total :
                            data = test_data.iloc[i]
                            data['num_of_passes'] = pass_streak
                            predict = forest.predict_proba([data])

                            if predict.shape == empty_prediction.shape:
                                prediction = predict[0][1]
                            else:
                                prediction = predict[0]

                            if prediction > threshold:
                                ci.append(1)
                                pass_streak += 1
                                i += 1
                            else:

                                while i < total:
                                    if (total - i) > batchsize:
                                        ci.extend([0 for l in range(batchsize)])
                                        duration_subbatch = test_duration[i:i+batchsize]
                                    else:
                                        ci.extend([0 for e in range(total-i)])
                                        duration_subbatch = test_duration[i:total]
                                    
                                    project_actual_duration += sum(duration_subbatch)
                                    
                                    batch_delays += (batchsize - 1)*batchsize*0.5

                                    grouped_batch_results = test_result[i:i+batchsize]
                                    
                                    batch_total = 0
                                    batch_duration = 0
                                    
                                    batch_stop_4(grouped_batch_results, duration_subbatch)
                                    
                                    num_of_builds += batch_total
                                    project_batch_duration += batch_duration

                                    if 0 not in grouped_batch_results:
                                        break
                                    else:
                                        i += batchsize
                                    grouped_batch_results.clear()
                                i += batchsize
                                pass_streak = 1

                if alg == 'BATCHBISECT':

                    pass_streak = 0
                    ci = []

                    while i < total :
                        data = test_data.iloc[i]
                        data['num_of_passes'] = pass_streak
                        predict = forest.predict_proba([data])

                        if predict.shape == empty_prediction.shape:
                            prediction = predict[0][1]
                        else:
                            prediction = predict[0]

                        if prediction > threshold:
                            ci.append(1)
                            pass_streak += 1
                            i += 1
                        else:

                            #this case is when model has predicted a failure
                            #Add [i, i+batchsize] to a group and perform BatchBisect

                            while i < total:

                                #Next batch is being built, so add to ci
                                if (total - i) >= batchsize:
                                    ci.extend([0 for l in range(batchsize)])
                                    duration_subbatch = test_duration[i:i+batchsize]
                                else:
                                    ci.extend([0 for e in range(total-i)])
                                    duration_subbatch = test_duration[i:total]
                                
                                project_actual_duration += sum(duration_subbatch)
                                    
                                batch_delays += (batchsize - 1)*batchsize*0.5

                                grouped_batch_results = test_result[i:i+batchsize]

                                batch_total = 0
                                batch_duration = 0

                                batch_bisect(grouped_batch_results, duration_subbatch)
                                
                                num_of_builds += batch_total
                                project_batch_duration += batch_duration

                                if 0 not in grouped_batch_results:
                                    break
                                else:
                                    i += batchsize

                                grouped_batch_results.clear()
                            i += batchsize
                            pass_streak = 1


                batch_performance = hybrid_performance(p_name, test_builds, test_result, batchsize, ci)
                total_delay = batch_performance[0]
                failures_found = batch_performance[1]
                failures_not_found = batch_performance[2]
                bad_builds = batch_performance[3]

                local_builds_reqd = 100*num_of_builds/total
                pflag = False
                delay_ci = get_delay_from_ci(ci, test_result, batchsize, pflag=pflag)
                #print(delay_ci)

                #print([p_name, alg, batchsize, local_builds_reqd, total_delay, failures_found, failures_not_found, bad_builds, batch_delays, total, ci])
                result_rows.append([p_name, test_start, test_end, alg, batchsize, local_builds_reqd, median(delay_ci), delay_ci, project_actual_duration, project_batch_duration, total, ci])
                if len(test_result) != len(ci):
                    print('PROBLEM!!')
                else:
                    print('NO PROBLEM!!')
    
        
        start_p = start_p + int((0.5)*window_size)
        end_p = start_p + window_size
    
    print('converting to csv')
    df = pd.DataFrame(result_rows, columns=['project', 'test_start', 'test_end', 'algorithm', 'batch_size', 'builds_reqd', 'median_delay', 'delay_list', 'actual_duration', 'batch_duration', 'testall_size', 'ci'])
    n = p_name + '_sw.csv'
    df.to_csv(n)

In [12]:
for p in project_list:
    mlci_process(p)

Processing heroku.csv
Entire Project = 2083
Start:0 	 End:416 	 Length Window:416
TrainS: 0 	 TrainE: 291 	 TestS: 291 	 TestE: 416
Length of Train: 291 	 Length of Test: 125
dump_data/heroku_models/rq2_heroku.csv_416_best_model.pkl
unpacked file
processing BATCH4 1
processing BATCH4 2
processing BATCH4 4
NO PROBLEM!!
processing BATCH4 8
processing BATCH4 16
processing BATCHSTOP4 1
processing BATCHSTOP4 2
processing BATCHSTOP4 4
NO PROBLEM!!
processing BATCHSTOP4 8
NO PROBLEM!!
processing BATCHSTOP4 16
NO PROBLEM!!
processing BATCHBISECT 1
NO PROBLEM!!
processing BATCHBISECT 2
NO PROBLEM!!
processing BATCHBISECT 4
NO PROBLEM!!
processing BATCHBISECT 8
NO PROBLEM!!
processing BATCHBISECT 16
NO PROBLEM!!
Start:208 	 End:624 	 Length Window:416
TrainS: 208 	 TrainE: 499 	 TestS: 499 	 TestE: 624
Length of Train: 291 	 Length of Test: 125
dump_data/heroku_models/rq2_heroku.csv_624_best_model.pkl
unpacked file
processing BATCH4 1
processing BATCH4 2
processing BATCH4 4
NO PROBLEM!!
processi

NO PROBLEM!!
Start:4239 	 End:7065 	 Length Window:2826
TrainS: 4239 	 TrainE: 6217 	 TestS: 6217 	 TestE: 7065
Length of Train: 1978 	 Length of Test: 848
dump_data/rails_models/rq2_rails.csv_7065_best_model.pkl
unpacked file
processing BATCH4 1
processing BATCH4 2
processing BATCH4 4
NO PROBLEM!!
processing BATCH4 8
processing BATCH4 16
processing BATCHSTOP4 1
processing BATCHSTOP4 2
processing BATCHSTOP4 4
NO PROBLEM!!
processing BATCHSTOP4 8
NO PROBLEM!!
processing BATCHSTOP4 16
NO PROBLEM!!
processing BATCHBISECT 1
NO PROBLEM!!
processing BATCHBISECT 2
NO PROBLEM!!
processing BATCHBISECT 4
NO PROBLEM!!
processing BATCHBISECT 8
NO PROBLEM!!
processing BATCHBISECT 16
NO PROBLEM!!
Start:5652 	 End:8478 	 Length Window:2826
TrainS: 5652 	 TrainE: 7630 	 TestS: 7630 	 TestE: 8478
Length of Train: 1978 	 Length of Test: 848
dump_data/rails_models/rq2_rails.csv_8478_best_model.pkl
unpacked file
processing BATCH4 1
processing BATCH4 2
processing BATCH4 4
NO PROBLEM!!
processing BATCH4 8
p

NO PROBLEM!!
processing BATCH4 8
processing BATCH4 16
processing BATCHSTOP4 1
processing BATCHSTOP4 2
processing BATCHSTOP4 4
NO PROBLEM!!
processing BATCHSTOP4 8
NO PROBLEM!!
processing BATCHSTOP4 16
NO PROBLEM!!
processing BATCHBISECT 1
NO PROBLEM!!
processing BATCHBISECT 2
NO PROBLEM!!
processing BATCHBISECT 4
NO PROBLEM!!
processing BATCHBISECT 8
NO PROBLEM!!
processing BATCHBISECT 16
NO PROBLEM!!
Start:2492 	 End:3205 	 Length Window:713
TrainS: 2492 	 TrainE: 2991 	 TestS: 2991 	 TestE: 3205
Length of Train: 499 	 Length of Test: 214
dump_data/gradle_models/rq2_gradle.csv_3205_best_model.pkl
unpacked file
processing BATCH4 1
processing BATCH4 2
processing BATCH4 4
NO PROBLEM!!
processing BATCH4 8
processing BATCH4 16
processing BATCHSTOP4 1
processing BATCHSTOP4 2
processing BATCHSTOP4 4
NO PROBLEM!!
processing BATCHSTOP4 8
NO PROBLEM!!
processing BATCHSTOP4 16
NO PROBLEM!!
processing BATCHBISECT 1
NO PROBLEM!!
processing BATCHBISECT 2
NO PROBLEM!!
processing BATCHBISECT 4
NO PRO

In [13]:
356+499

855

In [14]:
356 + 713

1069