In [11]:
import pandas as pd
import numpy as np
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle
import csv

In [12]:
projects = ['gradle', 'cloud_controller_ng', 'geoserver']
data_path = '../data/'
confidence = range(2,21,1)

In [23]:
def get_train_test_data(filename):
    
    csv_file = csv.reader(open(filename, 'r'))
    
    temp_data = []
    final_data = []

    for item in csv_file:
        temp_data.append(item)

    for i in range(len(temp_data[0])):
        temp = []
        for index in range(1, len(temp_data)):
            temp.append(temp_data[index][i])
        final_data.append(temp)

    indices = range(len(final_data[3]))

    #capture the metrics of source churn, test churn, file churn and team size in a list
    src_churn = []
    file_churn = []
    test_churn = []
    team_size = []
    build_result = []
    git_num_all_built_commits = []
    gh_num_commits_on_files_touched = []
    argument = []

    for index in indices:
        src_churn.append(float(final_data[23][index]))
        file_churn.append(float(final_data[27][index]))
        test_churn.append(float(final_data[24][index]))
        team_size.append(float(final_data[14][index]))
        
        if final_data[42][index] == 'passed':
            build_result.append(1)
        else:
            build_result.append(0)

        argument.append([])

    for index in range(len(src_churn)):
        argument[index].append(src_churn[index])
        argument[index].append(team_size[index])
        argument[index].append(file_churn[index])
        argument[index].append(test_churn[index])
    
    return np.array(argument), np.array(build_result)

In [27]:
def sbs(project):
    
    train_file = "../data/" + project + '_train.csv'
    num_feature = 4
    
    X_train, Y_train = get_train_test_data(train_file)
    X_train = X_train.reshape((int(len(X_train)), num_feature))

    rf = RandomForestClassifier()
    predictor = rf.fit(X_train, Y_train)
    return predictor

In [30]:
def get_durations(project):
    csv_file = pd.read_csv(project)
    durations = csv_file['tr_duration'].tolist()
    return durations

In [39]:
for p in projects:
    
    predictor = sbs(p)
    
    #get the test data
    test_file = "../data/" + p + '_test.csv'
    X_test, Y_test = get_train_test_data(test_file)
    Y_duration = get_durations(test_file)
    Y_result = []
    grouped_batch = []
    actual_group_results = []
    group_duration = []
    max_batch_size = 4
    num_feature = 4 
    
    
    project_reqd_builds = []
    project_missed_builds = []
    project_build_duration = []
    project_saved_builds = []
    project_delays = []
    
    print('Processing {}'.format(p))
    for c in confidence:
        
        pass_streak = Y_test[0]
        total_builds = Y_test[0]
        missed_builds = 0
        miss_indexes = []
        build_indexes = []
        delay_durations = []
        
        if pass_streak == 0:
            total_duration = Y_duration[0]
            saved_builds = 0
        else:
            total_duration = 0
            saved_builds = 1
        
        for index in range(1, len(X_test)):
            commit = X_test[index]
            commit = commit.reshape((1, num_feature))
            value = predictor.predict(commit)
            #we're setting a confidence of 'c' builds on SBS, if more than 'c' passes have been suggested in a row, we don't want to trust sbs
            if pass_streak < c :
                if value == 0:
                    pass_streak = 0
                    total_builds += 1
                    total_duration += Y_duration[index]
                    if len(miss_indexes) > 0:
                        if miss_indexes[-1] < index:
                            for l in range(len(miss_indexes)):
                                e = miss_indexes.pop()
                                delay_durations.append(index - e + 1)
                else:
                    pass_streak += 1
                    saved_builds += 1
                    if Y_test[index] == 0:
                        missed_builds += 1
                        miss_indexes.append(index)
            else:
                
                if len(grouped_batch) < max_batch_size:
                    grouped_batch.append(index)
                    actual_group_results.append(Y_test[index])
                    group_duration.append(Y_duration[index])
                
                if len(grouped_batch) == max_batch_size:
                    if len(miss_indexes) > 0:
                        if miss_indexes[-1] < index:
                            for l in range(len(miss_indexes)):
                                e = miss_indexes.pop()
                                delay_durations.append(index - e + 1)
                    
                    total_builds += 1
                    total_duration += max(group_duration)
                    
                    if 0 in actual_group_results:
                        total_builds += max_batch_size
                        total_duration += sum(group_duration)
                        
                    grouped_batch.clear()
                    actual_group_results.clear()
                    group_duration.clear()
                    
                pass_streak = 1
        
        print('\tFor confidence {}:'.format(c))
        print('\t\tTotal builds needed : {}'.format(total_builds))
        print('\t\tTotal number of missed builds : {}'.format(missed_builds))
        print('\t\tTotal number of saved builds : {}'.format(saved_builds))
        print('\t\tTotal duration of builds : {}'.format(total_duration))
        print('\t\tTotal delays: {}'.format(delay_durations))
        
        project_reqd_builds.append(total_builds)
        project_missed_builds.append(missed_builds)
        project_build_duration.append(total_duration)
        project_saved_builds.append(saved_builds)
        project_delays.append(delay_durations)
        
    print(project_reqd_builds)
    print(project_missed_builds)
    print(project_build_duration)
    print(project_saved_builds)
    print(project_delays)

Processing gradle
	For confidence 2:
		Total builds needed : 279
		Total number of missed builds : 52
		Total number of saved builds : 536
		Total duration of builds : 238881
		Total delays: [8, 2, 2, 8, 2, 6, 8, 6, 6, 2, 8, 4, 6, 6, 4, 4, 6, 8, 6, 4, 4, 2, 4, 8, 2, 4, 8, 8, 2, 4, 8, 8, 6, 2, 4, 8, 8, 8, 4, 8, 6, 2, 2, 6, 8, 2, 4, 2, 4, 8]
	For confidence 3:
		Total builds needed : 199
		Total number of missed builds : 65
		Total number of saved builds : 714
		Total duration of builds : 178638
		Total delays: [2, 9, 11, 12, 5, 6, 8, 5, 11, 5, 6, 11, 12, 9, 5, 11, 12, 11, 3, 8, 2, 8, 6, 5, 6, 12, 2, 2, 6, 5, 6, 2, 9, 6, 8, 9, 2, 5, 3, 2, 3, 5, 6, 11, 12, 2, 2, 5, 11, 2, 3, 8, 8, 11, 9, 2, 3, 5, 6, 8, 11]
	For confidence 4:
		Total builds needed : 145
		Total number of missed builds : 77
		Total number of saved builds : 803
		Total duration of builds : 125684
		Total delays: [7, 8, 2, 3, 11, 2, 2, 3, 15, 16, 6, 7, 8, 6, 6, 7, 15, 15, 10, 8, 12, 14, 4, 14, 4, 7, 8, 14, 15, 6, 4, 12, 15, 2

	For confidence 2:
		Total builds needed : 267
		Total number of missed builds : 56
		Total number of saved builds : 360
		Total duration of builds : 667777
		Total delays: [4, 6, 4, 4, 2, 2, 4, 4, 4, 6, 8, 2, 4, 4, 6, 4, 4, 4, 6, 6, 3, 2, 4, 6, 4, 4, 6, 4, 8, 7, 2, 3, 8, 6, 6, 8, 6, 2, 2, 6, 2, 2, 2, 4, 2, 6, 7, 2, 2, 2, 8, 6, 4, 6, 8, 8]
	For confidence 3:
		Total builds needed : 179
		Total number of missed builds : 77
		Total number of saved builds : 476
		Total duration of builds : 452454
		Total delays: [8, 11, 5, 6, 4, 8, 6, 7, 8, 6, 9, 12, 9, 11, 9, 6, 8, 9, 12, 2, 2, 3, 6, 8, 9, 6, 11, 9, 2, 12, 6, 2, 2, 12, 13, 9, 8, 11, 12, 2, 9, 2, 3, 5, 6, 8, 3, 12, 13, 2, 3, 5, 2, 4, 3, 5, 9, 5, 12, 5, 2, 3, 5, 12, 5, 4, 2, 3, 11, 12, 3, 6, 8, 2, 6, 9]
	For confidence 4:
		Total builds needed : 124
		Total number of missed builds : 94
		Total number of saved builds : 534
		Total duration of builds : 316439
		Total delays: [7, 8, 10, 11, 4, 5, 12, 13, 8, 3, 4, 2, 7, 6, 9, 9, 3, 7, 8, 2, 3,

	For confidence 19:
		Total builds needed : 55
		Total number of missed builds : 105
		Total number of saved builds : 664
		Total duration of builds : 134861
		Total delays: [23, 24, 26, 27, 4, 5, 12, 13, 8, 3, 4, 11, 16, 9, 15, 18, 2, 3, 14, 16, 3, 10, 11, 12, 15, 17, 18, 34, 35, 52, 57, 61, 63, 64, 65, 66, 4, 17, 10, 28, 29, 30, 44, 3, 14, 19, 35, 38, 48, 53, 58, 59, 5, 12, 15, 16, 7, 8, 5, 6, 8, 9, 25, 6, 9, 15, 21, 2, 4, 7, 8, 10, 9, 14, 21, 25, 26, 30, 31, 4, 10, 22, 29, 30, 31, 32, 33, 46, 2, 2, 3, 11, 18, 21, 22, 38, 39]
[267, 179, 124, 115, 112, 99, 79, 76, 65, 63, 59, 60, 56, 61, 56, 51, 56, 55]
[56, 77, 94, 92, 93, 94, 99, 101, 105, 105, 106, 104, 104, 105, 105, 104, 107, 105]
[667777, 452454, 316439, 286410, 277089, 249016, 195509, 188176, 157705, 158248, 141901, 148819, 138990, 149213, 134302, 123787, 137517, 134861]
[360, 476, 534, 567, 590, 605, 615, 625, 634, 640, 644, 647, 651, 656, 657, 661, 662, 664]
[[4, 6, 4, 4, 2, 2, 4, 4, 4, 6, 8, 2, 4, 4, 6, 4, 4, 4, 6, 6, 3, 2, 

	For confidence 5:
		Total builds needed : 231
		Total number of missed builds : 107
		Total number of saved builds : 412
		Total duration of builds : 757180
		Total delays: [3, 10, 12, 13, 2, 3, 2, 2, 10, 8, 3, 7, 4, 7, 14, 18, 19, 7, 7, 2, 19, 20, 5, 4, 3, 8, 3, 2, 4, 5, 6, 7, 9, 10, 11, 12, 14, 15, 16, 2, 3, 4, 5, 6, 2, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 16, 17, 18, 19, 2, 3, 4, 5, 6, 2, 3, 4, 6, 7, 8, 9, 2, 3, 4, 5, 7, 8, 9, 10, 12, 13, 14, 15, 16, 2, 3, 4, 2, 4, 7, 11, 2, 6, 3, 3, 4, 2, 3, 2, 2, 3, 5, 2, 3, 3, 2, 3]
	For confidence 6:
		Total builds needed : 217
		Total number of missed builds : 112
		Total number of saved builds : 426
		Total duration of builds : 690841
		Total delays: [4, 2, 9, 11, 12, 2, 3, 2, 5, 16, 24, 6, 10, 11, 16, 21, 25, 8, 5, 6, 2, 2, 24, 2, 4, 3, 2, 4, 2, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 2, 3, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 15, 16, 17