In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pickle
import csv

In [2]:
projects = ['gradle', 'cloud_controller_ng', 'geoserver']
data_path = '../data/'
confidence = range(2,21,1)

In [3]:
def get_train_test_data(filename):
    
    csv_file = csv.reader(open(filename, 'r'))
    
    temp_data = []
    final_data = []

    for item in csv_file:
        temp_data.append(item)

    for i in range(len(temp_data[0])):
        temp = []
        for index in range(1, len(temp_data)):
            temp.append(temp_data[index][i])
        final_data.append(temp)

    indices = range(len(final_data[3]))

    #capture the metrics of source churn, test churn, file churn and team size in a list
    src_churn = []
    file_churn = []
    test_churn = []
    team_size = []
    build_result = []
    git_num_all_built_commits = []
    gh_num_commits_on_files_touched = []
    argument = []

    for index in indices:
        src_churn.append(float(final_data[23][index]))
        file_churn.append(float(final_data[27][index]))
        test_churn.append(float(final_data[24][index]))
        team_size.append(float(final_data[14][index]))
        
        if final_data[42][index] == 'passed':
            build_result.append(1)
        else:
            build_result.append(0)

        argument.append([])

    for index in range(len(src_churn)):
        argument[index].append(src_churn[index])
        argument[index].append(team_size[index])
        argument[index].append(file_churn[index])
        argument[index].append(test_churn[index])
    
    return np.array(argument), np.array(build_result)

In [4]:
def sbs(project):
    
    train_file = "../data/" + project + '_train.csv'
    num_feature = 4
    
    X_train, Y_train = get_train_test_data(train_file)
    X_train = X_train.reshape((int(len(X_train)), num_feature))

    rf = RandomForestClassifier()
    predictor = rf.fit(X_train, Y_train)
    return predictor

In [5]:
def get_durations(project):
    csv_file = pd.read_csv(project)
    durations = csv_file['tr_duration'].tolist()
    return durations

In [6]:
for p in projects:
    
    predictor = sbs(p)
    
    #get the test data
    test_file = "../data/" + p + '_test.csv'
    X_test, Y_test = get_train_test_data(test_file)
    Y_duration = get_durations(test_file)
    Y_result = []
    grouped_batch = []
    actual_group_results = []
    group_duration = []
    max_batch_size = 4
    num_feature = 4 
    print(len(Y_test))
    
    project_reqd_builds = []
    project_missed_builds = []
    project_build_duration = []
    project_saved_builds = []
    project_delays = []
    
    print('Processing {}'.format(p))
    for c in confidence:
        
        pass_streak = Y_test[0]
        total_builds = Y_test[0]
        missed_builds = 0
        miss_indexes = []
        build_indexes = []
        delay_durations = []
        
        if pass_streak == 0:
            total_duration = Y_duration[0]
            saved_builds = 0
        else:
            total_duration = 0
            saved_builds = 1
        
        for index in range(1, len(X_test)):
            commit = X_test[index]
            commit = commit.reshape((1, num_feature))
            value = predictor.predict(commit)
            #we're setting a confidence of 'c' builds on SBS, if more than 'c' passes have been suggested in a row, we don't want to trust sbs
            if pass_streak < c :
                if value == 0:
                    pass_streak = 0
                    total_builds += 1
                    total_duration += Y_duration[index]
                    if len(miss_indexes) > 0:
                        if miss_indexes[-1] < index:
                            for l in range(len(miss_indexes)):
                                e = miss_indexes.pop()
                                delay_durations.append(index - e + 1)
                else:
                    pass_streak += 1
                    saved_builds += 1
                    if Y_test[index] == 0:
                        missed_builds += 1
                        miss_indexes.append(index)
            else:
                
                if len(grouped_batch) < max_batch_size:
                    grouped_batch.append(index)
                    actual_group_results.append(Y_test[index])
                    group_duration.append(Y_duration[index])
                
                if len(grouped_batch) == max_batch_size:
                    if len(miss_indexes) > 0:
                        if miss_indexes[-1] < index:
                            for l in range(len(miss_indexes)):
                                e = miss_indexes.pop()
                                delay_durations.append(index - e + 1)
                    
                    total_builds += 1
                    total_duration += max(group_duration)
                    
                    if 0 in actual_group_results:
                        total_builds += max_batch_size
                        total_duration += sum(group_duration)
                        
                    grouped_batch.clear()
                    actual_group_results.clear()
                    group_duration.clear()
                    
                #pass_streak = 1
        
        print('\tFor confidence {}:'.format(c))
        print('\t\tTotal builds needed : {}'.format(total_builds))
        print('\t\tTotal number of missed builds : {}'.format(missed_builds))
        print('\t\tTotal number of saved builds : {}'.format(saved_builds))
        print('\t\tTotal duration of builds : {}'.format(total_duration))
        print('\t\tTotal delays: {}'.format(delay_durations))
        
        project_reqd_builds.append(total_builds)
        project_missed_builds.append(missed_builds)
        project_build_duration.append(total_duration)
        project_saved_builds.append(saved_builds)
        project_delays.append(delay_durations)
        
    print(project_reqd_builds)
    print(project_missed_builds)
    print(project_build_duration)
    print(project_saved_builds)
    print(project_delays)

1071
Processing gradle
	For confidence 2:
		Total builds needed : 544
		Total number of missed builds : 0
		Total number of saved builds : 2
		Total duration of builds : 477479
		Total delays: []
	For confidence 3:
		Total builds needed : 548
		Total number of missed builds : 0
		Total number of saved builds : 3
		Total duration of builds : 482294
		Total delays: []
	For confidence 4:
		Total builds needed : 540
		Total number of missed builds : 0
		Total number of saved builds : 4
		Total duration of builds : 472759
		Total delays: []
	For confidence 5:
		Total builds needed : 547
		Total number of missed builds : 0
		Total number of saved builds : 5
		Total duration of builds : 483396
		Total delays: []
	For confidence 6:
		Total builds needed : 551
		Total number of missed builds : 0
		Total number of saved builds : 6
		Total duration of builds : 486160
		Total delays: []
	For confidence 7:
		Total builds needed : 551
		Total number of missed builds : 0
		Total number of saved build

	For confidence 3:
		Total builds needed : 488
		Total number of missed builds : 0
		Total number of saved builds : 5
		Total duration of builds : 1929526
		Total delays: []
	For confidence 4:
		Total builds needed : 475
		Total number of missed builds : 1
		Total number of saved builds : 17
		Total duration of builds : 1814192
		Total delays: [2]
	For confidence 5:
		Total builds needed : 475
		Total number of missed builds : 4
		Total number of saved builds : 22
		Total duration of builds : 1818859
		Total delays: [2, 2, 4, 5]
	For confidence 6:
		Total builds needed : 475
		Total number of missed builds : 4
		Total number of saved builds : 23
		Total duration of builds : 1822832
		Total delays: [2, 4, 6, 7]
	For confidence 7:
		Total builds needed : 470
		Total number of missed builds : 4
		Total number of saved builds : 24
		Total duration of builds : 1764611
		Total delays: [2, 7, 9, 10]
	For confidence 8:
		Total builds needed : 470
		Total number of missed builds : 4
		Total num

In [21]:
from github import Github
access_token = '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'
g = Github(access_token)
r = g.get_repo('gradle/gradle')
commit = '3ea5f46d9769cddb4061b726ce74784a30756028'
c = r.get_commit(commit)
name = c.commit.author.name
q = 'fullname:'+name
u = g.search_users(q)
for d in u:
    print(d.login)

breskeby
