In [1]:
import csv
import pandas as pd
import numpy as np
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import pickle
import datetime

In [2]:
def measure_balance(y_result):
    pass_count = 0
    fail_count = 0
    length = len(y_result)
    
    for i in range(length):
        if y_result[i] == 'passed':
            pass_count += 1
        else:
            fail_count += 1
    
    if pass_count == 0:
        return (0, fail_count*100/length)
    
    if fail_count == 0:
        return (pass_count*100/length, 0)
    
    return (pass_count*100/length, fail_count*100/length)

In [3]:
def get_train_test_data(filename):
    
    csv_file = csv.reader(open(filename, 'r'))
    
    temp_data = []
    final_data = []

    for item in csv_file:
        temp_data.append(item)

    for i in range(len(temp_data[0])):
        temp = []
        for index in range(1, len(temp_data)):
            temp.append(temp_data[index][i])
        final_data.append(temp)

    indices = range(len(final_data[3]))

    #capture the metrics of source churn, test churn, file churn and team size in a list
    src_churn = []
    file_churn = []
    test_churn = []
    team_size = []
    time_stamp = []
    build_result = []
    git_num_all_built_commits = []
    gh_num_commits_on_files_touched = []
    argument = []

    for index in indices:
        src_churn.append(float(final_data[23][index]))
        file_churn.append(float(final_data[27][index]))
        test_churn.append(float(final_data[24][index]))
        team_size.append(float(final_data[14][index]))
        time_stamp.append(datetime.strptime(final_data[41][index], "%y-%m-%d %H:%M:%S"))
        
        if final_data[42][index] == 'passed':
            build_result.append(1)
        else:
            build_result.append(0)

        argument.append([])

    for index in range(len(src_churn)):
        argument[index].append(src_churn[index])
        argument[index].append(team_size[index])
        argument[index].append(file_churn[index])
        argument[index].append(test_churn[index])
    
    return np.array(argument), np.array(build_result)

In [4]:
project_list = ['gradle', 'cloud_controller_ng', 'geoserver']

In [29]:
for project in project_list:
    print('Processing {}... \n\n\n\n'.format(project))
    
    best_diff = 100
    X = pd.read_csv('try_data/' + project + '.csv')
    X['gh_build_started_at'] =  pd.to_datetime(X['gh_build_started_at'], format='%Y-%m-%d %H:%M:%S')
    
    start_date = X['gh_build_started_at'].tolist()[0]
    end_date = X['gh_build_started_at'].tolist()[-1]
    
    while start_date < end_date :
        phase_end = start_date + datetime.timedelta(days = 60)
        
        test_data = X.loc[ (X['gh_build_started_at'] > start_date) & (X['gh_build_started_at'] < phase_end)]['tr_status'].tolist()
        train_data = X.loc[ (X['gh_build_started_at'] < start_date) | (X['gh_build_started_at'] > phase_end)]['tr_status'].tolist()
        
        train_ev = measure_balance(train_data)
        test_ev = measure_balance(test_data)
        
        diff = abs(train_ev[0] - test_ev[0])
        if diff < best_diff:
            best_diff = diff
            print(start_date)
            print(phase_end)
            print(diff)
            print(train_ev)
            print(test_ev)
            print('\n')
            
            test_indexes = X.loc[ (X['gh_build_started_at'] > start_date) & (X['gh_build_started_at'] < phase_end)]['tr_build_id'].tolist()
            train_indexes = X.loc[ (X['gh_build_started_at'] < start_date) | (X['gh_build_started_at'] > phase_end)]['tr_build_id'].tolist()
            
            print(len(test_indexes))
            print(len(train_indexes))
            
            filename = project + '_indexes.pkl'
            with open(filename, 'wb') as save_file:
                pickle.dump(train_indexes, save_file)
                pickle.dump(test_indexes, save_file)
            
        
        start_date = phase_end
        

Processing gradle... 




2014-03-05 01:06:23
2014-05-04 01:06:23
3.232217129171204
(93.55234878722752, 6.44765121277249)
(96.78456591639872, 3.215434083601286)


311
3257
2014-05-04 01:06:23
2014-07-03 01:06:23
2.656744939891965
(93.67353028946583, 6.326469710534169)
(96.3302752293578, 3.669724770642202)


218
3351
2014-09-01 01:06:23
2014-10-31 01:06:23
0.7633926395929649
(93.72329937561616, 6.276700624383832)
(94.48669201520913, 5.513307984790875)


526
3043
Processing cloud_controller_ng... 




2013-01-17 20:55:32
2013-03-18 20:55:32
4.001904937475032
(81.08581436077058, 18.914185639229423)
(85.08771929824562, 14.912280701754385)


114
2284
2013-03-18 20:55:32
2013-05-17 20:55:32
3.307703421718827
(81.08945969884854, 18.91054030115146)
(84.39716312056737, 15.602836879432624)


141
2258
2013-05-17 20:55:32
2013-07-16 20:55:32
1.1774846086191815
(81.22251539138082, 18.777484608619172)
(82.4, 17.6)


125
2274
2014-03-13 20:55:32
2014-05-12 20:55:32
0.9678658374708391
(81.347612672913