In [1]:
import csv
import pandas as pd
import numpy as np
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import pickle
import datetime

In [2]:
def measure_balance(y_result):
    pass_count = 0
    fail_count = 0
    length = len(y_result)
    
    for i in range(length):
        if y_result[i] == 'passed':
            pass_count += 1
        else:
            fail_count += 1
    
    if pass_count == 0:
        return (0, fail_count*100/length)
    
    if fail_count == 0:
        return (pass_count*100/length, 0)
    
    return (pass_count*100/length, fail_count*100/length)

In [3]:
def output_values(Y_data):
    Y_t = []
    for e in Y_data:
        if e == 'passed':
            Y_t.append(1)
        else:
            Y_t.append(0) 
    return Y_t


In [4]:
def get_first_failures(df):
    
    results = df['tr_status'].tolist()
    length = len(results)
    verdict = ['keep']
    prev = results[0]
    
    for i in range(1, length):
        if results[i] == 0:
            if prev == 0:
                verdict.append('discard')
                #print(i+1)
            else:
                verdict.append('keep')
        else:
            verdict.append('keep')
        prev = results[i]
    
    df['verdict'] = verdict
    df = df[ df['verdict'] == 'keep' ]
    df.drop('verdict', inplace=True, axis=1)
    return df


In [5]:
project_list = ['rails.csv', 'jruby.csv', 'metasploit-framework.csv', 'cloudify.csv', 'vagrant.csv', 'rubinius.csv', 'open-build-service.csv', 'gradle.csv', 'sonarqube.csv', 'loomio.csv', 'fog.csv', 'opal.csv', 'cloud_controller_ng.csv', 'puppet.csv', 'concerto.csv', 'sufia.csv', 'geoserver.csv', 'orbeon-forms.csv', 'graylog2-server.csv', 'heroku.csv']

In [9]:
for project in project_list:
    print('\n\n\n\nProcessing {}...'.format(project))
    
    best_diff = 100
    X = pd.read_csv('../../data/full_data/' + project)
    X['gh_build_started_at'] =  pd.to_datetime(X['gh_build_started_at'], format='%Y-%m-%d %H:%M:%S')
    
    start_date = X['gh_build_started_at'].tolist()[0]
    end_date = X['gh_build_started_at'].tolist()[-1]
    phase = (end_date - start_date)/10
    i = 1
    while start_date < end_date :
        phase_end = start_date + phase
        
        test_data = X.loc[ (X['gh_build_started_at'] >= start_date) & (X['gh_build_started_at'] < phase_end)]['tr_status'].tolist()
        train_data = X.loc[ (X['gh_build_started_at'] < start_date) | (X['gh_build_started_at'] >= phase_end)]['tr_status'].tolist()
        print(len(test_data), len(train_data))
        test_indexes = X.loc[ (X['gh_build_started_at'] > start_date) & (X['gh_build_started_at'] < phase_end)]['tr_build_id'].tolist()
        train_indexes = X.loc[ (X['gh_build_started_at'] < start_date) | (X['gh_build_started_at'] > phase_end)]['tr_build_id'].tolist()
        
        filename = '../../data/project_data_pickles/' + project + '_' + str(i) + '_indexes.pkl'
        with open(filename, 'wb') as save_file:
            pickle.dump(train_indexes, save_file)
            pickle.dump(test_indexes, save_file)
        i += 1
            
        
        start_date = phase_end





Processing rails.csv...
1501 12632
1443 12690
1368 12765
1443 12690
1163 12970
1894 12239
1483 12650
1198 12935
1682 12451
957 13176




Processing jruby.csv...
541 7734
501 7774
706 7569
518 7757
566 7709
645 7630
1526 6749
1385 6890
766 7509
1120 7155




Processing metasploit-framework.csv...
666 6936
633 6969
670 6932
561 7041
780 6822
1286 6316
1278 6324
669 6933
588 7014
470 7132




Processing cloudify.csv...
697 4118
674 4141
520 4295
421 4394
464 4351
496 4319
601 4214
400 4415
377 4438
164 4651




Processing vagrant.csv...
1 4048
0 4049
359 3690
702 3347
1026 3023
442 3607
363 3686
370 3679
501 3548
284 3765




Processing rubinius.csv...
930 3075
567 3438
434 3571
806 3199
352 3653
152 3853
193 3812
144 3861
187 3818
239 3766




Processing open-build-service.csv...
485 3233
356 3362
338 3380
402 3316
242 3476
240 3478
163 3555
583 3135
417 3301
491 3227




Processing gradle.csv...
401 3168
284 3285
576 2993
695 2874
686 2883
695 2874
171 3398
21 3548
19 3550
20 3549



In [10]:
for project in project_list:
    print('\n\n\n\nProcessing {}...'.format(project))
    
    best_diff = 100
    X = pd.read_csv('../../data/full_data/' + project)
    X['gh_build_started_at'] =  pd.to_datetime(X['gh_build_started_at'], format='%Y-%m-%d %H:%M:%S')
    
    start_date = X['gh_build_started_at'].tolist()[0]
    end_date = X['gh_build_started_at'].tolist()[-1]
    phase = (end_date - start_date)/10
    i = 1
    while start_date < end_date :
        phase_end = start_date + phase
        
        test_data = X.loc[ (X['gh_build_started_at'] >= start_date) & (X['gh_build_started_at'] < phase_end)]['tr_status'].tolist()
        train_data = X.loc[ (X['gh_build_started_at'] < start_date) | (X['gh_build_started_at'] >= phase_end)]['tr_status'].tolist()
        
        test_indexes = X.loc[ (X['gh_build_started_at'] > start_date) & (X['gh_build_started_at'] < phase_end)]['tr_build_id'].tolist()
        train_indexes = X.loc[ (X['gh_build_started_at'] < start_date) | (X['gh_build_started_at'] > phase_end)]['tr_build_id'].tolist()
        print(len(test_indexes), len(train_indexes))
        filename = '../../data/project_data_pickles/' + project + '_' + str(i) + '_indexes.pkl'
        with open(filename, 'wb') as save_file:
            pickle.dump(train_indexes, save_file)
            pickle.dump(test_indexes, save_file)
        i += 1
        
        '''train_ev = measure_balance(train_data)
        test_ev = measure_balance(test_data)
        
        diff = abs(train_ev[0] - test_ev[0])
        if diff < best_diff:
            best_diff = diff
            print(start_date)
            print(phase_end)
            print(diff)
            print(train_ev)
            print(test_ev)
            print('\n')
            
            test_indexes = X.loc[ (X['gh_build_started_at'] > start_date) & (X['gh_build_started_at'] < phase_end)]['tr_build_id'].tolist()
            train_indexes = X.loc[ (X['gh_build_started_at'] < start_date) | (X['gh_build_started_at'] > phase_end)]['tr_build_id'].tolist()
            
            print(len(test_indexes))
            print(len(train_indexes))
            
            filename = project + '_indexes.pkl'
            with open(filename, 'wb') as save_file:
                pickle.dump(train_indexes, save_file)
                pickle.dump(test_indexes, save_file)'''
            
        
        start_date = phase_end
        





Processing rails.csv...
1500 12632
1443 12690
1368 12765
1443 12690
1163 12970
1894 12239
1483 12650
1198 12935
1682 12451
957 13175




Processing jruby.csv...
540 7734
501 7774
706 7569
518 7757
566 7709
645 7630
1526 6749
1385 6890
766 7509
1120 7154




Processing metasploit-framework.csv...
665 6936
633 6969
670 6932
561 7041
780 6822
1286 6316
1278 6324
669 6933
588 7014
470 7131




Processing cloudify.csv...
696 4118
674 4141
520 4295
421 4394
464 4351
496 4319
601 4214
400 4415
377 4438
164 4650




Processing vagrant.csv...
0 4048
0 4049
359 3690
702 3347
1026 3023
442 3607
363 3686
370 3679
501 3548
284 3764




Processing rubinius.csv...
929 3075
567 3438
434 3571
806 3199
352 3653
152 3853
193 3812
144 3861
187 3818
239 3765




Processing open-build-service.csv...
484 3233
356 3362
338 3380
402 3316
242 3476
240 3478
163 3555
583 3135
417 3301
491 3226




Processing gradle.csv...
400 3168
284 3285
576 2993
695 2874
686 2883
695 2874
171 3398
21 3548
19 3550
20 3548



4047