In [87]:
import pandas as pd
from statistics import mean, median
import sys
import pprint
import pickle

In [88]:
project_ytest_lib = {}

In [89]:
def output_values(Y_data):
    Y_t = []
    for e in Y_data:
        if e == 'passed':
            Y_t.append(1)
        else:
            Y_t.append(0) 
    return Y_t

In [90]:
def get_first_failures(df):
    
    results = df['tr_status'].tolist()
    length = len(results)
    verdict = ['keep']
    prev = results[0]
    
    for i in range(1, length):
        if results[i] == 0:
            if prev == 0:
                verdict.append('discard')
                #print(i+1)
            else:
                verdict.append('keep')
        else:
            verdict.append('keep')
        prev = results[i]
    
    df['verdict'] = verdict
    df = df[ df['verdict'] == 'keep' ]
    df.drop('verdict', inplace=True, axis=1)
    return df

In [91]:
def str_to_list(s):
    if s == '[]':
        return [0]
    l = s[1:-1].split(', ')
    l = [int(x) for x in l]
    return l

In [92]:
def separate_versions(results):
    version_dfs = []
    
    for i in range(1,11):
        ver = results[ results['version']==i]
        version_dfs.append(ver)
    return version_dfs

In [93]:
def get_project_delays(ci, y_test, batch_size):
    
    
    y_test = output_values(y_test)
    
    sbs_list = []
    missed = []
    b = batch_size
    
    i = 0

    while i < len(ci):

        if ci[i] == 0:
            if y_test[i] == 0:
                sbs_list.append(0)

            while len(missed) > 0:
                ind = missed.pop()
                sbs_list.append(i - ind)

            b -= 1
            if b == -1:
                b = batch_size - 1

        if ci[i] == 1:
            if y_test[i] == 0:
                missed.append(i)

        i += 1
    while len(missed) > 0:
            sbs_list.append(i - missed.pop())
    
    return sbs_list
    

In [94]:
def start_result_collection(filename):
    
    global project_ytest_lib
    
    results = pd.read_csv(filename)
    
    projects = set(results['project'].tolist())
    all_versions = pd.DataFrame()
        
    for p in projects:
        
        #getting project data
        p_data = results[ results['project']==p]
        pframe = pd.DataFrame()
        
        #splitting data into versions
        versions = separate_versions(p_data)
        
        for start in range(0,10):
            if len(versions[start]) > 0:
                pframe = versions[start]
                break
        
        #starting with the first version's project frame
        for x in range(len(pframe)):
            row = pframe.iloc[x]
            
            alg = row['algorithm']
            b = row['batch_size']
            conf = row['confidence']
            
            if p in project_ytest_lib:
                y_test = project_ytest_lib[p]
            else:
                test_file = '../../data/full_data/' + p + '.csv'
                y_test = pd.read_csv(test_file, usecols=['tr_build_id', 'tr_status'])
                project_ytest_lib[p] = y_test
                
            
            index_file = '../../data/project_data_pickles/' + p + '.csv_' + str(row['version']) + '_indexes.pkl'
            with open(index_file, 'rb') as infile:
                train_indexes = pickle.load(infile)
                test_indexes = pickle.load(infile)
            
            ver_xtest = y_test [ y_test['tr_build_id'].isin(test_indexes)]
            ver_ytest = ver_xtest['tr_status'].tolist()
                
            
            final_batch_median = str_to_list(row['batch_median'])
            final_ci = str_to_list(row['ci'])
            final_proj_delays = get_project_delays(final_ci, ver_ytest, b)
            
            
            #appending other frames to outer frame
            for i in range(start+1,10):
                next_ver = versions[i]
                
                #extracting corresponding outer row for each version
                new_df = next_ver[ (next_ver['algorithm']==alg) & (next_ver['batch_size']==b) & (next_ver['confidence']==conf)]
                
                if len(new_df) > 0:
                    new_row = new_df.iloc[0]
                else:
                    continue
                
                total_reqd_builds = (row['project_reqd_builds']*row['testall_size']) + (new_row['project_reqd_builds']*new_row['testall_size'])
                total_missed_builds = (row['project_missed_builds']*row['testall_size']) + (new_row['project_missed_builds']*new_row['testall_size'])
                total_saved_builds = (row['project_saved_builds']*row['testall_size']) + (new_row['project_saved_builds']*new_row['testall_size'])
                total_size = row['testall_size'] + new_row['testall_size']
                
                row['project_reqd_builds'] = total_reqd_builds/total_size
                row['project_missed_builds'] = total_missed_builds/total_size
                row['project_saved_builds'] = total_saved_builds/total_size
                row['testall_size'] = total_size
                
                index_file = '../../data/project_data_pickles/' + p + '.csv_' + str(new_row['version']) + '_indexes.pkl'
                with open(index_file, 'rb') as infile:
                    train_indexes = pickle.load(infile)
                    test_indexes = pickle.load(infile)
                            
                ver_xtest = y_test [ y_test['tr_build_id'].isin(test_indexes)]
                ver_ytest = ver_xtest['tr_status'].tolist()
                
                new_ci = str_to_list(new_row['ci'])
                final_proj_delays.extend(get_project_delays(new_ci, ver_ytest, b))
                final_batch_median.extend(str_to_list(new_row['batch_median']))
                final_ci.extend(new_ci)
                
                row['project_delays'] = final_proj_delays
                row['batch_median'] = final_batch_median
                row['ci'] = final_ci
                                            
            pframe.iloc[x] = row
        all_versions = all_versions.append(pframe)
    
    return all_versions

In [95]:
filenames = ['../final_full_results.csv', '../heroku_results.csv','cvh_models.csv', 'lfs.csv', 'results1.csv', 'results2.csv', 'gradle_models.csv', 'rubinius_models.csv']
file_root = '../version_results/'
dfs = []

for file in filenames[:2]:
    dfs.append(start_result_collection(file_root+file))

#combining dfs
final_df = pd.DataFrame()
final_df = dfs[0]

for i in range(1, len(dfs)):
    final_df = final_df.append(dfs[i])

final_df.to_csv('combined_results.csv')

In [96]:
final_df = final_df.loc[:, ~final_df.columns.str.contains('^Unnamed')]

In [97]:
final_df.to_csv('combined_results.csv')

In [98]:
short = pd.DataFrame()
short = final_df

In [99]:
short.drop('ci', inplace=True, axis=1)

In [100]:
final_df.to_csv('short_combined_results.csv')