In [10]:
import pandas as pd
import numpy as np
from numpy import argmax
from numpy import sqrt
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.utils import resample
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.model_selection import KFold
from matplotlib import pyplot
from statistics import median
import pickle
import csv
import warnings
import datetime
import multiprocess
warnings.filterwarnings("ignore")

In [11]:
project_list = ['heroku.csv', 'rails.csv', 'gradle.csv', 'jruby.csv', 'metasploit-framework.csv', 'cloudify.csv', 'vagrant.csv', 'rubinius.csv', 'open-build-service.csv', 'sonarqube.csv', 'loomio.csv', 'fog.csv', 'opal.csv', 'cloud_controller_ng.csv', 'puppet.csv', 'concerto.csv', 'sufia.csv', 'geoserver.csv', 'orbeon-forms.csv', 'graylog2-server.csv']

In [12]:
def output_values(Y_data):
    Y_t = []
    for e in Y_data:
        if e == 'passed':
            Y_t.append(1)
        else:
            Y_t.append(0) 
    return Y_t

In [13]:
batch_total = 0

In [14]:
def batch_bisect(batch_results):
    global batch_total
    
    batch_total += 1
    
    if len(batch_results) == 1:
        return
    
    if 0 in batch_results:
        half_batch = len(batch_results)//2
        batch_bisect(batch_results[:half_batch])
        batch_bisect(batch_results[half_batch:])

In [15]:
def batch_stop_4(batch_results):
    global batch_total
    
    batch_total += 1
    
    if len(batch_results) <= 4:
        if 0 in batch_results:
            batch_total += 4
        return
    
    if 0 in batch_results:
        half_batch = len(batch_results)//2
        batch_stop_4(batch_results[:half_batch])
        batch_stop_4(batch_results[half_batch:])

In [53]:
def calculate_batches(p):
    global batch_total
    
    batch_total = 0
    
    data = pd.read_csv('../data/exact_data/' + p)
    y_test = output_values(data['tr_status'].tolist())
    
    total = len(y_test)
    lines = []
    
    algorithms = ['BATCHBISECT', 'BATCH4', 'BATCHSTOP4']
    batchsizelist = [1, 2, 4, 8, 16]
    for alg in algorithms:
        for batchsize in batchsizelist:

            num_builds = 0
            delay_list = []

            if alg == 'BATCH4':
                if batchsize != 4:
                    continue
                else:
                    i = 0
                    while i < total :
                        if (total - i) > 4:
                            batch = y_test[i:i+4]
                        else:
                            batch = y_test[i:total]

                        delay_list.extend([batchsize-b for b in range(1, batchsize+1)])
                        num_builds += 1                                    

                        #if any build has failed in the batch, then whole batch will fail
                        if 0 in batch:
                            num_builds += 4

                        i += 4

            elif alg == 'BATCHBISECT':
                i = 0
                while i < total:
                    if (total - i) > batchsize:
                        batch = y_test[i:i+batchsize]
                    else:
                        batch = y_test[i:total]

                    batch_total = 0

                    delay_list.extend([batchsize-b for b in range(1, batchsize+1)])
                    
                    batch_bisect(batch)
                    
                    num_builds += batch_total

                    i += batchsize

            elif alg == 'BATCHSTOP4':

                if batchsize <= 4:
                    continue
                else:

                    i = 0
                    while i < total:
                        if (total - i) > batchsize:
                            batch = y_test[i:i+batchsize]
                        else:
                            batch = y_test[i:total]

                        batch_total = 0

                        delay_list.extend([batchsize-b for b in range(1, batchsize+1)])

                        batch_stop_4(batch)
                        num_builds += batch_total

                        i += batchsize
            
            lines.append([p, alg, batchsize, 100*num_builds/total, delay_list, median(delay_list), total])

    return lines

In [54]:
results = []
for p in project_list:
    results.extend(calculate_batches(p))

In [56]:
df = pd.DataFrame(results, columns=['project', 'algorithm', 'batch_size', 'builds_reqd', 'delay_list', 'median_delay', 'testall_size'])


In [57]:
df.to_csv('gr_results.csv')

In [58]:
print(len(project_list))

20
