In [1]:
import pandas as pd
import numpy as np
from numpy import argmax
from numpy import sqrt
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.utils import resample
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.model_selection import KFold
from matplotlib import pyplot
from statistics import median
import pickle
import csv
import warnings
import datetime
import multiprocess
warnings.filterwarnings("ignore")

In [2]:
projects = ['heroku', 'cloud_controller_ng', 'cloudify', 'concerto', 'fog', 'geoserver', 'gradle', 'graylog2-server', 'jruby', 'loomio', 'metasploit-framework', 'opal', 'open-build-service', 'orbeon-forms', 'puppet', 'rails', 'rubinius', 'sonarqube', 'sufia', 'vagrant']


In [3]:
sbs_lib = {}
batch_sizes = [2, 4, 8, 16]
batching_algs = ['BATCH4', 'BATCHBISECT', 'BATCHSTOP4']

In [4]:
def get_delay_from_ci(actual_results, ci):
    built_indexes = []
    delay_indexes = []
    missed_indexes = []
    
    length = len(ci)
    for i in range(len(ci)):
        if ci[i] == 0:
            built_indexes.append(i)
        else:
            if actual_results[i] == 0:
                delay_indexes.append(i)

        if ci[i]!=0 and actual_results[i] == 0:
            missed_indexes.append(i)

    bp = 0
    mp = 0
    temp_delay = 0
    delay = []

    while bp < len(built_indexes):
        while ( (mp < len(missed_indexes)) and (missed_indexes[mp] < built_indexes[bp]) ):
            delay.append(built_indexes[bp] - missed_indexes[mp])
            mp += 1
        bp += 1

    while mp < len(missed_indexes):
        delay.append(length - missed_indexes[mp])
        mp += 1

    return delay

In [5]:
def get_sbs_delays(actual_results, pred_results):
    first_failure = 0
    ci = []

    total_builds = len(actual_results)
    sbs_builds = 0

    for i in range(len(actual_results)):

        #If first failure is already found, continue building until actual build pass is seen
        if first_failure == 1:
            ci.append(0)
            sbs_builds += 1

            if actual_results[i] == 1:
                #actual build pass is seen, switch to prediction
                first_failure = 0
            else:
                first_failure = 1
        else:
            #we're in prediction state, if predicted to skip, we skip
            if pred_results[i] == 1:
                ci.append(1)
            else:
                #if predicted to fail, we switch to determine state and set first_failure to True
                ci.append(0)
                sbs_builds += 1
                first_failure = 1-actual_results[i]

    return ci, get_delay_from_ci(actual_results, ci)

In [6]:
for p in projects:
    filename = 'final_sbs_results/' + p + '_200_metrics.csv'

    sbs_data = pd.read_csv(filename)

    actual_results = sbs_data['Actual_Result'].tolist()
    pred_results = sbs_data['Build_Result'].tolist()

    ci, sbs_delay = get_sbs_delays(actual_results, pred_results)
    sbs_lib[p] = {}
    
    if sum(sbs_delay)==0:
        sbs_lib[p]['delay_list'] = [0]
    else:
        sbs_lib[p]['delay_list'] = sbs_delay
    
    sbs_lib[p]['testall_size'] = len(ci)
    sbs_lib[p]['builds_reqd'] = 100*ci.count(0)/len(ci)

In [7]:
create_list = [[val, sbs_lib[val]['builds_reqd'], median(sbs_lib[val]['delay_list']), sbs_lib[val]['testall_size'], sbs_lib[val]['delay_list']] for val in sbs_lib]


In [8]:
df = pd.DataFrame(create_list, columns=['project', 'builds_reqd', 'median_delay', 'testall_size', 'delay_list'])

In [9]:
df

Unnamed: 0,project,builds_reqd,median_delay,testall_size,delay_list
0,heroku,0.0,29.5,61,"[54, 41, 18, 4]"
1,cloud_controller_ng,5.13834,15.0,253,"[6, 35, 23, 12, 19, 12, 8, 1, 6, 3, 1, 19, 12,..."
2,cloudify,0.409836,80.0,488,"[392, 387, 370, 341, 337, 314, 305, 140, 80, 7..."
3,concerto,1.034483,48.0,290,"[96, 93, 91, 65, 62, 51, 44, 41, 32, 30, 65, 4..."
4,fog,3.418803,16.0,351,"[49, 45, 17, 12, 10, 8, 5, 3, 1, 67, 62, 58, 5..."
5,geoserver,9.52381,8.0,168,"[6, 4, 11, 9, 1, 19, 15, 8, 4, 5, 3, 20, 10, 8..."
6,gradle,1.256732,34.0,557,"[65, 27, 18, 14, 67, 56, 35, 28, 19, 12, 9, 20..."
7,graylog2-server,0.498753,182.0,401,"[12, 48, 285, 266, 226, 182, 21]"
8,jruby,9.443099,9.5,413,"[9, 4, 2, 6, 5, 3, 7, 3, 31, 22, 18, 3, 3, 12,..."
9,loomio,3.902439,37.0,205,"[38, 95, 93, 82, 71, 63, 61, 56, 49, 43, 41, 3..."


In [10]:
df.to_csv('single_version_sbs_results.csv')

In [11]:
sbs_lib

{'heroku': {'delay_list': [54, 41, 18, 4],
  'testall_size': 61,
  'builds_reqd': 0.0},
 'cloud_controller_ng': {'delay_list': [6,
   35,
   23,
   12,
   19,
   12,
   8,
   1,
   6,
   3,
   1,
   19,
   12,
   9,
   7,
   4,
   1,
   42,
   15,
   63,
   54,
   48,
   43,
   39,
   31,
   27,
   24,
   18,
   16,
   12,
   6],
  'testall_size': 253,
  'builds_reqd': 5.138339920948616},
 'cloudify': {'delay_list': [392,
   387,
   370,
   341,
   337,
   314,
   305,
   140,
   80,
   78,
   45,
   35,
   52,
   43,
   38,
   36,
   13],
  'testall_size': 488,
  'builds_reqd': 0.4098360655737705},
 'concerto': {'delay_list': [96,
   93,
   91,
   65,
   62,
   51,
   44,
   41,
   32,
   30,
   65,
   48,
   28,
   1,
   6],
  'testall_size': 290,
  'builds_reqd': 1.0344827586206897},
 'fog': {'delay_list': [49,
   45,
   17,
   12,
   10,
   8,
   5,
   3,
   1,
   67,
   62,
   58,
   55,
   53,
   50,
   48,
   45,
   33,
   8,
   2,
   5,
   56,
   53,
   48,
   38,
   24,
   22,

In [12]:
%pprint

Pretty printing has been turned OFF


In [13]:
sbs_lib

{'heroku': {'delay_list': [54, 41, 18, 4], 'testall_size': 61, 'builds_reqd': 0.0}, 'cloud_controller_ng': {'delay_list': [6, 35, 23, 12, 19, 12, 8, 1, 6, 3, 1, 19, 12, 9, 7, 4, 1, 42, 15, 63, 54, 48, 43, 39, 31, 27, 24, 18, 16, 12, 6], 'testall_size': 253, 'builds_reqd': 5.138339920948616}, 'cloudify': {'delay_list': [392, 387, 370, 341, 337, 314, 305, 140, 80, 78, 45, 35, 52, 43, 38, 36, 13], 'testall_size': 488, 'builds_reqd': 0.4098360655737705}, 'concerto': {'delay_list': [96, 93, 91, 65, 62, 51, 44, 41, 32, 30, 65, 48, 28, 1, 6], 'testall_size': 290, 'builds_reqd': 1.0344827586206897}, 'fog': {'delay_list': [49, 45, 17, 12, 10, 8, 5, 3, 1, 67, 62, 58, 55, 53, 50, 48, 45, 33, 8, 2, 5, 56, 53, 48, 38, 24, 22, 19, 16, 12, 54, 38, 36, 29, 13, 7, 1, 16, 10, 4, 12, 9, 7, 5, 2, 3, 1], 'testall_size': 351, 'builds_reqd': 3.4188034188034186}, 'geoserver': {'delay_list': [6, 4, 11, 9, 1, 19, 15, 8, 4, 5, 3, 20, 10, 8, 16, 13, 9, 7, 5, 10, 8, 6, 4, 2, 6, 4, 2, 1, 38, 35, 28, 26, 24, 20, 7, 