In [23]:
import pandas as pd
import numpy as np
from numpy import argmax
from numpy import sqrt
import math
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
from sklearn.utils import resample
from sklearn.metrics import roc_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from matplotlib import pyplot
from statistics import median
import pickle
import csv
import warnings
import datetime
import multiprocess
warnings.filterwarnings("ignore")

In [2]:
project_list = ['geoserver', 'gradle', 'cloud_controller_ng', 'opal', 'jruby', 'cloudify', 'chef', 'orbeon-forms', 'vagrant']

In [3]:
def output_values(Y_data):
    Y_t = []
    for e in Y_data:
        if e == 'passed':
            Y_t.append(1)
        else:
            Y_t.append(0) 
    return Y_t

In [4]:
def get_pass_streak(y_project):
    p = y_project[0]
    pass_streak = [y_project[0]]
    for i in range(1, len(y_project)):
        pass_streak.append(p)
        if y_project[i] == 1:
            p += 1
        else:
            p = 0
    return pass_streak

In [5]:
def get_first_failures(df):
    
    results = df['tr_status'].tolist()
    length = len(results)
    verdict = ['keep']
    prev = results[0]
    
    for i in range(1, length):
        if results[i] == 0:
            if prev == 0:
                verdict.append('discard')
                #print(i+1)
            else:
                verdict.append('keep')
        else:
            verdict.append('keep')
        prev = results[i]
    
    df['verdict'] = verdict
    df = df[ df['verdict'] == 'keep' ]
    df.drop('verdict', inplace=True, axis=1)
    return df

In [6]:
def get_complete_data(p_name):
    
    #open the metrics file
    filename = 'metrics_data/' + p_name + '_metrics.csv'
    project = pd.read_csv(filename)
    
    #clean the data & remove correlated columns
    project = project [ project['developer_experience'] >= 0]
    project.drop('num_commits', inplace=True, axis=1)
    project.drop('reviewer_experience', inplace=True, axis=1)
    project.drop('num_of_reviewers', inplace=True, axis=1)
    
    build_ids = project['tr_build_id'].tolist()
    #get results data
    res_file = '../data/' + p_name + '.csv'
    res_project = pd.read_csv(res_file, usecols = ['tr_build_id', 'gh_build_started_at', 'tr_status'])
    res_project['gh_build_started_at'] =  pd.to_datetime(res_project['gh_build_started_at'], format='%Y-%m-%d %H:%M:%S')
    y_project = res_project[res_project['tr_build_id'].isin(build_ids)]['tr_status'].tolist()
    y_project = output_values(y_project)
    
    #append date of build
    project_dates = res_project[res_project['tr_build_id'].isin(build_ids)]['gh_build_started_at'].tolist()
    project['gh_build_started_at'] = project_dates
    
    #add results column to the dataframe
    project['tr_status'] = y_project
    
    return project

In [7]:
def get_start_end_date(project):
    dates = project['gh_build_started_at'].tolist()
    
    start_date = dates[0] - datetime.timedelta(days = 1)
    end_date = dates[-1] - datetime.timedelta(days = 1)
    
    return start_date, end_date

In [8]:
def get_required_data(p_name, build_ids):
    
    res_file = '../data/' + p_name + '.csv'
    res_project = pd.read_csv(res_file, usecols = ['tr_build_id', 'tr_duration'])
    durations = res_project[res_project['tr_build_id'].isin(build_ids)]['tr_duration'].tolist()
    return durations

In [15]:
def compute_performance(p_name, test_builds, test_result, pred_result):
    
    
    
    durations = get_required_data(p_name, test_builds)
    actual_duration = sum(durations)
    actual_failures = test_result.count(0)
    
    total_builds = len(test_builds)
    num_of_builds = 0
    total_duration = 0
    cbf = 0
    saved_builds = 0
    
    batch = []
    batch_duration = []
    actual_results = []
    max_batch_size = 4
    
    for i in range(len(pred_result)):
        if pred_result[i] == 0:
            
            if test_result[i] == 0:
                cbf += 1
                
            if len(batch) < max_batch_size:
                batch.append(pred_result[i])
                batch_duration.append(durations[i])
                actual_results.append(test_result[i])
            
            if len(batch) == max_batch_size:
                num_of_builds += 1
                total_duration += max(batch_duration)
                
                if 0 in actual_results:
                    num_of_builds += 4
                    total_duration += sum(batch_duration)
        else:
            saved_builds += 1
            
    if len(batch) > 0:
        num_of_builds += 1
        total_duration += max(batch_duration)
        
        if 0 in actual_results:
            num_of_builds += len(batch)
            total_duration += sum(batch_duration)
                    
    #Delay computation
    flag = 0
    count = 0
    delay = []
    for i in range(len(pred_result)):
        if flag == 1:
            if pred_result[i] == 1:
                count += 1
            
            if pred_result[i] == 0:
                delay.append(count)
                count = 0
                flag = 0
                
        if test_result[i] != 1:
            if pred_result[i] == 1:
                flag = 1
    delay.append(count)

    
    try:
        
        time_saved = 100*total_duration/actual_duration
        builds_saved = 100*saved_builds/total_builds
        reqd_builds = 100*num_of_builds/total_builds
        failed = 100*cbf/actual_failures
        median_delays = median(delay)
        total_delays = sum(delay)
    
#         print("===========================================")
#         print('The performance of the model is as follows:')
#         print('\t Time Reqd : {}'.format(total_duration))
#         print('\t % Time Reqd : {}%'.format(time_saved))
#         print('\t Num. Builds saved : {}%'.format(saved_builds))
#         print('\t % Builds saved : {}%'.format(builds_saved))
#         print('\t Num. Builds required : {}'.format(num_of_builds))
#         print('\t % Builds required : {}%'.format(reqd_builds))
#         print('\t Num. Failed Builds Identified : {}'.format(cbf))
#         print('\t % Failed Builds Identified : {}%'.format(failed))
#         print('\t Median Delay Induced : {} builds'.format(median_delays))
#         print('\t Total Delay Induced: {} builds'.format(total_delays))
#         print('\t Total number of builds: {}'.format(total_builds))
#         print('\t Total number of failed builds: {}'.format(actual_failures))
#         print('\t Total Duration: {}'.format(actual_duration))
#         print("===========================================")
        
    except:
        
#         print('exception')
        return (0, 0, 0, 0, 0, 0)
    
    return (time_saved, builds_saved, reqd_builds, failed, median_delays, total_delays)
    
    

In [26]:
def bootstrapping(p_name):
    performances = {'time_saved':[], 'builds_saved':[], 'builds_reqd':[], 'failed_builds':[], 'total_delay':[], 'median_delay':[]}
    print('Processing {}'.format(p_name))
    
    project = get_complete_data(p_name)
    start_date, end_date = get_start_end_date(project)
    
    n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
    max_depth = [int(x) for x in np.linspace(10, 110, num = 5)]
    
    param_grid = {'n_estimators': n_estimators, 'max_depth': max_depth}
    forest = RandomForestClassifier()
    grid_search = GridSearchCV(estimator = forest, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 0)
    
    phase = 1

    while start_date < end_date:
        
        train_period = 100
        test_period = 10
        
        while True:
            train_end = start_date + datetime.timedelta(days = train_period + 1)
            test_start = start_date + datetime.timedelta(days = train_period)
            test_end = test_start + datetime.timedelta(days = test_period)

            #getting data of train & test phase wise
            train_data = project[ (project['gh_build_started_at'] > start_date) & (project['gh_build_started_at'] < train_end)]
            test_data = project[ (project['gh_build_started_at'] > test_start) & (project['gh_build_started_at'] < test_end)]

            #getting 'y' data
            train_result = train_data['tr_status'].tolist()
            test_result = test_data['tr_status'].tolist()
            
            if len(train_result) > 100 and len(test_result) > 10 :
                break
            
            if test_end > end_date:
                break
                
            if len(train_result) <= 100:
                train_period += 20
            
            if len(test_result) <= 10:
                test_period += 20
                
            
        
        #dropping build start time column
        train_data.drop('gh_build_started_at', inplace=True, axis=1)
        test_data.drop('gh_build_started_at', inplace=True, axis=1)
        
        #add pass_streak to training data:
        train_data['num_of_passes'] = get_pass_streak(train_result)
        
        best_n_estimators = []
        best_max_depth = []
        
        best_f1 = 0
        best_f1_sample = 0
        best_f1_sample_result = 0
        best_f1_estimator = 0
        best_thresholds = []
        
#         train_result = train_data['tr_status']
        
#         train_data.drop('tr_status', inplace=True, axis=1)
#         train_data.drop('tr_status', inplace=True, axis=1)
        
#         test_data.drop('tr_build_id', inplace=True, axis=1)
#         test_data.drop('tr_build_id', inplace=True, axis=1)
        
#         grid_search.fit(train_data, train_result)
        
        #bootstrap 10 times
        for i in range(2):
            
            file_name = 'rq2_' + p_name + '_' + str(phase) + '_model_' + str(i+1) + '_model.pkl'
            
            while True:
                sample_train = resample(train_data, replace=True, n_samples=len(train_data))
                sample_train_result = sample_train['tr_status']

                build_ids = sample_train['tr_build_id'].tolist()
                sample_test = train_data [~train_data['tr_build_id'].isin(build_ids)] 
                sample_test_result = sample_test['tr_status']
                
                if len(sample_test_result) != 0:
                    break
            
            #dropping result column and build ids column
            sample_train.drop('tr_status', inplace=True, axis=1)
            sample_train.drop('tr_build_id', inplace=True, axis=1)
            sample_test.drop('tr_status', inplace=True, axis=1)
            sample_test.drop('tr_build_id', inplace=True, axis=1)
            
            #training
            grid_search.fit(sample_train, sample_train_result)
            sample_pred_vals = grid_search.predict_proba(sample_test)
            
#             print(sample_pred_vals)
            pred_vals = sample_pred_vals[:, 1]
            fpr, tpr, t = roc_curve(sample_test_result, pred_vals)
            gmeans = sqrt(tpr * (1-fpr))
            ix = argmax(gmeans)
            bt = t[ix]
            best_thresholds.append(bt)
            
            final_pred_result = []
            #threshold setting
            for j in range(len(pred_vals)):
                if pred_vals[j] > bt:
                    final_pred_result.append(1)
                else:
                    final_pred_result.append(0)
            
            try:
                accuracy = accuracy_score(sample_test_result, final_pred_result)
                precision = precision_score(sample_test_result, final_pred_result)
                recall = recall_score(sample_test_result, final_pred_result)
                confusion = confusion_matrix(sample_test_result, final_pred_result)
                auc_score = roc_auc_score(sample_test_result, final_pred_result)
                f1 = f1_score(sample_test_result, final_pred_result)
            except:
                print('')
    
            if f1 > best_f1:
                best_f1 = f1
                best_f1_sample = sample_train
                best_f1_sample_result = sample_train_result
                best_f1_estimator = grid_search.best_estimator_

            print(precision, recall, accuracy, f1, auc_score)
            best_n_estimators.append(grid_search.best_params_['n_estimators'])
            best_max_depth.append(grid_search.best_params_['max_depth'])
        
        #completed with bootstrapping 
        threshold = median(best_thresholds)
        n_estimator = median(best_n_estimators)
        max_depth = median(best_max_depth)
        #retrain on the best 
        forest = RandomForestClassifier(n_estimators=int(n_estimator), max_depth=int(max_depth))
        forest.fit(best_f1_sample, best_f1_sample_result)
        
        test_builds = test_data['tr_build_id'].tolist()
        test_data.drop('tr_build_id', inplace=True, axis=1)
        test_data.drop('tr_status', inplace=True, axis=1)
        
        final_pred_result = []
        queue = 0
        i = 0
        total = len(test_data)
        while i < total :
            data = test_data.iloc[i]
            data['num_of_passes'] = queue
            predict = forest.predict_proba([data])
            if predict[0][1] > threshold:
                final_pred_result.append(1)
                queue += 1
                i+=1
            else:
                final_pred_result.append(0)
                queue = 0
                i += 1
                
                
        print('Individual testing for {}....'.format(p_name))
        
        try:
            accuracy = accuracy_score(test_result, final_pred_result)
            precision = precision_score(test_result, final_pred_result)
            recall = recall_score(test_result, final_pred_result)
            confusion = confusion_matrix(test_result, final_pred_result)
            auc_score = roc_auc_score(test_result, final_pred_result)
            f1 = f1_score(test_result, final_pred_result)

#             print(precision, recall, accuracy, f1, auc_score)
#             print(confusion)
            
        except:
            print('')
        
        batch_performance = compute_performance(p_name, test_builds, test_result, final_pred_result)
        
        performances['time_saved'].append(batch_performance[0])
        performances['builds_saved'].append(batch_performance[1])
        performances['builds_reqd'].append(batch_performance[2])
        performances['failed_builds'].append(batch_performance[3])
        performances['median_delay'].append(batch_performance[4])
        performances['total_delay'].append(batch_performance[5])
        
        start_date = test_end
        phase += 1
    
    #Project Performance:
    
    print("Average Time Saved in {} = {}".format(p_name, sum(performances['time_saved'])/len(performances['time_saved'])))
    print("Average Builds Saved in {} = {}".format(p_name, sum(performances['builds_saved'])/len(performances['builds_saved'])))
    print("Average Builds Reqd in {} = {}".format(p_name, sum(performances['builds_reqd'])/len(performances['builds_reqd'])))
    print("Average Failed Identified in {} = {}".format(p_name, sum(performances['failed_builds'])/len(performances['failed_builds'])))
    print("Average Median Delay in {} = {}".format(p_name, sum(performances['median_delay'])/len(performances['median_delay'])))
    print("Average Total Delay in {} = {}".format(p_name, sum(performances['total_delay'])/len(performances['total_delay'])))
    
    print('\n\n\n\n\n')

In [27]:
jobs = []
for p_name in project_list:
    
    q = multiprocess.Process(target=bootstrapping, args=(p_name,))
    jobs.append(q)
    q.start()

for j in jobs:
    j.join()

Processing geoserver
Processing gradle
Processing cloud_controller_ng
Processing opalProcessing jruby

Processing cloudifyProcessing chef

Processing orbeon-forms
Processing vagrant
0.9047619047619048 0.9743589743589743 0.8863636363636364 0.9382716049382716 0.5871794871794871
0.9736842105263158 0.9736842105263158 0.95 0.9736842105263158 0.736842105263158
0.9310344827586207 0.7941176470588235 0.7567567567567568 0.8571428571428571 0.5637254901960784
0.8846153846153846 0.7796610169491526 0.7361111111111112 0.8288288288288288 0.6590612777053455
0.9318181818181818 0.7454545454545455 0.7638888888888888 0.8282828282828283 0.7844919786096257
0.7777777777777778 0.5526315789473685 0.6805555555555556 0.6461538461538462 0.6880804953560372
0.9222222222222223 0.8383838383838383 0.8067226890756303 0.8783068783068783 0.7441919191919192
0.9791666666666666 0.30128205128205127 0.3167701863354037 0.46078431372549017 0.5506410256410257
0.8636363636363636 0.8636363636363636 0.8392857142857143 0.863636363636

Process Process-18:
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/new_env/lib/python3.9/site-packages/multiprocess/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/homebrew/Caskroom/miniforge/base/envs/new_env/lib/python3.9/site-packages/multiprocess/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/var/folders/09/2vzp5xpd2ldd4ggk4xzjt1qr0000gn/T/ipykernel_37385/690140083.py", line 102, in bootstrapping
    pred_vals = sample_pred_vals[:, 1]
IndexError: index 1 is out of bounds for axis 1 with size 1


0.8055555555555556 0.43283582089552236 0.47674418604651164 0.5631067961165048 0.5322073841319717
0.7619047619047619 0.6857142857142857 0.6442307692307693 0.7218045112781954 0.6222689075630252
0.19230769230769232 0.8333333333333334 0.6666666666666666 0.3125 0.7416666666666668
0.9679144385026738 0.9526315789473684 0.9253731343283582 0.9602122015915119 0.7035885167464115
0.0 0.0 0.023255813953488372 0.0 0.5


Process Process-16:
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/new_env/lib/python3.9/site-packages/multiprocess/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/homebrew/Caskroom/miniforge/base/envs/new_env/lib/python3.9/site-packages/multiprocess/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/var/folders/09/2vzp5xpd2ldd4ggk4xzjt1qr0000gn/T/ipykernel_37385/690140083.py", line 143, in bootstrapping
    forest.fit(best_f1_sample, best_f1_sample_result)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/new_env/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 327, in fit
    X, y = self._validate_data(
  File "/opt/homebrew/Caskroom/miniforge/base/envs/new_env/lib/python3.9/site-packages/sklearn/base.py", line 576, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/new_env/lib/python3.9/site-packages/sklearn/utils/valid

0.9446808510638298 0.925 0.8938356164383562 0.9347368421052632 0.8375
0.8888888888888888 0.9411764705882353 0.9285714285714286 0.9142857142857143 0.9305882352941176
Individual testing for orbeon-forms....
The performance of the model is as follows:
	 Time Reqd : 0
	 % Time Reqd : 0.0%
	 Num. Builds saved : 53%
	 % Builds saved : 100.0%
	 Num. Builds required : 0
	 % Builds required : 0.0%
	 Num. Failed Builds Identified : 0
	 % Failed Builds Identified : 0.0%
	 Median Delay Induced : 14 builds
	 Total Delay Induced: 14 builds
	 Total number of builds: 53
	 Total number of failed builds: 3
	 Total Duration: 36434
0.96 0.42857142857142855 0.472 0.5925925925925924 0.6373626373626373
Individual testing for opal....
The performance of the model is as follows:
	 Time Reqd : 5200
	 % Time Reqd : 165.4996817313813%
	 Num. Builds saved : 2%
	 % Builds saved : 8.333333333333334%
	 Num. Builds required : 20
	 % Builds required : 83.33333333333333%
	 Num. Failed Builds Identified : 1
	 % Failed Bu

0.8222222222222222 0.5606060606060606 0.5697674418604651 0.6666666666666666 0.5803030303030303
Individual testing for cloudify....
The performance of the model is as follows:
	 Time Reqd : 99799
	 % Time Reqd : 119.69895052473763%
	 Num. Builds saved : 0%
	 % Builds saved : 0.0%
	 Num. Builds required : 53
	 % Builds required : 96.36363636363636%
	 Num. Failed Builds Identified : 46
	 % Failed Builds Identified : 100.0%
	 Median Delay Induced : 0 builds
	 Total Delay Induced: 0 builds
	 Total number of builds: 55
	 Total number of failed builds: 46
	 Total Duration: 83375
0.6527777777777778 0.7121212121212122 0.6271186440677966 0.6811594202898551 0.6156759906759907
1.0 0.9787234042553191 0.9791666666666666 0.989247311827957 0.9893617021276595
Individual testing for orbeon-forms....

exception
0.34210526315789475 0.5909090909090909 0.6263736263736264 0.43333333333333335 0.6142951251646903
0.9647887323943662 0.6372093023255814 0.6422413793103449 0.7675070028011204 0.6715458276333789
0.90

Process Process-14:
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/new_env/lib/python3.9/site-packages/multiprocess/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/homebrew/Caskroom/miniforge/base/envs/new_env/lib/python3.9/site-packages/multiprocess/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/var/folders/09/2vzp5xpd2ldd4ggk4xzjt1qr0000gn/T/ipykernel_37385/690140083.py", line 102, in bootstrapping
    pred_vals = sample_pred_vals[:, 1]
IndexError: index 1 is out of bounds for axis 1 with size 1


Individual testing for opal....
The performance of the model is as follows:
	 Time Reqd : 22154
	 % Time Reqd : 104.48028673835125%
	 Num. Builds saved : 0%
	 % Builds saved : 0.0%
	 Num. Builds required : 38
	 % Builds required : 95.0%
	 Num. Failed Builds Identified : 1
	 % Failed Builds Identified : 100.0%
	 Median Delay Induced : 0 builds
	 Total Delay Induced: 0 builds
	 Total number of builds: 40
	 Total number of failed builds: 1
	 Total Duration: 21204
0.74 0.74 0.6486486486486487 0.74 0.5991666666666667
0.9387755102040817 0.6216216216216216 0.6352941176470588 0.7479674796747967 0.6744471744471744
Individual testing for geoserver....
The performance of the model is as follows:
	 Time Reqd : 0
	 % Time Reqd : 0.0%
	 Num. Builds saved : 14%
	 % Builds saved : 100.0%
	 Num. Builds required : 0
	 % Builds required : 0.0%
	 Num. Failed Builds Identified : 0
	 % Failed Builds Identified : 0.0%
	 Median Delay Induced : 5 builds
	 Total Delay Induced: 5 builds
	 Total number of builds:

Process Process-13:
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/new_env/lib/python3.9/site-packages/multiprocess/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/homebrew/Caskroom/miniforge/base/envs/new_env/lib/python3.9/site-packages/multiprocess/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/var/folders/09/2vzp5xpd2ldd4ggk4xzjt1qr0000gn/T/ipykernel_37385/690140083.py", line 143, in bootstrapping
    forest.fit(best_f1_sample, best_f1_sample_result)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/new_env/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 327, in fit
    X, y = self._validate_data(
  File "/opt/homebrew/Caskroom/miniforge/base/envs/new_env/lib/python3.9/site-packages/sklearn/base.py", line 576, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/new_env/lib/python3.9/site-packages/sklearn/utils/valid

0.9545454545454546 0.2916666666666667 0.35 0.4468085106382979 0.5833333333333334
0.8627450980392157 0.4835164835164835 0.5 0.6197183098591549 0.5358758888170653

0.0 0.0 1.0 0.0 0.5
Individual testing for cloud_controller_ng....
The performance of the model is as follows:
	 Time Reqd : 12680
	 % Time Reqd : 21.680402147522486%
	 Num. Builds saved : 15%
	 % Builds saved : 71.42857142857143%
	 Num. Builds required : 4
	 % Builds required : 19.047619047619047%
	 Num. Failed Builds Identified : 0
	 % Failed Builds Identified : 0.0%
	 Median Delay Induced : 1.0 builds
	 Total Delay Induced: 2 builds
	 Total number of builds: 21
	 Total number of failed builds: 2
	 Total Duration: 58486
Individual testing for geoserver....
The performance of the model is as follows:
	 Time Reqd : 135792
	 % Time Reqd : 208.76302924084493%
	 Num. Builds saved : 4%
	 % Builds saved : 9.523809523809524%
	 Num. Builds required : 36
	 % Builds required : 85.71428571428571%
	 Num. Failed Builds Identified : 32
	 %

Process Process-17:
Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/new_env/lib/python3.9/site-packages/multiprocess/process.py", line 315, in _bootstrap
    self.run()
  File "/opt/homebrew/Caskroom/miniforge/base/envs/new_env/lib/python3.9/site-packages/multiprocess/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/var/folders/09/2vzp5xpd2ldd4ggk4xzjt1qr0000gn/T/ipykernel_37385/690140083.py", line 143, in bootstrapping
    forest.fit(best_f1_sample, best_f1_sample_result)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/new_env/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 327, in fit
    X, y = self._validate_data(
  File "/opt/homebrew/Caskroom/miniforge/base/envs/new_env/lib/python3.9/site-packages/sklearn/base.py", line 576, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/new_env/lib/python3.9/site-packages/sklearn/utils/valid

0.8888888888888888 0.7058823529411765 0.675 0.7868852459016393 0.6029411764705883
0.9032258064516129 0.8 0.8412698412698413 0.8484848484848486 0.8464285714285714
0.9375 0.42857142857142855 0.46153846153846156 0.588235294117647 0.5892857142857143
Individual testing for cloud_controller_ng....

exception
Average Time Saved in cloud_controller_ng = 156.72074122326728
Average Builds Saved in cloud_controller_ng = 29.87286497090419
Average Builds Reqd in cloud_controller_ng = 154.0837883975139
Average Failed Identified in cloud_controller_ng = 50.264550264550266
Average Median Delay in cloud_controller_ng = 0.7222222222222222
Average Total Delay in cloud_controller_ng = 1.7777777777777777






0.8918918918918919 0.8048780487804879 0.8064516129032258 0.8461538461538461 0.8072009291521487
Individual testing for geoserver....
The performance of the model is as follows:
	 Time Reqd : 8477
	 % Time Reqd : 35.18011288180611%
	 Num. Builds saved : 8%
	 % Builds saved : 72.72727272727273%
	 Num. B

In [28]:
for q in jobs:
    q.terminate()