In [29]:
import pandas as pd
import sys
import csv
from statistics import median

In [30]:
def get_best_confidence(data):
    
    #typecasting
    data['project_reqd_builds'] = pd.to_numeric(data['project_reqd_builds'], downcast="float", errors='coerce')
    data["project_missed_builds"] = pd.to_numeric(data["project_missed_builds"], downcast="float", errors='coerce')
    data["project_saved_builds"] = pd.to_numeric(data["project_saved_builds"], downcast="float", errors='coerce')
    data['batch_delays'] = pd.to_numeric(data["batch_delays"], downcast="float", errors='coerce')
    
    #converting delay string to int and computing total
    delay_avg = []
    delays = data['project_delays'].tolist()

    for i in delays:
        if i == '[]':
            delay_avg.append(0)
        else:
            row = i[1:-1].split(', ')
            row = [int(x) for x in row]
            delay_avg.append(sum(row))
    
    #getting sbs and batch delay separately
    data['total_sbs_delay'] = delay_avg
    data['total_batch_delay'] = data['batch_delays'].tolist()
    
    #setting parameters
    project_list = set(data['project'].tolist())
    batching_alg = set(data['algorithm'].tolist())
    batchsizes = [1,2,4,8,16]
    
    #result file
    result_file = open('best_confidence.csv', 'w')
    res_headers = ['version','project', 'method', 'algorithm', 'batch_size', 'builds_reqd', 'batch_delays', 'sbs_delays', 'confidence']
    res_writer = csv.writer(result_file)
    res_writer.writerow(res_headers)
    
    
    #get best confidences
    for ver in range(1,11):
        ver_data = data[data['version'] == ver]
        for p in project_list:
            p_name = p + '.csv'
            alg_scores = {}

            p_data = ver_data[ver_data['project'] == p]
            #print(p_data)
            for alg in batching_alg:

                alg_data = p_data[p_data['algorithm'] == alg]
                #print(alg_data)

                for b in batchsizes:

                    if alg == 'BATCH4':
                        if b != 4:
                            continue

                    if alg == 'BATCHSTOP4':
                        if b < 4:
                            continue

                    batch_data = alg_data[ alg_data['batch_size'] == b]

                    reqd_list = batch_data['project_reqd_builds'].tolist()
                    delay_list = batch_data['total_sbs_delay'].tolist()
                    batch_delay_list = batch_data['total_batch_delay'].tolist()
                    min_distance = sys.maxsize

                    best = [ver, p_name, 'ssr', alg, b, 0, 0, 0, 0]
                    best_i = 0

                    for i in range(len(reqd_list)):
                        distance = ((reqd_list[i]**2)+((delay_list[i]+batch_delay_list[i])**2))**0.5
                        if distance < min_distance:
                            best[5] = reqd_list[i]
                            best[6] = batch_delay_list[i]
                            best[7] = delay_list[i]
                            best[8] = i+2
                            best_i = i
                            min_distance = distance

                    res_writer.writerow(best)
    

In [33]:
data = pd.read_csv('version_results/cvh_models.csv')
get_best_confidence(data)

92

In [24]:
result_file = open('cvh_sbs_delay_lists.csv', 'w')
res_headers = ['project', 'method', 'algorithm', 'batch_size', 'median_sbs_delays', 'median_batch_delays', 'confidence']
res_writer = csv.writer(result_file)
res_writer.writerow(res_headers)

86

In [25]:
lines = []

In [26]:
for p in project_list:
    p_name = p + '.csv'
    alg_scores = {}

    p_data = data[data['project'] == p]
    #print(p_data)
    for alg in batching_alg:

        alg_data = p_data[p_data['algorithm'] == alg]
        #print(alg_data)

        for b in batchsizes:

            if alg == 'BATCH4':
                if b != 4:
                    continue

            if alg == 'BATCHSTOP4':
                if b < 4:
                    continue

            batch_data = alg_data[ alg_data['batch_size'] == b]

            reqd_list = batch_data['project_reqd_builds'].tolist()
            delay_list = batch_data['total_sbs_delay'].tolist()
            batch_delay_list = batch_data['total_batch_delay'].tolist()
            ssr_delays = batch_data['project_delays'].tolist()
            min_distance = sys.maxsize

            best = [p_name, 'ssr', alg, b, 0, 0]
            best_i = 0

            for i in range(len(reqd_list)):
                distance = ((reqd_list[i]**2)+((delay_list[i]+batch_delay_list[i])**2))**0.5
                if distance < min_distance:
                    best[4] = ssr_delays[i]
                    #best[5] = median(ssr_delays[i])
                    best[5] = i+2
                    best_i = i
                    min_distance = distance

            lines.append(best)
            if alg == 'BATCH4':
                print(best)

['cloudify.csv', 'ssr', 'BATCH4', 4, '[]', 189]


In [27]:
ssr_delay = pd.DataFrame(lines, columns=['project', 'method', 'algorithm', 'batch_size', 'sbs_delays', 'confidence'])

In [28]:
ssr_delay.to_csv('cvh_sbs_delay_list.csv')