In [1]:
import pandas as pd
from statistics import mean, median

In [2]:
project_list = ['rails.csv', 'gradle.csv', 'heroku.csv', 'jruby.csv', 'metasploit-framework.csv', 'cloudify.csv', 'vagrant.csv', 'rubinius.csv', 'open-build-service.csv', 'sonarqube.csv', 'loomio.csv', 'fog.csv', 'opal.csv', 'cloud_controller_ng.csv', 'puppet.csv', 'concerto.csv', 'sufia.csv', 'geoserver.csv', 'orbeon-forms.csv', 'graylog2-server.csv']

In [3]:
algorithms = ['BATCH4', 'BATCHBISECT', 'BATCHSTOP4']
batch_sizes = [16, 8, 4, 2, 1]

In [4]:
def batch_stop_4(batch_results, duration_subbatch):
    global batch_total
    global batch_duration
    
    batch_total += 1
    batch_duration += duration_subbatch[-1]
    
    
    if len(batch_results) <= 4:
        if 0 in batch_results:
            batch_total += 4
            batch_duration = sum(duration_subbatch)
        return
    
    if 0 in batch_results:
        half_batch = len(batch_results)//2
        batch_stop_4(batch_results[:half_batch], duration_subbatch[:half_batch])
        batch_stop_4(batch_results[half_batch:], duration_subbatch[half_batch:])

In [5]:
def batch_bisect(batch_results, duration_subbatch):
    global batch_total
    global batch_duration
    
    batch_total += 1
    batch_duration += duration_subbatch[-1]
    
    if len(batch_results) == 1:
        return
    
    if 0 in batch_results:
        half_batch = len(batch_results)//2
        batch_bisect(batch_results[:half_batch], duration_subbatch[:half_batch])
        batch_bisect(batch_results[half_batch:], duration_subbatch[half_batch:])

In [6]:
def output_values(Y_data):
    Y_t = []
    for e in Y_data:
        if e == 'passed':
            Y_t.append(1)
        else:
            Y_t.append(0) 
    return Y_t

In [7]:
def str_to_list(s):
    if s == '[]':
        return [0]
    l = s[1:-1].split(', ')
    l = [int(x) for x in l]
    return l

In [8]:
def get_complete_data(p_name):
    filename = '../../data/full_data/' + p_name
    
    project = pd.read_csv(filename)
    project['tr_status'] = output_values(project['tr_status'])
    
    return project

In [9]:
result_rows = []

In [10]:
batch_total = 0
batch_duration = 0

In [11]:
for p in project_list:
    
    project = get_complete_data(p)
    window_size = len(project)//5
    
    start_p = 0
    end_p = window_size
    
    while end_p <= len(project):
        
        window = project[start_p:end_p]
        test_start = start_p + int(0.7*len(window))
        test_end = end_p
        
        
        test_data = project[test_start:test_end]
        
        build_start = test_data['gh_build_started_at'].tolist()
        duration = test_data['tr_duration'].tolist()
        test_result = test_data['tr_status'].tolist()
        
        total = len(test_result)
        
        
        for alg in algorithms:
            for batchsize in batch_sizes:
                
                project_actual_duration = 0
                project_batch_duration = 0
                
                
                num_builds = 0
                delay_list = []
                
                if alg == 'BATCH4':
                    if batchsize != 4:
                        continue
                        
                if alg == 'BATCHSTOP4':
                    if batchsize < 4:
                        continue
                
                
                if alg == 'BATCH4':
                    i = 0
                    while i < total :
                        if (total - i) > 4:
                            batch = test_result[i:i+4]
                            project_actual_duration += sum(duration[i:i+4])
                            project_batch_duration += duration[i+4]
                        else:
                            batch = test_result[i:total]
                            project_actual_duration += sum(duration[i:total])
                            project_batch_duration += duration[total-1]
                        
                        
                        delay_list.extend([batchsize-b-1 for b in range(len(batch))])
                        num_builds += 1                        

                        #if any build has failed in the batch, then whole batch will fail
                        if 0 in batch:
                            num_builds += 4
                            
                            if i < total:
                                project_batch_duration += sum(duration[i:i+4])
                            else:
                                project_batch_duration += sum(duration[i:total])

                        i += 4
                        
                elif alg == 'BATCHBISECT':
                    i = 0
                    while i < total:
                        if (total - i) > batchsize:
                            batch = test_result[i:i+batchsize]
                            duration_subbatch = duration[i:i+batchsize]
                            project_actual_duration += sum(duration[i:i+batchsize])
                        else:
                            batch = test_result[i:total]
                            duration_subbatch = duration[i:total]
                            project_actual_duration += sum(duration[i:total])

                        batch_total = 0
                        batch_duration = 0

                        delay_list.extend([batchsize-b-1 for b in range(len(batch))])

                        batch_bisect(batch, duration_subbatch)
                        

                        num_builds += batch_total
                        project_batch_duration += batch_duration

                        i += batchsize
                        
                else:
                    i = 0
                    while i < total:
                        if (total - i) > batchsize:
                            batch = test_result[i:i+batchsize]
                            duration_subbatch = duration[i:i+batchsize]
                            project_actual_duration += sum(duration[i:i+batchsize])
                        else:
                            batch = test_result[i:total]
                            duration_subbatch = duration[i:total]
                            project_actual_duration += sum(duration[i:total])

                        batch_total = 0

                        delay_list.extend([batchsize-b-1 for b in range(len(batch))])

                        batch_stop_4(batch, duration_subbatch)
                        num_builds += batch_total
                        project_batch_duration += batch_duration

                        i += batchsize
                
                result_rows.append([p, start_p, end_p, alg, batchsize, 100*num_builds/total, delay_list, median(delay_list), project_batch_duration, project_actual_duration, total])
        
        start_p = start_p + int((0.5)*window_size)
        end_p = start_p + window_size
    

In [12]:
len(result_rows)

1620

In [13]:
df = pd.DataFrame(result_rows, columns=['project', 'start_p', 'end_p', 'algorithm', 'batch_size', 'builds_reqd', 'delay_list', 'median_delay', 'batch_duration', 'actual_duration', 'testall_size'])

In [14]:
df.to_csv('duration_sw_grouping_results.csv')

In [15]:
20*9*9

1620