In [7]:
import os
import re
import pandas as pd

## Clean logs of attention head mask

In [35]:
LOGS_PATH = './logs'

def get_start_idx(lines, substring):
    return [line_idx for line_idx, line in enumerate(lines) if substring in line]

def get_logs(lines):
    starter_ides = get_start_idx(lines, 'attention_head_mask')
    res = pd.DataFrame({'starter_ides': starter_ides})
    res['next'] = res.apply(lambda x: x.shift(1))
    res = res[1:]
    log_len = int((res['starter_ides'] - res['next']).drop_duplicates().tolist()[0])
    return [lines[starter_idx:(starter_idx+log_len)] for starter_idx in starter_ides]
        
def clean_log(logs):
    '''
    Clean the log of 1 experiment
    '''
    def clean_one_line(log_line):
        '''
        Clean 1 line of 1 log
        Examples
        --------
        >>> clean_one_line("00:31:00-INFO:   Batch size = 8")
        ('Batch size', '8')
        '''
        if '-INFO:   ' in log_line:
            result = log_line.split('-INFO:   ')

            if result:
                #variable, value = result[0]
                variable, value = result[1].split(' = ')
                value = re.findall(r'[-+]?(?:\d*\.\d+|\d+)', value)
        #                    print(result, value)
                if value:
                    value = value[0]
                else:
                    value = None
                return variable, value
    variables = []
    values = []
    for log_line in logs:
        result = clean_one_line(log_line)
        if result:
            variable, value = result
            variables.append(variable)
            values.append(value)
    return variables, values

def get_experiment_result(task):
    with open(f'{LOGS_PATH}/head_pruning/{task}.txt') as f:
        lines = f.readlines()
    dfs = []
    for log in get_logs(lines):
        experiment = log[0].split(' ')[0]
        parameter = log[1].replace('\n', '')
        variables, values = clean_log(log[3:])
        df = pd.DataFrame({'task': task.lower(),
                           'experiments': experiment,
                           'parameter': parameter,
                           'variables': variables,
                           'values': values})
        dfs.append(df)
    result = pd.concat(dfs, axis=0, ignore_index=True) #\
               # .pivot(index = ["experiments", 'parameter'], 
               #        columns = 'variables', 
               #        values = 'values').reset_index()
    result.columns.name = None
    return result

def get_baseline_result(task):
    with open(f'{LOGS_PATH}/head_pruning/{task}.txt') as f:
        lines = f.readlines()
    header_start = min(get_start_idx(lines, 'Running evaluation'))
    header_end = min(get_start_idx(lines, 'attention_head_mask'))
    variables, values = clean_log(lines[header_start:header_end])

    df = pd.DataFrame({'task': task,
                       'variables': variables,
                       'values': values})
    return df

In [36]:
experiment_results = []
baseline_results = []
for task in [task.replace('.txt', '') for task in os.listdir(LOGS_PATH + '/head_pruning')]:
    experiment_results.append(get_experiment_result(task))
    baseline_results.append(get_baseline_result(task))
pd.concat(experiment_results, axis=0).to_csv('logs_cleaned/head_pruning_experiment_results.csv', index=False)
pd.concat(baseline_results, axis=0).to_csv('logs_cleaned/head_pruning_baseline_results.csv', index=False)

Unnamed: 0,task,experiments,parameter,variables,values
0,sst-2,attention_head_mask,"['1', '1']",Num examples,872
1,sst-2,attention_head_mask,"['1', '1']",Batch size,32
2,sst-2,attention_head_mask,"['1', '1']",Accuracy,0.930045871559633
3,sst-2,attention_head_mask,"['1', '1']",eval_accuracy,0.930045871559633
4,sst-2,attention_head_mask,"['1', '1']",eval_loss,0.23572358821651765
...,...,...,...,...,...
1147,cola,attention_head_mask,"['12', '12']",eval_accuracy,0.8082454458293384
1148,cola,attention_head_mask,"['12', '12']",eval_loss,0.47653821923516015
1149,cola,attention_head_mask,"['12', '12']",global_step,0
1150,cola,attention_head_mask,"['12', '12']",inference_time,9.440559148788452


## Clean logs of layer pruning

In [60]:
def get_logs_2(lines):
    logs = []
    start_ides = get_start_idx(lines, 'EXPERIMENT')
    for log_idx, strat_idx in enumerate(start_ides):
        if log_idx != len(start_ides) - 1:
            end_idx = start_ides[log_idx+1]
            log = lines[strat_idx:end_idx]
            logs.append(log)
    return logs

def get_experiment_result_2(task):
    with open(f'{LOGS_PATH}/layer_drop/{task}.txt') as f:
        lines = f.readlines()
    dfs = []
    for log in get_logs_2(lines):
        experiment = 'Remove Layers'
        parameter = log[0].split(' remove layers ')[1].replace('\n', '')
        variables, values = clean_log(log[3:])
        df = pd.DataFrame({'task': task.lower(),
                           'experiments': experiment,
                           'parameter': parameter,
                           'variables': variables,
                           'values': values})
        dfs.append(df)
    result = pd.concat(dfs, axis=0, ignore_index=True) #\
               # .pivot(index = ["experiments", 'parameter'], 
               #        columns = 'variables', 
               #        values = 'values').reset_index()
    result.columns.name = None
    return result

experiment_results_2 = []
#baseline_results = []
for task in [task.replace('.txt', '') for task in os.listdir(LOGS_PATH + '/layer_drop')]:
   
    experiment_results_2.append(get_experiment_result_2(task))
    #baseline_results.append(get_baseline_result(task))
pd.concat(experiment_results, axis=0).to_csv('logs_cleaned/layer_drop_results.csv', index=False)
#pd.concat(baseline_results, axis=0).to_csv('logs_cleaned/head_pruning_baseline_results.csv', index=False)


#get_experiment_result_2('CoLA')
# for line in log:
#     print(line)
#print(logs)
# variables, values = clean_logs(log[3:])
#     print(line)
#LOGS_PATH = './logs'