# Result logs cleaning

In [6]:
import os
import re
import pandas as pd
import numpy as np
import json

In [7]:
pd.set_option('display.max_columns', 50)  # or 1000
pd.set_option('display.max_rows', 50)  # or 1000
pd.set_option('display.max_colwidth', 300)  # or 199

## Initial Cleaning

In [4]:
def get_start_idx(lines, substring):
    '''
    Obtain the beginning of the cleaning
    '''
    return [line_idx for line_idx, line in enumerate(lines) if substring in line]

### Clean logs of attention head mask

In [5]:
LOGS_PATH = './logs'

def get_sliced_logs(lines):
    '''
    Slice the entire logs of multiple experiemnts into seperate experiements.
    '''
    
    # Step 1: Find the starting point.
    starter_ides = get_start_idx(lines, 'attention_head_mask')
    res = pd.DataFrame({'starter_ides': starter_ides})
    
    # Step 2: Base on all the start point, find the range
    res['ender_ides'] = res.apply(lambda x: x.shift(-1))
    res.iloc[-1, -1] = len(lines) # Upper bound of last row is the length!
    res['ender_ides'] = res['ender_ides'].astype('int')

    # Step 3: Obtain the corresponding lines base on the ranges
    sliced_lines = []
    for index, row in res.iterrows():
        starter_idx = row['starter_ides']
        ender_idx = row['ender_ides']
        sliced_lines.append(lines[starter_idx:ender_idx])
    return sliced_lines
        
def clean_log(logs):
    '''
    Clean the log of 1 experiment
    '''
    def clean_one_line(log_line):
        '''
        Clean 1 line of 1 log
        Examples
        --------
        >>> clean_one_line("00:31:00-INFO:   Batch size = 8")
        ('Batch size', '8')
        '''
        if '-INFO:   ' in log_line:
            result = log_line.split('-INFO:   ')
            if result:
                variable, value = result[1].split(' = ')
                value = re.findall(r'[-+]?(?:\d*\.\d+|\d+)', value)
                if value:
                    value = value[0]
                else:
                    value = None
                return variable, value
        
    variables = []
    values = []
    for log_line in logs:
        result = clean_one_line(log_line)
        if result:
            variable, value = result
            variables.append(variable)
            values.append(value)
    return variables, values

def get_train_time(header_logs):
    '''
    Examples
    --------
    >>> get_train_time("tot time 416.2169461250305 =========")
    416.2169461250305
    '''
    for header_log in header_logs:
        #print(header_log)
        if 'training time======' in header_log or (' =========' in header_log and 'tot ' in header_log) or ' training time======' in header_log:
            train_time = re.search(r'[-+]?(?:\d*\.\d+|\d+)', header_log).group()
            return train_time
    return None

def get_inference_time(header_logs):
    '''
    Examples
    --------
    >>> get_inference_time("evaluation time 0.7100062370300293")
    0.7100062370300293
    '''
    for header_log in header_logs:
        #print(header_log)
        if 'evaluation time' in header_log:
            train_time = re.search(r'[-+]?(?:\d*\.\d+|\d+)', header_log).group()
            return train_time
    return None
        
def get_experiment_result(task):
    with open(f'{LOGS_PATH}/head_pruning/{task}.txt') as f:
        lines = f.readlines()
    dfs = []
    for log in get_sliced_logs(lines):
        experiment = log[0].split(' ')[0]
        parameters = eval(log[1].replace('\n', ''))
        variables, values = clean_log(log[2:])
        # if task == 'rte':
        #     print(task, parameters, variables, values)
        df = pd.DataFrame({'task': task.lower(),
                           'experiments': experiment,
                           'drop_head_at_layer': int(parameters[0]),
                           'drop_head': int(parameters[1]),
                           'variables': variables,
                           'values': values})
        dfs.append(df)
    result = pd.concat(dfs, axis=0, ignore_index=True)
    result.columns.name = None
    result.loc[result['variables'].isin(['acc', 'eval_accuracy']), 'variables'] = 'accuracy'
    return result

def get_baseline_result(task):
    with open(f'{LOGS_PATH}/head_pruning/{task}.txt') as f:
        lines = f.readlines()
    header_start = min(get_start_idx(lines, 'Running evaluation'))
    header_end = min(get_start_idx(lines, 'attention_head_mask'))
    variables, values = clean_log(lines[header_start:header_end])
    train_time = get_train_time(lines[:header_start])
    if train_time:
        variables.append('train_time')
        values.append(train_time)
    inference_time = get_inference_time(lines)
    if inference_time:
        variables.append('inference_time')
        values.append(inference_time)
            
    result = pd.DataFrame({'task': task,
                           'variables': variables,
                           'values': values})
    result.loc[result['variables'].isin(['acc', 'eval_accuracy']), 'variables'] = 'accuracy'
    return result

In [6]:
experiment_results = []
baseline_results = []
for task in [task.replace('.txt', '') for task in os.listdir(LOGS_PATH + '/head_pruning') if '.ipynb_checkpoints' not in task]:
    baseline_results.append(get_baseline_result(task))
    experiment_results.append(get_experiment_result(task))
    
pd.concat(experiment_results, axis=0).to_csv('logs_cleaned/head_pruning_experiment_results.csv', index=False)
pd.concat(experiment_results, axis=0).to_pickle('logs_cleaned/head_pruning_experiment_results.pickle')
pd.concat(baseline_results, axis=0).to_csv('logs_cleaned/baseline_results.csv', index=False)
pd.concat(baseline_results, axis=0).to_pickle('logs_cleaned/baseline_results.pickle')

### Clean logs of layer pruning

In [7]:
def get_logs_2(lines):
    logs = []
    start_ides = get_start_idx(lines, 'EXPERIMENT')
    for log_idx, strat_idx in enumerate(start_ides):
        if log_idx != len(start_ides) - 1:
            end_idx = start_ides[log_idx+1]
            log = lines[strat_idx:end_idx]
        else:
            log = lines[strat_idx:]
        logs.append(log)
    return logs

def get_experiment_result_2(task):
    with open(f'{LOGS_PATH}/layer_drop/{task}.txt') as f:
        lines = f.readlines()
    dfs = []
    for log in get_logs_2(lines):
        experiment = 'Remove Layers'
        parameter = log[0].split(' remove layers ')[1].replace('\n', '')
        variables, values = clean_log(log[3:])
        train_time = get_train_time(log)
        if train_time:
            variables.append('train_time')
            values.append(train_time)
        inference_time = get_inference_time(log)
        if inference_time:
            variables.append('inference_time')
            values.append(inference_time)
        df = pd.DataFrame({'task': task.lower(),
                           'experiments': experiment,
                           'parameter': parameter,
                           'variables': variables,
                           'values': values})
        dfs.append(df)
    result = pd.concat(dfs, axis=0, ignore_index=True)
    result.columns.name = None
    result.loc[result['variables'].isin(['acc', 'eval_accuracy']), 'variables'] = 'accuracy'
    return result

experiment_results_2 = []
for task in [task.replace('.txt', '') for task in os.listdir(LOGS_PATH + '/layer_drop') if '.ipynb_checkpoints' not in task]:
    experiment_results_2.append(get_experiment_result_2(task))
pd.concat(experiment_results_2, axis=0).to_csv('logs_cleaned/layer_drop_results.csv', index=False)
pd.concat(experiment_results_2, axis=0).to_pickle('logs_cleaned/layer_drop_results.pickle')

## Result re-formatting for heads pruning

[GLUE](https://openreview.net/pdf?id=rJ4km2R5t7https://openreview.net/pdf?id=rJ4km2R5t7)

<img width = "50%" src="https://cdn.mathpix.com/snip/images/pS3Kb2-_3rym-Zd4LhhdPZkqIs7-K1cMmMekf7QQ2HE.original.fullsize.png" />

- Note that at [BERT](https://arxiv.org/abs/1810.04805https://arxiv.org/abs/1810.04805), F1 scores are reported for QQP and MRPC.

    <img width = "50%" src="https://cdn.mathpix.com/snip/images/TyBsRFSkPxAnklR4GijMblC8w8kcwXuTcAIVCqfaPdA.original.fullsize.png" />

In [17]:
head_prune = pd.read_pickle('logs_cleaned/head_pruning_experiment_results.pickle')

### Core scores of every experiments

In [18]:
benchmark_mapper = pd.DataFrame(
    {'task': ['sst-2', 'rte', 'mrpc', 'wnli', 'sts-b', 'cola'],
     'benchmark': ['accuracy', 'accuracy', 'F-1 score', 'accuracy', 'spearmanr', "Matthew's correlation"]}
)

In [19]:
head_prune_core_benchmark = head_prune.merge(benchmark_mapper, how='inner', on='task') \
    .query('variables == benchmark') \
    .drop(columns=['experiments', 'variables'])
head_prune_core_benchmark.to_csv('logs_cleaned/head_prune_core_benchmark.csv', index=False)

### GLUE scores

#### Baseline

In [20]:
baseline = pd.read_pickle('logs_cleaned/baseline_results.pickle')
baseline_core_benchmark = baseline.merge(benchmark_mapper, how='inner', 
               left_on=['task', 'variables'],
               right_on=['task', 'benchmark']) \
    .drop(columns=['variables']) \
    .rename(columns={'values': 'baseline'})
baseline_core_benchmark['baseline'] = baseline_core_benchmark['baseline'].astype('double')
baseline_core_benchmark.to_csv('logs_cleaned/baseline_core_benchmark.csv', index=False)

#### Experiment

In [21]:
head_prune_core_benchmark['values'] = head_prune_core_benchmark['values'].astype('double')

In [22]:
# Average
res = head_prune_core_benchmark \
    .merge(baseline_core_benchmark, how='left', on=['task', 'benchmark']) \
    .rename(columns={'values':'scores'}) \
    .assign(score_diff = lambda df: (df.scores - df.baseline) / df.baseline) \
    .groupby(["drop_head_at_layer", "drop_head"], as_index=False) \
    .agg(avg_glue = ('score_diff', 'mean')) 

res = res.pivot_table(index=['drop_head_at_layer'],
                values=['avg_glue'],
                columns=['drop_head'])

#res.applymap(lambda row: str(round(row* 100, 2)) + '%')
res

Unnamed: 0_level_0,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue
drop_head,1,2,3,4,5,6,7,8,9,10,11,12
drop_head_at_layer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,-0.005379,-0.0065,-0.005898,-0.009108,-0.008952,-0.014626,-0.009983,-0.007087,-0.008186,-0.009156,-0.009768,-0.002781
2,-0.005869,-0.002797,-0.007298,-0.007884,-0.005798,-0.00414,-0.009235,-0.015561,-0.014483,-0.014159,-0.012032,-0.018996
3,-0.020481,-0.021481,-0.023172,-0.024453,-0.025173,-0.029516,-0.062614,-0.06253,-0.057031,-0.053495,-0.077953,-0.070548
4,-0.096419,-0.098221,-0.093843,-0.100176,-0.100281,-0.1029,-0.09923,-0.098148,-0.092852,-0.082303,-0.092527,-0.09142
5,-0.086843,-0.09308,-0.098205,-0.151903,-0.145662,-0.15199,-0.151336,-0.153239,-0.156442,-0.17308,-0.170316,-0.175261
6,-0.186006,-0.179811,-0.185567,-0.182454,-0.187551,-0.184524,-0.184981,-0.186574,-0.189523,-0.187186,-0.184743,-0.180867
7,-0.175022,-0.17516,-0.182925,-0.182463,-0.178944,-0.176547,-0.181029,-0.179775,-0.168324,-0.170164,-0.169337,-0.171992
8,-0.17913,-0.177025,-0.181518,-0.182744,-0.186339,-0.179484,-0.178311,-0.182333,-0.170191,-0.175672,-0.172345,-0.169897
9,-0.169829,-0.166242,-0.172903,-0.172677,-0.175056,-0.178014,-0.174588,-0.167839,-0.142535,-0.136854,-0.140835,-0.137587
10,-0.135881,-0.142173,-0.137087,-0.134012,-0.129973,-0.121409,-0.129586,-0.129693,-0.163039,-0.159994,-0.149896,-0.153944


In [23]:
# By task
def get_task_result(task, df = head_prune_core_benchmark):
    
    task_df = df.copy()[df['task'] == task]
    res = task_df \
        .merge(baseline_core_benchmark, how='left', on=['task', 'benchmark']) \
        .rename(columns={'values':'scores'}) \
        .assign(score_diff = lambda df: (df.scores - df.baseline) / df.baseline) \
        .groupby(["drop_head_at_layer", "drop_head"], as_index=False) \
        .agg(avg_glue = ('score_diff', 'mean')) 

    res = res.pivot_table(index=['drop_head_at_layer'],
                    values=['avg_glue'],
                    columns=['drop_head'])

    #res = res.applymap(lambda row: str(round(row* 100, 2)) + '%')
    
    return res

In [24]:
head_prune_core_benchmark

Unnamed: 0,task,drop_head_at_layer,drop_head,values,benchmark
3,sst-2,1,1,0.930046,accuracy
11,sst-2,1,2,0.928899,accuracy
19,sst-2,1,3,0.930046,accuracy
27,sst-2,1,4,0.927752,accuracy
35,sst-2,1,5,0.930046,accuracy
...,...,...,...,...,...
5002,cola,12,8,0.525887,Matthew's correlation
5010,cola,12,9,0.523282,Matthew's correlation
5018,cola,12,10,0.515291,Matthew's correlation
5026,cola,12,11,0.523282,Matthew's correlation


In [25]:
head_prune_core_benchmark.drop_duplicates(subset='task')

Unnamed: 0,task,drop_head_at_layer,drop_head,values,benchmark
3,sst-2,1,1,0.930046,accuracy
1154,rte,1,1,0.689531,accuracy
1586,mrpc,1,1,0.895575,F-1 score
2738,wnli,1,1,0.56338,accuracy
3172,sts-b,1,1,0.882948,spearmanr
3890,cola,1,1,0.56009,Matthew's correlation


In [26]:
get_task_result('mrpc')

Unnamed: 0_level_0,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue
drop_head,1,2,3,4,5,6,7,8,9,10,11,12
drop_head_at_layer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,-0.003937,-0.002171,-0.001767,0.001773,0.005338,-0.004765,0.000789,0.002163,0.004833,0.000396,-0.005697,-0.003121
2,0.00138,0.000789,0.002551,0.000396,0.002551,0.0,0.006467,-0.002022,0.002382,0.002163,0.005075,0.002655
3,0.002288,0.00302,0.004332,0.002655,0.002655,0.001354,-0.035154,0.003606,0.002655,0.005657,0.002655,0.007936
4,-0.017121,0.002551,0.004448,-0.000571,-0.000178,0.001951,0.006189,0.007995,0.007638,0.008296,-0.010038,0.002334
5,0.002713,-0.001917,-0.001673,-0.015748,0.003464,0.001951,0.002334,0.005555,-0.000178,-0.002456,-0.002315,-0.000739
6,-0.005377,-0.004971,0.002123,-0.011804,-0.000351,0.001366,0.000599,-0.003249,-0.019612,-0.004167,-0.007098,-0.002851
7,-0.002851,-0.00365,-0.006686,-0.001129,-0.008396,-0.004567,-0.001522,-0.002063,-0.005377,-0.007512,-0.001129,-0.006686
8,-0.013523,-0.005377,-0.003249,-0.004567,-0.008396,-0.000178,-0.008812,-0.004567,-0.002851,-0.008812,-0.004567,-0.003249
9,-0.003249,-0.008812,-0.003249,-0.004971,-0.004971,-0.005377,-0.007098,-0.017947,-0.004167,-0.004567,0.001746,-0.005377
10,-0.005377,-0.007098,-0.005786,-0.005377,-0.007098,-0.004567,-0.005377,-0.007098,-0.00365,-0.005377,-0.001522,-0.002456


In [27]:
get_task_result('sst-2')

Unnamed: 0_level_0,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue
drop_head,1,2,3,4,5,6,7,8,9,10,11,12
drop_head_at_layer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,0.001235,0.0,0.001235,-0.001235,0.001235,0.0,-0.001235,-0.001235,-0.002469,-0.001235,0.003704,0.001235
2,0.002469,0.002469,0.001235,0.001235,0.0,0.0,0.002469,0.001235,0.0,0.001235,0.002469,0.0
3,-0.003704,0.0,0.0,0.0,0.0,-0.001235,0.001235,0.0,0.0,0.001235,0.0,0.001235
4,0.002469,0.001235,0.001235,0.002469,0.001235,0.001235,0.002469,0.002469,0.001235,-0.002469,0.002469,0.002469
5,0.002469,0.001235,0.0,0.001235,0.0,0.001235,0.002469,0.003704,0.0,0.001235,0.001235,0.0
6,-0.002469,0.0,0.0,-0.003704,-0.003704,-0.002469,-0.003704,-0.001235,0.0,-0.006173,-0.004938,0.001235
7,-0.001235,0.001235,0.0,0.001235,0.0,-0.001235,0.002469,0.002469,0.0,0.0,0.001235,0.002469
8,-0.002469,-0.003704,0.0,-0.001235,0.001235,0.0,0.0,0.002469,0.001235,0.002469,0.0,0.001235
9,0.0,-0.003704,0.0,0.0,0.001235,0.001235,0.002469,0.0,-0.001235,-0.004938,-0.002469,0.001235
10,0.0,0.0,0.001235,0.001235,0.002469,-0.001235,0.001235,0.0,0.001235,0.001235,-0.001235,0.001235


In [28]:
get_task_result('cola')

Unnamed: 0_level_0,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue
drop_head,1,2,3,4,5,6,7,8,9,10,11,12
drop_head_at_layer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,-0.008843,0.004685,-0.009002,-0.009002,0.000291,-0.022753,0.004848,0.0,0.013643,-0.009002,-0.031801,-0.004433
2,-0.018288,-0.004433,-0.027007,-0.022597,-0.031957,-0.009002,-0.013579,-0.013421,-0.004561,-0.004561,-0.026789,-0.013705
3,-0.004657,-0.01386,-0.027476,-0.00913,-0.013799,-0.018288,-0.00472,-0.027352,-0.018288,-0.022753,-0.013705,-0.013705
4,-0.013421,-0.027476,-9.7e-05,-0.027567,-0.00913,-0.022408,-0.022753,-0.023029,-0.022408,-0.031957,-0.018005,-0.018381
5,0.004555,-0.02297,-0.027625,-0.004273,-9.7e-05,-0.027567,0.0,-0.004749,-0.02297,-0.03219,-0.018381,-0.046108
6,-0.055319,-0.027589,-0.041467,-0.013886,-0.027384,-0.041404,-0.027589,-0.046108,-0.027505,-0.036713,-0.046108,-0.046087
7,-0.036228,-0.036862,-0.069207,-0.074049,-0.055101,-0.027567,-0.060039,-0.046087,-0.036848,-0.03684,-0.022914,-0.041481
8,-0.036848,-0.046032,-0.055387,-0.046032,-0.064684,-0.046087,-0.046087,-0.064701,-0.050723,-0.064537,-0.055387,-0.059971
9,-0.059971,-0.055319,-0.055103,-0.059971,-0.054936,-0.05473,-0.054936,-0.064632,-0.086584,-0.050743,-0.055312,-0.050676
10,-0.050728,-0.055319,-0.069211,-0.068913,-0.055319,-0.050676,-0.060039,-0.055387,-0.064541,-0.059971,-0.06278,-0.055371


In [29]:
get_task_result('wnli')

Unnamed: 0_level_0,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue
drop_head,1,2,3,4,5,6,7,8,9,10,11,12
drop_head_at_layer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
get_task_result('sts-b')

Unnamed: 0_level_0,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue
drop_head,1,2,3,4,5,6,7,8,9,10,11,12
drop_head_at_layer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,-0.000216,-0.00049,-0.000215,-3.1e-05,-0.004166,-0.003828,-0.002761,-0.002426,-0.003583,-0.00407,-0.004303,-0.005239
2,-0.00539,-0.005349,-0.005183,-0.005827,-0.005381,-0.005582,-0.009744,-0.012493,-0.012927,-0.017127,-0.022177,-0.020872
3,-0.019377,-0.020613,-0.033839,-0.047937,-0.052715,-0.061494,-0.244739,-0.228354,-0.234247,-0.217931,-0.348976,-0.326445
4,-0.458135,-0.473326,-0.466077,-0.48308,-0.491051,-0.500744,-0.488975,-0.484016,-0.456396,-0.431789,-0.447538,-0.473404
5,-0.464128,-0.452775,-0.472755,-0.800327,-0.790162,-0.790125,-0.799996,-0.800866,-0.797552,-0.825582,-0.828074,-0.814976
6,-0.822104,-0.825796,-0.858675,-0.865329,-0.873357,-0.880021,-0.884321,-0.89962,-0.905403,-0.891448,-0.865699,-0.852882
7,-0.845716,-0.85271,-0.857552,-0.856732,-0.85632,-0.851557,-0.852722,-0.83297,-0.788229,-0.786889,-0.78296,-0.770872
8,-0.760404,-0.760883,-0.779187,-0.793349,-0.789779,-0.763974,-0.753429,-0.760534,-0.732909,-0.721611,-0.70232,-0.680476
9,-0.678829,-0.693722,-0.727782,-0.714708,-0.735256,-0.747672,-0.76232,-0.719328,-0.486303,-0.468565,-0.481284,-0.468137
10,-0.466875,-0.477804,-0.446199,-0.459225,-0.453223,-0.405312,-0.410771,-0.418238,-0.557432,-0.557392,-0.52615,-0.543993


In [31]:
get_task_result('rte')

Unnamed: 0_level_0,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue,avg_glue
drop_head,1,2,3,4,5,6,7,8,9,10,11,12
drop_head_at_layer,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
1,-0.020513,-0.041026,-0.025641,-0.046154,-0.05641,-0.05641,-0.061538,-0.041026,-0.061538,-0.041026,-0.020513,-0.005128
2,-0.015385,-0.010256,-0.015385,-0.020513,0.0,-0.010256,-0.041026,-0.066667,-0.071795,-0.066667,-0.030769,-0.082051
3,-0.097436,-0.097436,-0.082051,-0.092308,-0.087179,-0.097436,-0.092308,-0.123077,-0.092308,-0.087179,-0.107692,-0.092308
4,-0.092308,-0.092308,-0.102564,-0.092308,-0.102564,-0.097436,-0.092308,-0.092308,-0.087179,-0.035897,-0.082051,-0.061538
5,-0.066667,-0.082051,-0.087179,-0.092308,-0.087179,-0.097436,-0.112821,-0.123077,-0.117949,-0.179487,-0.174359,-0.189744
6,-0.230769,-0.220513,-0.215385,-0.2,-0.220513,-0.184615,-0.194872,-0.169231,-0.184615,-0.184615,-0.184615,-0.184615
7,-0.164103,-0.158974,-0.164103,-0.164103,-0.153846,-0.174359,-0.174359,-0.2,-0.179487,-0.189744,-0.210256,-0.215385
8,-0.261538,-0.246154,-0.251282,-0.251282,-0.25641,-0.266667,-0.261538,-0.266667,-0.235897,-0.261538,-0.271795,-0.276923
9,-0.276923,-0.235897,-0.251282,-0.25641,-0.25641,-0.261538,-0.225641,-0.205128,-0.276923,-0.292308,-0.307692,-0.302564
10,-0.292308,-0.312821,-0.302564,-0.271795,-0.266667,-0.266667,-0.302564,-0.297436,-0.353846,-0.338462,-0.307692,-0.323077


## Results re-formmating for layer droping

In [32]:
benchmark_mapper = pd.DataFrame(
    {'task': ['sst-2', 'rte', 'mrpc', 'wnli', 'sts-b', 'cola'],
     'benchmark': ['accuracy', 'accuracy', 'F-1 score', 'accuracy', 'spearmanr', "Matthew's correlation"]}
)
layer_drop = pd.read_pickle('logs_cleaned/layer_drop_results.pickle')

In [33]:
# Clean the mixed parameters column
layer_drop['strategy'] = layer_drop['parameter'].apply(lambda x: re.search(r'\(([^()]+)\)', x).group(1))
layer_drop['n_layer_drop'] = layer_drop['strategy'].apply(lambda x: re.search(r'\d', x).group()).astype('int')
layer_drop['strategy'] = layer_drop['strategy'].apply(lambda x: re.sub(r' \d', '', x).replace('drop bottom', 'bottom drop').title())
layer_drop['layer_drop'] = layer_drop['parameter'].apply(lambda x: re.search(r'([^\(]+)', x).group(1))
layer_drop['values'] = layer_drop['values'].astype('double')
layer_drop = layer_drop.drop(columns = 'parameter')

### Core scores of every experiments

In [34]:
layer_drop_core_benchmark = layer_drop.merge(benchmark_mapper, how='inner', on='task') \
    .query('variables == benchmark') \
    .drop(columns=['experiments', 'variables'])

In [35]:
layer_drop_core_benchmark.groupby('task').size()

task
cola     9
mrpc     9
rte      9
sst-2    9
sts-b    9
wnli     9
dtype: int64

In [36]:
layer_drop_core_benchmark = layer_drop.merge(benchmark_mapper, how='inner', on='task') \
    .query('variables == benchmark') \
    .drop(columns=['experiments', 'variables'])

In [37]:
layer_drop_core_benchmark.head()

Unnamed: 0,task,values,strategy,n_layer_drop,layer_drop,benchmark
5,sst-2,0.923165,Top Drop,2,1011,accuracy
16,sst-2,0.911697,Top Drop,4,891011,accuracy
27,sst-2,0.904817,Top Drop,6,67891011,accuracy
38,sst-2,0.922018,Bottom Drop,2,1,accuracy
49,sst-2,0.905963,Bottom Drop,4,123,accuracy


In [38]:
layer_drop_core_benchmark.to_csv('logs_cleaned/layer_drop_core_benchmark.csv', index=False)

### Time

In [39]:
EPOCHS_mapper = pd.DataFrame(
    {'task': ['sst-2', 'rte', 'mrpc', 'wnli', 'sts-b', 'cola'],
     'n_epoch': [3, 10, 3, 2, 3, 3]}
)

In [40]:
TARGET_COL = ['train_time', 'inference_time', 'Num examples']
baseline_time = baseline. \
    query("variables in @TARGET_COL"). \
    pivot_table(index=['task'],
                values=['values'],
                columns=['variables']).reset_index(col_level=0)
baseline_time.columns = [col[0] for col in baseline_time.columns[:-len(TARGET_COL)]] + [col[1] for col in baseline_time.columns[-len(TARGET_COL):]]
baseline_time = baseline_time.merge(EPOCHS_mapper, on="task")
baseline_time['train_time_per_epoch_baseline'] = baseline_time['train_time'] / baseline_time['n_epoch']
baseline_time['inference_latency_baseline'] = baseline_time['inference_time'] / baseline_time['Num examples']
baseline_time['inference_throughput_baseline'] = baseline_time['Num examples'] / baseline_time['inference_time']
baseline_time = baseline_time[['task', 'train_time_per_epoch_baseline', 'inference_latency_baseline', 'inference_throughput_baseline']]
baseline_time

Unnamed: 0,task,train_time_per_epoch_baseline,inference_latency_baseline,inference_throughput_baseline
0,cola,138.738982,0.008906,112.284841
1,mrpc,55.606937,0.008895,112.42266
2,rte,30.705949,0.004794,208.603726
3,sst-2,1092.251655,0.00888,112.612348
4,sts-b,71.349972,0.004943,202.293243
5,wnli,7.977739,0.005144,194.409047


In [41]:
TARGET_COL = ['train_time', 'inference_time', 'Num examples']
layer_drop_time = layer_drop. \
    query("variables in @TARGET_COL"). \
    pivot_table(index=['task', 'experiments', 'strategy', 'n_layer_drop', 'layer_drop'],
                values=['values'],
                columns=['variables']).reset_index(col_level=0)#. \
layer_drop_time.columns = [col[0] for col in layer_drop_time.columns[:-len(TARGET_COL)]] + [col[1] for col in layer_drop_time.columns[-len(TARGET_COL):]]
layer_drop_time = layer_drop_time.merge(EPOCHS_mapper, on="task")
layer_drop_time['train_time_per_epoch'] = layer_drop_time['train_time'] / layer_drop_time['n_epoch']
layer_drop_time['inference_latency'] = layer_drop_time['inference_time'] / layer_drop_time['Num examples']
layer_drop_time['inference_throughput'] = layer_drop_time['Num examples'] / layer_drop_time['inference_time']
layer_drop_time = layer_drop_time.merge(baseline_time, on='task', how='inner')

for benchmark in ['train_time_per_epoch', 'inference_latency', 'inference_throughput']:
    layer_drop_time['diff_' + benchmark] = layer_drop_time[benchmark] - layer_drop_time[benchmark + '_baseline']
    layer_drop_time['percentage_diff_' + benchmark] = layer_drop_time['diff_' + benchmark] / layer_drop_time[benchmark]
layer_drop_time.head(5)

Unnamed: 0,task,experiments,strategy,n_layer_drop,layer_drop,Num examples,inference_time,train_time,n_epoch,train_time_per_epoch,inference_latency,inference_throughput,train_time_per_epoch_baseline,inference_latency_baseline,inference_throughput_baseline,diff_train_time_per_epoch,percentage_diff_train_time_per_epoch,diff_inference_latency,percentage_diff_inference_latency,diff_inference_throughput,percentage_diff_inference_throughput
0,cola,Remove Layers,Bottom Drop,2,1,1043.0,9.267023,326.441859,3,108.813953,0.008885,112.549625,138.738982,0.008906,112.284841,-29.925029,-0.275011,-2.1e-05,-0.002358,0.264784,0.002353
1,cola,Remove Layers,Bottom Drop,4,123,1043.0,9.261514,262.275043,3,87.425014,0.00888,112.616577,138.738982,0.008906,112.284841,-51.313968,-0.586948,-2.6e-05,-0.002954,0.331736,0.002946
2,cola,Remove Layers,Bottom Drop,6,12345,1043.0,9.268895,198.31521,3,66.10507,0.008887,112.526899,138.738982,0.008906,112.284841,-72.633912,-1.098765,-1.9e-05,-0.002156,0.242058,0.002151
3,cola,Remove Layers,Symmetric Drop,2,56,1043.0,9.273199,326.141196,3,108.713732,0.008891,112.474666,138.738982,0.008906,112.284841,-30.02525,-0.276186,-1.5e-05,-0.001691,0.189825,0.001688
4,cola,Remove Layers,Symmetric Drop,4,4567,1043.0,9.271496,262.70725,3,87.569083,0.008889,112.495329,138.738982,0.008906,112.284841,-51.169899,-0.584337,-1.7e-05,-0.001875,0.210488,0.001871



#### Layers 对 speedup的影响

In [42]:
sts_b_base_latency = 7.414978265762329 / 1500

In [43]:
sts_b_drop2_latency = (9.799771785736084 + 9.799771785736084 + 10.391596794128418) / 4500

In [44]:
sts_b_drop2_latency / sts_b_base_latency

1.348223343376953

In [59]:
def get_speedup(df):
    df_output = df[['task', 'n_layer_drop']].iloc[0]
    df_output['Fine-tuning speedup'] = sum(df['train_time_per_epoch_baseline']) / sum(df['train_time_per_epoch'])
    df_output['Fine-tuning speedup'] = '%.3f' % df_output['Fine-tuning speedup']  + 'x'
    df_output['Inference time save'] = sum(df['inference_latency_baseline'] - df['inference_latency']) / sum(df['inference_latency_baseline'])
    #df_output['Inference time save(%)'] = 100 * df_output['Inference time save(%)']
    return df_output
layer_drop_time.groupby(['task', 'n_layer_drop'], as_index=False).apply(get_speedup).reset_index(drop=True)

Unnamed: 0,task,n_layer_drop,Fine-tuning speedup,Inference time save
0,cola,2,1.275x,0.001905
1,cola,4,1.584x,0.002427
2,cola,6,2.097x,0.002035
3,mrpc,2,1.187x,0.001843
4,mrpc,4,1.473x,-0.00085
5,mrpc,6,1.950x,-0.000129
6,rte,2,1.179x,0.162885
7,rte,4,1.459x,0.27868
8,rte,6,1.912x,0.45677
9,sst-2,2,1.272x,0.001071


In [60]:
layer_drop_time.groupby(['task', 'n_layer_drop'], as_index=False).apply(get_speedup).reset_index(drop=True). \
    query('task in ["rte", "wnli"]')

Unnamed: 0,task,n_layer_drop,Fine-tuning speedup,Inference time save
6,rte,2,1.179x,0.162885
7,rte,4,1.459x,0.27868
8,rte,6,1.912x,0.45677
15,wnli,2,1.186x,0.180482
16,wnli,4,1.489x,0.3393
17,wnli,6,1.965x,0.460209
