## Predict GLUE performances with probing
Does probing results predict the GLUE task performances?  
This notebook: merge GLUE classification results and probing results into a table, so I can do OLS analysis later.

In [1]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf 
import numpy as np
from pathlib import Path 

In [2]:
probing_df = pd.read_csv("../reports/probing_1200_per_class.csv")
probing_df = probing_df[probing_df["config"]=="Full"]
print(probing_df.shape)
probing_df.head()

(2275, 14)


Unnamed: 0,LM,layer,task,config,model,train_acc,train_loss,val_acc,val_loss,test_acc,test_loss,rs,train_size_per_class,nclasses
0,embeddings_albert_base_v2,0,bigram_shift,Full,DecisionTree,0.5,0.693147,0.5,0.693147,0.5,0.693147,6560.2,1200.0,2.0
3,embeddings_albert_base_v2,0,coordination_inversion,Full,DecisionTree,0.5,0.693147,0.5,0.693147,0.5,0.693147,6560.2,1200.0,2.0
6,embeddings_albert_base_v2,0,obj_number,Full,DecisionTree,0.5,0.693147,0.5,0.693147,0.5,0.693147,6560.2,1200.0,2.0
9,embeddings_albert_base_v2,0,odd_man_out,Full,DecisionTree,0.500833,0.693146,0.5,0.693149,0.5,0.693149,6560.2,1200.0,2.0
12,embeddings_albert_base_v2,0,past_present,Full,DecisionTree,0.5,0.693147,0.5,0.693147,0.5,0.693147,6560.2,1200.0,2.0


In [4]:
glue_df = pd.read_csv("../reports/glue_classification_results.csv")
glue_df = glue_df[~glue_df.task.str.contains("processed")]  # Only handle the raw fine-tuning classification results for now.
print(glue_df.shape)
glue_df.head()

(150, 5)


Unnamed: 0,task,LM,init_lr,slurm_id,dev_acc
0,rte,embeddings_roberta_base,1e-05,5236891,0.7726
1,rte,embeddings_roberta_base_corr_500,1e-05,5215525,0.7148
2,rte,embeddings_roberta_base_corr_1000,1e-05,5258357,0.704
3,rte,embeddings_roberta_base_corr_2000,1e-05,5258397,0.6859
4,rte,embeddings_roberta_base_corr_4000,1e-05,5258437,0.5848


In [5]:
glue_df.groupby("task").count()

Unnamed: 0_level_0,LM,init_lr,slurm_id,dev_acc
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
cola,25,25,25,25
mrpc,25,25,25,25
qnli,25,25,25,25
qqp,25,25,25,25
rte,25,25,25,25
sst2,25,25,25,25


In [7]:
# Compile a dataframe for linear regression
def prepare_data(probing_df, glue_df):

    lms = glue_df["LM"].drop_duplicates().tolist()
    result_df = {"LM": lms}
    for lm in lms:
        # Classification
        # Each task: one LM (one row in glue_df) as target. 
        df = glue_df[glue_df.LM==lm]  # n_classification_task rows
        for i, row in df.iterrows():
            if row.task not in result_df:
                result_df[row.task] = [row.dev_acc]
            else:
                result_df[row.task].append(row.dev_acc)

        # Probing
        # 13 layers x 7 tasks = 91 (rows) as data from probing_df as features.
        df = probing_df[probing_df.LM==lm]  # 91 rows  
        for i, row in df.iterrows():
            featname = "{}_layer_{}".format(row.task, row.layer)
            featval = row.test_acc 
            if featname in result_df:
                result_df[featname].append(featval)
            else:
                result_df[featname] = [featval]
    
    return pd.DataFrame(result_df)

linreg_data = prepare_data(probing_df, glue_df)
linreg_data

Unnamed: 0,LM,rte,cola,qnli,mrpc,sst2,qqp,bigram_shift_layer_0,coordination_inversion_layer_0,obj_number_layer_0,...,past_present_layer_11,subj_number_layer_11,tree_depth_layer_11,bigram_shift_layer_12,coordination_inversion_layer_12,obj_number_layer_12,odd_man_out_layer_12,past_present_layer_12,subj_number_layer_12,tree_depth_layer_12
0,embeddings_roberta_base,0.7726,0.8437,0.9251,0.8995,0.9438,0.9143,0.5,0.5,0.5,...,0.857333,0.793333,0.283714,0.850667,0.654333,0.787333,0.619667,0.879667,0.815333,0.295238
1,embeddings_roberta_base_corr_500,0.7148,0.8322,0.9213,0.875,0.9415,0.9164,0.5,0.5,0.5,...,0.865333,0.704333,0.250381,0.685667,0.546667,0.813,0.582667,0.868,0.806,0.294857
2,embeddings_roberta_base_corr_1000,0.704,0.838,0.9213,0.8824,0.9392,0.916,0.5,0.5,0.5,...,0.819,0.721333,0.259905,0.605667,0.547,0.81,0.564333,0.864333,0.814667,0.297714
3,embeddings_roberta_base_corr_2000,0.6859,0.8341,0.9185,0.8824,0.9415,0.9166,0.5,0.5,0.5,...,0.864333,0.747667,0.27619,0.579,0.539333,0.81,0.560667,0.870333,0.812,0.301143
4,embeddings_roberta_base_corr_4000,0.5848,0.8399,0.9209,0.8873,0.945,0.9165,0.5,0.5,0.5,...,0.864333,0.785667,0.28419,0.559333,0.540667,0.804333,0.570667,0.865333,0.810333,0.302667
5,embeddings_roberta_base_corr_6000,0.6679,0.8293,0.9229,0.8554,0.9392,0.9169,0.5,0.5,0.5,...,0.867667,0.803333,0.287619,0.552333,0.535667,0.813333,0.571333,0.865333,0.81,0.29781
6,embeddings_xlm_roberta_base,0.6534,0.7833,0.9068,0.8211,0.9289,0.9093,0.5,0.5,0.5,...,0.757667,0.745667,0.267143,0.763333,0.565333,0.819,0.558667,0.854667,0.817333,0.309429
7,embeddings_xlm_roberta_base_corr_500,0.5993,0.7881,0.8997,0.8897,0.9255,0.7834,0.5,0.5,0.5,...,0.773333,0.746667,0.260952,0.605333,0.504,0.794,0.522,0.826667,0.8,0.284952
8,embeddings_xlm_roberta_base_corr_1000,0.5776,0.7939,0.9023,0.8284,0.9255,0.7875,0.5,0.5,0.5,...,0.774,0.783667,0.262762,0.558333,0.517333,0.804667,0.525333,0.837333,0.803667,0.293143
9,embeddings_xlm_roberta_base_corr_2000,0.5884,0.7919,0.8964,0.8088,0.9151,0.7821,0.5,0.5,0.5,...,0.793667,0.777333,0.260857,0.534,0.495667,0.813,0.507333,0.844,0.811,0.282571


In [8]:
linreg_data.to_csv("../reports/task1_predict_task_performance.csv", index=False)

In [9]:
y = linreg_data["rte"]
#X = linreg_data.drop(columns=["LM", "rte", "cola", "qnli", "mrpc", "sst2", "qqp"])
X = linreg_data[[f"bigram_shift_layer_{i}" for i in range(13)]]
model = sm.OLS(y, X)
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,rte,R-squared:,0.689
Model:,OLS,Adj. R-squared:,0.377
Method:,Least Squares,F-statistic:,2.211
Date:,"Sat, 22 Jan 2022",Prob (F-statistic):,0.0918
Time:,21:11:16,Log-Likelihood:,48.09
No. Observations:,25,AIC:,-70.18
Df Residuals:,12,BIC:,-54.34
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
bigram_shift_layer_0,3.0911,2.831,1.092,0.296,-3.078,9.260
bigram_shift_layer_1,-2.2583,2.711,-0.833,0.421,-8.164,3.648
bigram_shift_layer_2,-0.1363,0.962,-0.142,0.890,-2.232,1.960
bigram_shift_layer_3,-0.1716,1.278,-0.134,0.895,-2.956,2.613
bigram_shift_layer_4,-0.3510,0.998,-0.352,0.731,-2.525,1.823
bigram_shift_layer_5,1.7382,1.244,1.397,0.188,-0.973,4.449
bigram_shift_layer_6,-1.0122,1.546,-0.655,0.525,-4.381,2.357
bigram_shift_layer_7,-0.1937,2.354,-0.082,0.936,-5.323,4.935
bigram_shift_layer_8,0.6034,2.019,0.299,0.770,-3.796,5.003

0,1,2,3
Omnibus:,0.927,Durbin-Watson:,2.338
Prob(Omnibus):,0.629,Jarque-Bera (JB):,0.374
Skew:,0.297,Prob(JB):,0.83
Kurtosis:,3.07,Cond. No.,889.0


In [11]:
results.pvalues

bigram_shift_layer_0     0.296370
bigram_shift_layer_1     0.421051
bigram_shift_layer_2     0.889690
bigram_shift_layer_3     0.895450
bigram_shift_layer_4     0.731088
bigram_shift_layer_5     0.187712
bigram_shift_layer_6     0.525022
bigram_shift_layer_7     0.935777
bigram_shift_layer_8     0.770182
bigram_shift_layer_9     0.478428
bigram_shift_layer_10    0.635115
bigram_shift_layer_11    0.950519
bigram_shift_layer_12    0.937407
dtype: float64