# Analyze probing results
There are two steps:  
1. For each probing configuration {LM, layer, task, config}, export only one feature.  
2. Merge with the GLUE classification results dataframe. Prepare a csv for classification / regression / etc.

## 1.1 Best probe
Select the best results from each configuration. Export to the same directory.

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

In [4]:
def analyze_results(report_dir):
    dfs = []
    for p in Path(report_dir).iterdir():
        if not p.name.startswith("report"): 
            continue
        report_df = pd.read_csv(p)
        print ("   ", p.name, report_df.shape)
        dfs.append(report_df)
    df = pd.concat(dfs, axis=0)
    
    tasks = []
    layers = []
    for i, row in df.iterrows():
        tasks.append("_".join(row["task"].split("_")[:-2]))
        layers.append(int(row["task"].split("_")[-1]))
    df.drop(columns=["task"], inplace=True)
    df["task"] = tasks
    df["layer"] = layers
    return df 


all_model_best_results = []
for p in Path("../reports/probing_results_1200_per_class").iterdir():
    
    if not p.is_dir():
        continue
    print(p.name)
    df_best_results = analyze_results(p)
    df_best_results["LM"] = [p.name] * len(df_best_results)
    all_model_best_results.append(df_best_results) 
df_all = pd.concat(all_model_best_results, axis=0)
# There are 25 LM x 13 layers x 7 (probing) tasks x 3 configs x 7 models x 5 rs
# For each {LM, layer, task, config}, average by rs, and take the best model.
df_avg = df_all.groupby(["LM", "layer", "task", "config", "model"], as_index=False).mean()
df_best_ids = df_avg.groupby(["LM", "layer", "task", "config"], as_index=False)["val_acc"].idxmax()["val_acc"]
df_best = df_avg.iloc[df_best_ids]

embeddings_xlnet_base_cased
    report_tree_depth.csv (1365, 12)
    report_subj_number.csv (1365, 12)
    report_obj_number.csv (1365, 12)
    report_past_present.csv (1365, 12)
    report_odd_man_out.csv (1365, 12)
    report_coordination_inversion.csv (1365, 12)
    report_bigram_shift.csv (1365, 12)
embeddings_roberta_base_corr_6000
    report_tree_depth.csv (1365, 12)
    report_subj_number.csv (1365, 12)
    report_obj_number.csv (1365, 12)
    report_past_present.csv (1365, 12)
    report_odd_man_out.csv (1365, 12)
    report_coordination_inversion.csv (1365, 12)
    report_bigram_shift.csv (1365, 12)
embeddings_microsoft_deberta_base_corr_1000
    report_tree_depth.csv (1365, 12)
    report_subj_number.csv (1365, 12)
    report_obj_number.csv (1365, 12)
    report_past_present.csv (1365, 12)
    report_odd_man_out.csv (1365, 12)
    report_coordination_inversion.csv (1365, 12)
    report_bigram_shift.csv (1365, 12)
embeddings_roberta_base_corr_2000
    report_tree_depth.csv (13

In [7]:
df_best.shape

(6825, 14)

In [8]:
df_best.to_csv("../reports/probing_1200_per_class_best.csv", index=False)

## 1.2: Specify a probe
Use only one probing configuration

In [7]:
for probe in df_avg.model.drop_duplicates():
    df_tmp = df_avg[df_avg.model==probe]
    df_tmp.to_csv(f"../reports/probing_1200_per_class_{probe}.csv", index=False)

## 2. Prepare data for linear regression
Compile a dataframe for linear regression

In [None]:
probing_df = pd.read_csv("../reports/probing_1200_per_class.csv")
probing_df = probing_df[probing_df["config"]=="Full"]
print(probing_df.shape)
probing_df.head()

glue_df = pd.read_csv("../reports/glue_classification_results.csv")
glue_df = glue_df[~glue_df.task.str.contains("processed")]  # Only handle the raw fine-tuning classification results for now.
print(glue_df.shape)
glue_df.head()

def prepare_data(probing_df, glue_df):

    lms = glue_df["LM"].drop_duplicates().tolist()
    result_df = {"LM": lms}
    for lm in lms:
        # Classification
        # Each task: one LM (one row in glue_df) as target. 
        df = glue_df[glue_df.LM==lm]  # n_classification_task rows
        for i, row in df.iterrows():
            if row.task not in result_df:
                result_df[row.task] = [row.dev_acc]
            else:
                result_df[row.task].append(row.dev_acc)

        # Probing
        # 13 layers x 7 tasks = 91 (rows) as data from probing_df as features.
        df = probing_df[probing_df.LM==lm]  # 91 rows  
        for i, row in df.iterrows():
            featname = "{}_layer_{}".format(row.task, row.layer)
            featval = row.test_acc 
            if featname in result_df:
                result_df[featname].append(featval)
            else:
                result_df[featname] = [featval]
    
    return pd.DataFrame(result_df)

linreg_data = prepare_data(probing_df, glue_df)
linreg_data.to_csv("../reports/task1_predict_task_performance.csv", index=False)