## Distinguish foundation models

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from tqdm import tqdm

In [2]:
df = pd.read_csv("../reports/task1_predict_task_performance.csv")
df["label"] = ["roberta"]*6 + ["xlm"]*6 + ["albert"]*6 + ["deberta"]*6 + ["xlnet"]
df.head()

Unnamed: 0,LM,rte,cola,qnli,mrpc,sst2,qqp,bigram_shift_layer_0,coordination_inversion_layer_0,obj_number_layer_0,...,subj_number_layer_11,tree_depth_layer_11,bigram_shift_layer_12,coordination_inversion_layer_12,obj_number_layer_12,odd_man_out_layer_12,past_present_layer_12,subj_number_layer_12,tree_depth_layer_12,label
0,embeddings_roberta_base,0.7726,0.8437,0.9251,0.8995,0.9438,0.9143,0.5,0.5,0.5,...,0.793333,0.283714,0.850667,0.654333,0.787333,0.619667,0.879667,0.815333,0.295238,roberta
1,embeddings_roberta_base_corr_500,0.7148,0.8322,0.9213,0.875,0.9415,0.9164,0.5,0.5,0.5,...,0.704333,0.250381,0.685667,0.546667,0.813,0.582667,0.868,0.806,0.294857,roberta
2,embeddings_roberta_base_corr_1000,0.704,0.838,0.9213,0.8824,0.9392,0.916,0.5,0.5,0.5,...,0.721333,0.259905,0.605667,0.547,0.81,0.564333,0.864333,0.814667,0.297714,roberta
3,embeddings_roberta_base_corr_2000,0.6859,0.8341,0.9185,0.8824,0.9415,0.9166,0.5,0.5,0.5,...,0.747667,0.27619,0.579,0.539333,0.81,0.560667,0.870333,0.812,0.301143,roberta
4,embeddings_roberta_base_corr_4000,0.5848,0.8399,0.9209,0.8873,0.945,0.9165,0.5,0.5,0.5,...,0.785667,0.28419,0.559333,0.540667,0.804333,0.570667,0.865333,0.810333,0.302667,roberta


In [8]:
def distinguish_models_cv(df_, features, verbose=False):
    np.random.seed(42)
    df = df_[features + ["label"]]
    ctrl_features = np.random.normal(0, 0.1, size=(len(df), len(features)))
    ctrl_df = pd.DataFrame({
        features[j]: ctrl_features[:,j] for j in range(len(features))
    })
    ctrl_df["label"] = df_["label"]

    kfold = KFold(n_splits=5, shuffle=True)
    trainaccs, devaccs = [], []
    ctrl_trainaccs, ctrl_devaccs = [], []
    for train_idx, val_idx in kfold.split(df):
        df_tr = df.loc[train_idx]
        df_val = df.loc[val_idx]
        # Model
        model = LogisticRegression()
        model.fit(df_tr[features], df_tr["label"])
        preds = model.predict(df_tr[features])
        trainaccs.append(accuracy_score(df_tr["label"], preds))
        preds = model.predict(df_val[features])
        devacc = accuracy_score(df_val["label"], preds)
        devaccs.append(devacc)

        # Control
        df_tr = ctrl_df.loc[train_idx]
        df_val = ctrl_df.loc[val_idx]
        model = LogisticRegression()
        model.fit(df_tr[features], df_tr["label"])
        preds = model.predict(df_tr[features])
        ctrl_trainaccs.append(accuracy_score(df_tr["label"], preds))
        preds = model.predict(df_val[features])
        ctrl_devacc = accuracy_score(df_val["label"], preds)
        ctrl_devaccs.append(ctrl_devacc)

    ctrl_devacc_mean = np.mean(ctrl_devaccs)
    devacc_mean = np.mean(devaccs)
    improvement = devacc_mean - ctrl_devacc_mean
        
    if verbose:
        print("Train acc: mean {:.4f} std {:.4f}; dev acc: mean {:.4f} std {:.4f}".format(
            np.mean(trainaccs), np.std(trainaccs), np.mean(devaccs), np.std(devaccs)
        ))
        print("Control setting: dev acc: mean {:.4f} std {:.4f}".format(
            np.mean(ctrl_devaccs), np.std(ctrl_devaccs)
        ))
        print("Dev acc improvement: {:.4f}".format(improvement))
    return np.mean(devaccs), np.std(devaccs), improvement
    
distinguish_models_cv(df, [
    "bigram_shift_layer_5", 
    "coordination_inversion_layer_6",
    "obj_number_layer_1",
    "odd_man_out_layer_5",
    "past_present_layer_1",
    "subj_number_layer_1",
    "tree_depth_layer_1"
], verbose=True)

Train acc: mean 0.4700 std 0.1631; dev acc: mean 0.0800 std 0.0980
Control setting: dev acc: mean 0.0000 std 0.0000
Dev acc improvement: 0.0800


(0.08, 0.09797958971132713, 0.08)

In [17]:
def find_3_best_features(df):
    probing_tasks = ["bigram_shift", "coordination_inversion", "obj_number",
                    "odd_man_out", "past_present", "subj_number", "tree_depth"]
    all_features = [f"{pt}_layer_{layer}" 
                    for layer in range(1, 13) 
                    for pt in probing_tasks]
    best_mean_acc = None
    best_feats = []
    max_improvement = None
    max_improvement_feats = []
    
    all_improvements = []
    for i in tqdm(range(len(all_features)-2)):
        for j in range(i+1, len(all_features)-1):
            for k in range(j+1, len(all_features)):
                feats = [all_features[i], all_features[j], all_features[k]]
                mean_acc, _, improvement = distinguish_models_cv(df, feats, verbose=False)
                
                if best_mean_acc is None or mean_acc > best_mean_acc:
                    best_mean_acc = mean_acc
                    best_feats = feats
                if max_improvement is None or improvement > max_improvement:
                    max_improvement = improvement
                    max_improvement_feats = feats
                all_improvements.append(improvement)
    return best_mean_acc, best_feats, max_improvement, max_improvement_feats, all_improvements

best_mean_acc, best_feats, max_improvement, max_improvement_feats, all_improvements = find_3_best_features(df)
print(best_mean_acc, best_feats, max_improvement, max_improvement_feats)

100%|██████████████████████████████████████████████████████| 82/82 [44:14<00:00, 32.38s/it]

0.08 ['coordination_inversion_layer_1', 'bigram_shift_layer_2', 'obj_number_layer_12'] 0.08 ['coordination_inversion_layer_1', 'bigram_shift_layer_2', 'obj_number_layer_12']





In [20]:
np.mean(all_improvements), np.std(all_improvements), np.max(all_improvements)

(0.002748835061500357, 0.010903529715299653, 0.08)

In [19]:
from scipy.stats import ttest_1samp
ttest_1samp(all_improvements, popmean=0, alternative='greater')

Ttest_1sampResult(statistic=77.81965146882206, pvalue=0.0)

In [27]:
# What is the accuracy of a trivial classifier that always output "roberta"?
def test_trivial_classifier():
    kfold = KFold(n_splits=5, shuffle=True)
    accs = []
    for train_idx, val_idx in kfold.split(df):
        df_val = df.loc[val_idx]
        labels = df_val["label"]
        trivial_preds = ["roberta"] * len(labels)
        accs.append(accuracy_score(labels, trivial_preds))
    return np.mean(accs), np.std(accs)

test_trivial_classifier()

(0.24000000000000005, 0.14966629547095767)

The trivial classifier has expected accuracy that is much higher than the max accuracy (0.08), showing that the probing features can't really distinguish the originating foundation models.