# Evaluating models on datasets

In [2]:
from pathlib import Path

from tqdm.notebook import tqdm
import evaluate
import pandas as pd
import transformers
from evaluate import evaluator
from omegaconf import OmegaConf
from repsim.nlp import get_dataset, get_model
from bert_finetune import ShortcutAdder
import os


2024-04-03 15:07:52.488619: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

model_dirs = [
    Path("/root/similaritybench/experiments/models/nlp/shortcut"),
]

model_pattern = "sst2*"
split = "validation"
shortcut_rates = [0, 0.25, 0.5, 0.75, 1.0]
device = 0

metric = evaluate.load("accuracy")
task_evaluator = evaluator("text-classification")

columns = ["model", "dataset", "sc_rate", "acc"]
records = []

csv_path = Path("shortcut_evals.csv")
if csv_path.exists():
    df = pd.read_csv(csv_path, index_col=0)
else:
    df = pd.DataFrame(columns=columns)

for model_dir in model_dirs:
    for model_path in tqdm(model_dir.glob(model_pattern)):
        print(model_path)
        cfg = OmegaConf.load(model_path / "config.yaml")

        for shortcut_rate in shortcut_rates:
            if len(df.loc[(df["model"]==model_path.name) & (df["dataset"]==cfg.dataset.path) & (df["sc_rate"]==shortcut_rate)]) >0:
                print("result already exists, skipping")
                continue

            print(shortcut_rate)
            dataset = get_dataset(cfg.dataset.path, cfg.dataset.name)
            shortcutter = ShortcutAdder(
                num_labels=cfg.dataset.finetuning.num_labels,
                p=shortcut_rate,
                seed=cfg.shortcut_seed,
                feature_column=cfg.dataset.feature_column[0],
                label_column=cfg.dataset.target_column,
            )
            dataset = dataset.map(shortcutter)
            feature_column = shortcutter.new_feature_column
            tokenizer = transformers.AutoTokenizer.from_pretrained(
                cfg.model.kwargs.tokenizer_name,
                additional_special_tokens=shortcutter.new_tokens,
            )
            model = get_model(str(model_path))
            # print(model)
            model.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=64)
            # print(model)
            model = model.to(f"cuda:{device}" if device != -1 else "cpu")
            pipe = transformers.pipeline(
                "text-classification",
                model=model,
                tokenizer=tokenizer,
                device=device,
                max_length=128,
            )

            results = task_evaluator.compute(
                model_or_pipeline=pipe,
                data=dataset[split],
                metric=metric,
                label_mapping={"LABEL_0": 0, "LABEL_1": 1},
                input_column=feature_column,
            )

            records.append((model_path.name, cfg.dataset.path, shortcut_rate, results["accuracy"]))
df = pd.DataFrame.from_records(records, columns=columns)


0it [00:00, ?it/s]

/root/similaritybench/experiments/models/nlp/shortcut/sst2_pre2_ft2_scrate0558
0
0.25
0.5
0.75
1.0
/root/similaritybench/experiments/models/nlp/shortcut/sst2_pre5_ft5_scrate0668
0


KeyboardInterrupt: 

In [None]:

df.to_csv("shortcut_evals.csv")

## Analysis. Are models with different training setup distinguishable from their performance?

In [None]:
df = pd.read_csv("shortcut_evals.csv")

In [None]:
def scrate(s: str):
    str_to_float = {"0": 0.0, "025": 0.25, "05": 0.5, "075": 0.75,"10": 1.0,}
    s = s.split("_")[-1].replace("scrate", "")
    return str_to_float[s]

def seed(s: str):
    s = s.split("_")[1].replace("pre", "")
    return int(s)

split="validation"
clean_df = df.copy()
clean_df["split"] = split
clean_df["model_sc_rate"] = clean_df["model"].map(scrate)
clean_df["seed"] = clean_df["model"].map(seed)
clean_df


In [None]:
clean_df.groupby(["model_sc_rate", "sc_rate"])["acc"].agg("mean")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

data = clean_df
print(data.columns)
plt.figure()
# sns.scatterplot(data=data, hue="model_sc_rate", y="acc", x="sc_rate", marker="o",fillstyle="None")
cmap = sns.color_palette("crest", as_cmap=True)
palette = {0.0: "C0", 0.25: "C1", 0.5: "C2", 0.75: "C3", 1.0: "C4"}
kws = {"s": 40, "facecolor": "none", "linewidth": 1}
ax = sns.scatterplot(
    data=data,
    x="sc_rate",
    y="acc",
    edgecolor=data["model_sc_rate"].map(cmap),
    **kws,
)
handles, labels = zip(
    *[(plt.scatter([], [], ec=cmap(key), **kws), key) for key in sorted(data["model_sc_rate"].unique())]
)
ax.legend(handles, labels, title="cat")

plt.figure()
sns.boxplot(data=data, hue="model_sc_rate", y="acc", x="sc_rate")

Statistically significant difference in avg acc?

In [None]:
clean_df

In [None]:
import scipy.stats
import itertools

data = clean_df
selecting_feature = "model_sc_rate"
ds_selecting_feature = "sc_rate"
cols = ["rate1", "rate2", "ds_rate", "acc1-mean", "acc2-mean", "pval"]
records = []
for aug1, aug2 in itertools.combinations(sorted(data[selecting_feature].unique()), r=2):
    for ds_strength in data[selecting_feature].unique():
        x = data.loc[(data[selecting_feature] == aug1) & (data[ds_selecting_feature] == ds_strength), "acc"]
        y = data.loc[(data[selecting_feature] == aug2) & (data[ds_selecting_feature] == ds_strength), "acc"]
        # print("aug1, aug2, ds", aug1, aug2, ds_strength)
        # print(len(x), len(y))
        # plt.figure()
        # sns.histplot(x)
        # sns.histplot(y)
        records.append(
            (aug1, aug2, ds_strength, x.mean(), y.mean(), scipy.stats.ttest_ind(x, y, permutations=10000).pvalue)
        )

pvals = pd.DataFrame.from_records(records, columns=cols)
pvals.head()

pvals[pvals.ds_rate == 0]

In [None]:
significant = pvals[pvals.pval < 0.05]
print(len(significant)/len(pvals))

significant.sort_values(by=["rate1", "rate2", "ds_rate"])[significant.ds_rate == 0]