## Baseline Runner

Runs the baseline for each of the proposed method:

| Classifier | Approach              | Coverage (% identified) | Validity (% identified and leads to flip) | Median Size |
|------------|-----------------------|--------------------------|--------------------------------------------|-------------|
| Logistic   | Class Exclusion       |                          |                                            |             |
| Logistic   | Fast                  |                          |                                            |             |
| Logistic   | Slow                  |                          |                                            |             |
| Logistic   | Fast + CE fallback    |                          |                                            |             |
| Logistic   | Slow + CE fallback    |                          |                                            |             |
| KNN        | Class Exclusion       |                          |                                            |             |
| KNN        | Greedy                |                          |                                            |             |
| KNN        | Greedy + CE fallback  |                          |                                            |             |
| SVM        | Class Exclusion       |                          |                                            |             |
| SVM        | Greedy                |                          |                                            |             |
| SVM        | Greedy + CE fallback  |                          |                                            |             |
| DT         | Class Exclusion       |                          |                                            |             |
| DT         | Greedy                |                          |                                            |             |
| DT         | Greedy + CE fallback  |                          |                                            |             |
| LMeans     | Class Exclusion       |                          |                                            |             |
| LMeans     | Greedy                |                          |                                            |             |
| LMeans     | Greedy + CE fallback  |                          |                                            |             |

Note that running class exclusion + a random baseline does not make any sense, 
since if the random classifier is deterministic (fixed seed), then any removal
would not be valid. Otherwise, validity is stochastic


In [4]:
%load_ext autoreload
%autoreload 2

DATASET_NAME = "esnli"
LABEL_SPACE = ["entailment", "neutral", "contradiction"]
MODEL_NAME = "deberta_large"
SEED = 42
POOLER = "mean_with_attention"
LAYER = 24

from MinimalSubsetToFlipPredictions.baselines.SimpleBaseline import FindMinimalSubsetSimpleBaseline


ce_finder = FindMinimalSubsetSimpleBaseline()

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
## Load Embeddings
from utils.io import (
    load_dataset_from_hf,
    load_labels_at_split,
    load_embeddings,
)
import numpy as np
train_embeddings = load_embeddings(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    split="train",
    pooler=POOLER,
    layer=LAYER
)

eval_embeddings = load_embeddings(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    split="eval",
    pooler=POOLER,
    layer=LAYER
)

test_embeddings = load_embeddings(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    split="test",
    pooler=POOLER,
    layer=LAYER
)

train_eval_embeddings = np.vstack([train_embeddings, eval_embeddings])

## Load Datasets and Labels
dataset = load_dataset_from_hf(dataset=DATASET_NAME)
train_labels = load_labels_at_split(dataset, "train")
eval_labels = load_labels_at_split(dataset, "eval")
train_eval_labels = np.concatenate([train_labels, eval_labels])
test_labels = load_labels_at_split(dataset, "test")

from datasets import DatasetDict, concatenate_datasets
train_eval_dataset = concatenate_datasets([dataset["train"], dataset["eval"]])
dataset_dict = DatasetDict(
    {"train": train_eval_dataset, "test": dataset["test"]}
)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/samsoup/.cache/huggingface/token
Login successful




In [8]:
## Run Logistic with Class Exclusion
from classifiers import RandomClassifier
from utils.io import load_wrapperbox
import pickle

wrapper_name = "LogisticRegression"
clf = load_wrapperbox(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    pooler=POOLER,
    wrapperbox=wrapper_name
)

subsets = ce_finder.find_minimal_subset(
    clf=clf, 
    train_embeddings=train_eval_embeddings, 
    test_embeddings=test_embeddings, 
    train_labels=train_eval_labels
)

output_file = f"{DATASET_NAME}_{MODEL_NAME}_{wrapper_name}_baseline.pickle"
with open(output_file, 'wb') as handle:
    pickle.dump(subsets, handle)

In [9]:
## Run KNN with Class Exclusion
from classifiers import RandomClassifier
from utils.io import load_wrapperbox
import pickle

wrapper_name = "KNN"
clf = load_wrapperbox(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    pooler=POOLER,
    wrapperbox=wrapper_name
)

subsets = ce_finder.find_minimal_subset(
    clf=clf, 
    train_embeddings=train_eval_embeddings, 
    test_embeddings=test_embeddings, 
    train_labels=train_eval_labels
)

output_file = f"{DATASET_NAME}_{MODEL_NAME}_{wrapper_name}_baseline.pickle"
with open(output_file, 'wb') as handle:
    pickle.dump(subsets, handle)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [None]:
## Run SVM with Class Exclusion
from classifiers import RandomClassifier
from utils.io import load_wrapperbox
import pickle

wrapper_name = "SVM"
clf = load_wrapperbox(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    pooler=POOLER,
    wrapperbox=wrapper_name
)

subsets = ce_finder.find_minimal_subset(
    clf=clf, 
    train_embeddings=train_eval_embeddings, 
    test_embeddings=test_embeddings, 
    train_labels=train_eval_labels
)

output_file = f"{DATASET_NAME}_{MODEL_NAME}_{wrapper_name}_baseline.pickle"
with open(output_file, 'wb') as handle:
    pickle.dump(subsets, handle)

In [17]:
## Run LGBM with Class Exclusion
from classifiers import RandomClassifier
from utils.io import load_wrapperbox
import pickle

wrapper_name = "LGBM"
clf = load_wrapperbox(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    pooler=POOLER,
    wrapperbox=wrapper_name
)

subsets = ce_finder.find_minimal_subset(
    clf=clf, 
    train_embeddings=train_eval_embeddings, 
    test_embeddings=test_embeddings, 
    train_labels=train_eval_labels
)

output_file = f"{DATASET_NAME}_{MODEL_NAME}_{wrapper_name}_baseline.pickle"

with open(output_file, 'wb') as handle:
    pickle.dump(subsets, handle)

[1 0 2 ... 2 0 1]
[     2      4      7 ... 559197 559198 559200]
[     0      3      8 ... 559190 559191 559202]
[     1      5      6 ... 559195 559199 559201]


In [16]:
clf

In [10]:
## Run LMeans with Class Exclusion
from classifiers import RandomClassifier
from utils.io import load_wrapperbox
import pickle

wrapper_name = "LMeans"
clf = load_wrapperbox(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    pooler=POOLER,
    wrapperbox=wrapper_name
)

subsets = ce_finder.find_minimal_subset(
    clf=clf, 
    train_embeddings=train_eval_embeddings, 
    test_embeddings=test_embeddings, 
    train_labels=train_eval_labels
)

output_file = f"{DATASET_NAME}_{MODEL_NAME}_{wrapper_name}_baseline.pickle"
with open(output_file, 'wb') as handle:
    pickle.dump(subsets, handle)