## Evaluate Approaches

This notebook is intended to evaluate the subset approaches for esnli, where the
output is a dataframe that looks something like:

| Classifier | Approach              | Coverage (% identified) | Validity (% identified and leads to flip) | Median Size |
|------------|-----------------------|--------------------------|--------------------------------------------|-------------|
| Random     | Class Exclusion       | x                        | x                                          | x           |
| Logistic   | Fast                  |                          |                                            |             |
| Logistic   | Slow                  |                          |                                            |             |
| Logistic   | Fast + CE fallback    |                          |                                            |             |
| Logistic   | Slow + CE fallback    |                          |                                            |             |
| KNN        | Greedy                |                          |                                            |             |
| KNN        | Greedy + CE fallback  |                          |                                            |             |
| SVM        | Greedy                |                          |                                            |             |
| SVM        | Greedy + CE fallback  |                          |                                            |             |
| DT         | Greedy                |                          |                                            |             |
| DT         | Greedy + CE fallback  |                          |                                            |             |
| LMeans     | Greedy                |                          |                                            |             |
| LMeans     | Greedy + CE fallback  |                          |                                            |             |


In [2]:
%load_ext autoreload
%autoreload 2

# TODO: Think about plotting subset sizes against predicted probability? (confidence)

DATASET_NAME = "esnli"
LABEL_SPACE = ["entailment", "neutral", "contradiction"]
MODEL_NAME = "deberta_large"
SEED = 42
POOLER = "mean_with_attention"
LAYER = 24

In [3]:
## Load Embeddings
from utils.io import (
    load_dataset_from_hf,
    load_labels_at_split,
    load_embeddings,
    load_wrapperbox
)
import numpy as np


train_embeddings = load_embeddings(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    split="train",
    pooler=POOLER,
    layer=LAYER
)

eval_embeddings = load_embeddings(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    split="eval",
    pooler=POOLER,
    layer=LAYER
)

test_embeddings = load_embeddings(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    split="test",
    pooler=POOLER,
    layer=LAYER
)

train_eval_embeddings = np.vstack([train_embeddings, eval_embeddings])


## Load Classifiers
knn_clf = load_wrapperbox(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    pooler=POOLER,
    wrapperbox="KNN"
)

svm_clf = load_wrapperbox(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    pooler=POOLER,
    wrapperbox="SVM",
)

dt_clf = load_wrapperbox(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    pooler=POOLER,
    wrapperbox="DecisionTree",
)

lmeans_clf = load_wrapperbox(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    pooler=POOLER,
    wrapperbox="LMeans",
)

## Load Datasets and Labels
dataset = load_dataset_from_hf(dataset=DATASET_NAME)
train_labels = load_labels_at_split(dataset, "train")
eval_labels = load_labels_at_split(dataset, "eval")
train_eval_labels = np.concatenate([train_labels, eval_labels])
test_labels = load_labels_at_split(dataset, "test")

from datasets import DatasetDict, concatenate_datasets
train_eval_dataset = concatenate_datasets([dataset["train"], dataset["eval"]])
dataset_dict = DatasetDict(
    {"train": train_eval_dataset, "test": dataset["test"]}
)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/samsoup/.cache/huggingface/token
Login successful




In [6]:
## Do metrics for Yang fast

import pickle
from sklearn.linear_model import LogisticRegression

from MinimalSubsetToFlipPredictions.evaluate import evaluate_predictions


l2 = 500
logit_clf = LogisticRegression(penalty="l2", C= 1 / l2)
logit_clf.fit(train_eval_embeddings, train_eval_labels)

filename = "esnli_deberta_large_yang2023_alg1.pickle"
with open(filename, 'rb') as handle:
    yang_flip_list = pickle.load(handle)

# filter flip list to num zero entry
ex_indices = [i for i, l in enumerate(yang_flip_list) if l is not None]

is_yang_valid = evaluate_predictions(
    clf=logit_clf,
    flip_list=yang_flip_list,
    train_embeddings=train_eval_embeddings,
    train_labels=train_eval_labels,
    test_embeddings=test_embeddings,
    ex_indices_to_check=ex_indices,
)

print(f"Of the {len(ex_indices)} proposed subsets, only {np.sum(is_yang_valid)} is valid")
acc = np.sum(is_yang_valid)/len(test_labels) * 100
print(f"Validity: {acc:.2f}%")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
0it [00:00, ?it/s]

Of the 0 proposed subsets, only 0.0 is valid
Validity: 0.00%





In [26]:
## Do metrics for Yang slow

import pickle
from sklearn.linear_model import LogisticRegression
from MinimalSubsetToFlipPredictions.evaluate import evaluate_predictions


filename = "esnli_deberta_large_yang2023_alg2.pickle"
with open(filename, 'rb') as handle:
    yang_flip_list = pickle.load(handle)

ex_indices = []
# filter flip list to num zero entry
for i, l in enumerate(yang_flip_list):
    if l is not None and len(l) > 0:
        ex_indices.append(i)

num_examples = []
# compute some basic statistics
for i, l in enumerate(yang_flip_list):
    if l is None or len(l) == 0:
        continue
    # compute the length of indices
    num_examples.append(len(l))

# is_yang_valid = evaluate_predictions(
#     clf=logit_clf,
#     flip_list=yang_flip_list,
#     train_embeddings=train_eval_embeddings,
#     train_labels=train_eval_labels,
#     test_embeddings=test_embeddings,
#     ex_indices_to_check=ex_indices,
# )

# print(f"Of the {len(ex_indices)} proposed subsets, only {np.sum(is_yang_valid)} is valid")
# acc = np.sum(is_yang_valid)/len(test_labels) * 100
# print(f"Validity: {acc:.2f}%")

In [17]:
## Do metrics for KNN

from MinimalSubsetToFlipPredictions.evaluate import evaluate_predictions
from utils.io import load_pickle

wrapper_name = "KNN"
flip_list = load_pickle(f"{DATASET_NAME}_{MODEL_NAME}_{wrapper_name}.pickle")

is_valid = evaluate_predictions(
    clf=knn_clf,
    flip_list=flip_list,
    train_embeddings=train_eval_embeddings,
    train_labels=train_eval_labels,
    test_embeddings=test_embeddings,
    ex_indices_to_check=np.arange(10)
    # ex_indices_to_check=np.arange(test_labels.size),
)

is_valid[:10]

100%|██████████| 10/10 [00:14<00:00,  1.46s/it]


[True, True, True, True, True, True, True, True, True, True]

In [18]:
## Do metrics for LGBM

from MinimalSubsetToFlipPredictions.evaluate import evaluate_predictions
from utils.io import load_pickle

wrapper_name = "LGBM"
flip_list = load_pickle(f"{DATASET_NAME}_{MODEL_NAME}_{wrapper_name}.pickle")

is_valid = evaluate_predictions(
    clf=knn_clf,
    flip_list=flip_list,
    train_embeddings=train_eval_embeddings,
    train_labels=train_eval_labels,
    test_embeddings=test_embeddings,
    ex_indices_to_check=[5735]
    # ex_indices_to_check=np.arange(test_labels.size),
)

is_valid[:10]

  0%|          | 0/1 [00:00<?, ?it/s]

[72536, 230614, 103956, 64841, 16724, 201575, 164188, 439221, 169344, 299596, 8777, 241855, 397320, 209813, 113382, 84475, 232543, 368968, 173396, 441193, 511308, 44068, 482544, 234189, 359350, 410141, 157371, 345300, 501813, 361190, 395784, 139783, 250858, 148216, 84424, 370972, 124868, 170613, 62384, 94938, 480421, 315753, 529729, 452918, 202507, 339696, 167020, 441154, 421958, 214062, 511259, 152333, 181376, 88650, 54664, 472289, 186096, 49681, 285494, 495408, 255065, 217264, 221109, 483082, 203954, 459368, 173009, 13615, 268443, 549111, 390735, 351403, 231018, 131294, 70262, 297309, 453964, 44892, 385807, 440283, 296417, 432782, 195111, 378602, 186732, 536779, 311775, 344964, 113103, 254598, 120823, 227367, 70985, 208595, 226254, 354649, 47640, 441740, 109582, 255479, 551670, 536295, 507450, 436234, 439802, 273583, 13454, 269826, 519126, 54842, 511253, 530310, 461770, 546343, 326955, 461283, 533095, 78197, 153521, 184801, 386960, 206871, 297721, 111553, 71061, 505919, 399915, 22182

100%|██████████| 1/1 [00:01<00:00,  1.36s/it]

0 0





[False]

[72536,
 230614,
 103956,
 64841,
 16724,
 201575,
 164188,
 439221,
 169344,
 299596,
 8777,
 241855,
 397320,
 209813,
 113382,
 84475,
 232543,
 368968,
 173396,
 441193,
 511308,
 44068,
 482544,
 234189,
 359350,
 410141,
 157371,
 345300,
 501813,
 361190,
 395784,
 139783,
 250858,
 148216,
 84424,
 370972,
 124868,
 170613,
 62384,
 94938,
 480421,
 315753,
 529729,
 452918,
 202507,
 339696,
 167020,
 441154,
 421958,
 214062,
 511259,
 152333,
 181376,
 88650,
 54664,
 472289,
 186096,
 49681,
 285494,
 495408,
 255065,
 217264,
 221109,
 483082,
 203954,
 459368,
 173009,
 13615,
 268443,
 549111,
 390735,
 351403,
 231018,
 131294,
 70262,
 297309,
 453964,
 44892,
 385807,
 440283,
 296417,
 432782,
 195111,
 378602,
 186732,
 536779,
 311775,
 344964,
 113103,
 254598,
 120823,
 227367,
 70985,
 208595,
 226254,
 354649,
 47640,
 441740,
 109582,
 255479,
 551670,
 536295,
 507450,
 436234,
 439802,
 273583,
 13454,
 269826,
 519126,
 54842,
 511253,
 530310,
 461770,
 54

In [3]:
## Do metrics for LMeans

from MinimalSubsetToFlipPredictions.evaluate import evaluate_predictions
from utils.io import load_pickle

wrapper_name = "LMeans"
flip_list = load_pickle(f"{DATASET_NAME}_{MODEL_NAME}_{wrapper_name}.pickle")

is_valid = evaluate_predictions(
    clf=knn_clf,
    flip_list=flip_list,
    train_embeddings=train_eval_embeddings,
    train_labels=train_eval_labels,
    test_embeddings=test_embeddings,
    ex_indices_to_check=np.arange(test_labels.size),
)

: 