## Evaluate Approaches

This notebook is intended to evaluate the subset approaches for esnli, where the
output is a dataframe that looks something like:

| Classifier | Approach              | Coverage (% identified) | Validity (% identified and leads to flip) | Median Size |
|------------|-----------------------|--------------------------|--------------------------------------------|-------------|
| Random     | Class Exclusion       | x                        | x                                          | x           |
| Logistic   | Fast                  |                          |                                            |             |
| Logistic   | Slow                  |                          |                                            |             |
| Logistic   | Fast + CE fallback    |                          |                                            |             |
| Logistic   | Slow + CE fallback    |                          |                                            |             |
| KNN        | Greedy                |                          |                                            |             |
| KNN        | Greedy + CE fallback  |                          |                                            |             |
| SVM        | Greedy                |                          |                                            |             |
| SVM        | Greedy + CE fallback  |                          |                                            |             |
| DT         | Greedy                |                          |                                            |             |
| DT         | Greedy + CE fallback  |                          |                                            |             |
| LMeans     | Greedy                |                          |                                            |             |
| LMeans     | Greedy + CE fallback  |                          |                                            |             |


In [3]:
%load_ext autoreload
%autoreload 2

# TODO: Think about plotting subset sizes against predicted probability? (confidence)

DATASET_NAME = "esnli"
LABEL_SPACE = ["entailment", "neutral", "contradiction"]
MODEL_NAME = "deberta-large"
SEED = 42
POOLER = "mean_with_attention"
LAYER = 24

In [4]:
## Load Embeddings
from data.embeddings import load_saved_embeddings
import numpy as np
train_embeddings = load_saved_embeddings(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    split="train",
    pooler=POOLER,
    layer=LAYER
)

eval_embeddings = load_saved_embeddings(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    split="eval",
    pooler=POOLER,
    layer=LAYER
)

test_embeddings = load_saved_embeddings(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    split="test",
    pooler=POOLER,
    layer=LAYER
)

train_eval_embeddings = np.vstack([train_embeddings, eval_embeddings])


## Load Classifiers
from data.models import load_saved_wrapperbox_model
knn_clf = load_saved_wrapperbox_model(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    pooler=POOLER,
    wrapperbox="KNN"
)

svm_clf = load_saved_wrapperbox_model(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    pooler=POOLER,
    wrapperbox="SVM",
)

dt_clf = load_saved_wrapperbox_model(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    pooler=POOLER,
    wrapperbox="DecisionTree",
)

lmeans_clf = load_saved_wrapperbox_model(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    pooler=POOLER,
    wrapperbox="LMeans",
)

## Load Datasets and Labels
from data.datasets import load_dataset_from_hf, load_labels_at_split
import numpy as np
dataset = load_dataset_from_hf(dataset=DATASET_NAME)
train_labels = load_labels_at_split(dataset, "train")
eval_labels = load_labels_at_split(dataset, "eval")
train_eval_labels = np.concatenate([train_labels, eval_labels])
test_labels = load_labels_at_split(dataset, "test")

from datasets import DatasetDict, concatenate_datasets
train_eval_dataset = concatenate_datasets([dataset["train"], dataset["eval"]])
dataset_dict = DatasetDict(
    {"train": train_eval_dataset, "test": dataset["test"]}
)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/samsoup/.cache/huggingface/token
Login successful




In [5]:
## Evaluation functions

from typing import List
import numpy as np
from sklearn import clone
from sklearn.base import BaseEstimator
from tqdm import tqdm


def retrain_and_evaluate_validity(
    clf: BaseEstimator, 
    train_embeddings: np.ndarray, 
    train_labels: np.ndarray, 
    x_test: np.ndarray, 
    indices_to_exclude: np.ndarray
):
    train_mask = np.ones(train_embeddings.shape[0], dtype=bool)
    train_mask[indices_to_exclude] = False
    reduced_embeddings = train_embeddings[train_mask]
    reduced_labels = train_labels[train_mask]
    old_pred = clf.predict(x_test.reshape(1, -1))[0]
    new_clf = clone(clf)
    new_clf.fit(reduced_embeddings, reduced_labels)
    new_pred = new_clf.predict(x_test.reshape(1, -1))[0]
    # this subset is valid only if new prediction does not equal old prediction
    return old_pred, new_pred, new_pred != old_pred

def evaluate_predictions(
    clf: BaseEstimator,
    flip_list: List[List[int]],
    train_embeddings: np.ndarray,
    train_labels: np.ndarray, 
    test_embeddings: np.ndarray, 
    ex_indices_to_check: List[int], 
):
    is_valid_subsets = []
    for test_ex_idx in tqdm(ex_indices_to_check):
        _, _, is_valid_subset = retrain_and_evaluate_validity(
            clf=clf, 
            train_embeddings=train_embeddings, 
            train_labels=train_labels, 
            x_test=test_embeddings[test_ex_idx],
            indices_to_exclude=flip_list[test_ex_idx]
        )
        is_valid_subsets.append(is_valid_subset)

    return is_valid_subsets

In [6]:
## Do metrics for Yang fast

import pickle
from sklearn.linear_model import LogisticRegression


l2 = 500
logit_clf = LogisticRegression(penalty="l2", C= 1 / l2)
logit_clf.fit(train_eval_embeddings, train_eval_labels)

filename = "esnli_deberta_large_yang2023_alg1.pickle"
with open(filename, 'rb') as handle:
    yang_flip_list = pickle.load(handle)

# filter flip list to num zero entry
ex_indices = [i for i, l in enumerate(yang_flip_list) if l is not None]

is_yang_valid = evaluate_predictions(
    clf=logit_clf,
    flip_list=yang_flip_list,
    train_embeddings=train_eval_embeddings,
    train_labels=train_eval_labels,
    test_embeddings=test_embeddings,
    ex_indices_to_check=ex_indices,
)

print(f"Of the {len(ex_indices)} proposed subsets, only {np.sum(is_yang_valid)} is valid")
acc = np.sum(is_yang_valid)/len(test_labels) * 100
print(f"Validity: {acc:.2f}%")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
0it [00:00, ?it/s]

Of the 0 proposed subsets, only 0.0 is valid
Validity: 0.00%





In [7]:
ex_indices = [i for i, l in enumerate(yang_flip_list) if l is not None]

In [10]:
output_file_path = 'ex_indices_to_check_yang2023_alg1.pickle'
with open(output_file_path, 'wb') as output_file:
    pickle.dump(ex_indices, output_file)

In [9]:
len(ex_indices)

8394

In [2]:
yang_flip_list

NameError: name 'yang_flip_list' is not defined

In [26]:
## Do metrics for Yang slow

import pickle
from sklearn.linear_model import LogisticRegression


# l2 = 500
# logit_clf = LogisticRegression(penalty="l2", C= 1 / l2)
# logit_clf.fit(train_eval_embeddings, train_eval_labels)

filename = "esnli_deberta_large_yang2023_alg2.pickle"
with open(filename, 'rb') as handle:
    yang_flip_list = pickle.load(handle)

ex_indices = []
# filter flip list to num zero entry
for i, l in enumerate(yang_flip_list):
    if l is not None and len(l) > 0:
        ex_indices.append(i)

num_examples = []
# compute some basic statistics
for i, l in enumerate(yang_flip_list):
    if l is None or len(l) == 0:
        continue
    # compute the length of indices
    num_examples.append(len(l))

# is_yang_valid = evaluate_predictions(
#     clf=logit_clf,
#     flip_list=yang_flip_list,
#     train_embeddings=train_eval_embeddings,
#     train_labels=train_eval_labels,
#     test_embeddings=test_embeddings,
#     ex_indices_to_check=ex_indices,
# )

# print(f"Of the {len(ex_indices)} proposed subsets, only {np.sum(is_yang_valid)} is valid")
# acc = np.sum(is_yang_valid)/len(test_labels) * 100
# print(f"Validity: {acc:.2f}%")

In [29]:
len(num_examples), len(ex_indices), len(yang_flip_list)

(7734, 7734, 9824)

In [12]:
len(ex_indices)

9824

In [38]:
output_file_path = 'ex_indices_to_check_yang2023_alg2.pickle'
with open(output_file_path, 'wb') as output_file:
    pickle.dump(ex_indices, output_file)

In [37]:
len(ex_indices)

7734

In [40]:
output_file_path = 'ex_indices_to_check_yang2023_alg1.pickle'
with open(output_file_path, 'rb') as output_file:
    alg2_indices = pickle.load(output_file)

len(alg2_indices)

8394