This notebook investigates a simple idea for valid subsets that leads to 
prediction flip, by simply just removing all candidate example-based
explanations (examples used in inference), and note how many times it is valid

In [2]:
## Constants for control

DATASET = "esnli"
MODEL = "deberta-large"
SEED = 42
POOLER = "mean_with_attention"
LAYER = 24

In [3]:
## Load Data
# load embeddings
from data.embeddings import load_saved_embeddings
train_embeddings = load_saved_embeddings(
    dataset=DATASET,
    model=MODEL,
    seed=SEED,
    split="train",
    pooler=POOLER,
    layer=LAYER
)

eval_embeddings = load_saved_embeddings(
    dataset=DATASET,
    model=MODEL,
    seed=SEED,
    split="eval",
    pooler=POOLER,
    layer=LAYER
)

test_embeddings = load_saved_embeddings(
    dataset=DATASET,
    model=MODEL,
    seed=SEED,
    split="test",
    pooler=POOLER,
    layer=LAYER
)

# load labels 
from data.datasets import load_dataset_from_hf, load_labels_at_split
import numpy as np
dataset = load_dataset_from_hf(dataset=DATASET)
train_labels = load_labels_at_split(dataset, "train")
eval_labels = load_labels_at_split(dataset, "eval")
train_eval_embeddings = np.vstack([train_embeddings, eval_embeddings])
train_eval_labels = np.concatenate([train_labels, eval_labels])
test_labels = load_labels_at_split(dataset, "test")

from datasets import DatasetDict, concatenate_datasets
train_eval_dataset = concatenate_datasets([dataset["train"], dataset["eval"]])
dataset_dict = DatasetDict(
    {"train": train_eval_dataset, "test": dataset["test"]}
)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/samsoup/.cache/huggingface/token
Login successful


In [4]:
# KNN: how many times is the K neighbors a valid subset?
from data.models import load_saved_wrapperbox_model
from utils.inference import find_majority_batched

knn_clf = load_saved_wrapperbox_model(
    dataset="esnli",
    model="deberta-large",
    seed=42,
    pooler="mean_with_attention",
    wrapperbox="KNN"
)

K = knn_clf.n_neighbors

predictions = knn_clf.predict(test_embeddings)
neigh_indices = knn_clf.kneighbors(
    X=test_embeddings,
    n_neighbors=len(train_eval_labels),
    return_distance=False,
)
neigh_labels = train_eval_labels[neigh_indices]
# remove the K neighbors is the same as sliding the window down by K 
# and check the majority of the next K, to see if prediction has flipped
next_window = neigh_labels[:, K : 2*K]
majority_current = find_majority_batched(next_window)
changed_majority = np.logical_not(majority_current == predictions)
num_changed = np.sum(changed_majority)

# Print a summary of changed_majority
print(f"{num_changed} of {train_eval_labels.size} has changed prediction")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


: 

In [None]:
changed_majority