## Evaluate Approaches

This notebook verifies if identified subsets are valid.

An empty (or None) subset is automatically invalid

In [1]:
%load_ext autoreload
%autoreload 2


from typing import List
import numpy as np


DATASET_NAME = "esnli"
LABEL_SPACE = ["entailment", "neutral", "contradiction"]
MODEL_NAME = "deberta_large"
SEED = 42
POOLER = "mean_with_attention"
LAYER = 24

# check of the non_empty sets, how many are actually valid 
def summarize_subset_results(flip_list: List[List[int]], is_valid: List[bool]):
    total = len(flip_list)
    valid = 0
    identified_subset = 0
    subset_sizes = []
    for i, l in enumerate(flip_list):
        if l is not None and len(l) > 0: 
            identified_subset += 1
            if is_valid[i]:
                valid += 1
                subset_sizes.append(len(l))
    metrics = {
        "Coverage": round(identified_subset / total * 100, 2),
        "Validity": round(valid / total * 100, 2),
        "Median Valid Subset Sizes": np.median(np.array(subset_sizes))
    }
    print(f"Identified {identified_subset}/{total} subsets")
    print(f"{valid}/{identified_subset} identified subsets are valid")
    print(f"Overall, validity is {valid}/{total}, or {metrics['Validity']}%")
    print(f"Median Valid Subset Sizes is {metrics['Median Valid Subset Sizes']}")

    return metrics

In [2]:
## Load Embeddings
from utils.io import (
    load_dataset_from_hf,
    load_labels_at_split,
    load_embeddings,
    load_wrapperbox
)
import numpy as np


train_embeddings = load_embeddings(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    split="train",
    pooler=POOLER,
    layer=LAYER
)

eval_embeddings = load_embeddings(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    split="eval",
    pooler=POOLER,
    layer=LAYER
)

test_embeddings = load_embeddings(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    split="test",
    pooler=POOLER,
    layer=LAYER
)

train_eval_embeddings = np.vstack([train_embeddings, eval_embeddings])


## Load Datasets and Labels
dataset = load_dataset_from_hf(dataset=DATASET_NAME)
train_labels = load_labels_at_split(dataset, "train")
eval_labels = load_labels_at_split(dataset, "eval")
train_eval_labels = np.concatenate([train_labels, eval_labels])
test_labels = load_labels_at_split(dataset, "test")

from datasets import DatasetDict, concatenate_datasets
train_eval_dataset = concatenate_datasets([dataset["train"], dataset["eval"]])
dataset_dict = DatasetDict(
    {"train": train_eval_dataset, "test": dataset["test"]}
)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /home/samsoup/.cache/huggingface/token
Login successful




In [3]:
## Do metrics for Yang fast

from MinimalSubsetToFlipPredictions.evaluate import evaluate_predictions
import pickle

wrapper_name = "LogisticRegression"
clf = load_wrapperbox(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    pooler=POOLER,
    wrapperbox=wrapper_name
)

filename = "esnli_deberta_large_yang_fast.pickle"
with open(filename, 'rb') as handle:
    flip_list = pickle.load(handle)

ex_indices_to_check = np.arange(len(flip_list))
is_valid = evaluate_predictions(
    clf=clf,
    flip_list=flip_list,
    train_embeddings=train_eval_embeddings,
    train_labels=train_eval_labels,
    test_embeddings=test_embeddings,
    ex_indices_to_check=ex_indices_to_check,
)

metrics = summarize_subset_results(
    flip_list=flip_list,
    is_valid=is_valid
)

metrics

  0%|          | 0/9824 [06:43<?, ?it/s]


KeyboardInterrupt: 

In [2]:
## Do metrics for Yang fast, using their output files 

from pprint import pprint
from MinimalSubsetToFlipPredictions.evaluate.evaluate import (
    evaluate_by_prediction_probas, compute_subset_metrics
)
import pickle

name = "yang_fast"
filename = f"{DATASET_NAME}_{MODEL_NAME}_{name}.pickle"
with open(filename, 'rb') as handle:
    flip_list = pickle.load(handle)

filename = f"{DATASET_NAME}_{MODEL_NAME}_{name}_new_predictions.npy"
new_predictions = np.load(filename, allow_pickle=True)

filename = f"{DATASET_NAME}_{MODEL_NAME}_{name}_old_predictions.npy"
old_predictions = np.load(filename, allow_pickle=True)

is_valid = evaluate_by_prediction_probas(
    new_predictions=new_predictions,
    old_predictions=old_predictions
)

metrics = compute_subset_metrics(
    flip_list=flip_list,
    is_valid=is_valid
)

pprint(metrics)

38/8825 identified subsets are valid
Overall validity is 38/9824, or 0.39%
Precision validity is 38/8825, or 0.43%
Identified 8825/9824 subsets.
Coverage: 89.83%
Median Valid Subset Sizes is 76446.5, out of 38 valid subsets
{'Coverage': 89.83,
 'Median Size': 76446.5,
 'Overall Validity': 0.39,
 'Precision Validity': 0.43}


In [4]:
## Do metrics for Yang slow

from MinimalSubsetToFlipPredictions.evaluate import evaluate_predictions
import pickle

wrapper_name = "LogisticRegression"
clf = load_wrapperbox(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    pooler=POOLER,
    wrapperbox=wrapper_name
)

filename = "esnli_deberta_large_yang_slow.pickle"
with open(filename, 'rb') as handle:
    flip_list = pickle.load(handle)

ex_indices_to_check = np.arange(len(flip_list))
is_valid = evaluate_predictions(
    clf=clf,
    flip_list=flip_list,
    train_embeddings=train_eval_embeddings,
    train_labels=train_eval_labels,
    test_embeddings=test_embeddings,
    ex_indices_to_check=ex_indices_to_check,
)

metrics = summarize_subset_results(
    flip_list=flip_list,
    is_valid=is_valid
)

metrics

  0%|          | 2/9824 [35:30<2906:45:39, 1065.40s/it]


KeyboardInterrupt: 

In [3]:
## Do metrics for Yang fast, using their output files 

from pprint import pprint
from MinimalSubsetToFlipPredictions.evaluate.evaluate import (
    evaluate_by_prediction_probas, compute_subset_metrics
)
import pickle

name = "yang_slow"
filename = f"{DATASET_NAME}_{MODEL_NAME}_{name}.pickle"
with open(filename, 'rb') as handle:
    flip_list = pickle.load(handle)

filename = f"{DATASET_NAME}_{MODEL_NAME}_{name}_new_predictions.npy"
new_predictions = np.load(filename, allow_pickle=True)

filename = f"{DATASET_NAME}_{MODEL_NAME}_{name}_old_predictions.npy"
old_predictions = np.load(filename, allow_pickle=True)

is_valid = evaluate_by_prediction_probas(
    new_predictions=new_predictions,
    old_predictions=old_predictions
)

metrics = compute_subset_metrics(
    flip_list=flip_list,
    is_valid=is_valid
)

pprint(metrics)

11/8825 identified subsets are valid
Overall validity is 11/9824, or 0.11%
Precision validity is 11/8825, or 0.12%
Identified 8825/9824 subsets.
Coverage: 89.83%
Median Valid Subset Sizes is 2.0, out of 11 valid subsets
{'Coverage': 89.83,
 'Median Size': 2.0,
 'Overall Validity': 0.11,
 'Precision Validity': 0.12}


In [3]:
## Do metrics for KNN

import json
import pprint
from MinimalSubsetToFlipPredictions.evaluate import evaluate_predictions
import pickle

wrapper_name = "KNN"
clf = load_wrapperbox(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    pooler=POOLER,
    wrapperbox=wrapper_name
)

filename = f"esnli_deberta_large_{wrapper_name}.pickle"
with open(filename, 'rb') as handle:
    flip_list = pickle.load(handle)

total = len(flip_list)
ex_indices_to_check = np.arange(total)
flip_list = flip_list[:total]

is_valid = evaluate_predictions(
    clf=clf,
    flip_list=flip_list,
    train_embeddings=train_eval_embeddings,
    train_labels=train_eval_labels,
    test_embeddings=test_embeddings,
    ex_indices_to_check=ex_indices_to_check,
)

metrics = summarize_subset_results(
    flip_list=flip_list,
    is_valid=is_valid
)

pprint(metrics)

# Path to the output JSON file
output_file_path = f'{DATASET_NAME}_{MODEL_NAME}_{wrapper_name}_greedy_metrics.json'

# Save the dictionary to a JSON file
with open(output_file_path, 'w') as json_file:
    json.dump(metrics, json_file, indent=4)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
  1%|          | 90/9824 [02:56<5:18:50,  1.97s/it]


KeyboardInterrupt: 

In [5]:
from MinimalSubsetToFlipPredictions.evaluate.evaluate import compute_valid_subset_sizes


np.median(compute_valid_subset_sizes(flip_list=flip_list, is_valid=[True] * len(flip_list)))

77.5

In [24]:
## Do metrics for LGBM

from MinimalSubsetToFlipPredictions.evaluate import evaluate_predictions
import pickle

wrapper_name = "LGBM"
clf = load_wrapperbox(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    pooler=POOLER,
    wrapperbox=wrapper_name
)

filename = f"esnli_deberta_large_{wrapper_name}.pickle"
with open(filename, 'rb') as handle:
    flip_list = pickle.load(handle)

total = len(flip_list)
ex_indices_to_check = np.arange(total)
flip_list = flip_list[:total]

is_valid = evaluate_predictions(
    clf=clf,
    flip_list=flip_list,
    train_embeddings=train_eval_embeddings,
    train_labels=train_eval_labels,
    test_embeddings=test_embeddings,
    ex_indices_to_check=ex_indices_to_check,
)

metrics = summarize_subset_results(
    flip_list=flip_list,
    is_valid=is_valid
)

pprint(metrics)

# Path to the output JSON file
output_file_path = f'{DATASET_NAME}_{MODEL_NAME}_{wrapper_name}_greedy_metrics.json'

# Save the dictionary to a JSON file
with open(output_file_path, 'w') as json_file:
    json.dump(metrics, json_file, indent=4)

  0%|          | 0/9824 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.470616 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 558761, number of used features: 1024
[LightGBM] [Info] Start training from score -1.097114
[LightGBM] [Info] Start training from score -1.101229
[LightGBM] [Info] Start training from score -1.097500


  0%|          | 21/9824 [00:16<2:05:12,  1.30it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.408087 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 558978, number of used features: 1024
[LightGBM] [Info] Start training from score -1.096966
[LightGBM] [Info] Start training from score -1.100971
[LightGBM] [Info] Start training from score -1.097904


  1%|          | 67/9824 [00:31<1:11:51,  2.26it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.371525 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 558013, number of used features: 1024
[LightGBM] [Info] Start training from score -1.097330
[LightGBM] [Info] Start training from score -1.101106
[LightGBM] [Info] Start training from score -1.097405


  1%|          | 92/9824 [00:47<1:23:33,  1.94it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.462219 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 558470, number of used features: 1024
[LightGBM] [Info] Start training from score -1.097059
[LightGBM] [Info] Start training from score -1.101580
[LightGBM] [Info] Start training from score -1.097204


  2%|▏         | 217/9824 [01:03<37:36,  4.26it/s] 

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.322925 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 558947, number of used features: 1024
[LightGBM] [Info] Start training from score -1.096321
[LightGBM] [Info] Start training from score -1.100690
[LightGBM] [Info] Start training from score -1.098831


  3%|▎         | 272/9824 [01:19<39:47,  4.00it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.388774 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 559202, number of used features: 1024
[LightGBM] [Info] Start training from score -1.096777
[LightGBM] [Info] Start training from score -1.100786
[LightGBM] [Info] Start training from score -1.098278


  3%|▎         | 280/9824 [01:36<58:03,  2.74it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.449948 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 557606, number of used features: 1024
[LightGBM] [Info] Start training from score -1.094048
[LightGBM] [Info] Start training from score -1.101142
[LightGBM] [Info] Start training from score -1.100662


  3%|▎         | 304/9824 [01:52<1:07:51,  2.34it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.456064 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 557377, number of used features: 1024
[LightGBM] [Info] Start training from score -1.097189
[LightGBM] [Info] Start training from score -1.103102
[LightGBM] [Info] Start training from score -1.095562


  3%|▎         | 307/9824 [02:08<1:35:38,  1.66it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.578366 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 559200, number of used features: 1024
[LightGBM] [Info] Start training from score -1.096785
[LightGBM] [Info] Start training from score -1.100782
[LightGBM] [Info] Start training from score -1.098274


  4%|▎         | 348/9824 [02:25<1:23:39,  1.89it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.679754 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 558931, number of used features: 1024
[LightGBM] [Info] Start training from score -1.097053
[LightGBM] [Info] Start training from score -1.100693
[LightGBM] [Info] Start training from score -1.098094


  4%|▍         | 381/9824 [02:42<1:22:29,  1.91it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 2.424240 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 261120
[LightGBM] [Info] Number of data points in the train set: 558860, number of used features: 1024
[LightGBM] [Info] Start training from score -1.096192
[LightGBM] [Info] Start training from score -1.100959
[LightGBM] [Info] Start training from score -1.098691


  4%|▍         | 391/9824 [03:10<1:16:33,  2.05it/s]


KeyboardInterrupt: 

In [None]:
## Do metrics for LMeans

from MinimalSubsetToFlipPredictions.evaluate import evaluate_predictions
import pickle

from MinimalSubsetToFlipPredictions.evaluate.evaluate import compute_valid_subset_sizes

wrapper_name = "LMeans"
clf = load_wrapperbox(
    dataset=DATASET_NAME,
    model=MODEL_NAME,
    seed=SEED,
    pooler=POOLER,
    wrapperbox=wrapper_name
)

filename = f"esnli_deberta_large_{wrapper_name}.pickle"
with open(filename, 'rb') as handle:
    flip_list = pickle.load(handle)

total = len(flip_list)
ex_indices_to_check = np.arange(total)
flip_list = flip_list[:total]

is_valid = evaluate_predictions(
    clf=clf,
    flip_list=flip_list,
    train_embeddings=train_eval_embeddings,
    train_labels=train_eval_labels,
    test_embeddings=test_embeddings,
    ex_indices_to_check=ex_indices_to_check,
)

metrics = summarize_subset_results(
    flip_list=flip_list,
    is_valid=is_valid
)

pprint(metrics)

# Path to the output JSON file
output_file_path = f'{DATASET_NAME}_{MODEL_NAME}_{wrapper_name}_greedy_metrics.json'

# Save the dictionary to a JSON file
with open(output_file_path, 'w') as json_file:
    json.dump(metrics, json_file, indent=4)
    
# For LMeans, also compute valid subset sizes
valid_subset_sizes = compute_valid_subset_sizes(
    flip_list, is_valid
)
import pickle
with open(f"{DATASET_NAME}_{MODEL_NAME}_{wrapper_name}_valid_subset_sizes.pickle", 'wb') as pickle_file:
    x = pickle.dump(valid_subset_sizes, pickle_file)

FileNotFoundError: [Errno 2] No such file or directory: 'esnli_deberta_large_LMeans.part_1.pickle'