This notebook is intended to evaluate the subset approaches for esnli for 
three metrics:

1. Coverage: how many subsets/total test inputs identified
2. Validity: how many valid subsets / total test inputs identified
3. Median of Valid Subset Sizes


| Classifier | Approach              | Coverage (% identified) | Validity (% identified and leads to flip) | Median Size |
|------------|-----------------------|--------------------------|--------------------------------------------|-------------|
| Random     | Class Exclusion       | x                        | x                                          | x           |
| Logistic   | Fast                  |                          |                                            |             |
| Logistic   | Slow                  |                          |                                            |             |
| Logistic   | Fast + CE fallback    |                          |                                            |             |
| Logistic   | Slow + CE fallback    |                          |                                            |             |
| KNN        | Greedy                |                          |                                            |             |
| KNN        | Greedy + CE fallback  |                          |                                            |             |
| SVM        | Greedy                |                          |                                            |             |
| SVM        | Greedy + CE fallback  |                          |                                            |             |
| DT         | Greedy                |                          |                                            |             |
| DT         | Greedy + CE fallback  |                          |                                            |             |
| LMeans     | Greedy                |                          |                                            |             |
| LMeans     | Greedy + CE fallback  |                          |                                            |             |

In [1]:
%load_ext autoreload
%autoreload 2

DATASET_NAME = "esnli"
LABEL_SPACE = ["entailment", "neutral", "contradiction"]
MODEL_NAME = "deberta_large"
SEED = 42
POOLER = "mean_with_attention"
LAYER = 24

import pandas as pd

def create_classifier_df(do_CE=False):
    # Create a list of classifiers and their base approaches
    classifiers = ["Logistic", "KNN", "SVM", "DT", "LMeans"]
    base_approaches = {
        "Logistic": ["Yang Fast", "Yang Slow"],
        "KNN": ["Greedy"],
        "SVM": ["Greedy"],
        "DT": ["Greedy"],
        "LMeans": ["Greedy"]
    }

    # Initialize the columns
    columns = ["Classifier", "Approach", "Coverage", "Overall Validity", "Precision Validity", "Median Size"]

    # Create the data list
    data = []

    # Populate the data list
    for classifier in classifiers:
        for approach in base_approaches[classifier]:
            data.append([classifier, approach] + [""] * (len(columns) - 2))
            
            # Append CE fallback approach if do_CE is True
            if do_CE:
                ce_fallback_approach = f"{approach} + CE fallback"
                data.append([classifier, approach] + [""] * (len(columns) - 2))

    # Create the DataFrame
    df = pd.DataFrame(data, columns=columns)
    return df

df = create_classifier_df(do_CE=False)
df

Unnamed: 0,Classifier,Approach,Coverage,Overall Validity,Precision Validity,Median Size
0,Logistic,Yang Fast,,,,
1,Logistic,Yang Slow,,,,
2,KNN,Greedy,,,,
3,SVM,Greedy,,,,
4,DT,Greedy,,,,
5,LMeans,Greedy,,,,


In [2]:
## Compute metrics for Yang Fast

# Load flip list and valid
from utils.io import load_pickle
from MinimalSubsetToFlipPredictions.evaluate import compute_subset_metrics


name = "yang2023_fast"
flip_list_filename = f"{DATASET_NAME}_{MODEL_NAME}_{name}.pickle"
flip_list = load_pickle(flip_list_filename)

is_valid_filename = f"{DATASET_NAME}_{MODEL_NAME}_{name}_is_valid.pickle"
is_valid = load_pickle(is_valid_filename)

metrics = compute_subset_metrics(flip_list, is_valid)

# Record metrics to df
row = (df['Classifier'] == "Logistic") & (df['Approach'] == "Yang Fast")
for m in metrics:
    df.loc[row, m] = metrics[m]

df

59/8394 identified subsets are valid
Overall validity is 59/9824, or 0.6%
Precision validity is 59/8394, or 0.7%
Identified 8394/9824 subsets.
Coverage: 85.44%
Median Valid Subset Sizes is 76.0, out of 59 valid subsets


Unnamed: 0,Classifier,Approach,Coverage,Overall Validity,Precision Validity,Median Size
0,Logistic,Yang Fast,85.44,0.6,0.7,76.0
1,Logistic,Yang Slow,,,,
2,KNN,Greedy,,,,
3,SVM,Greedy,,,,
4,DT,Greedy,,,,
5,LMeans,Greedy,,,,


In [3]:
## Compute metrics for Yang Slow

# Load flip list and valid
from utils.io import load_pickle
from MinimalSubsetToFlipPredictions.evaluate import compute_subset_metrics


name = "yang2023_slow"
flip_list_filename = f"{DATASET_NAME}_{MODEL_NAME}_{name}.pickle"
flip_list = load_pickle(flip_list_filename)

is_valid_filename = f"{DATASET_NAME}_{MODEL_NAME}_{name}_is_valid.pickle"
is_valid = load_pickle(is_valid_filename)

metrics = compute_subset_metrics(flip_list, is_valid)

# Record metrics to df
row = (df['Classifier'] == "Logistic") & (df['Approach'] == "Yang Slow")
for m in metrics:
    df.loc[row, m] = metrics[m]

df

36/7734 identified subsets are valid
Overall validity is 36/9824, or 0.37%
Precision validity is 36/7734, or 0.47%
Identified 7734/9824 subsets.
Coverage: 78.73%
Median Valid Subset Sizes is 57.0, out of 36 valid subsets


Unnamed: 0,Classifier,Approach,Coverage,Overall Validity,Precision Validity,Median Size
0,Logistic,Yang Fast,85.44,0.6,0.7,76.0
1,Logistic,Yang Slow,78.73,0.37,0.47,57.0
2,KNN,Greedy,,,,
3,SVM,Greedy,,,,
4,DT,Greedy,,,,
5,LMeans,Greedy,,,,


In [4]:
## Compute metrics for KNN

# Load flip list and valid
from utils.io import load_pickle
from MinimalSubsetToFlipPredictions.evaluate import compute_subset_metrics


name = "KNN"
flip_list_filename = f"{DATASET_NAME}_{MODEL_NAME}_{name}.pickle"
flip_list = load_pickle(flip_list_filename)

is_valid_filename = f"{DATASET_NAME}_{MODEL_NAME}_{name}_is_valid.pickle"
is_valid = load_pickle(is_valid_filename)

metrics = compute_subset_metrics(flip_list, is_valid)

# Record metrics to df
row = (df['Classifier'] == "KNN") & (df['Approach'] == "Greedy")
for m in metrics:
    df.loc[row, m] = metrics[m]

df

9822/9824 identified subsets are valid
Overall validity is 9822/9824, or 99.98%
Precision validity is 9822/9824, or 99.98%
Identified 9824/9824 subsets.
Coverage: 100.0%
Median Valid Subset Sizes is 2779.5, out of 9822 valid subsets


Unnamed: 0,Classifier,Approach,Coverage,Overall Validity,Precision Validity,Median Size
0,Logistic,Yang Fast,85.44,0.6,0.7,76.0
1,Logistic,Yang Slow,78.73,0.37,0.47,57.0
2,KNN,Greedy,100.0,99.98,99.98,2779.5
3,SVM,Greedy,,,,
4,DT,Greedy,,,,
5,LMeans,Greedy,,,,


In [5]:
## Compute metrics for LGBM

# Load flip list and valid
from utils.io import load_pickle
from MinimalSubsetToFlipPredictions.evaluate import compute_subset_metrics


name = "LGBM"
flip_list_filename = f"{DATASET_NAME}_{MODEL_NAME}_{name}.pickle"
flip_list = load_pickle(flip_list_filename)

is_valid_filename = f"{DATASET_NAME}_{MODEL_NAME}_{name}_is_valid.pickle"
is_valid = load_pickle(is_valid_filename)

metrics = compute_subset_metrics(flip_list, is_valid)

# Record metrics to df
row = (df['Classifier'] == "DT") & (df['Approach'] == "Greedy")
for m in metrics:
    df.loc[row, m] = metrics[m]

df

317/317 identified subsets are valid
Overall validity is 317/9824, or 3.23%
Precision validity is 317/317, or 100.0%
Identified 317/9824 subsets.
Coverage: 3.23%
Median Valid Subset Sizes is 89.0, out of 317 valid subsets


Unnamed: 0,Classifier,Approach,Coverage,Overall Validity,Precision Validity,Median Size
0,Logistic,Yang Fast,85.44,0.6,0.7,76.0
1,Logistic,Yang Slow,78.73,0.37,0.47,57.0
2,KNN,Greedy,100.0,99.98,99.98,2779.5
3,SVM,Greedy,,,,
4,DT,Greedy,3.23,3.23,100.0,89.0
5,LMeans,Greedy,,,,


In [6]:
## Compute metrics for LMeans

# Load flip list and valid
from utils.io import load_pickle
from MinimalSubsetToFlipPredictions.evaluate import compute_subset_metrics


# name = "LMeans"
# flip_list_filename = f"{DATASET_NAME}_{MODEL_NAME}_{name}.pickle"
# flip_list = load_pickle(flip_list_filename)

# is_valid_filename = f"{DATASET_NAME}_{MODEL_NAME}_{name}_is_valid.pickle"
# is_valid = load_pickle(is_valid_filename)

# metrics = compute_subset_metrics(flip_list, is_valid)

# # Record metrics to df
# row = (df['Classifier'] == "LMeans") & (df['Approach'] == "Greedy")
# for m in metrics:
#     df.loc[row, m] = metrics[m]

# df

# from MinimalSubsetToFlipPredictions.evaluate import compute_coverage


# compute_coverage(flip_list)

In [9]:
df

Unnamed: 0,Classifier,Approach,Coverage,Overall Validity,Precision Validity,Median Size
0,Logistic,Yang Fast,85.44,0.6,0.7,76.0
1,Logistic,Yang Slow,78.73,0.37,0.47,57.0
2,KNN,Greedy,100.0,99.98,99.98,2779.5
3,SVM,Greedy,,,,
4,DT,Greedy,3.23,3.23,100.0,89.0
5,LMeans,Greedy,,,,


In [8]:
print(df.to_latex(index=False))

\begin{tabular}{llllll}
\toprule
Classifier & Approach & Coverage & Overall Validity & Precision Validity & Median Size \\
\midrule
Logistic & Yang Fast & 85.440000 & 0.600000 & 0.700000 & 76.000000 \\
Logistic & Yang Slow & 78.730000 & 0.370000 & 0.470000 & 57.000000 \\
KNN & Greedy & 100.000000 & 99.980000 & 99.980000 & 2779.500000 \\
SVM & Greedy &  &  &  &  \\
DT & Greedy & 3.230000 & 3.230000 & 100.000000 & 89.000000 \\
LMeans & Greedy &  &  &  &  \\
\bottomrule
\end{tabular}

