This notebook is intended to evaluate the subset approaches for esnli for 
three metrics:

1. Coverage: how many subsets/total test inputs identified
2. Validity: how many valid subsets / total test inputs identified
3. Median of Valid Subset Sizes


| Classifier | Approach              | Coverage (% identified) | Validity (% identified and leads to flip) | Median Size |
|------------|-----------------------|--------------------------|--------------------------------------------|-------------|
| Random     | Class Exclusion       | x                        | x                                          | x           |
| Logistic   | Fast                  |                          |                                            |             |
| Logistic   | Slow                  |                          |                                            |             |
| Logistic   | Fast + CE fallback    |                          |                                            |             |
| Logistic   | Slow + CE fallback    |                          |                                            |             |
| KNN        | Greedy                |                          |                                            |             |
| KNN        | Greedy + CE fallback  |                          |                                            |             |
| SVM        | Greedy                |                          |                                            |             |
| SVM        | Greedy + CE fallback  |                          |                                            |             |
| DT         | Greedy                |                          |                                            |             |
| DT         | Greedy + CE fallback  |                          |                                            |             |
| LMeans     | Greedy                |                          |                                            |             |
| LMeans     | Greedy + CE fallback  |                          |                                            |             |

In [1]:
%load_ext autoreload
%autoreload 2

DATASET_NAME = "toxigen"
LABEL_SPACE = ["non-toxic", "toxic", "contradiction"]
MODEL_NAME = "deberta_large"
SEED = 42
POOLER = "mean_with_attention"
LAYER = 24

import pandas as pd

def create_classifier_df(do_CE=False):
    # Create a list of classifiers and their base approaches
    classifiers = ["Logistic", "KNN", "SVM", "DT", "LMeans"]
    base_approaches = {
        "Oracle": ["Naive"],
        "Logistic": ["Yang Fast", "Yang Slow"],
        "KNN": ["Greedy"],
        "SVM": ["Greedy"],
        "DT": ["Greedy"],
        "LMeans": ["Greedy"]
    }

    # Initialize the columns
    columns = ["Classifier", "Approach", "Coverage", "Overall Validity", "Precision Validity", "Median Size"]

    # Create the data list
    data = []

    # Populate the data list
    for classifier in classifiers:
        for approach in base_approaches[classifier]:
            data.append([classifier, approach] + [""] * (len(columns) - 2))
            
            # Append CE fallback approach if do_CE is True
            if do_CE:
                ce_fallback_approach = f"{approach} + CE fallback"
                data.append([classifier, approach] + [""] * (len(columns) - 2))

    # Create the DataFrame
    df = pd.DataFrame(data, columns=columns)
    return df

# Load metrics directly from json: saves computation
from utils.io import load_json
from tabulate import tabulate


def populate_df_from_disk():
    # Create a list of classifiers and their base approaches
    classifiers = [
        "Logistic", "KNN", 
        # "SVM", # leave out SVM for now
        "LGBM", "LMeans"
    ]
    base_approaches = {
        "Logistic": ["Yang Fast", "Yang Slow"],
        "KNN": ["Greedy"],
        # "SVM": ["Greedy"], # leave out SVM for now
        "LGBM": ["Greedy"],
        "LMeans": ["Greedy"]
    }

    # Initialize the columns
    columns = ["Classifier", "Approach", "Coverage", "Overall Validity", "Precision Validity", "Median Size"]

    # Create the data list
    data = []

    # Populate the data list
    for classifier in classifiers:
        for approach in base_approaches[classifier]:
            if "Yang" in approach:
                metric_filename = f"{DATASET_NAME}_{MODEL_NAME}_yang_{approach[5:].lower()}_metrics.json"
            elif approach == "Greedy":
                metric_filename = f"{DATASET_NAME}_{MODEL_NAME}_{classifier}_greedy_metrics.json"
            
            metrics = load_json(metric_filename)
            data.append([
                classifier, approach, metrics['Coverage'], metrics['Overall Validity'],
                metrics['Precision Validity'], metrics['Median Size']
            ])

    # Create the DataFrame
    df = pd.DataFrame(data, columns=columns)
    return df

df = create_classifier_df(do_CE=False)
populated_df = populate_df_from_disk()
print(tabulate(df, headers='keys', tablefmt='psql', showindex=False))
print(tabulate(populated_df, headers='keys', tablefmt='psql', showindex=False))

+--------------+------------+------------+--------------------+----------------------+---------------+
| Classifier   | Approach   | Coverage   | Overall Validity   | Precision Validity   | Median Size   |
|--------------+------------+------------+--------------------+----------------------+---------------|
| Logistic     | Yang Fast  |            |                    |                      |               |
| Logistic     | Yang Slow  |            |                    |                      |               |
| KNN          | Greedy     |            |                    |                      |               |
| SVM          | Greedy     |            |                    |                      |               |
| DT           | Greedy     |            |                    |                      |               |
| LMeans       | Greedy     |            |                    |                      |               |
+--------------+------------+------------+--------------------+----------

In [2]:
## Compute metrics for an oracle baseline 

# Load flip list and valid
from utils.io import load_pickle
from MinimalSubsetToFlipPredictions.evaluate.evaluate import compute_subset_metrics
import numpy as np

name = "Oracle"
flip_list_filename = f"{DATASET_NAME}_{name}_baseline.pickle"
flip_list = load_pickle(flip_list_filename)

is_valid = np.full((len(flip_list)), True)

metrics = compute_subset_metrics(flip_list, is_valid)
# Record metrics to disk
import json
with open(f"{DATASET_NAME}_{name}_metrics.json", "w") as handle:
    json.dump(metrics, handle, indent=4)

metrics

940/940 identified subsets are valid
Overall validity is 940/940, or 100.0%
Precision validity is 940/940, or 100.0%
Identified 940/940 subsets.
Coverage: 100.0%
Median Valid Subset Sizes is 5283.0, out of 940 valid subsets


{'Coverage': 100.0,
 'Overall Validity': 100.0,
 'Precision Validity': 100.0,
 'Median Size': 5283.0}

In [4]:
## Compute metrics for Yang Fast

# Load flip list and valid        "Oracle": ["Naive"],

is_valid_filename = f"{DATASET_NAME}_{MODEL_NAME}_{name}_is_valid.pickle"
is_valid = load_pickle(is_valid_filename)

metrics = compute_subset_metrics(flip_list, is_valid)
# Record metrics to disk
import json
with open(f"{DATASET_NAME}_{MODEL_NAME}_{name}_metrics.json", "w") as handle:
    json.dump(metrics, handle, indent=4)

# Record metrics to df
row = (df['Classifier'] == "Logistic") & (df['Approach'] == "Yang Fast")
for m in metrics:
    df.loc[row, m] = metrics[m]

df

59/8394 identified subsets are valid
Overall validity is 59/9824, or 0.6%
Precision validity is 59/8394, or 0.7%
Identified 8394/9824 subsets.
Coverage: 85.44%
Median Valid Subset Sizes is 76.0, out of 59 valid subsets


Unnamed: 0,Classifier,Approach,Coverage,Overall Validity,Precision Validity,Median Size
0,Logistic,Yang Fast,85.44,0.6,0.7,76.0
1,Logistic,Yang Slow,78.73,0.37,0.47,57.0
2,KNN,Greedy,,,,
3,SVM,Greedy,,,,
4,DT,Greedy,,,,
5,LMeans,Greedy,,,,


In [5]:
## Compute metrics for Yang Slow

# Load flip list and valid
from utils.io import load_pickle
from MinimalSubsetToFlipPredictions.evaluate.evaluate import compute_subset_metrics


name = "yang2023_slow"
flip_list_filename = f"{DATASET_NAME}_{MODEL_NAME}_{name}.pickle"
flip_list = load_pickle(flip_list_filename)

is_valid_filename = f"{DATASET_NAME}_{MODEL_NAME}_{name}_is_valid.pickle"
is_valid = load_pickle(is_valid_filename)

metrics = compute_subset_metrics(flip_list, is_valid)
# Record metrics to disk
import json
with open(f"{DATASET_NAME}_{MODEL_NAME}_{name}_metrics.json", "w") as handle:
    json.dump(metrics, handle, indent=4)

# Record metrics to df
row = (df['Classifier'] == "Logistic") & (df['Approach'] == "Yang Slow")
for m in metrics:
    df.loc[row, m] = metrics[m]

df

36/7734 identified subsets are valid
Overall validity is 36/9824, or 0.37%
Precision validity is 36/7734, or 0.47%
Identified 7734/9824 subsets.
Coverage: 78.73%
Median Valid Subset Sizes is 57.0, out of 36 valid subsets


Unnamed: 0,Classifier,Approach,Coverage,Overall Validity,Precision Validity,Median Size
0,Logistic,Yang Fast,85.44,0.6,0.7,76.0
1,Logistic,Yang Slow,78.73,0.37,0.47,57.0
2,KNN,Greedy,,,,
3,SVM,Greedy,,,,
4,DT,Greedy,,,,
5,LMeans,Greedy,,,,


In [6]:
## Compute metrics for KNN

# Load flip list and valid
from utils.io import load_pickle
from MinimalSubsetToFlipPredictions.evaluate.evaluate import compute_subset_metrics


name = "KNN"
flip_list_filename = f"{DATASET_NAME}_{MODEL_NAME}_{name}.pickle"
flip_list = load_pickle(flip_list_filename)

is_valid_filename = f"{DATASET_NAME}_{MODEL_NAME}_{name}_is_valid.pickle"
is_valid = load_pickle(is_valid_filename)

metrics = compute_subset_metrics(flip_list, is_valid)
import json
with open(f"{DATASET_NAME}_{MODEL_NAME}_{name}_greedy_metrics.json", "w") as handle:
    json.dump(metrics, handle, indent=4)

# Record metrics to df
row = (df['Classifier'] == "KNN") & (df['Approach'] == "Greedy")
for m in metrics:
    df.loc[row, m] = metrics[m]

df

9822/9824 identified subsets are valid
Overall validity is 9822/9824, or 99.98%
Precision validity is 9822/9824, or 99.98%
Identified 9824/9824 subsets.
Coverage: 100.0%
Median Valid Subset Sizes is 2779.5, out of 9822 valid subsets


Unnamed: 0,Classifier,Approach,Coverage,Overall Validity,Precision Validity,Median Size
0,Logistic,Yang Fast,85.44,0.6,0.7,76.0
1,Logistic,Yang Slow,78.73,0.37,0.47,57.0
2,KNN,Greedy,100.0,99.98,99.98,2779.5
3,SVM,Greedy,,,,
4,DT,Greedy,,,,
5,LMeans,Greedy,,,,


In [7]:
## Compute metrics for LGBM

# Load flip list and valid
from utils.io import load_pickle
from MinimalSubsetToFlipPredictions.evaluate.evaluate import compute_subset_metrics


name = "LGBM"
flip_list_filename = f"{DATASET_NAME}_{MODEL_NAME}_{name}.pickle"
flip_list = load_pickle(flip_list_filename)

is_valid_filename = f"{DATASET_NAME}_{MODEL_NAME}_{name}_is_valid.pickle"
is_valid = load_pickle(is_valid_filename)

metrics = compute_subset_metrics(flip_list, is_valid)
import json
with open(f"{DATASET_NAME}_{MODEL_NAME}_{name}_greedy_metrics.json", "w") as handle:
    json.dump(metrics, handle, indent=4)

# Record metrics to df
row = (df['Classifier'] == "DT") & (df['Approach'] == "Greedy")
for m in metrics:
    df.loc[row, m] = metrics[m]

df

317/317 identified subsets are valid
Overall validity is 317/9824, or 3.23%
Precision validity is 317/317, or 100.0%
Identified 317/9824 subsets.
Coverage: 3.23%
Median Valid Subset Sizes is 89.0, out of 317 valid subsets


Unnamed: 0,Classifier,Approach,Coverage,Overall Validity,Precision Validity,Median Size
0,Logistic,Yang Fast,85.44,0.6,0.7,76.0
1,Logistic,Yang Slow,78.73,0.37,0.47,57.0
2,KNN,Greedy,100.0,99.98,99.98,2779.5
3,SVM,Greedy,,,,
4,DT,Greedy,3.23,3.23,100.0,89.0
5,LMeans,Greedy,,,,


In [2]:
## Compute metrics for LMeans

# Load flip list and valid
from utils.io import load_pickle
from MinimalSubsetToFlipPredictions.evaluate.evaluate import compute_subset_metrics


name = "LMeans"
flip_list_filename = f"{DATASET_NAME}_{MODEL_NAME}_{name}.pickle"
flip_list = load_pickle(flip_list_filename)

is_valid_filename = f"{DATASET_NAME}_{MODEL_NAME}_{name}_is_valid.pickle"
is_valid = load_pickle(is_valid_filename)

metrics = compute_subset_metrics(flip_list, is_valid)

# Record metrics to disk
import json
with open(f"{DATASET_NAME}_{MODEL_NAME}_{name}_greedy_metrics.json", "w") as handle:
    json.dump(metrics, handle, indent=4)

# Record metrics to df
row = (df['Classifier'] == "LMeans") & (df['Approach'] == "Greedy")
for m in metrics:
    df.loc[row, m] = metrics[m]

df

9824/9824 identified subsets are valid
Overall validity is 9824/9824, or 100.0%
Precision validity is 9824/9824, or 100.0%
Identified 9824/9824 subsets.
Coverage: 100.0%
Median Valid Subset Sizes is 140523.0, out of 9824 valid subsets


Unnamed: 0,Classifier,Approach,Coverage,Overall Validity,Precision Validity,Median Size
0,Logistic,Yang Fast,,,,
1,Logistic,Yang Slow,,,,
2,KNN,Greedy,,,,
3,SVM,Greedy,,,,
4,DT,Greedy,,,,
5,LMeans,Greedy,100.0,100.0,100.0,140523.0


In [3]:
populated_df

Unnamed: 0,Classifier,Approach,Coverage,Overall Validity,Precision Validity,Median Size
0,Logistic,Yang Fast,99.04,0.53,0.54,2430.0
1,Logistic,Yang Slow,5.11,0.21,4.17,90.5
2,KNN,Greedy,100.0,100.0,100.0,6144.0
3,LGBM,Greedy,13.4,13.4,100.0,4.0
4,LMeans,Greedy,100.0,100.0,100.0,6339.0


In [4]:
print(populated_df.to_latex(index=False))

\begin{tabular}{llrrrr}
\toprule
Classifier & Approach & Coverage & Overall Validity & Precision Validity & Median Size \\
\midrule
Logistic & Yang Fast & 99.040000 & 0.530000 & 0.540000 & 2430.000000 \\
Logistic & Yang Slow & 5.110000 & 0.210000 & 4.170000 & 90.500000 \\
KNN & Greedy & 100.000000 & 100.000000 & 100.000000 & 6144.000000 \\
LGBM & Greedy & 13.400000 & 13.400000 & 100.000000 & 4.000000 \\
LMeans & Greedy & 100.000000 & 100.000000 & 100.000000 & 6339.000000 \\
\bottomrule
\end{tabular}

