# Evaluation notebook

We have divided this notebook into the following parts:

1. Load **matrix**: We load a CSV file with the matrix concerning the model to evaluate (e.g., validation, calibration or test set data).
2. Load **preds**: We load a CSV file with the predictions concerning the model to evaluate.
3. **Tokenize** the matrix for evaluation purposes: We apply tokenization (e.g., _spacy_ or _whitespace_) before evaluating the results.
4. **Compute metrics** (dubbed evaluations): We compute the specified evaluation metrics.
5. **Dump metrics**: After computing the evaluations, dump their results in the disk.

**Note**: We assume that all of these files will have a set of index columns through which we can jointly align them.


In [1]:
OUTPUT_DIR = "../outputs"

MODEL_NAME = "allenai/unifiedqa-t5-small"
#model_name = "t5-small"


# name of the dataset to preprocess
DATASET_NAME, SPLIT_NAME = "squad", "validation"
# DATASET_NAME, SPLIT_NAME = "newsqa", "dev"
# DATASET_NAME, SPLIT_NAME = ('squadshifts', 'new_wiki'), "test"
# DATASET_NAME, SPLIT_NAME = ('squadshifts', 'nyt'), "test"
# DATASET_NAME, SPLIT_NAME = ('squadshifts', 'amazon'), "test"
# DATASET_NAME, SPLIT_NAME = ('squadshifts', 'reddit'), "test"
# DATASET_NAME, SPLIT_NAME = "narrativeqa", "test_5k_sample_seed_2022"


IS_LOCAL_FS_DATASET = True \
    if (DATASET_NAME in ("newsqa", ) or SPLIT_NAME in ("test_5k_sample_seed_2022",)) \
    else False

if isinstance(DATASET_NAME, tuple):
    NORMALIZED_DATASET_NAME = "".join(DATASET_NAME)
else:
    NORMALIZED_DATASET_NAME = DATASET_NAME

BASE_FILENAME = f"{NORMALIZED_DATASET_NAME}_{SPLIT_NAME}"

ROOT_DIR = f"{OUTPUT_DIR}/results/{NORMALIZED_DATASET_NAME}/{SPLIT_NAME}"


MATRIX_DIR = f"{ROOT_DIR}/matrix"
MATRIX_FILEPATH = f"{MATRIX_DIR}/{BASE_FILENAME}_preprocessed.csv"

PREDS_DIR = f"{ROOT_DIR}/preds"
PREDS_FILEPATH = f"{PREDS_DIR}/{BASE_FILENAME}" + f"_{NORMALIZED_DATASET_NAME}_{SPLIT_NAME}.csv.gz"
PREDS_SCORES_FILEPATH = f"{PREDS_DIR}/{BASE_FILENAME}" + f"_{NORMALIZED_DATASET_NAME}_{SPLIT_NAME}_scores.csv.gz"

# ----------------------------------------------------------------------------------
# Outputs
# ----------------------------------------------------------------------------------
EVALS_DIR = f"{ROOT_DIR}/evals"
!mkdir -p {EVALS_DIR}

# Tokenizer
TOKENIZER = "default"
# TOKENIZER = "spacy"
TOKENIZER_FILEPATH = f"{EVALS_DIR}/{BASE_FILENAME}_evals_{TOKENIZER}_config.yml"

# Instance-wise metrics for each prediction
EVALS_FILEPATH = f"{EVALS_DIR}/{BASE_FILENAME}_evals_{TOKENIZER}.csv.gz"

# Dataset-wise metrics avg over all predictions (it will include calibration and correlation metrics)
EVALS_GLOBAL_FILEPATH = f"{ROOT_DIR}/evals/{BASE_FILENAME}_{TOKENIZER}"
CORR_METRICS_SUFFIX = "correlation_metrics.csv"
CALIB_METRICS_SUFFIX = "calib_metrics.csv"
PERF_METRICS_SUFFIX = "perf_metrics.csv"

# Arguments used to read the files from disk
csv_kwargs = {
    "compression": "gzip",
    # "encoding": "utf-8",
}

# ----------------------------------------
## Columns names
# ----------------------------------------
ID_COLS = ["example_id", "answer_id"]

UNIQUE_ID_COL = ID_COLS[0]
NON_UNIQUE_ID_COL = ID_COLS[1]
print("Using", UNIQUE_ID_COL, "as the unique column to de-duplicate the data")

Using example_id as the unique column to de-duplicate the data


## Load Data and Preds

We expect the data matrix to be a matrix of instances described by the `ID_COLUMNS` specified above but also by the following columns (along with some others that won't be used in this notebook such as the Xs): 

- `TARGET_LABEL`: the golden text of the example. It should not contain any model-specific preprocessing.
- `TARGET_MULTI_LABELS`: the multiple annotations that could be provided to that example (e.g., in a QA setting we can have multiple possible answers for the same context question pair.


We expect the corresponding **predictions** to be described by the `ID_COLUMNS` but also by the following columns:
- `TARGET_PRED_LABEL`: the predicted text.

In [2]:
import pandas as pd
import numpy as np
import yaml

In [3]:
TARGET_LABEL = "labels"
TARGET_MULTI_LABELS = "multi_way_labels"

TARGET_PRED_LABEL = "preds"

In [4]:
# matrix = pd.read_csv(MATRIX_FILEPATH, **csv_kwargs, converters={TARGET_MULTI_LABELS: eval}).set_index(ID_COLS)
matrix = pd.read_csv(MATRIX_FILEPATH).set_index(ID_COLS)
print("Loaded", len(matrix), "datapoints from", MATRIX_FILEPATH)

preds = pd.read_csv(PREDS_FILEPATH, **csv_kwargs).set_index(ID_COLS)
print("Loaded", len(preds), "predictions from", PREDS_FILEPATH)

assert len(preds) <= len(matrix), "More preds than datapoints: len(preds) > len(matrix)"

Loaded 18015 datapoints from ../outputs/results/squad/validation/matrix/squad_validation_preprocessed.csv
Loaded 10570 predictions from ../outputs/results/squad/validation/preds/squad_validation_squad_validation.csv.gz


### Complement predictions w/ multiple normalization schemes

In [5]:
preds_raw_scores = preds.preds_raw_scores.apply(eval)

# Compute arithm proba
preds["score_proba_arithm"] = preds_raw_scores.apply(np.mean)
preds["score_proba_std"] = preds_raw_scores.apply(np.std)


from scipy.stats.mstats import gmean, hmean
preds["score_proba_geom"] = preds_raw_scores.apply(gmean)
preds["score_proba_hmean"] = preds_raw_scores.apply(hmean)

preds.to_csv(PREDS_SCORES_FILEPATH, **{"compression": "gzip", "header": True, "encoding": "utf-8",})

In [6]:
preds

Unnamed: 0_level_0,Unnamed: 1_level_0,preds,preds_id,preds_raw_int,preds_raw_str,preds_raw_count,truncated,score_proba,preds_raw_scores,score_proba_arithm,score_proba_std,score_proba_geom,score_proba_hmean
example_id,answer_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
56be4db0acb8001400a502ec,d811f352fec69c6b04229bea6c53e8b1,Carolina Panthers,25b8b0375191e4af224a5c59beab4ce1,"[5089, 21149, 7, 1]","['▁Carolina', '▁Panther', 's']",3,1,0.804312,"[0.8083785176277161, 0.9999995231628418, 0.999...",0.950837,0.082273,0.947013,0.942929
56be4db0acb8001400a502ed,19c3b94be895352c1141244c2d436f7e,Carolina Panthers,25b8b0375191e4af224a5c59beab4ce1,"[5089, 21149, 7, 1]","['▁Carolina', '▁Panther', 's']",3,1,0.990322,"[0.9952126741409302, 0.9999997615814209, 0.999...",0.997575,0.002403,0.997572,0.997569
56be4db0acb8001400a502ee,8e3cdc8e6a6146c500b344d1db8f53ee,San Francisco Bay Area,d845ec7044fdb92db8dc7dee95f06b4b,"[1051, 5901, 2474, 5690, 1]","['▁San', '▁Francisco', '▁Bay', '▁Area']",4,1,0.150021,"[0.3066118657588959, 0.9989548921585083, 0.965...",0.755446,0.291388,0.684275,0.604799
56be4db0acb8001400a502ef,31fde941280b986bd50133d2a4dd077d,Carolina Panthers,25b8b0375191e4af224a5c59beab4ce1,"[5089, 21149, 7, 1]","['▁Carolina', '▁Panther', 's']",3,1,0.910507,"[0.9142481684684753, 0.9999992847442627, 0.999...",0.977539,0.036578,0.976834,0.976109
56be4db0acb8001400a502f0,6135253b6dcda73931362164a1371b78,gold,c762f5cc7760f52c37a1ad52ab2d693a,"[2045, 1]",['▁gold'],1,1,0.979656,"[0.9989109039306641, 0.980724573135376]",0.989818,0.009093,0.989776,0.989734
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5737aafd1c456719005744fb,a6994acc1ae88f7fa866ec28cf4823d0,the kilogram-force,f66a3d972d46c3f320f09dc795ced357,"[8, 23332, 18, 10880, 1]","['▁the', '▁kilogram', '-', 'force']",4,1,0.378249,"[0.6792029142379761, 0.8805549740791321, 0.999...",0.838396,0.155831,0.823295,0.807914
5737aafd1c456719005744fc,82573e294f5cc900b2f9e13df2929a66,kilopond,17476b8026606f0a99b0b099675683ba,"[3, 157, 173, 32, 7290, 1]","['▁', 'k', 'il', 'o', 'pond']",5,1,0.755473,"[0.7666891813278198, 0.9989207983016968, 0.999...",0.958674,0.085989,0.954340,0.949497
5737aafd1c456719005744fd,2726fa3db542e4e0d3bbadf2b5d94afd,the metric slug,965fd244b9c1171d38bf08d7a415b691,"[8, 3, 7959, 3, 7, 8076, 1]","['▁the', '▁', 'metric', '▁', 's', 'lug']",6,1,0.418631,"[0.4673277735710144, 0.9274631142616272, 0.999...",0.908655,0.181909,0.883031,0.848143
5737aafd1c456719005744fe,ac557fb4ecec58ec64f4ca1e7c197d5a,metric slug,1189f472a1f1a450ee62108b31c88920,"[3, 7959, 3, 7, 8076, 1]","['▁', 'metric', '▁', 's', 'lug']",5,1,0.424001,"[0.5206716060638428, 0.8498675227165222, 0.974...",0.888051,0.172257,0.866751,0.840284


### Get overall DATA

In [7]:
DATA = matrix.join(preds.droplevel(NON_UNIQUE_ID_COL), how="left")
# Dropping nan columns for now 
# (loose examination suggests that this is due to model outputting unknown (with >< characters))
DATA = DATA[~DATA["preds"].isna()]
print("After dropping `NaN` predictions, final dataset has", len(DATA), "examples")

After dropping `NaN` predictions, final dataset has 18005 examples


In [8]:
DATA.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,title,context,question,labels,multi_way_labels,preds,preds_id,preds_raw_int,preds_raw_str,preds_raw_count,truncated,score_proba,preds_raw_scores,score_proba_arithm,score_proba_std,score_proba_geom,score_proba_hmean
example_id,answer_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
56be4db0acb8001400a502ec,d811f352fec69c6b04229bea6c53e8b1,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team represented the AFC at Super Bo...,Denver Broncos,['Denver Broncos'],Carolina Panthers,25b8b0375191e4af224a5c59beab4ce1,"[5089, 21149, 7, 1]","['▁Carolina', '▁Panther', 's']",3,1,0.804312,"[0.8083785176277161, 0.9999995231628418, 0.999...",0.950837,0.082273,0.947013,0.942929
56be4db0acb8001400a502ed,19c3b94be895352c1141244c2d436f7e,Super_Bowl_50,Super Bowl 50 was an American football game to...,Which NFL team represented the NFC at Super Bo...,Carolina Panthers,['Carolina Panthers'],Carolina Panthers,25b8b0375191e4af224a5c59beab4ce1,"[5089, 21149, 7, 1]","['▁Carolina', '▁Panther', 's']",3,1,0.990322,"[0.9952126741409302, 0.9999997615814209, 0.999...",0.997575,0.002403,0.997572,0.997569
56be4db0acb8001400a502ee,8e3cdc8e6a6146c500b344d1db8f53ee,Super_Bowl_50,Super Bowl 50 was an American football game to...,Where did Super Bowl 50 take place?,Levi's Stadium,"[""Levi's Stadium"", ""Levi's Stadium in the San ...",San Francisco Bay Area,d845ec7044fdb92db8dc7dee95f06b4b,"[1051, 5901, 2474, 5690, 1]","['▁San', '▁Francisco', '▁Bay', '▁Area']",4,1,0.150021,"[0.3066118657588959, 0.9989548921585083, 0.965...",0.755446,0.291388,0.684275,0.604799
56be4db0acb8001400a502ee,ac838f0529224befb95e3caf06e91ea8,Super_Bowl_50,Super Bowl 50 was an American football game to...,Where did Super Bowl 50 take place?,Levi's Stadium in the San Francisco Bay Area a...,"[""Levi's Stadium"", ""Levi's Stadium in the San ...",San Francisco Bay Area,d845ec7044fdb92db8dc7dee95f06b4b,"[1051, 5901, 2474, 5690, 1]","['▁San', '▁Francisco', '▁Bay', '▁Area']",4,1,0.150021,"[0.3066118657588959, 0.9989548921585083, 0.965...",0.755446,0.291388,0.684275,0.604799
56be4db0acb8001400a502ee,694680d52e7d8e756171efa32acdada8,Super_Bowl_50,Super Bowl 50 was an American football game to...,Where did Super Bowl 50 take place?,"Santa Clara, California","[""Levi's Stadium"", ""Levi's Stadium in the San ...",San Francisco Bay Area,d845ec7044fdb92db8dc7dee95f06b4b,"[1051, 5901, 2474, 5690, 1]","['▁San', '▁Francisco', '▁Bay', '▁Area']",4,1,0.150021,"[0.3066118657588959, 0.9989548921585083, 0.965...",0.755446,0.291388,0.684275,0.604799


## Tokenize 

At the moment, we provide two different tokenizations:

- `default`: uses punctuation, lowercase, determinants normalization, followed by whitespace and single quotes normalization. This method closely follows the evaluation strategies in the HuggingFace repository for QA.
- `spacy`: uses `spacy` framework for tokenization.

We apply this tokenization to the specified columns: `TARGET_LABEL`, `TARGET_MULTI_LABELS`, and `TARGET_PRED_LABEL`, placing their resulting tokenized versions on columns w/ the same name but with a `_token` suffix.


### Apply tokenization

In [10]:
import tokenizer as t

print("Applying tokenizer", TOKENIZER)
if TOKENIZER == "spacy":
    tokenizer_classpath = t.spacy_tokenizer
elif TOKENIZER == "default":
    tokenizer_classpath = t.default_tokenizer
else:
    raise ValueError(f"Unrecognized tokenizer value: {TOKENIZER}")

tokenizer_params = {
    "tokens": True
} 

for _col in (TARGET_LABEL, TARGET_PRED_LABEL, TARGET_MULTI_LABELS):
    print("Applying tokenization to col", _col)
    try:
        DATA[f"{_col}{t.TOKENIZATION_SUFFIX}"] = DATA[_col].apply(eval).apply(tokenizer_classpath, **tokenizer_params)
        print("Eval", _col)
    except:
        DATA[f"{_col}{t.TOKENIZATION_SUFFIX}"] = DATA[_col].apply(tokenizer_classpath, **tokenizer_params)

        
with open(TOKENIZER_FILEPATH, "w") as f:
    yaml.safe_dump({
        "tokenizer_classpath": tokenizer_classpath.__name__,
        "tokenizer_params": tokenizer_params,
    }, f)

Applying tokenizer default
Applying tokenization to col labels
Applying tokenization to col preds
Applying tokenization to col multi_way_labels
Eval multi_way_labels


## Compute metrics

We'll resort to HuggingFace's `datasets` builtin [metrics](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Metric) library. This provides more flexibility and is also less cumbersome to maintain code. 

Unfortunately, it means that it's not as efficient, since we will be iterating the whole dataset `M` times, where `M` is the number of metrics to compute. One might compute these metrics in parallel. We resort to a _pipeline_ implementation which should be easily adapted for multithreading to benefit from parallelism.

In order to use standard standard metrics based in word overlap (e.g., `precision`, `recall` and `f1-score`) we need to create our own methods. We'll use the implementation available in [`datasets/squad_metrics.py`](https://github.com/huggingface/transformers/blob/master/src/transformers/data/metrics/squad_metrics.py).

---

Our evaluation pipeline supports the following metrics:

- **performance metrics**: evaluate performance metrics like `precision`, `recall`, `bleu`, `rougeL`, among others.
- **correlation metrics**: evaluate the correlation between specified pairs of columns. Correlation metrics include `pearsonr`, `spearman`, and `kendalltau`.
- **calibration metrics**: evaluate calibration metrics like `equal_width_ece`, `log_loss`, `brier_score`, among others.

In [11]:
import metrics as m


### Performance metrics

Based on the names of the metrics specified by the user, we'll have to delegate the appropriate methods. Since different metrics require different types of inputs, we also provide the option for the user to specify which columns to use for applying a given metric.

These metrics will be computed at an instance level (per each example in the dataset).

In [22]:
_DATA = DATA.loc[["5737aafd1c456719005744fd"]]

In [23]:
perf_metric = m.PerformanceMetrics(target_label=TARGET_LABEL, pred_label=TARGET_PRED_LABEL, token_suffix=t.TOKENIZATION_SUFFIX, target_multi_label=TARGET_MULTI_LABELS)
perf_results = perf_metric.compute(_DATA)
perf_results.head()

[nltk_data] Downloading package wordnet to /home/kat/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0_level_0,Unnamed: 1_level_0,exact_match,first_error_position,precision,recall,f1_score,csi,rouge1,rouge2,rougeL,rougeLsum,...,bleu,brevity_penalty,length_ratio,translation_length,reference_length,bleu_1,bleu_2,bleu_3,bleu_4,metric_type
example_id,answer_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
5737aafd1c456719005744fd,2726fa3db542e4e0d3bbadf2b5d94afd,1,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,2.0,2,1,1.0,1.0,0.0,0.0,performance
5737aafd1c456719005744fd,c6b6f34dfde1af7b43e194a6a4125a40,0,0.0,0.5,1.0,0.666667,0.5,0.666667,0.0,0.666667,0.666667,...,0.0,1.0,2.0,2,1,1.0,1.0,0.0,0.0,performance
5737aafd1c456719005744fd,d50016fee3ff04db84ae4ef89d858317,1,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,1.0,2.0,2,1,1.0,1.0,0.0,0.0,performance


In [34]:
perf_results[["rouge1", "rouge2", "rougeL"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,rouge1,rouge2,rougeL
example_id,answer_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
5737aafd1c456719005744fd,2726fa3db542e4e0d3bbadf2b5d94afd,1.0,1.0,1.0
5737aafd1c456719005744fd,c6b6f34dfde1af7b43e194a6a4125a40,0.666667,0.0,0.666667
5737aafd1c456719005744fd,d50016fee3ff04db84ae4ef89d858317,1.0,1.0,1.0


In [28]:
_DATA[[TARGET_LABEL, TARGET_PRED_LABEL, f"{TARGET_PRED_LABEL}{t.TOKENIZATION_SUFFIX}", TARGET_MULTI_LABELS, f"{TARGET_MULTI_LABELS}{t.TOKENIZATION_SUFFIX}"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,labels,preds,preds_token,multi_way_labels,multi_way_labels_token
example_id,answer_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5737aafd1c456719005744fd,2726fa3db542e4e0d3bbadf2b5d94afd,metric slug,the metric slug,"[metric, slug]","['metric slug', 'slug', 'the metric slug']","[[metric, slug], [slug], [metric, slug]]"
5737aafd1c456719005744fd,c6b6f34dfde1af7b43e194a6a4125a40,slug,the metric slug,"[metric, slug]","['metric slug', 'slug', 'the metric slug']","[[metric, slug], [slug], [metric, slug]]"
5737aafd1c456719005744fd,d50016fee3ff04db84ae4ef89d858317,the metric slug,the metric slug,"[metric, slug]","['metric slug', 'slug', 'the metric slug']","[[metric, slug], [slug], [metric, slug]]"


array(['the metric slug', 'the metric slug', 'the metric slug'],
      dtype=object)

In [40]:
import datasets

rouge = datasets.load_metric("rouge", keep_in_memory=True, seed=42)
rouge.compute(predictions=[_DATA["preds"].values[0]], references=[_DATA["multi_way_labels_token"].values[0]])

{'rouge1': AggregateScore(low=Score(precision=0.6666666666666666, recall=0.4, fmeasure=0.5), mid=Score(precision=0.6666666666666666, recall=0.4, fmeasure=0.5), high=Score(precision=0.6666666666666666, recall=0.4, fmeasure=0.5)),
 'rouge2': AggregateScore(low=Score(precision=0.5, recall=0.25, fmeasure=0.3333333333333333), mid=Score(precision=0.5, recall=0.25, fmeasure=0.3333333333333333), high=Score(precision=0.5, recall=0.25, fmeasure=0.3333333333333333)),
 'rougeL': AggregateScore(low=Score(precision=0.6666666666666666, recall=0.4, fmeasure=0.5), mid=Score(precision=0.6666666666666666, recall=0.4, fmeasure=0.5), high=Score(precision=0.6666666666666666, recall=0.4, fmeasure=0.5)),
 'rougeLsum': AggregateScore(low=Score(precision=0.6666666666666666, recall=0.4, fmeasure=0.5), mid=Score(precision=0.6666666666666666, recall=0.4, fmeasure=0.5), high=Score(precision=0.6666666666666666, recall=0.4, fmeasure=0.5))}

#### Dump instance-wise metrics to filepath

In [39]:
# perf_results = perf_results.join(DATA, how="left")
perf_results.to_csv(EVALS_FILEPATH, **csv_kwargs)

## Global metrics (or Dataset-wise metrics)


These metrics include **correlation** and **calibration** metrics, as well as the mean values for the **performance metrics** we computed before.

In [12]:
SCORE_COLS = [
    "score_proba",
    ## Add other normalization scores. We will assume these columns
    ## are normalized between [0, 1]. Consider renormalizing prior
    ## using this script
    "score_proba_arithm",
    "score_proba_geom",
    "score_proba_hmean",
    "score_proba_std",
    # ""...
]

# Validation of the scores range
for col in SCORE_COLS:
    assert 0 <= min(DATA[col]), f"{col} col is less than 0"
    assert max(DATA[col]) <= 1, f"{col} col is greater than 1"


GLOBAL_METRICS = DATA[SCORE_COLS].copy()
GLOBAL_METRICS = GLOBAL_METRICS.join(perf_results, how="left")

### Filter the duplicate ones

When dealing with multi-way annotations one might have different golden annotations for the same example pair. Therefore, we're going to drop the duplicates as it is standard practice, keeping only the example with highest achieving metric values. 


In the past, we've been using the columns `exact_match` and `f1_score` to sort the performance metrics descending and then use drop_duplicates, while keeping the first instance. This guarantees we only keep the highest achieving `f1_score`s. Consider changing the `REFERENCE_METRICS` below to adopt a different sorting process. 

**Note**: Be mindful when using multiple metrics, since this code is not directly supporting metrics with opposite senses and, in fact, is assuming that **higher values of REFERENCE METRICS are better**.

In [13]:
REFERENCE_METRICS = ["exact_match", "f1_score"] 

# -----------------------------------------------------------------
GLOBAL_METRICS = GLOBAL_METRICS.reset_index()
print("Before de-duplication of data:", len(GLOBAL_METRICS))

_temp = GLOBAL_METRICS.sort_values(REFERENCE_METRICS, ascending=False)
GLOBAL_METRICS_UNIQUE = GLOBAL_METRICS[~_temp.duplicated(UNIQUE_ID_COL)].set_index(ID_COLS)

print("After de-duplication of data:", len(GLOBAL_METRICS_UNIQUE))
# -----------------------------------------------------------------
GLOBAL_METRICS_UNIQUE

Before de-duplication of data: 18005
After de-duplication of data: 10565


  GLOBAL_METRICS_UNIQUE = GLOBAL_METRICS[~_temp.duplicated(UNIQUE_ID_COL)].set_index(ID_COLS)


Unnamed: 0_level_0,Unnamed: 1_level_0,score_proba,score_proba_arithm,score_proba_geom,score_proba_hmean,score_proba_std,exact_match,first_error_position,precision,recall,f1_score,...,bleu,brevity_penalty,length_ratio,translation_length,reference_length,bleu_1,bleu_2,bleu_3,bleu_4,metric_type
example_id,answer_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
56be4db0acb8001400a502ec,d811f352fec69c6b04229bea6c53e8b1,0.804312,0.950837,0.947013,0.942929,0.082273,0,0.0,0.0,0.000000,0.000000,...,0.0,1.0,1.0,2,2,0.0,0.0,0.0,0.0,performance
56be4db0acb8001400a502ed,19c3b94be895352c1141244c2d436f7e,0.990322,0.997575,0.997572,0.997569,0.002403,1,,1.0,1.000000,1.000000,...,0.0,1.0,1.0,2,2,1.0,1.0,0.0,0.0,performance
56be4db0acb8001400a502ee,ac838f0529224befb95e3caf06e91ea8,0.150021,0.755446,0.684275,0.604799,0.291388,0,0.0,1.0,0.363636,0.533333,...,1.0,1.0,2.0,4,2,1.0,1.0,1.0,1.0,performance
56be4db0acb8001400a502ef,31fde941280b986bd50133d2a4dd077d,0.910507,0.977539,0.976834,0.976109,0.036578,0,0.0,0.0,0.000000,0.000000,...,0.0,1.0,1.0,2,2,0.0,0.0,0.0,0.0,performance
56be4db0acb8001400a502f0,6135253b6dcda73931362164a1371b78,0.979656,0.989818,0.989776,0.989734,0.009093,1,,1.0,1.000000,1.000000,...,0.0,1.0,1.0,1,1,1.0,0.0,0.0,0.0,performance
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5737aafd1c456719005744fb,a6994acc1ae88f7fa866ec28cf4823d0,0.378249,0.838396,0.823295,0.807914,0.155831,1,,1.0,1.000000,1.000000,...,0.0,1.0,1.0,1,1,1.0,0.0,0.0,0.0,performance
5737aafd1c456719005744fc,82573e294f5cc900b2f9e13df2929a66,0.755473,0.958674,0.954340,0.949497,0.085989,1,,1.0,1.000000,1.000000,...,0.0,1.0,1.0,1,1,1.0,0.0,0.0,0.0,performance
5737aafd1c456719005744fd,2726fa3db542e4e0d3bbadf2b5d94afd,0.418631,0.908655,0.883031,0.848143,0.181909,1,,1.0,1.000000,1.000000,...,0.0,1.0,2.0,2,1,1.0,1.0,0.0,0.0,performance
5737aafd1c456719005744fe,ac557fb4ecec58ec64f4ca1e7c197d5a,0.424001,0.888051,0.866751,0.840284,0.172257,0,0.0,0.0,0.000000,0.000000,...,0.0,1.0,2.0,2,1,0.0,0.0,0.0,0.0,performance


### Correlation metrics

In [14]:
corr_metrics = []
for score_col in SCORE_COLS:
    score_col_results = m.CorrelationMetric(score_col).compute(GLOBAL_METRICS_UNIQUE, GLOBAL_METRICS_UNIQUE.columns)
    score_col_results = score_col_results.dropna()
    corr_metrics.append(score_col_results)
    
corr_metrics = pd.concat(corr_metrics, axis=0)
corr_metrics



Unnamed: 0,x,y,pearsonr,pearsonr_pvalue,spearmanr,spearmanr_pvalue,kendall_tau,kendall_tau_pvalue,metric_type
0,score_proba_arithm,score_proba,0.723120,0.000000e+00,0.850048,0.000000e+00,0.671886,0.000000e+00,correlation
1,score_proba_geom,score_proba,0.756899,0.000000e+00,0.874668,0.000000e+00,0.701782,0.000000e+00,correlation
2,score_proba_hmean,score_proba,0.786405,0.000000e+00,0.895335,0.000000e+00,0.726468,0.000000e+00,correlation
3,score_proba_std,score_proba,-0.889323,0.000000e+00,-0.930760,0.000000e+00,-0.782157,0.000000e+00,correlation
4,exact_match,score_proba,0.434459,0.000000e+00,0.411516,0.000000e+00,0.336017,0.000000e+00,correlation
...,...,...,...,...,...,...,...,...,...
18,reference_length,score_proba_std,0.115825,6.979006e-33,0.174825,2.801794e-73,0.132207,8.646511e-73,correlation
19,bleu_1,score_proba_std,-0.312209,1.429874e-237,-0.313039,6.826644e-239,-0.248589,3.903245e-231,correlation
20,bleu_2,score_proba_std,0.013741,1.578652e-01,0.066730,6.612518e-12,0.049783,1.190520e-10,correlation
21,bleu_3,score_proba_std,0.062614,1.183710e-10,0.117687,6.697984e-34,0.093166,9.233801e-33,correlation


In [15]:
corr_metrics.to_csv(f"{EVALS_GLOBAL_FILEPATH}_{CORR_METRICS_SUFFIX}", index=False)

In [16]:
corr_metrics[corr_metrics.x == "f1_score"]

Unnamed: 0,x,y,pearsonr,pearsonr_pvalue,spearmanr,spearmanr_pvalue,kendall_tau,kendall_tau_pvalue,metric_type
7,f1_score,score_proba,0.360378,1.74e-321,0.401713,0.0,0.312725,0.0,correlation
7,f1_score,score_proba_arithm,0.316356,3.264127e-244,0.311242,4.878549e-236,0.24479,1.531571e-230,correlation
7,f1_score,score_proba_geom,0.332939,7.646036e-272,0.32273,1.2584509999999999e-254,0.253834,9.872542000000001e-248,correlation
7,f1_score,score_proba_hmean,0.346234,2.844618e-295,0.332775,1.460001e-271,0.261659,4.162069e-263,correlation
7,f1_score,score_proba_std,-0.322909,6.350618000000001e-255,-0.340807,1.437291e-285,-0.267664,3.185876e-275,correlation


### Calibration metrics


Amongst the calibration metrics, we have `expected calibration error (ECE)`, `brier score`, `AUC` which quantify the absolute and relative calibrations measures. 


In [17]:
CALIB_METRICS = ["exact_match", "f1_score", "precision", "recall"]

In [18]:
calib_metrics = []
for calib_metric in CALIB_METRICS:
    calib_results = m.CalibrationMetrics(calib_metric).compute(GLOBAL_METRICS_UNIQUE, SCORE_COLS)
    calib_results = calib_results.dropna()
    
    calib_metrics.append(calib_results)
    
calib_metrics = pd.concat(calib_metrics, axis=0)
calib_metrics

Unnamed: 0,x,y,mse,mae,ce_avg,ce_std,ECE_eq_width,ECE_eq_width_max,ECE_eq_freq,ECE_eq_freq_max,hyperparams,metric_type
0,exact_match,score_proba,0.164245,0.318825,-0.084104,0.396449,0.093259,0.009955,0.093833,0.019168,"{'n_bins': 20, 'frac': 0.1}",calibration
1,exact_match,score_proba_arithm,0.201942,0.26998,0.168796,0.416473,0.168796,0.057395,0.168797,0.024001,"{'n_bins': 20, 'frac': 0.1}",calibration
2,exact_match,score_proba_geom,0.195799,0.270818,0.157563,0.413489,0.157563,0.047342,0.157563,0.022978,"{'n_bins': 20, 'frac': 0.1}",calibration
3,exact_match,score_proba_hmean,0.189224,0.271537,0.144353,0.410349,0.144353,0.03697,0.144353,0.019659,"{'n_bins': 20, 'frac': 0.1}",calibration
4,exact_match,score_proba_std,0.644517,0.725426,-0.654346,0.465133,0.655215,0.326846,0.65473,0.095624,"{'n_bins': 20, 'frac': 0.1}",calibration
0,f1_score,score_proba,0.141903,0.292764,-0.177878,0.332058,0.180668,0.018007,0.180956,0.039471,"{'n_bins': 20, 'frac': 0.1}",calibration
1,f1_score,score_proba_arithm,0.098455,0.177282,0.075023,0.304675,0.075023,0.024001,0.075023,0.011642,"{'n_bins': 20, 'frac': 0.1}",calibration
2,f1_score,score_proba_geom,0.095493,0.178924,0.063789,0.302363,0.063789,0.019115,0.063789,0.008196,"{'n_bins': 20, 'frac': 0.1}",calibration
3,f1_score,score_proba_hmean,0.092858,0.181151,0.050579,0.300499,0.050639,0.01718,0.050579,0.00708,"{'n_bins': 20, 'frac': 0.1}",calibration
4,f1_score,score_proba_std,0.684115,0.778313,-0.74812,0.35275,0.748875,0.337707,0.748428,0.096733,"{'n_bins': 20, 'frac': 0.1}",calibration


In [19]:
calib_metrics.describe().to_csv(f"{EVALS_GLOBAL_FILEPATH}_{CALIB_METRICS_SUFFIX}", index=False)

In [20]:
calib_metrics[calib_metrics.x == "f1_score"]

Unnamed: 0,x,y,mse,mae,ce_avg,ce_std,ECE_eq_width,ECE_eq_width_max,ECE_eq_freq,ECE_eq_freq_max,hyperparams,metric_type
0,f1_score,score_proba,0.141903,0.292764,-0.177878,0.332058,0.180668,0.018007,0.180956,0.039471,"{'n_bins': 20, 'frac': 0.1}",calibration
1,f1_score,score_proba_arithm,0.098455,0.177282,0.075023,0.304675,0.075023,0.024001,0.075023,0.011642,"{'n_bins': 20, 'frac': 0.1}",calibration
2,f1_score,score_proba_geom,0.095493,0.178924,0.063789,0.302363,0.063789,0.019115,0.063789,0.008196,"{'n_bins': 20, 'frac': 0.1}",calibration
3,f1_score,score_proba_hmean,0.092858,0.181151,0.050579,0.300499,0.050639,0.01718,0.050579,0.00708,"{'n_bins': 20, 'frac': 0.1}",calibration
4,f1_score,score_proba_std,0.684115,0.778313,-0.74812,0.35275,0.748875,0.337707,0.748428,0.096733,"{'n_bins': 20, 'frac': 0.1}",calibration


### Performance metrics (dataset wise)

In [21]:
global_perf = (
    pd.DataFrame(GLOBAL_METRICS_UNIQUE.mean(), columns=["metric_avg"]),
    pd.DataFrame(GLOBAL_METRICS_UNIQUE.std(), columns=["metric_std"]),
)
# GLOBAL_METRICS_UNIQUE[~GLOBAL_METRICS_UNIQUE.first_error_position.isna()].mean()
global_perf = pd.concat(global_perf, axis=1)
global_perf

  pd.DataFrame(GLOBAL_METRICS_UNIQUE.mean(), columns=["metric_avg"]),
  pd.DataFrame(GLOBAL_METRICS_UNIQUE.std(), columns=["metric_std"]),


Unnamed: 0,metric_avg,metric_std
score_proba,0.666298,0.26033
score_proba_arithm,0.919199,0.078963
score_proba_geom,0.907965,0.09245
score_proba_hmean,0.894755,0.108885
score_proba_std,0.096057,0.076951
exact_match,0.750402,0.432801
first_error_position,0.517634,1.391829
precision,0.847037,0.32693
recall,0.865899,0.317797
f1_score,0.844176,0.320319


In [22]:
global_perf.describe().to_csv(f"{EVALS_GLOBAL_FILEPATH}_{PERF_METRICS_SUFFIX}", index=False)

In [23]:
!ls {EVALS_DIR}

squad_validation_default_calib_metrics.csv
squad_validation_default_correlation_metrics.csv
squad_validation_default_perf_metrics.csv
squad_validation_evals_default_config.yml
squad_validation_evals_default.csv.gz
