In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import seaborn as sns

from collections import defaultdict
from scipy.stats import pearsonr, spearmanr, kendalltau
from glob import glob

In [16]:
def _get_instance_level_correlation(data: pd.DataFrame, metrics: list, target_col: str, corr_method: callable) -> dict:
    name = corr_method.__name__
    print("Computing", name, "with", target_col, "col")
    
    # Pseudo algorithm
    # 1. Iterate over each doc_id
    # 2. Compute the correlation between different metric values for each doc_id and the human values
    # 3. Avg correlation coefficients in the end
    instance_level_corrs = defaultdict(list)

    for iid in data["bartscore_doc_id"].unique():
        for m in metrics:
            instance = data[data["bartscore_doc_id"] == iid]
            corr, p_val = corr_method(instance[m], instance[target_col])
            instance_level_corrs[m].append(corr)
           
    # Compute the avg (#TODO - handle p_val)
    instance_level_corrs_avg = {metric: np.mean(corr_data) for metric, corr_data in instance_level_corrs.items()}
    return instance_level_corrs_avg


def compute_instance_level_correlations(data, metrics, target_col, dataset_name, output_dir, to_persist=True, **_):
    correlations = {}
    for corr_method in (pearsonr, spearmanr, kendalltau):
        result = _get_instance_level_correlation(data, metrics, target_col, corr_method)

        correlations[corr_method.__name__] = result

    correlations = pd.DataFrame(correlations)
    if to_persist:
        os.makedirs(output_dir, exist_ok=True)
        correlations.reset_index().to_csv(f"{output_dir}/{dataset_name}_instance_corrs.csv", index=0)
    
    return correlations


def _get_system_level_correlation(data, metrics, target_col, systems, corr_method: callable) -> dict:
    # pseudo code
    # for each system
    # compute the mean score attributed by a metric m to the outputs of each system.
    # compute the mean score attributed by a target_col to the outputs of each system.
    # compute correlation
    system_level_correlation = defaultdict(list)
    for sys in systems:
        data_sys = data[data["sys_name"] == sys]
        # ^Note: since we're computing the mean, we dont need to ensure the ordering

        for m in metrics + [target_col]:
            mean_sys = data_sys[m].mean()
            system_level_correlation[m].append(mean_sys)

    # Compute the correlation now
    correlations = {}
    for m in metrics:
        corr, p_val = corr_method(system_level_correlation[m], system_level_correlation[target_col])

        correlations[m] = round(corr, 4)

    return correlations


def compute_system_level_correlations(data, metrics, target_col, dataset_name, systems, output_dir, to_persist=True, **_):

    correlations = {}
    for corr_method in (pearsonr, spearmanr, kendalltau):
        result = _get_system_level_correlation(data, metrics, target_col, systems, corr_method)
        correlations[corr_method.__name__] = result

    correlations = pd.DataFrame(correlations)
    if to_persist:
        os.makedirs(output_dir, exist_ok=True)
        correlations.reset_index().to_csv(f"{output_dir}/{dataset_name}_system_corrs.csv", index=0)
    
    return correlations

In [17]:
TARGET_COL = "target_rescaled"
DATASET_NAME = "realsumm"
DATASET_DIR = "../datasets/summ_data/REALSumm/quantile/regression"

dfs = []
for split in ("all", "train", "dev", "test"):
    df = pd.read_csv(f"{DATASET_DIR}/{split}.rescaled_1_50.csv")
    for col in ("bert_score_p", "bert_score_r", "bert_score_f"):
        if col in df.columns:
            df[col] = df[col].apply(lambda s: s.replace("tensor", "")[1:-1]).apply(float)

    dfs.append(df)
    
all_df, train_df, dev_df, test_df = dfs

In [4]:
METRICS = [
    # BERTScore
    'bert_score_p','bert_score_r','bert_score_f',
    'mover_score',
    # PRISM
    'prism_ref_hypo','prism_hypo_ref','prism_avg','prism_src_hypo',
    # ROUGE
    'rouge1_r','rouge1_p','rouge1_f',
    'rouge2_r','rouge2_p','rouge2_f',
    'rougel_r','rougel_p','rougel_f',
    # BARTScore
    'bart_score_cnn_ref_hypo_en', 'bart_score_cnn_ref_hypo_de',
    'bart_score_cnn_hypo_ref_en','bart_score_cnn_hypo_ref_de',
    'bart_score_cnn_avg_f_en','bart_score_cnn_avg_f_de',
    'bart_score_cnn_harm_f_en','bart_score_cnn_harm_f_de',
    'bart_score_src_hypo','bart_score_hypo_ref','bart_score_ref_hypo','bart_score_avg_f','bart_score_harm_f',
    'bart_score_cnn_src_hypo','bart_score_cnn_hypo_ref','bart_score_cnn_ref_hypo','bart_score_cnn_avg_f','bart_score_cnn_harm_f',
    'bart_score_para_src_hypo','bart_score_para_hypo_ref','bart_score_para_ref_hypo','bart_score_para_avg_f','bart_score_para_harm_f',
]
systems = sorted(all_df["sys_name"].unique())

instance_baseline_corrs = []
system_baseline_corrs = []
        
print("-" * 80)
print("Computing correlations for AUTOMATED METRICS")
print("-" * 80)

instance_corr_metrics = compute_instance_level_correlations(
    dev_df,
    metrics=METRICS,
    target_col=TARGET_COL,
    dataset_name=f"{DATASET_NAME.lower()}",
    output_dir="",
    to_persist=False,
)

system_corr_metrics = compute_system_level_correlations(
    dev_df, 
    metrics=METRICS,
    target_col=TARGET_COL,
    dataset_name=f"{DATASET_NAME.lower()}",
    output_dir="",
    systems=systems,
    to_persist=False
)

--------------------------------------------------------------------------------
Computing correlations for AUTOMATED METRICS
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col


In [5]:
def extract_template(name):
    template_no = name.rpartition("_")[-1]
    return template_no
    

def extract_basename(name):
    index = name.index("_template")
    return name[:index]


def compute_metrics(df: pd.DataFrame):
    from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
    from scipy.stats import pearsonr, spearmanr, kendalltau
    
    metrics = {}
    
    num_correct = 0
    num_digits = 0
    errs_p, errs_t, errs, mae, mse = [], [], [], [], []
    for p, t in zip(df["prediction"], df["label"]):
        # label is already a number
        p = str(p) if isinstance(p, (int, float)) else p
        p = p.strip()

        num_correct += (p == str(t))
        num_digits += p.isdigit()

        if not p.isdigit():
            continue

        p, t = float(p), float(t)
        err = (t - p)
        errs_p.append(p)
        errs_t.append(t)
        errs.append(err)
        mae.append(np.abs(err))
        mse.append(err * err)
    
    metrics["accuracy"] = num_correct / len(df)
    metrics["digits_count"] = num_digits
    metrics["digits_pct"] = num_digits / len(df)

    metrics["err_len"] = len(errs)
    metrics["err_avg"] = float(np.mean(errs))
    metrics["mae_avg"] = float(np.mean(mae))
    metrics["mse_avg"] = float(np.mean(mse))

    metrics["err_std"] = float(np.std(errs))
    metrics["mae_std"] = float(np.std(mae))
    metrics["mse_std"] = float(np.std(mse))
    
    return metrics

In [6]:
output_dir = f"../t-few-master/experiments_balanced/realsumm_reg/evals"
dev_files = sorted(glob(os.path.join(output_dir, "*", "dev_pred.txt")))
print(f"Found {len(dev_files)} experiments!")
print()

def extract_name(path):
    # Given a path in the format
    # './t-few-master/exp_out/realsumm/t03b_realsumm_baseline_ft_train/<something>
    # 1. Extract the parent dir `t03b_realsumm_baseline_ft_train`
    exp_name = path.split("/")[-2]

    # 2. Keep all parts including baseline and afterwards
    index_baseline = exp_name.index("baseline")
    return exp_name[index_baseline:]


dev_data = {extract_name(path): pd.read_csv(path) for path in dev_files}

# Before returning the dataframe, we will recover the bartscore_doc_id
# to facilitate re-use of previous correlation methods.
metrics_reg = []
dev_baselines = {}
for baseline, data in dev_data.items():
    data = data.merge(dev_df, left_on="idx", right_on="index", suffixes=(None, "_orig"))
    assert (data["label"] == data["target_rescaled"]).all()
    
    m = compute_metrics(data)
    m["baseline"] = baseline
    
    metrics_reg.append(m)
    
    data["is_digit_prediction"] = data["prediction"].apply(lambda p: str(p).strip().isdigit())
    dev_baselines[baseline] = data
    
    # 
    data["log.pred_score"] = data["log.pred_score"].apply(lambda s: -1 * s)
    data["log.label"] = data["log.label"].apply(lambda s: -1 * s)

metrics_reg = pd.DataFrame(metrics_reg)
metrics_reg["eval_template"] = metrics_reg["baseline"].apply(extract_template)
metrics_reg["basename"] = metrics_reg["baseline"].apply(extract_basename)
metrics_reg.tail()

Found 47 experiments!



Unnamed: 0,accuracy,digits_count,digits_pct,err_len,err_avg,mae_avg,mse_avg,err_std,mae_std,mse_std,baseline,eval_template,basename
42,0.025,720,1.0,720,14.276389,17.598611,449.8375,15.685095,11.837499,503.915133,baseline_template_4,4,baseline
43,0.015278,720,1.0,720,15.875,18.202778,468.797222,14.723505,11.724167,514.675265,baseline_template_5,5,baseline
44,0.019444,720,1.0,720,6.468056,15.568056,372.320833,18.179249,11.399846,473.990244,baseline_template_6,6,baseline
45,0.031944,720,1.0,720,9.520833,16.518056,406.231944,17.764731,11.549276,477.06983,baseline_template_7,7,baseline
46,0.025,720,1.0,720,8.945833,16.195833,397.065278,17.805543,11.608629,482.684918,baseline_template_8,8,baseline


In [7]:
dataset_name = "realsumm"
# target_col = "litepyramid_recall"
target_col = "target_rescaled"
split_baselines = dev_baselines
systems = sorted(data["sys_name"].unique())

instance_baseline_corrs = []
system_baseline_corrs = []
        
for baseline, data in split_baselines.items():
    if data["is_digit_prediction"].all():
        
        print("\n" * 4)
        print("-" * 80)
        print("Computing correlations for", baseline)
        print("-" * 80)

        instance_corr = compute_instance_level_correlations(
            data,
            metrics=["prediction"],
            target_col=target_col,
            dataset_name=f"{dataset_name.lower()}_{baseline}",
            output_dir="",
            to_persist=False,
        )

        system_corr = compute_system_level_correlations(
            data, 
            metrics=["prediction"],
            target_col=target_col,
            dataset_name=f"{dataset_name.lower()}_{baseline}",
            output_dir="",
            systems=systems,
            to_persist=False
        )

        instance_corr["index"] = baseline
        system_corr["index"] = baseline

        instance_baseline_corrs.append(instance_corr)
        system_baseline_corrs.append(system_corr)

instance_baseline_corrs = pd.concat(instance_baseline_corrs)
system_baseline_corrs = pd.concat(system_baseline_corrs)

instance_baseline_corrs["eval_template"] = instance_baseline_corrs["index"].apply(extract_template)
instance_baseline_corrs["basename"] = instance_baseline_corrs["index"].apply(extract_basename)

system_baseline_corrs["eval_template"] = system_baseline_corrs["index"].apply(extract_template)
system_baseline_corrs["basename"] = system_baseline_corrs["index"].apply(extract_basename)






--------------------------------------------------------------------------------
Computing correlations for baseline__t5_3b_pretrain_10ksteps_no_ia3_template_0
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline__t5_3b_pretrain_ia3_without_ul_and_ln_template_0
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_ia3_pretrained100k_ft_train_ia3_template_0
-------------------------------------------------------------------



Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_ia3_pretrained100k_ft_train_ia3_template_2
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_ia3_pretrained100k_ft_train_ia3_template_3
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_ia3_pretrained100k_ft_tra



Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_ia3_pretrained100k_ft_train_ia3_template_5
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_ia3_pretrained100k_ft_train_ia3_template_6
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_ia3_pretrained100k_ft_train_ia3_template_7
---------------------------








--------------------------------------------------------------------------------
Computing correlations for baseline_ia3_pretrained100k_ft_train_ia3_template_8
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_ia3_pretrained100k_template_0
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_ia3_pretrained100k_template_1
--------------------------------------------------------------------------------
Computing pearsonr



Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_ia3_pretrained100k_template_3
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_ia3_pretrained100k_template_4
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_ia3_pretrained100k_template_5
------------------------------------------------------------------



Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_no_ckpt_ft_train_ia3_template_1
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_no_ckpt_ft_train_ia3_template_2
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_no_ckpt_ft_train_ia3_template_3
------------------------------------------------------------








--------------------------------------------------------------------------------
Computing correlations for baseline_no_ckpt_ft_train_ia3_template_4
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_no_ckpt_ft_train_ia3_template_5
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_no_ckpt_ft_train_ia3_template_6
--------------------------------------------------------------------------------
Computing pearsonr with t



Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_no_ckpt_ft_train_ia3_template_8
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_no_ckpt_ft_train_no_ia3_template_0
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_no_ckpt_ft_train_no_ia3_template_1
------------------------------------------------------








--------------------------------------------------------------------------------
Computing correlations for baseline_no_ckpt_ft_train_no_ia3_template_2
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_no_ckpt_ft_train_no_ia3_template_3
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_no_ckpt_ft_train_no_ia3_template_4
--------------------------------------------------------------------------------
Computing pearso



Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_no_ckpt_ft_train_no_ia3_template_6
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_no_ckpt_ft_train_no_ia3_template_7
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_no_ckpt_ft_train_no_ia3_template_8
---------------------------------------------------








--------------------------------------------------------------------------------
Computing correlations for baseline_template_0
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_template_1
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_template_2
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col



Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_template_4
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_template_5
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled col
Computing spearmanr with target_rescaled col
Computing kendalltau with target_rescaled col





--------------------------------------------------------------------------------
Computing correlations for baseline_template_6
--------------------------------------------------------------------------------
Computing pearsonr with target_rescaled co



In [8]:
instance_baseline_corrs.sort_values("kendalltau", ascending=False).head()

Unnamed: 0,pearsonr,spearmanr,kendalltau,index,eval_template,basename
prediction,0.338668,0.349662,0.29334,baseline__t5_3b_pretrain_10ksteps_no_ia3_templ...,0,baseline__t5_3b_pretrain_10ksteps_no_ia3
prediction,0.285011,0.265968,0.22961,baseline__t5_3b_pretrain_ia3_without_ul_and_ln...,0,baseline__t5_3b_pretrain_ia3_without_ul_and_ln
prediction,,,,baseline_ia3_pretrained100k_ft_train_ia3_templ...,0,baseline_ia3_pretrained100k_ft_train_ia3
prediction,,,,baseline_ia3_pretrained100k_ft_train_ia3_templ...,1,baseline_ia3_pretrained100k_ft_train_ia3
prediction,,,,baseline_ia3_pretrained100k_ft_train_ia3_templ...,2,baseline_ia3_pretrained100k_ft_train_ia3


In [9]:
system_baseline_corrs.sort_values("kendalltau", ascending=False).head()

Unnamed: 0,pearsonr,spearmanr,kendalltau,index,eval_template,basename
prediction,0.7389,0.692,0.5236,baseline__t5_3b_pretrain_10ksteps_no_ia3_templ...,0,baseline__t5_3b_pretrain_10ksteps_no_ia3
prediction,0.6036,0.6144,0.4689,baseline_no_ckpt_ft_train_ia3_template_3,3,baseline_no_ckpt_ft_train_ia3
prediction,0.7041,0.6342,0.4655,baseline_template_4,4,baseline
prediction,0.6237,0.6657,0.4644,baseline_no_ckpt_ft_train_ia3_template_4,4,baseline_no_ckpt_ft_train_ia3
prediction,0.7033,0.6359,0.4582,baseline__t5_3b_pretrain_ia3_without_ul_and_ln...,0,baseline__t5_3b_pretrain_ia3_without_ul_and_ln


## Easy examples 

TODO LIST
- [ ] Find examples that have a wide spread of scores (higher standard deviation)
- [ ] Discern whether T5 (fully trained) is capable of assigning different scores to the different outputs. Are these contradicting? How difficult is the task?

In [10]:
t5_dev_preds = split_baselines["baseline__t5_3b_pretrain_10ksteps_no_ia3_template_0"]
t5_dev_preds.head()

Unnamed: 0,idx,label,prediction,log.pred_score,log.label,num_truncated,top5_unconstrained,top5_constrained,current_epoch,index,...,bart_score_cnn_avg_f_en,bart_score_cnn_avg_f_de,bart_score_cnn_harm_f_en,bart_score_cnn_harm_f_de,target,bin,label_orig,discretization_type,target_rescaled,is_digit_prediction
0,0,19,34,-0.754195,-7.379195,0,"{'tokens': ['▁34', '▁22', '▁32', '▁26', '▁23']...","{'tokens': ['▁34', '▁22', '▁32', '▁26', '▁23']...",0,0,...,-3.142573,-3.338079,-1.56831,-1.667924,36,"(30.62, 42.86]",1,5-quantile,19,True
1,1,15,22,-0.492056,-21.132681,0,"{'tokens': ['▁22', '▁35', '▁34', '▁32', '▁23']...","{'tokens': ['▁22', '▁35', '▁34', '▁32', '▁23']...",0,1,...,-3.299922,-3.506089,-1.584143,-1.704337,29,"(-0.01, 30.62]",0,5-quantile,15,True
2,2,40,34,-0.020732,-23.083233,0,"{'tokens': ['▁34', '▁28', '▁38', '▁29', '▁17']...","{'tokens': ['▁34', '▁28', '▁38', '▁29', '▁17']...",0,2,...,-2.587052,-2.798319,-1.276288,-1.391062,79,"(62.5, 100.0]",4,5-quantile,40,True
3,3,22,34,-0.004167,-20.347918,0,"{'tokens': ['▁34', '▁28', '▁17', '▁35', '▁23']...","{'tokens': ['▁34', '▁28', '▁17', '▁35', '▁23']...",0,3,...,-2.879158,-3.100711,-1.435252,-1.547302,43,"(30.62, 42.86]",1,5-quantile,22,True
4,4,8,17,-0.486572,-29.486572,0,"{'tokens': ['▁17', '▁26', '▁22', '▁16', '▁19']...","{'tokens': ['▁17', '▁26', '▁22', '▁16', '▁19']...",0,4,...,-3.287043,-3.518617,-1.643086,-1.758646,14,"(-0.01, 30.62]",0,5-quantile,8,True


In [11]:
# bartscore_doc_id uniquely identifies each document id 
scores_human_std = t5_dev_preds.groupby("bartscore_doc_id").std()["target_rescaled"].sort_values()
low_human_std_top5 = scores_human_std.head().index.tolist()
high_human_std_top5 = scores_human_std.tail().index.tolist()
print("Less ambiguous Bartscore doc ids:", low_human_std_top5)
print("More ambiguous Bartscore doc ids:", high_human_std_top5)

# bartscore_doc_id uniquely identifies each document id 
scores_model_std = t5_dev_preds.groupby("bartscore_doc_id").std()["prediction"].sort_values()
low_model_std_top5 = scores_model_std.head().index.tolist()
high_model_std_top5 = scores_model_std.tail().index.tolist()

print("Less ambiguous Bartscore doc ids:", low_model_std_top5)
print("More ambiguous Bartscore doc ids:", high_model_std_top5)

# We can observe that in general the variability in model is minimal anti-correlated with human's variability
t5_dev_preds.groupby("bartscore_doc_id").std()[["target_rescaled", "prediction"]].corr(method="kendall")

Less ambiguous Bartscore doc ids: [94, 68, 91, 42, 48]
More ambiguous Bartscore doc ids: [63, 26, 27, 45, 96]
Less ambiguous Bartscore doc ids: [88, 22, 35, 42, 26]
More ambiguous Bartscore doc ids: [16, 6, 91, 48, 68]


Unnamed: 0,target_rescaled,prediction
target_rescaled,1.0,-0.177011
prediction,-0.177011,1.0


let's check this for other metrics as a proxy measure of its goodness. We can see below, the best metrics are:
- PRISM (avg);
- ROUGE-L (recall variant)
- BartScore CNN (hypothesis -> ref)

This is interesting because these top 5 metrics are not (for the most part) the ones that achieve the highest summary-level correlation coefficients (avg instance-wise correlation). The ones achieving higher instance-level results are: 
- rouge1_r
- rougel_r
- bart_score_para_hypo_ref
- bart_score_cnn_hypo_ref_de
- bert_score_r	

Note however, that the STD does not tell us nothing about the ranking itself, it just gives an idea of how spread off the predicted values are. In particular, the fact that PRISM has a better correlation in terms of the STD means that it "agrees more often in the uncertainty" of the examples. The highest value is 0.22 which does not entail very strong correlation.

In [12]:
t5_dev_preds.groupby("bartscore_doc_id").std()[["target_rescaled"] + METRICS].corr(method="kendall").sort_values("target_rescaled", ascending=False).head()

Unnamed: 0,target_rescaled,bert_score_p,bert_score_r,bert_score_f,mover_score,prism_ref_hypo,prism_hypo_ref,prism_avg,prism_src_hypo,rouge1_r,...,bart_score_cnn_src_hypo,bart_score_cnn_hypo_ref,bart_score_cnn_ref_hypo,bart_score_cnn_avg_f,bart_score_cnn_harm_f,bart_score_para_src_hypo,bart_score_para_hypo_ref,bart_score_para_ref_hypo,bart_score_para_avg_f,bart_score_para_harm_f
target_rescaled,1.0,-0.08046,-0.066667,-0.057471,0.071264,0.103448,0.154023,0.227586,0.025287,0.163218,...,-0.094253,0.190805,0.052874,0.121839,0.177011,-0.025287,0.154023,0.006897,0.172414,0.154023
prism_avg,0.227586,0.287356,0.42069,0.43908,0.448276,0.452874,0.632184,1.0,0.034483,0.209195,...,0.043678,0.310345,0.255172,0.397701,0.37931,-0.025287,0.393103,0.356322,0.512644,0.512644
rougel_r,0.218391,0.085057,0.282759,0.126437,0.521839,-0.016092,0.43908,0.301149,0.117241,0.770115,...,-0.131034,0.448276,-0.075862,0.075862,0.14023,-0.052874,0.42069,-0.048276,0.154023,0.163218
bart_score_cnn_hypo_ref,0.190805,0.177011,0.328736,0.245977,0.402299,-0.071264,0.512644,0.310345,-0.029885,0.429885,...,-0.094253,1.0,-0.112644,0.250575,0.370115,-0.126437,0.54023,-0.002299,0.282759,0.310345
bart_score_cnn_hypo_ref_en,0.186207,0.172414,0.342529,0.241379,0.406897,-0.085057,0.517241,0.296552,-0.025287,0.443678,...,-0.098851,0.986207,-0.126437,0.236782,0.356322,-0.121839,0.554023,-0.006897,0.278161,0.296552


In [13]:
t5_dev_preds[t5_dev_preds.bartscore_doc_id.isin(high_human_std_top5)]

Unnamed: 0,idx,label,prediction,log.pred_score,log.label,num_truncated,top5_unconstrained,top5_constrained,current_epoch,index,...,bart_score_cnn_avg_f_en,bart_score_cnn_avg_f_de,bart_score_cnn_harm_f_en,bart_score_cnn_harm_f_de,target,bin,label_orig,discretization_type,target_rescaled,is_digit_prediction
144,144,43,28,-0.000044,-29.101606,0,"{'tokens': ['▁28', '▁23', '▁34', '▁35', '▁30']...","{'tokens': ['▁28', '▁23', '▁34', '▁35', '▁30']...",0,144,...,-2.828418,-2.950616,-1.354778,-1.428213,85,"(62.5, 100.0]",4,5-quantile,43,True
145,145,43,34,-0.028685,-20.220091,0,"{'tokens': ['▁34', '▁30', '▁17', '▁35', '▁28']...","{'tokens': ['▁34', '▁30', '▁17', '▁35', '▁28']...",0,145,...,-2.580979,-2.769707,-1.289284,-1.383457,85,"(62.5, 100.0]",4,5-quantile,43,True
146,146,31,17,-0.000226,-23.343977,0,"{'tokens': ['▁17', '▁23', '▁16', '▁28', '▁15']...","{'tokens': ['▁17', '▁23', '▁16', '▁28', '▁15']...",0,146,...,-2.447359,-2.576313,-1.218783,-1.284153,62,"(50.0, 62.5]",3,5-quantile,31,True
147,147,46,34,-0.166756,-18.076912,0,"{'tokens': ['▁34', '▁28', '▁35', '▁30', '▁23']...","{'tokens': ['▁34', '▁28', '▁35', '▁30', '▁23']...",0,147,...,-2.387646,-2.544167,-1.188734,-1.267161,92,"(62.5, 100.0]",4,5-quantile,46,True
148,148,35,34,-0.067036,-6.426411,0,"{'tokens': ['▁34', '▁28', '▁30', '▁26', '▁27']...","{'tokens': ['▁34', '▁28', '▁30', '▁26', '▁27']...",0,148,...,-2.508901,-2.692063,-1.245347,-1.338491,69,"(62.5, 100.0]",4,5-quantile,35,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,595,22,23,-0.009002,-4.759002,0,"{'tokens': ['▁23', '▁22', '▁28', '▁30', '▁17']...","{'tokens': ['▁23', '▁22', '▁28', '▁30', '▁17']...",0,595,...,-2.978144,-3.151444,-1.477682,-1.570288,43,"(30.62, 42.86]",1,5-quantile,22,True
596,596,36,23,-0.009002,-28.009003,0,"{'tokens': ['▁23', '▁22', '▁28', '▁30', '▁17']...","{'tokens': ['▁23', '▁22', '▁28', '▁30', '▁17']...",0,596,...,-2.978145,-3.151444,-1.477682,-1.570288,71,"(62.5, 100.0]",4,5-quantile,36,True
597,597,22,23,-0.435380,-5.060380,0,"{'tokens': ['▁23', '▁28', '▁22', '▁30', '▁29']...","{'tokens': ['▁23', '▁28', '▁22', '▁30', '▁29']...",0,597,...,-2.989371,-3.161259,-1.482682,-1.574505,43,"(30.62, 42.86]",1,5-quantile,22,True
598,598,36,1,-0.032059,-41.938309,0,"{'tokens': ['▁1', '▁6', '▁11', '▁16', '▁12'], ...","{'tokens': ['▁1', '▁6', '▁11', '▁16', '▁12'], ...",0,598,...,-3.663926,-3.838926,-1.798924,-1.894550,71,"(62.5, 100.0]",4,5-quantile,36,True


### Analysis 1: High Human STD 

Check out the top 5 examples that lead to the higher discrepancy between scores according to humans. These should be in general, fairly easy to distinguish.

In [14]:
high_human_std_top5

[63, 26, 27, 45, 96]