<a href="https://colab.research.google.com/github/jeanlucjackson/w266_final_project/blob/main/code/sandboxes/BB/analysis/table_metrics_by_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Table for Metrics by Dataset

### Imports/Setup

In [1]:
from os import listdir
from os.path import isfile, join

import csv
import json
import pprint

import pandas as pd
from tqdm import tqdm

In [2]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
evaluation_root = "/content/drive/MyDrive/w266 NLP Final Project/Evaluation/"
filename = join(evaluation_root, "evaluation_database.json")

# Read JSON into dataframe
evaluation_df = pd.read_json(filename)

Set up ability to remove models with inference hyperparameters

In [4]:
# Apply this mask to remove all samples from non default inference sets
default_hyperparameter_mask = [sample['defaults'] for sample in evaluation_df.hyperparameter]

### Cleanup dataframe

In [5]:
# Clean up evaluation db df to only include columns needed for text analysis
clean_eval_df = evaluation_df[["nickname", "base_model", "trained_on", "tested_on", "hyperparameter", "target", "prediction", "bleu", "rougeL", "meteor", "bertscore-f1", "use"]]

clean_eval_df

Unnamed: 0,nickname,base_model,trained_on,tested_on,hyperparameter,target,prediction,bleu,rougeL,meteor,bertscore-f1,use
0,bart_nq_nq,bart,nq,nq,{'defaults': True},what was the real name of saudi arabia,what is the name of the kingdom of saudi arabia,0.000000,0.666667,0.623306,0.915830,0.794086
1,bart_nq_nq,bart,nq,nq,{'defaults': True},whats the most liked picture on instagram 2018,what is the most liked picture on instagram,0.680375,0.750000,0.864796,0.932259,0.884909
2,bart_nq_nq,bart,nq,nq,{'defaults': True},where does the movie proof of life take place,where does alice go in the new movie,0.000000,0.470588,0.354635,0.835192,0.517867
3,bart_nq_nq,bart,nq,nq,{'defaults': True},where is net profit on the balance sheet,where does net profit come from in a financial...,0.000000,0.333333,0.311653,0.856130,0.728012
4,bart_nq_nq,bart,nq,nq,{'defaults': True},when was fingerprinting first used by the police,when was fingerprint technology first used in ...,0.000000,0.588235,0.694444,0.876353,0.704206
...,...,...,...,...,...,...,...,...,...,...,...,...
286285,T5_amalgam_triviaqa,T5,amalgam,triviaqa,{'defaults': True},Which ex British daily newspaper was first pub...,What is the name of the city of Manchester?,0.000000,0.093023,0.058309,0.679618,0.322748
286286,T5_amalgam_triviaqa,T5,amalgam,triviaqa,{'defaults': True},Which cartoon character lived in Bunkerton Cas...,What was the name of Lord Marmaduke of Bunkerton?,0.000000,0.125000,0.121951,0.794428,0.515345
286287,T5_amalgam_triviaqa,T5,amalgam,triviaqa,{'defaults': True},In which prison was 'Amy' born in the novel 'L...,What is the name of the prison in Charles Dick...,0.000000,0.416667,0.248227,0.821790,0.593840
286288,T5_amalgam_triviaqa,T5,amalgam,triviaqa,{'defaults': True},The Sign Of Four was a detective story written...,Who wrote the novels 'The Sign of Four' and 'A...,0.000000,0.400000,0.381426,0.796076,0.585255


### Find means for all models

In [6]:
# Find the overall averages for each prediction set and metric
eval_means_df = evaluation_df.groupby(["nickname"]).mean()

# Reset index so able to groupby and sort below to find top scoring prediction sets for each metric
eval_means_df = eval_means_df.reset_index("nickname")

eval_means_df

Unnamed: 0,nickname,bleu,rouge1,rouge2,rougeL,rougeLsum,meteor,bertscore-precision,bertscore-recall,bertscore-f1,use,bleurt
0,T5_amalgam_nq,0.169746,0.551111,0.331145,0.527758,0.527758,0.497871,0.862055,0.854624,0.858036,0.719863,-0.329087
1,T5_amalgam_quac,0.053671,0.252468,0.122504,0.249039,0.249039,0.262385,0.778761,0.778353,0.77806,0.378001,-0.865799
2,T5_amalgam_squad,0.142885,0.502334,0.287635,0.470772,0.470772,0.469798,0.873664,0.861667,0.867048,0.656399,-0.290063
3,T5_amalgam_triviaqa,0.06639,0.424649,0.208584,0.35913,0.35913,0.337926,0.841297,0.806558,0.822766,0.601577,-0.668766
4,T5_nq_nq,0.172979,0.551682,0.331158,0.527675,0.527675,0.497261,0.864974,0.854585,0.859476,0.718928,-0.326454
5,T5_nq_quac,0.001471,0.155335,0.03538,0.1519,0.1519,0.111625,0.692863,0.701502,0.696885,0.287737,-1.357303
6,T5_nq_squad,0.017446,0.373221,0.160708,0.346182,0.346182,0.27346,0.806934,0.777579,0.791693,0.554671,-0.727935
7,T5_nq_triviaqa,0.00745,0.318294,0.123126,0.278408,0.278408,0.189823,0.795502,0.740678,0.766649,0.518441,-1.038274
8,T5_quac_nq,0.0,0.070207,0.007646,0.068525,0.068525,0.040712,0.667213,0.658704,0.662784,0.180669,-1.526871
9,T5_quac_quac,0.023456,0.175371,0.067426,0.173453,0.173453,0.189914,0.759456,0.753469,0.756046,0.303081,-1.124621


### Create BART and T5 dataframes

In [7]:
# Create BART dataframe
bart_df = evaluation_df[evaluation_df["base_model"] == "bart"]

# Create T5 dataframe
t5_df = evaluation_df[evaluation_df["base_model"] == "T5"]

## Metrics by Model

In [8]:
df = clean_eval_df[default_hyperparameter_mask]
df = df.groupby(["base_model"]).mean().sort_values(by="use", ascending=False)

df

Unnamed: 0_level_0,bleu,rougeL,meteor,bertscore-f1,use
base_model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
bart,0.051795,0.314182,0.293177,0.797674,0.510399
T5,0.045889,0.276984,0.264854,0.781927,0.455826


In [9]:
# Convert to Latex
print(df.to_latex(index=False, header=True))

\begin{tabular}{rrrrr}
\toprule
    bleu &   rougeL &   meteor &  bertscore-f1 &      use \\
0.051795 & 0.314182 & 0.293177 &      0.797674 & 0.510399 \\
\midrule
0.045889 & 0.276984 & 0.264854 &      0.781927 & 0.455826 \\
\bottomrule
\end{tabular}



## Metrics by Data Tested On

In [10]:
# Set number of decimal places for scores in pandas df
pd.set_option('precision', 4)

### NQ

In [11]:
# Tested on NQ
nq_df = evaluation_df[(evaluation_df['tested_on'] == 'nq') & (evaluation_df['base_model'] == 'bart')].groupby(['base_model', 'trained_on', 'tested_on']).mean().sort_values(by=['bertscore-f1'], ascending=False)

nq_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,bleu,rouge1,rouge2,rougeL,rougeLsum,meteor,bertscore-precision,bertscore-recall,bertscore-f1,use,bleurt
base_model,trained_on,tested_on,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
bart,nq,nq,0.2031,0.5764,0.357,0.5504,0.5504,0.5288,0.8706,0.8646,0.8674,0.7367,-0.2507
bart,amalgam,nq,0.1967,0.5694,0.3531,0.545,0.545,0.5235,0.8665,0.8626,0.8643,0.7293,-0.2672
bart,squad,nq,0.0118,0.4288,0.2126,0.4106,0.4106,0.3553,0.7929,0.8051,0.7986,0.6303,-0.6329
bart,triviaqa,nq,0.0114,0.4027,0.1972,0.3658,0.3658,0.3676,0.7769,0.8161,0.7956,0.623,-0.7363
bart,quac,nq,0.005,0.3342,0.1369,0.3261,0.3261,0.2244,0.7599,0.742,0.7505,0.5166,-0.9909


In [12]:
# Convert to Latex
print(nq_df.to_latex(index=False)) 

\begin{tabular}{rrrrrrrrrrr}
\toprule
  bleu &  rouge1 &  rouge2 &  rougeL &  rougeLsum &  meteor &  bertscore-precision &  bertscore-recall &  bertscore-f1 &    use &  bleurt \\
0.2031 &  0.5764 &  0.3570 &  0.5504 &     0.5504 &  0.5288 &               0.8706 &            0.8646 &        0.8674 & 0.7367 & -0.2507 \\
\midrule
0.1967 &  0.5694 &  0.3531 &  0.5450 &     0.5450 &  0.5235 &               0.8665 &            0.8626 &        0.8643 & 0.7293 & -0.2672 \\
0.0118 &  0.4288 &  0.2126 &  0.4106 &     0.4106 &  0.3553 &               0.7929 &            0.8051 &        0.7986 & 0.6303 & -0.6329 \\
0.0114 &  0.4027 &  0.1972 &  0.3658 &     0.3658 &  0.3676 &               0.7769 &            0.8161 &        0.7956 & 0.6230 & -0.7363 \\
0.0050 &  0.3342 &  0.1369 &  0.3261 &     0.3261 &  0.2244 &               0.7599 &            0.7420 &        0.7505 & 0.5166 & -0.9909 \\
\bottomrule
\end{tabular}



### SQuAD

In [13]:
# Tested on SQuAD
squad_df = evaluation_df[evaluation_df['tested_on'] == 'squad'].groupby(['base_model', 'trained_on', 'tested_on']).mean().sort_values(by=['bertscore-f1'], ascending=False)

squad_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,bleu,rouge1,rouge2,rougeL,rougeLsum,meteor,bertscore-precision,bertscore-recall,bertscore-f1,use,bleurt
base_model,trained_on,tested_on,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
bart,squad,squad,0.1512,0.5034,0.2903,0.4692,0.4692,0.4753,0.8752,0.8651,0.8695,0.657,-0.267
T5,squad,squad,0.1473,0.4978,0.2864,0.4677,0.4677,0.4677,0.8755,0.8633,0.8688,0.6507,-0.2897
T5,amalgam,squad,0.1429,0.5023,0.2876,0.4708,0.4708,0.4698,0.8737,0.8617,0.867,0.6564,-0.2901
bart,amalgam,squad,0.1358,0.4933,0.2793,0.4586,0.4586,0.4601,0.869,0.8579,0.8628,0.6505,-0.3081
bart,triviaqa,squad,0.047,0.3508,0.1419,0.3106,0.3106,0.3379,0.8098,0.8249,0.8166,0.5351,-0.6453
T5,triviaqa,squad,0.0426,0.3363,0.136,0.2806,0.2806,0.328,0.803,0.8163,0.8089,0.4998,-0.8056
bart,quac,squad,0.0277,0.3062,0.1133,0.2909,0.2909,0.2456,0.8332,0.7759,0.8029,0.4456,-0.8318
T5,nq,squad,0.0174,0.3732,0.1607,0.3462,0.3462,0.2735,0.8069,0.7776,0.7917,0.5547,-0.7279
bart,nq,squad,0.0144,0.349,0.139,0.3226,0.3226,0.252,0.7975,0.7711,0.7838,0.5407,-0.7647
T5,quac,squad,0.0004,0.0995,0.0086,0.0967,0.0967,0.0976,0.737,0.6936,0.7142,0.1662,-1.4403


In [14]:
# Convert to Latex
print(squad_df.to_latex(index=False)) 

\begin{tabular}{rrrrrrrrrrr}
\toprule
  bleu &  rouge1 &  rouge2 &  rougeL &  rougeLsum &  meteor &  bertscore-precision &  bertscore-recall &  bertscore-f1 &    use &  bleurt \\
0.1512 &  0.5034 &  0.2903 &  0.4692 &     0.4692 &  0.4753 &               0.8752 &            0.8651 &        0.8695 & 0.6570 & -0.2670 \\
\midrule
0.1473 &  0.4978 &  0.2864 &  0.4677 &     0.4677 &  0.4677 &               0.8755 &            0.8633 &        0.8688 & 0.6507 & -0.2897 \\
0.1429 &  0.5023 &  0.2876 &  0.4708 &     0.4708 &  0.4698 &               0.8737 &            0.8617 &        0.8670 & 0.6564 & -0.2901 \\
0.1358 &  0.4933 &  0.2793 &  0.4586 &     0.4586 &  0.4601 &               0.8690 &            0.8579 &        0.8628 & 0.6505 & -0.3081 \\
0.0470 &  0.3508 &  0.1419 &  0.3106 &     0.3106 &  0.3379 &               0.8098 &            0.8249 &        0.8166 & 0.5351 & -0.6453 \\
0.0426 &  0.3363 &  0.1360 &  0.2806 &     0.2806 &  0.3280 &               0.8030 &            0.8163 &   

### TriviaQA

In [15]:
# Tested on TriviaQA
triviaqa_df = evaluation_df[evaluation_df['tested_on'] == 'triviaqa'].groupby(['base_model', 'trained_on', 'tested_on']).mean().sort_values(by=['bertscore-f1'], ascending=False)

triviaqa_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,bleu,rouge1,rouge2,rougeL,rougeLsum,meteor,bertscore-precision,bertscore-recall,bertscore-f1,use,bleurt
base_model,trained_on,tested_on,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
bart,amalgam,triviaqa,0.077973,0.4357,0.2214,0.3687,0.3687,0.3546,0.8438,0.8113,0.8264,0.6092,-0.6266
bart,triviaqa,triviaqa,0.075994,0.434,0.2187,0.3678,0.3678,0.351,0.8444,0.8098,0.8259,0.6078,-0.6315
T5,amalgam,triviaqa,0.06639,0.4246,0.2086,0.3591,0.3591,0.3379,0.8413,0.8066,0.8228,0.6016,-0.6688
T5,triviaqa,triviaqa,0.055784,0.4053,0.1884,0.337,0.337,0.3231,0.8331,0.8016,0.8163,0.5914,-0.7305
T5,squad,triviaqa,0.022352,0.3041,0.1118,0.2655,0.2655,0.2219,0.816,0.7647,0.7888,0.495,-0.9132
bart,squad,triviaqa,0.021929,0.2978,0.1101,0.2603,0.2603,0.2191,0.8152,0.7642,0.7881,0.4921,-0.9088
bart,nq,triviaqa,0.0079975,0.3319,0.1285,0.2864,0.2864,0.2008,0.7999,0.7446,0.7708,0.5343,-1.0174
T5,nq,triviaqa,0.0074502,0.3183,0.1231,0.2784,0.2784,0.1898,0.7955,0.7407,0.7666,0.5184,-1.0383
bart,quac,triviaqa,0.006868,0.2188,0.0626,0.1978,0.1978,0.1509,0.8,0.7167,0.7555,0.3968,-1.1692
T5,quac,triviaqa,5.7e-06,0.0459,0.0025,0.0432,0.0432,0.0567,0.7028,0.6522,0.6762,0.1192,-1.5522


In [16]:
# Convert to Latex
print(triviaqa_df.to_latex(index=False)) 

\begin{tabular}{rrrrrrrrrrr}
\toprule
      bleu &  rouge1 &  rouge2 &  rougeL &  rougeLsum &  meteor &  bertscore-precision &  bertscore-recall &  bertscore-f1 &    use &  bleurt \\
7.7973e-02 &  0.4357 &  0.2214 &  0.3687 &     0.3687 &  0.3546 &               0.8438 &            0.8113 &        0.8264 & 0.6092 & -0.6266 \\
\midrule
7.5994e-02 &  0.4340 &  0.2187 &  0.3678 &     0.3678 &  0.3510 &               0.8444 &            0.8098 &        0.8259 & 0.6078 & -0.6315 \\
6.6390e-02 &  0.4246 &  0.2086 &  0.3591 &     0.3591 &  0.3379 &               0.8413 &            0.8066 &        0.8228 & 0.6016 & -0.6688 \\
5.5784e-02 &  0.4053 &  0.1884 &  0.3370 &     0.3370 &  0.3231 &               0.8331 &            0.8016 &        0.8163 & 0.5914 & -0.7305 \\
2.2352e-02 &  0.3041 &  0.1118 &  0.2655 &     0.2655 &  0.2219 &               0.8160 &            0.7647 &        0.7888 & 0.4950 & -0.9132 \\
2.1929e-02 &  0.2978 &  0.1101 &  0.2603 &     0.2603 &  0.2191 &               0.8

### QuAC

In [17]:
# Tested on QuAC
quac_df = evaluation_df[evaluation_df['tested_on'] == 'quac'].groupby(['base_model', 'trained_on', 'tested_on']).mean().sort_values(by=['bertscore-f1'], ascending=False)

quac_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,bleu,rouge1,rouge2,rougeL,rougeLsum,meteor,bertscore-precision,bertscore-recall,bertscore-f1,use,bleurt
base_model,trained_on,tested_on,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
bart,quac,quac,0.0572,0.2867,0.1404,0.2827,0.2827,0.2881,0.7964,0.7902,0.7928,0.4188,-0.8082
T5,amalgam,quac,0.0537,0.2525,0.1225,0.249,0.249,0.2624,0.7788,0.7784,0.7781,0.378,-0.8658
bart,amalgam,quac,0.0274,0.2381,0.0883,0.2337,0.2337,0.2502,0.7585,0.7842,0.7705,0.3638,-1.0444
T5,quac,quac,0.0235,0.1754,0.0674,0.1735,0.1735,0.1899,0.7595,0.7535,0.756,0.3031,-1.1246
bart,squad,quac,0.0071,0.1893,0.0466,0.1842,0.1842,0.2059,0.7368,0.7733,0.754,0.3185,-1.2148
T5,squad,quac,0.0071,0.1908,0.0451,0.1852,0.1852,0.208,0.7352,0.7744,0.7537,0.3205,-1.1997
T5,triviaqa,quac,0.0017,0.1198,0.0241,0.1106,0.1106,0.1606,0.69,0.7442,0.7156,0.2446,-1.3839
bart,triviaqa,quac,0.0015,0.1218,0.0255,0.1136,0.1136,0.163,0.6813,0.7409,0.7093,0.2432,-1.3734
bart,nq,quac,0.0011,0.153,0.0346,0.1496,0.1496,0.1126,0.6925,0.7024,0.6972,0.2866,-1.3413
T5,nq,quac,0.0015,0.1553,0.0354,0.1519,0.1519,0.1116,0.6929,0.7015,0.6969,0.2877,-1.3573


In [18]:
# Convert to Latex
print(quac_df.to_latex(index=False)) 

\begin{tabular}{rrrrrrrrrrr}
\toprule
  bleu &  rouge1 &  rouge2 &  rougeL &  rougeLsum &  meteor &  bertscore-precision &  bertscore-recall &  bertscore-f1 &    use &  bleurt \\
0.0572 &  0.2867 &  0.1404 &  0.2827 &     0.2827 &  0.2881 &               0.7964 &            0.7902 &        0.7928 & 0.4188 & -0.8082 \\
\midrule
0.0537 &  0.2525 &  0.1225 &  0.2490 &     0.2490 &  0.2624 &               0.7788 &            0.7784 &        0.7781 & 0.3780 & -0.8658 \\
0.0274 &  0.2381 &  0.0883 &  0.2337 &     0.2337 &  0.2502 &               0.7585 &            0.7842 &        0.7705 & 0.3638 & -1.0444 \\
0.0235 &  0.1754 &  0.0674 &  0.1735 &     0.1735 &  0.1899 &               0.7595 &            0.7535 &        0.7560 & 0.3031 & -1.1246 \\
0.0071 &  0.1893 &  0.0466 &  0.1842 &     0.1842 &  0.2059 &               0.7368 &            0.7733 &        0.7540 & 0.3185 & -1.2148 \\
0.0071 &  0.1908 &  0.0451 &  0.1852 &     0.1852 &  0.2080 &               0.7352 &            0.7744 &   

### Format code to PEP 8 Standards

### Steps

*   Install:

In [None]:
!pip install black[jupyter]

*   To format your code run:

In [None]:
!black /content/drive/MyDrive/'Colab Notebooks'/table_metrics_by_dataset.ipynb

[1mAll done! ✨ 🍰 ✨[0m
[34m1 file [0mleft unchanged.



*   Don't save your notebook, hit F5 (Command + r) to refresh the page
*   Voila!
*   Now save!