<a href="https://colab.research.google.com/github/jeanlucjackson/w266_final_project/blob/main/code/BB/table_metrics_by_dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Table for Metrics by Dataset

### Imports/Setup

In [None]:
from os import listdir
from os.path import isfile, join

import csv
import json
import pprint

import pandas as pd
from tqdm import tqdm

In [None]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
evaluation_root = "/content/drive/MyDrive/w266 NLP Final Project/Evaluation/"
filename = join(evaluation_root, "evaluation_database.json")

# Read JSON into dataframe
evaluation_df = pd.read_json(filename)

### Cleanup dataframe

In [None]:
# Find the overall averages for each prediction set and metric
eval_means_df = evaluation_df.groupby(["nickname"]).mean()

# Reset index so able to groupby and sort below to find top scoring prediction sets for each metric
eval_means_df = eval_means_df.reset_index("nickname")

eval_means_df

Unnamed: 0,nickname,bleu,rouge1,rouge2,rougeL,rougeLsum,meteor,bertscore-precision,bertscore-recall,bertscore-f1,bleurt,use
0,T5_amalgam_nq,0.16134,0.545908,0.321837,0.520167,0.520167,0.494038,0.860089,0.853318,0.85637,-0.35621,0.714194
1,T5_amalgam_quac,0.029039,0.257288,0.102606,0.252199,0.252199,0.255084,0.788942,0.782265,0.785061,-0.935537,0.39845
2,T5_amalgam_squad,0.125223,0.487697,0.268768,0.455174,0.455174,0.453043,0.869129,0.85625,0.862051,-0.326483,0.646733
3,T5_amalgam_triviaqa,0.058694,0.412592,0.194635,0.346813,0.346813,0.327001,0.83709,0.803558,0.819109,-0.683983,0.593142
4,T5_nq_nq,0.161035,0.543956,0.321336,0.519087,0.519087,0.491609,0.861622,0.852308,0.856634,-0.368653,0.71261
5,T5_nq_quac,0.001004,0.153919,0.033906,0.150299,0.150299,0.111316,0.692656,0.700757,0.696398,-1.360866,0.285393
6,T5_nq_squad,0.014082,0.357617,0.144095,0.330075,0.330075,0.259321,0.800772,0.772858,0.786268,-0.762601,0.539696
7,T5_nq_triviaqa,0.007319,0.313972,0.116665,0.273116,0.273116,0.186009,0.791732,0.738775,0.76386,-1.062357,0.512451
8,T5_quac_nq,0.000151,0.149113,0.023918,0.144824,0.144824,0.076108,0.695247,0.681317,0.688033,-1.507222,0.288375
9,T5_quac_quac,0.002599,0.172027,0.041067,0.171033,0.171033,0.165326,0.771336,0.749962,0.760098,-1.370463,0.32111


## Metrics by Data Tested On

In [None]:
# Set number of decimal places for scores in pandas df
pd.set_option('precision', 4)

### NQ

In [None]:
# Tested on NQ
nq_df = evaluation_df[(evaluation_df['tested_on'] == 'nq') & (evaluation_df['base_model'] == 'bart')].groupby(['base_model', 'trained_on', 'tested_on']).mean().sort_values(by=['bertscore-f1'], ascending=False)

nq_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,bleu,rouge1,rouge2,rougeL,rougeLsum,meteor,bertscore-precision,bertscore-recall,bertscore-f1,bleurt,use
base_model,trained_on,tested_on,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
bart,nq,nq,0.2035,0.5759,0.357,0.5499,0.5499,0.528,0.8705,0.8644,0.8672,-0.2524,0.7364
bart,amalgam,nq,0.1962,0.5698,0.3532,0.5451,0.5451,0.524,0.8664,0.8627,0.8643,-0.2673,0.7294
bart,squad,nq,0.0118,0.4289,0.2126,0.4107,0.4107,0.3552,0.7929,0.8051,0.7986,-0.6335,0.6304
bart,triviaqa,nq,0.0113,0.4037,0.1982,0.3665,0.3665,0.3683,0.7772,0.8163,0.7959,-0.7319,0.6243
bart,quac,nq,0.005,0.3354,0.1381,0.3273,0.3273,0.2257,0.7605,0.7424,0.7511,-0.9865,0.5175


In [None]:
# Convert to Latex
print(nq_df.to_latex(index=False)) 

\begin{tabular}{rrrrrrrrrrr}
\toprule
  bleu &  rouge1 &  rouge2 &  rougeL &  rougeLsum &  meteor &  bertscore-precision &  bertscore-recall &  bertscore-f1 &  bleurt &    use \\
0.2035 &  0.5759 &  0.3570 &  0.5499 &     0.5499 &  0.5280 &               0.8705 &            0.8644 &        0.8672 & -0.2524 & 0.7364 \\
\midrule
0.1962 &  0.5698 &  0.3532 &  0.5451 &     0.5451 &  0.5240 &               0.8664 &            0.8627 &        0.8643 & -0.2673 & 0.7294 \\
0.0118 &  0.4289 &  0.2126 &  0.4107 &     0.4107 &  0.3552 &               0.7929 &            0.8051 &        0.7986 & -0.6335 & 0.6304 \\
0.0113 &  0.4037 &  0.1982 &  0.3665 &     0.3665 &  0.3683 &               0.7772 &            0.8163 &        0.7959 & -0.7319 & 0.6243 \\
0.0050 &  0.3354 &  0.1381 &  0.3273 &     0.3273 &  0.2257 &               0.7605 &            0.7424 &        0.7511 & -0.9865 & 0.5175 \\
\bottomrule
\end{tabular}



### SQuAD

In [None]:
# Tested on SQuAD
squad_df = evaluation_df[evaluation_df['tested_on'] == 'squad'].groupby(['base_model', 'trained_on', 'tested_on']).mean().sort_values(by=['bertscore-f1'], ascending=False)

squad_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,bleu,rouge1,rouge2,rougeL,rougeLsum,meteor,bertscore-precision,bertscore-recall,bertscore-f1,bleurt,use
base_model,trained_on,tested_on,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
bart,squad,squad,0.1515,0.5036,0.2905,0.4692,0.4692,0.4756,0.8752,0.8651,0.8696,-0.2657,0.657
T5,squad,squad,0.1323,0.4842,0.2684,0.4521,0.4521,0.4555,0.8714,0.8598,0.8649,-0.3187,0.6422
bart,amalgam,squad,0.1362,0.4935,0.2798,0.4588,0.4588,0.4605,0.869,0.858,0.8629,-0.3088,0.6504
T5,amalgam,squad,0.1252,0.4877,0.2688,0.4552,0.4552,0.453,0.8691,0.8563,0.8621,-0.3265,0.6467
bart,triviaqa,squad,0.0479,0.3519,0.1425,0.3114,0.3114,0.3387,0.81,0.8253,0.8169,-0.6442,0.5354
bart,quac,squad,0.0279,0.3053,0.1131,0.2902,0.2902,0.2453,0.8329,0.776,0.8028,-0.8311,0.4456
T5,triviaqa,squad,0.0285,0.2938,0.1059,0.2426,0.2426,0.2894,0.7839,0.7999,0.791,-0.8792,0.4581
T5,nq,squad,0.0141,0.3576,0.1441,0.3301,0.3301,0.2593,0.8008,0.7729,0.7863,-0.7626,0.5397
bart,nq,squad,0.0144,0.3492,0.1393,0.3228,0.3228,0.2522,0.7976,0.7713,0.7839,-0.7636,0.5407
T5,quac,squad,0.0009,0.18,0.0253,0.1734,0.1734,0.1343,0.7693,0.7168,0.7417,-1.355,0.2571


In [None]:
# Convert to Latex
print(squad_df.to_latex(index=False)) 

\begin{tabular}{rrrrrrrrrrr}
\toprule
  bleu &  rouge1 &  rouge2 &  rougeL &  rougeLsum &  meteor &  bertscore-precision &  bertscore-recall &  bertscore-f1 &  bleurt &    use \\
0.1515 &  0.5036 &  0.2905 &  0.4692 &     0.4692 &  0.4756 &               0.8752 &            0.8651 &        0.8696 & -0.2657 & 0.6570 \\
\midrule
0.1323 &  0.4842 &  0.2684 &  0.4521 &     0.4521 &  0.4555 &               0.8714 &            0.8598 &        0.8649 & -0.3187 & 0.6422 \\
0.1362 &  0.4935 &  0.2798 &  0.4588 &     0.4588 &  0.4605 &               0.8690 &            0.8580 &        0.8629 & -0.3088 & 0.6504 \\
0.1252 &  0.4877 &  0.2688 &  0.4552 &     0.4552 &  0.4530 &               0.8691 &            0.8563 &        0.8621 & -0.3265 & 0.6467 \\
0.0479 &  0.3519 &  0.1425 &  0.3114 &     0.3114 &  0.3387 &               0.8100 &            0.8253 &        0.8169 & -0.6442 & 0.5354 \\
0.0279 &  0.3053 &  0.1131 &  0.2902 &     0.2902 &  0.2453 &               0.8329 &            0.7760 &   

### TriviaQA

In [None]:
# Tested on TriviaQA
triviaqa_df = evaluation_df[evaluation_df['tested_on'] == 'triviaqa'].groupby(['base_model', 'trained_on', 'tested_on']).mean().sort_values(by=['bertscore-f1'], ascending=False)

triviaqa_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,bleu,rouge1,rouge2,rougeL,rougeLsum,meteor,bertscore-precision,bertscore-recall,bertscore-f1,bleurt,use
base_model,trained_on,tested_on,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
bart,amalgam,triviaqa,0.0778,0.4355,0.2212,0.3684,0.3684,0.3541,0.8439,0.8112,0.8264,-0.6272,0.6088
bart,triviaqa,triviaqa,0.0766,0.4344,0.2193,0.3682,0.3682,0.3513,0.8444,0.8098,0.8259,-0.6317,0.608
T5,amalgam,triviaqa,0.0587,0.4126,0.1946,0.3468,0.3468,0.327,0.8371,0.8036,0.8191,-0.684,0.5931
T5,triviaqa,triviaqa,0.0456,0.378,0.1613,0.3153,0.3153,0.2995,0.8257,0.7966,0.8101,-0.7718,0.5797
T5,squad,triviaqa,0.0225,0.3098,0.1093,0.2667,0.2667,0.2255,0.8141,0.7652,0.7881,-0.9154,0.5026
bart,squad,triviaqa,0.0219,0.2975,0.11,0.2602,0.2602,0.2189,0.8151,0.7641,0.788,-0.9097,0.4917
bart,nq,triviaqa,0.0079,0.3317,0.1283,0.2862,0.2862,0.2008,0.7999,0.7446,0.7708,-1.0172,0.5345
T5,nq,triviaqa,0.0073,0.314,0.1167,0.2731,0.2731,0.186,0.7917,0.7388,0.7639,-1.0624,0.5125
bart,quac,triviaqa,0.007,0.218,0.0625,0.1971,0.1971,0.1505,0.7998,0.7166,0.7553,-1.1698,0.3959
T5,quac,triviaqa,0.0002,0.1248,0.0128,0.1162,0.1162,0.0889,0.7418,0.6726,0.705,-1.522,0.2256


In [None]:
# Convert to Latex
print(triviaqa_df.to_latex(index=False)) 

\begin{tabular}{rrrrrrrrrrr}
\toprule
  bleu &  rouge1 &  rouge2 &  rougeL &  rougeLsum &  meteor &  bertscore-precision &  bertscore-recall &  bertscore-f1 &  bleurt &    use \\
0.0778 &  0.4355 &  0.2212 &  0.3684 &     0.3684 &  0.3541 &               0.8439 &            0.8112 &        0.8264 & -0.6272 & 0.6088 \\
\midrule
0.0766 &  0.4344 &  0.2193 &  0.3682 &     0.3682 &  0.3513 &               0.8444 &            0.8098 &        0.8259 & -0.6317 & 0.6080 \\
0.0587 &  0.4126 &  0.1946 &  0.3468 &     0.3468 &  0.3270 &               0.8371 &            0.8036 &        0.8191 & -0.6840 & 0.5931 \\
0.0456 &  0.3780 &  0.1613 &  0.3153 &     0.3153 &  0.2995 &               0.8257 &            0.7966 &        0.8101 & -0.7718 & 0.5797 \\
0.0225 &  0.3098 &  0.1093 &  0.2667 &     0.2667 &  0.2255 &               0.8141 &            0.7652 &        0.7881 & -0.9154 & 0.5026 \\
0.0219 &  0.2975 &  0.1100 &  0.2602 &     0.2602 &  0.2189 &               0.8151 &            0.7641 &   

### QuAC

In [None]:
# Tested on QuAC
quac_df = evaluation_df[evaluation_df['tested_on'] == 'quac'].groupby(['base_model', 'trained_on', 'tested_on']).mean().sort_values(by=['bertscore-f1'], ascending=False)

quac_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,bleu,rouge1,rouge2,rougeL,rougeLsum,meteor,bertscore-precision,bertscore-recall,bertscore-f1,bleurt,use
base_model,trained_on,tested_on,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
bart,quac,quac,0.0582,0.2859,0.1407,0.282,0.282,0.2877,0.7958,0.79,0.7924,-0.8071,0.4174
T5,amalgam,quac,0.029,0.2573,0.1026,0.2522,0.2522,0.2551,0.7889,0.7823,0.7851,-0.9355,0.3984
bart,amalgam,quac,0.0281,0.2372,0.0881,0.2328,0.2328,0.2493,0.7582,0.7837,0.7701,-1.0411,0.3628
T5,quac,quac,0.0026,0.172,0.0411,0.171,0.171,0.1653,0.7713,0.75,0.7601,-1.3705,0.3211
T5,squad,quac,0.0062,0.1898,0.0451,0.1837,0.1837,0.2096,0.7355,0.7751,0.7543,-1.1584,0.3232
bart,squad,quac,0.0073,0.1887,0.0463,0.1836,0.1836,0.2053,0.7364,0.7731,0.7537,-1.216,0.3181
bart,triviaqa,quac,0.0015,0.1216,0.0254,0.1134,0.1134,0.1631,0.6813,0.7408,0.7093,-1.3748,0.2428
T5,triviaqa,quac,0.001,0.1099,0.0193,0.0989,0.0989,0.1497,0.6817,0.7362,0.7073,-1.3861,0.2271
bart,nq,quac,0.0011,0.153,0.0347,0.1497,0.1497,0.1127,0.6924,0.7024,0.6971,-1.3418,0.2867
T5,nq,quac,0.001,0.1539,0.0339,0.1503,0.1503,0.1113,0.6927,0.7008,0.6964,-1.3609,0.2854


In [None]:
# Convert to Latex
print(quac_df.to_latex(index=False)) 

\begin{tabular}{rrrrrrrrrrr}
\toprule
  bleu &  rouge1 &  rouge2 &  rougeL &  rougeLsum &  meteor &  bertscore-precision &  bertscore-recall &  bertscore-f1 &  bleurt &    use \\
0.0582 &  0.2859 &  0.1407 &  0.2820 &     0.2820 &  0.2877 &               0.7958 &            0.7900 &        0.7924 & -0.8071 & 0.4174 \\
\midrule
0.0290 &  0.2573 &  0.1026 &  0.2522 &     0.2522 &  0.2551 &               0.7889 &            0.7823 &        0.7851 & -0.9355 & 0.3984 \\
0.0281 &  0.2372 &  0.0881 &  0.2328 &     0.2328 &  0.2493 &               0.7582 &            0.7837 &        0.7701 & -1.0411 & 0.3628 \\
0.0026 &  0.1720 &  0.0411 &  0.1710 &     0.1710 &  0.1653 &               0.7713 &            0.7500 &        0.7601 & -1.3705 & 0.3211 \\
0.0062 &  0.1898 &  0.0451 &  0.1837 &     0.1837 &  0.2096 &               0.7355 &            0.7751 &        0.7543 & -1.1584 & 0.3232 \\
0.0073 &  0.1887 &  0.0463 &  0.1836 &     0.1836 &  0.2053 &               0.7364 &            0.7731 &   

### Format code to PEP 8 Standards

### Steps

*   Install:

In [None]:
!pip install black[jupyter]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting black[jupyter]
  Downloading black-22.10.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 9.3 MB/s 
[?25hCollecting click>=8.0.0
  Downloading click-8.1.3-py3-none-any.whl (96 kB)
[K     |████████████████████████████████| 96 kB 4.9 MB/s 
[?25hCollecting pathspec>=0.9.0
  Downloading pathspec-0.10.2-py3-none-any.whl (28 kB)
Collecting typed-ast>=1.4.2
  Downloading typed_ast-1.5.4-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (843 kB)
[K     |████████████████████████████████| 843 kB 41.3 MB/s 
Collecting mypy-extensions>=0.4.3
  Downloading mypy_extensions-0.4.3-py2.py3-none-any.whl (4.5 kB)
Collecting platformdirs>=2
  Downloading platformdirs-2.5.4-py3-none-any.whl (14 kB)
Collecting tokenize-rt>=3.2.0
  Downloading tokenize_rt-5.0.0-py2.py3-none-any.whl

*   To format your code run:

In [None]:
!black /content/drive/MyDrive/'Colab Notebooks'/prediction_text_analysis.ipynb


*   Don't save your notebook, hit F5 (Command + r) to refresh the page
*   Voila!
*   Now save!