<a href="https://colab.research.google.com/github/jeanlucjackson/w266_final_project/blob/main/code/BB/results_text_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Results: Text Evaluation

### Imports/Setup

In [None]:
from os import listdir
from os.path import isfile, join

import csv
import json
import pprint

import pandas as pd
from tqdm import tqdm

In [None]:
# This cell will authenticate you and mount your Drive in the Colab.
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
evaluation_root = "/content/drive/MyDrive/w266 NLP Final Project/Evaluation/"
filename = join(evaluation_root, "evaluation_database.json")

# Read JSON into dataframe
evaluation_df = pd.read_json(filename)

### Clean up evaluation dataframe to only include columns needed for text analysis

In [None]:
# Clean up evaluation db df to only include columns needed for text analysis
clean_eval_df = evaluation_df[["nickname", "base_model", "trained_on", "tested_on", "hyperparameter", "target", "prediction", "bleu", "rougeL", "meteor", "bertscore-f1", "use"]]

clean_eval_df

### Find the overall averages for each prediction set and metric

In [None]:
# Find the overall averages for each prediction set and metric
eval_means_df = clean_eval_df.groupby(["nickname"]).mean()

# Reset index so able to groupby and sort below to find top scoring prediction sets for each metric
eval_means_df = eval_means_df.reset_index("nickname")

eval_means_df

### Create BART and T5 dataframes to focus on

In [None]:
# Create BART dataframe
bart_df = clean_eval_df[clean_eval_df["base_model"] == "bart"]

In [None]:
# Create T5 dataframe
t5_df = clean_eval_df[clean_eval_df["base_model"] == "T5"]

## Model Evaluation

### USE Scores 0.85-09.0

In [None]:
# Find BART predictions with use range of 85-90

bart_df_range = bart_df[(bart_df['use'] >= 0.85) & (bart_df['use'] <= 0.87) & (bart_df["nickname"] == 'bart_nq_nq')]

bart_df_range = bart_df_range.sort_values(by="use", ascending=False)

bart_df_range

Unnamed: 0,nickname,base_model,trained_on,tested_on,hyperparameter,target,prediction,bleu,rougeL,meteor,bertscore-f1,use
172893,bart_nq_nq,bart,nq,nq,{'defaults': True},who was the main character in their eyes were ...,who was their eyes were watching god based on,0.411055,0.700000,0.640590,0.878411,0.869590
172956,bart_nq_nq,bart,nq,nq,{'defaults': True},when did jack mccoy join law and order,when did john mccoy first appear on law and order,0.000000,0.666667,0.685976,0.919406,0.869536
172437,bart_nq_nq,bart,nq,nq,{'defaults': True},where did they film woody the woodpecker movie,where was the movie woody woodpecker filmed,0.000000,0.400000,0.379747,0.902069,0.869150
172655,bart_nq_nq,bart,nq,nq,{'defaults': True},who wrote the book my country my life,who is the author of my country my life,0.330316,0.705882,0.694444,0.921862,0.867857
172761,bart_nq_nq,bart,nq,nq,{'defaults': True},what does the keys to the city mean,what is the meaning of the key to the city,0.000000,0.555556,0.820060,0.899178,0.867763
...,...,...,...,...,...,...,...,...,...,...,...,...
172179,bart_nq_nq,bart,nq,nq,{'defaults': True},when do willow and tara get back together,when does tara and buffy get back together,0.000000,0.625000,0.638889,0.936549,0.852325
173716,bart_nq_nq,bart,nq,nq,{'defaults': True},when did holland become involved in world war 2,when did the dutch enter world war 2,0.000000,0.588235,0.543820,0.905561,0.851055
173422,bart_nq_nq,bart,nq,nq,{'defaults': True},where is the new stadium being built in las vegas,where is the raiders stadium being built in la...,0.658037,0.900000,0.895062,0.960545,0.851036
172354,bart_nq_nq,bart,nq,nq,{'defaults': True},hart of dixie season 4 how many episodes,how many episodes of hart of dixie are there,0.000000,0.352941,0.727023,0.878181,0.850455


In [None]:
spec_pred_scores_df = bart_df[(bart_df["target"] == "who wrote the book my country my life") & (bart_df["nickname"] == "bart_nq_nq")]

spec_pred_scores_df = spec_pred_scores_df[["target", "prediction", "rougeL", "use"]]

spec_pred_scores_df

Unnamed: 0,target,prediction,rougeL,use
172655,who wrote the book my country my life,who is the author of my country my life,0.705882,0.867857


In [None]:
# Convert to Latex
print(spec_pred_scores_df.to_latex(index=False, header=True)) 

\begin{tabular}{llrr}
\toprule
                               target &                              prediction &   rougeL &      use \\
\midrule
who wrote the book my country my life & who is the author of my country my life & 0.705882 & 0.867857 \\
\bottomrule
\end{tabular}



In [None]:
# Find T5 predictions with use range of 85-90

t5_df_range = t5_df[(t5_df['use'] >= 0.85) & (t5_df['use'] <= 0.86) & (t5_df["nickname"] == 'T5_nq_nq')]

t5_df_range = t5_df_range.sort_values(by="use", ascending=False)

t5_df_range

Unnamed: 0,nickname,base_model,trained_on,tested_on,hyperparameter,target,prediction,bleu,rougeL,meteor,bertscore-f1,use
31,T5_nq_nq,T5,nq,nq,{'defaults': True},what kind of book is where's waldo,what kind of book is where's wally,0.809107,0.875,0.873724,0.992737,0.859866
1275,T5_nq_nq,T5,nq,nq,{'defaults': True},who sang with eternal on i wanna be the only one,who sings i wanna be the only one,0.485987,0.736842,0.678419,0.920693,0.859697
1577,T5_nq_nq,T5,nq,nq,{'defaults': True},who was the original host of what's my line,who is the host of what's my line,0.496264,0.842105,0.786774,0.952402,0.859557
1378,T5_nq_nq,T5,nq,nq,{'defaults': True},who was it that described the structure of dna,who discovered the structure of dna,0.32588,0.666667,0.556322,0.903958,0.8595
1678,T5_nq_nq,T5,nq,nq,{'defaults': True},where is urinary bladder located in human body,where does the bladder sit in the human body,0.0,0.588235,0.459259,0.867073,0.859204
1857,T5_nq_nq,T5,nq,nq,{'defaults': True},when is the opening ceremonies of the olympics...,when is the opening ceremony of the winter oly...,0.368894,0.777778,0.881944,0.919863,0.858611
1962,T5_nq_nq,T5,nq,nq,{'defaults': True},where was the louisiana purchase signed in 1803,where was the louisiana purchase treaty signed,0.5578,0.8,0.745429,0.903282,0.858369
1887,T5_nq_nq,T5,nq,nq,{'defaults': True},who started the guinness book of world records,who is the founder of the guinness book of rec...,0.290715,0.666667,0.685976,0.90942,0.858329
1450,T5_nq_nq,T5,nq,nq,{'defaults': True},where do the secretory cells of endocrine glan...,where do exocrine glands secrete their products,0.276104,0.666667,0.555556,0.940807,0.857914
1741,T5_nq_nq,T5,nq,nq,{'defaults': True},when did the first pokemon game come out,when did pokemon first come out on the game boy,0.0,0.555556,0.769817,0.889377,0.85761


In [None]:
spec_pred_scores_df = t5_df[(t5_df["target"] == "who wrote the book my country my life") & (t5_df["nickname"] == "T5_nq_nq")]

spec_pred_scores_df = spec_pred_scores_df[["target", "prediction", "rougeL", "use"]]

spec_pred_scores_df

Unnamed: 0,target,prediction,rougeL,use
881,who wrote the book my country my life,who wrote my country my life in 2008,0.75,0.794154


### Word and Character Repetition

In [None]:
spec_pred_scores_df = bart_df[(bart_df["target"] == "What was the Elephant Man's real name?") & (bart_df["nickname"] == "bart_amalgam_triviaqa_hyp")]

spec_pred_scores_df = spec_pred_scores_df[["target", "prediction", "rougeL", "use"]]

spec_pred_scores_df[:1]

Unnamed: 0,target,prediction,rougeL,use
417006,What was the Elephant Man's real name?,What was John Merrick's nickname?,0.428571,0.504569


In [None]:
# Convert to Latex
print(spec_pred_scores_df.to_latex(index=False, header=True)) 

\begin{tabular}{llrr}
\toprule
                                target &                                         prediction &   rougeL &      use \\
\midrule
What was the Elephant Man's real name? &                  What was John Merrick's nickname? & 0.428571 & 0.504569 \\
What was the Elephant Man's real name? & Which 1980 film, directed by David Lynch, starr... & 0.000000 & 0.202451 \\
\bottomrule
\end{tabular}



In [None]:
spec_pred_scores_df = t5_df[(t5_df["target"] == "What was the Elephant Man's real name?") & (t5_df["nickname"] == "T5_amalgam_triviaqa")]

spec_pred_scores_df = spec_pred_scores_df[["target", "prediction", "rougeL", "use"]]

spec_pred_scores_df

Unnamed: 0,target,prediction,rougeL,use
259324,What was the Elephant Man's real name?,What was the name of the man who was exhibited...,0.363636,0.481661
265713,What was the Elephant Man's real name?,"""What 1980 film, directed by David Lynch, feat...",0.095238,0.082935


In [None]:
# Convert to Latex
print(spec_pred_scores_df.to_latex(index=False, header=True)) 

\begin{tabular}{llrr}
\toprule
                                target &                                         prediction &   rougeL &      use \\
\midrule
What was the Elephant Man's real name? & What was the name of the man who was exhibited ... & 0.363636 & 0.481661 \\
What was the Elephant Man's real name? & "What 1980 film, directed by David Lynch, featu... & 0.095238 & 0.082935 \\
\bottomrule
\end{tabular}



### Metric Analysis

BART: High USE Score but Low RougeL Score

In [None]:
# Find predictions with bertscore range of 85-90

bart_hi_lo_df = bart_df[(bart_df['use'] >= 0.85) & (bart_df['use'] <= 0.90) & (bart_df['rougeL'] <= 0.2)]

bart_hi_lo_df = bart_hi_lo_df.sort_values(by="bertscore-f1", ascending=False)

bart_hi_lo_df

Unnamed: 0,nickname,base_model,trained_on,tested_on,hyperparameter,target,prediction,bleu,rougeL,meteor,bertscore-f1,use
217016,bart_triviaqa_squad,bart,triviaqa,squad,{'defaults': True},What parts of plants have chloroplasts?,Chloroplasts are found in which part of a plant?,0.0,0.133333,0.509589,0.906873,0.869844
377485,bart_triviaqa_squad_hyp,bart,triviaqa,squad,"{'defaults': False, 'max_length': 50, 'beams':...",What parts of plants have chloroplasts?,Chloroplasts are found in which part of a plant?,0.0,0.133333,0.509589,0.906873,0.869844
280410,bart_amalgam_triviaqa,bart,amalgam,triviaqa,{'defaults': True},Electric Christmas tree lights were first used...,In which year was the first electrically power...,0.0,0.190476,0.432065,0.871232,0.850297
417687,bart_amalgam_triviaqa_hyp,bart,amalgam,triviaqa,"{'defaults': False, 'max_length': 50, 'beams':...",Electric Christmas tree lights were first used...,In which year was the first electrically power...,0.0,0.190476,0.432065,0.871232,0.850296
128090,bart_squad_squad,bart,squad,squad,{'defaults': True},Where did Tesla look for investors prior to WWI?,What type of investors did Tesla seek before W...,0.0,0.2,0.309436,0.870404,0.873923
385016,bart_squad_squad_hyp,bart,squad,squad,"{'defaults': False, 'max_length': 50, 'beams':...",Where did Tesla look for investors prior to WWI?,What type of investors did Tesla seek before W...,0.0,0.2,0.309436,0.870404,0.873923
395586,bart_amalgam_squad_hyp,bart,amalgam,squad,"{'defaults': False, 'max_length': 50, 'beams':...",Where did Tesla look for investors prior to WWI?,What type of investors did Tesla seek before W...,0.0,0.2,0.309436,0.870404,0.873923
414029,bart_amalgam_triviaqa_hyp,bart,amalgam,triviaqa,"{'defaults': False, 'max_length': 50, 'beams':...",What country has slums known as favelas?,A favela is a slum in which country?,0.0,0.133333,0.246914,0.868003,0.856116


In [None]:
spec_pred_scores_df = bart_df[(bart_df["target"] == "What country has slums known as favelas?") & (bart_df["nickname"] == "bart_triviaqa_triviaqa_hyp")]

spec_pred_scores_df[["nickname", "target", "prediction", "rougeL", "use"]]

Unnamed: 0,nickname,target,prediction,rougeL,use
321084,bart_triviaqa_triviaqa_hyp,What country has slums known as favelas?,In which country would you find a favela?,0.133333,0.725211


In [None]:
spec_pred_scores_df = bart_df[(bart_df["target"] == "What country has slums known as favelas?") & (bart_df["nickname"] == "bart_amalgam_triviaqa")]

spec_pred_scores_df[["nickname", "target", "prediction", "rougeL", "use"]]

Unnamed: 0,nickname,target,prediction,rougeL,use
276752,bart_amalgam_triviaqa,What country has slums known as favelas?,In which South American country would you find...,0.117647,0.691679


In [None]:
spec_pred_scores_df = bart_df[(bart_df["target"] == "What country has slums known as favelas?") & (bart_df["nickname"] == "bart_amalgam_triviaqa_hyp")]

spec_pred_scores_df[["nickname", "target", "prediction", "rougeL", "use"]]

Unnamed: 0,nickname,target,prediction,rougeL,use
414029,bart_amalgam_triviaqa_hyp,What country has slums known as favelas?,A favela is a slum in which country?,0.133333,0.856116


BART: Specific Example of High USE and Low ROUGEL

In [None]:
spec_pred_scores_df = bart_df[(bart_df["target"] == "What country has slums known as favelas?") & (bart_df["nickname"] == "bart_amalgam_triviaqa_hyp")]

spec_pred_scores_df = spec_pred_scores_df[["base_model", "trained_on", "tested_on", "target", "prediction", "rougeL", "use"]]

spec_pred_scores_df

Unnamed: 0,base_model,trained_on,tested_on,target,prediction,rougeL,use
414029,bart,amalgam,triviaqa,What country has slums known as favelas?,A favela is a slum in which country?,0.133333,0.856116


In [None]:
# Convert to Latex
print(spec_pred_scores_df.to_latex(index=False, header=True)) 

\begin{tabular}{lllllrr}
\toprule
base\_model & trained\_on & tested\_on &                                   target &                           prediction &   rougeL &      use \\
\midrule
      bart &    amalgam &  triviaqa & What country has slums known as favelas? & A favela is a slum in which country? & 0.133333 & 0.856116 \\
\bottomrule
\end{tabular}



## Format code to PEP 8 Standards

### Steps

*   Install:

In [None]:
!pip install black[jupyter]


*   To format your code run:

In [None]:
!black /content/drive/MyDrive/'Colab Notebooks'/prediction_text_analysis.ipynb


*   Don't save your notebook, hit F5 (Command + r) to refresh the page
*   Voila!
*   Now save!