In [1]:
import re
import nltk
import numpy as np
import pandas as pd 
import transformers
from tqdm import tqdm
import spacy
import string
from nltk.tokenize import sent_tokenize, word_tokenize
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.stem.porter import *
from nltk.stem.snowball import *
from datasets import list_metrics, load_metric
from collections import Counter
from transformers import GPT2Tokenizer, GPT2TokenizerFast
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data = pd.read_csv("./Data/output_test_curie_full_ppl.csv")
data.head()

Unnamed: 0,Issue,Actual_Solution,Pred_Solution,Response_PPL,Total_PPL
0,mrh aft scissors has side to side play ...,"removed, inspected, and reshimmed aft scissor...",inspected mrh aft scissor iaw 150-300. play is...,1.338561,1.715868
1,black mrh blade lower tip cap screws worn. car...,black mrh blade lower tip cap screws removed ...,removed and replaced black mrh blade lower tip...,1.156229,1.715868
2,blue mrb damper leaking out of limits. ...,removed and replaced blue main rotor head dam...,removed and replaced blue main rotor damper ia...,1.621602,1.715868
3,left hand nose landing gear tire worn. \n\n###...,replaced left nose landing gear tire iaw sss ...,replaced lh nlg tire iaw sss 3240. ataf apaf a...,1.236525,1.715868
4,scir change: 16pmgg9. yellow blade tip cap s...,removed and replaced yellow bladetip cap scre...,removed and replased all tip cap screws iaw 15...,2.087995,1.715868


In [3]:
acro_dict = {
    "mrh": "main rotor head",
    "nlg": "nose landing gear",
    "tr": "tail rotor",
    "aff": "area fod free",
    "hyd": "hydraulic",
    "oz": "ounces",
    "tr": "tail rotor ",
    "r&r": "removed and replaced",
    "mlg": "main landing gear",
    "lh": "left hand",
    "rh": "right hand",
    "quad": "quadrant",
    "tq": "torque",
    "lgcu": "landing gear control unit",
    "150-300": "num_main",
    "3240": "main_num",
}

rev_dict = {"num_main":"150-300",
           "main_num": "3240"}

def replace_acronym(text):
    for acr, full in acro_dict.items():
        text = text.replace(acr.lower(), full)
    return text

def remove_eos(text, eos_token=" <EOS>"):
    t2 = text.replace(eos_token, "")
    return t2

def replace_num_grp(text):
    t3 = re.sub('[\d]+', '#num', text)
    return t3

def revert_val(text1):
    for acr, full in rev_dict.items():
        text1 = text1.replace(acr, full)
    return text1

def process_data(text):
    """
    Processes data: includes conversion from short form to full form and replacing numbers except a few top occuring ones
    """
    t1 = replace_acronym(text)
    t2 = remove_eos(t1)
    t3 = replace_num_grp(t2)
    t4 = revert_val(t3)
    return t4

def get_gpt2_tokenized_text(text, tokenizer):
    """
    Tokenize and create a list of tokenized words as per GPT-2 tokenizer 
    """
    tokens1   = tokenizer(text)['input_ids']
    list_toks = [tokenizer.decode(x).strip().lower() for x in tokens1]
    return list_toks

def load_support_spacy():
    "Loads all supporting requirements for punctuations, stopwords, stemmer"
    
    nlp = spacy.load("en_core_web_sm")
    # Create our list of punchuation marks
    punctuations = string.punctuation
     # Create our list of stop words
    stop_words = spacy.lang.en.stop_words.STOP_WORDS
    # Load Porter
    porterStemmer = PorterStemmer()
    
    return nlp, punctuations, stop_words, porterStemmer

def spacy_tokenizer(sent, nlp, punctuations, stop_words, porterStemmer, stem = False):
    """This function will accept a sentence as input and process the sentence into tokens, performing stemming, 
    lowercasing, removing stop words and punctuations."""
    
    doc = nlp(sent)
    mytokens = [token.text.lower() for token in doc if token.text not in stop_words and token.text not in punctuations]    
    if stem:
        # Perform stemming to get root words 
        mytokens = [porterStemmer.stem(word) for word in mytokens]

    return mytokens

### PROCESS DATA

In [4]:
data['Upd_Actual_Solution'] = data['Actual_Solution'].apply(lambda x: process_data(x))
data['Upd_Pred_Solution']   = data['Pred_Solution'].apply(lambda x: process_data(x))

In [27]:
nlp, punctuations, stop_words, porterStemmer = load_support_spacy()
pred_tokens = [spacy_tokenizer(sent, nlp, punctuations, stop_words, porterStemmer, stem=True) for sent in tqdm(data['Upd_Pred_Solution'])]
act_tokens  = [[spacy_tokenizer(sent, nlp, punctuations, stop_words, porterStemmer, stem=True)] for sent in tqdm(data['Upd_Actual_Solution'])]

100%|█████████████████████████████████████████████████████████████████████████████| 3320/3320 [00:24<00:00, 135.40it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 3320/3320 [00:25<00:00, 132.31it/s]


In [28]:
metrics_list = list_metrics()
bleu         = load_metric('bleu')
bleu_score   = bleu.compute(predictions=pred_tokens, references=act_tokens,max_order=4)
bleu_score

{'bleu': 0.2905331425479142,
 'precisions': [0.593419740777667,
  0.3600687696217671,
  0.2455088878401762,
  0.17156846473029044],
 'brevity_penalty': 0.9432626748188677,
 'length_ratio': 0.9448130155697003,
 'translation_length': 70210,
 'reference_length': 74311}

In [29]:
metrics_list = list_metrics()
bleu         = load_metric('bleu')
bleu_score   = bleu.compute(predictions=pred_tokens, references=act_tokens,max_order=3)
bleu_score

{'bleu': 0.3531052737941756,
 'precisions': [0.593419740777667, 0.3600687696217671, 0.2455088878401762],
 'brevity_penalty': 0.9432626748188677,
 'length_ratio': 0.9448130155697003,
 'translation_length': 70210,
 'reference_length': 74311}

In [30]:
metrics_list = list_metrics()
bleu         = load_metric('bleu')
bleu_score   = bleu.compute(predictions=pred_tokens, references=act_tokens,max_order=2)
bleu_score

{'bleu': 0.4360199609948481,
 'precisions': [0.593419740777667, 0.3600687696217671],
 'brevity_penalty': 0.9432626748188677,
 'length_ratio': 0.9448130155697003,
 'translation_length': 70210,
 'reference_length': 74311}

In [31]:
metrics_list = list_metrics()
bleu         = load_metric('bleu')
bleu_score   = bleu.compute(predictions=pred_tokens, references=act_tokens,max_order=1)
bleu_score

{'bleu': 0.5597506919762613,
 'precisions': [0.593419740777667],
 'brevity_penalty': 0.9432626748188677,
 'length_ratio': 0.9448130155697003,
 'translation_length': 70210,
 'reference_length': 74311}

In [32]:
sbleu         = load_metric('sacrebleu')
sbleu_score   = sbleu.compute(predictions=data['Pred_Solution'].tolist(), references=[[x] for x in data['Actual_Solution']])
sbleu_score

{'score': 23.642324852109972,
 'counts': [50018, 28500, 18700, 12743],
 'totals': [95625, 92305, 88985, 85665],
 'precisions': [52.30640522875817,
  30.87590054709929,
  21.01477777153453,
  14.87538668067472],
 'bp': 0.8869482621085094,
 'sys_len': 95625,
 'ref_len': 107097}

In [33]:
rouge   = load_metric('rouge')
rouge   = rouge.compute(predictions=data['Upd_Pred_Solution'].tolist(), references=[[x] for x in data['Upd_Actual_Solution']])
rouge

{'rouge1': AggregateScore(low=Score(precision=0.5941932345005622, recall=0.5832325193353943, fmeasure=0.5697916436086047), mid=Score(precision=0.6020652308667536, recall=0.5913538654647492, fmeasure=0.5773395798425655), high=Score(precision=0.6102303735716907, recall=0.5991253772969031, fmeasure=0.5841931021992998)),
 'rouge2': AggregateScore(low=Score(precision=0.3923231297694795, recall=0.38449693123011286, fmeasure=0.3762251449404273), mid=Score(precision=0.400561292696072, recall=0.3931981345004616, fmeasure=0.384381882447152), high=Score(precision=0.40909435282565887, recall=0.4011351538239484, fmeasure=0.3921502186977758)),
 'rougeL': AggregateScore(low=Score(precision=0.5294247524074076, recall=0.5204112435605912, fmeasure=0.5083175344316421), mid=Score(precision=0.5377844997465213, recall=0.5285399064603791, fmeasure=0.515955154695161), high=Score(precision=0.5457391139895541, recall=0.5363425625393664, fmeasure=0.523528685053943)),
 'rougeLsum': AggregateScore(low=Score(precis

# EVALUATE MEAN RESPONSE PERPLEXITY

In [21]:
data['Response_PPL'].mean()

1.9041122266641177

# METRICS FOR WORD TOKENIZER ON UNPROCESSED DATA

In [22]:
nlp, punctuations, stop_words, porterStemmer = load_support_spacy()
pred_tokens = [spacy_tokenizer(sent, nlp, punctuations, stop_words, porterStemmer, stem=False) for sent in tqdm(data['Pred_Solution'])]
act_tokens  = [[spacy_tokenizer(sent, nlp, punctuations, stop_words, porterStemmer, stem=False)] for sent in tqdm(data['Actual_Solution'])]

100%|█████████████████████████████████████████████████████████████████████████████| 3320/3320 [00:22<00:00, 148.54it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 3320/3320 [00:22<00:00, 145.40it/s]


In [23]:
metrics_list = list_metrics()
bleu         = load_metric('bleu')
bleu_score   = bleu.compute(predictions=pred_tokens, references=act_tokens,max_order=4)
bleu_score

{'bleu': 0.20517612194314272,
 'precisions': [0.521076260206695,
  0.2784420692061277,
  0.1719006327630654,
  0.11218050976638662],
 'brevity_penalty': 0.8921113417914803,
 'length_ratio': 0.8975336686678566,
 'translation_length': 66378,
 'reference_length': 73956}

In [24]:
metrics_list = list_metrics()
bleu         = load_metric('bleu')
bleu_score   = bleu.compute(predictions=pred_tokens, references=act_tokens,max_order=3)
bleu_score

{'bleu': 0.26064952184373114,
 'precisions': [0.521076260206695, 0.2784420692061277, 0.1719006327630654],
 'brevity_penalty': 0.8921113417914803,
 'length_ratio': 0.8975336686678566,
 'translation_length': 66378,
 'reference_length': 73956}

In [25]:
metrics_list = list_metrics()
bleu         = load_metric('bleu')
bleu_score   = bleu.compute(predictions=pred_tokens, references=act_tokens,max_order=2)
bleu_score

{'bleu': 0.3398107633203335,
 'precisions': [0.521076260206695, 0.2784420692061277],
 'brevity_penalty': 0.8921113417914803,
 'length_ratio': 0.8975336686678566,
 'translation_length': 66378,
 'reference_length': 73956}

In [26]:
metrics_list = list_metrics()
bleu         = load_metric('bleu')
bleu_score   = bleu.compute(predictions=pred_tokens, references=act_tokens,max_order=1)
bleu_score

{'bleu': 0.46485804166868117,
 'precisions': [0.521076260206695],
 'brevity_penalty': 0.8921113417914803,
 'length_ratio': 0.8975336686678566,
 'translation_length': 66378,
 'reference_length': 73956}

## EVALUATION FOR N-SHOT SETTINGS

In [38]:
nsdata = pd.read_csv("./Data/API_Comparisons.csv")

Unnamed: 0,prompt,actual_completion,finetune_completion,nshot_completion_curie,nshot_completion_davinci,Comment
0,mrh aft scissors has side to side play ...,"removed, inspected, and reshimmed aft scissor...",inspected mrh aft scissor iaw 150-300. play is...,inspeted mrh aft scissors side to side play ia...,replaced aft scissors iaw 150-300. checks goo...,
1,black mrh blade lower tip cap screws worn. car...,black mrh blade lower tip cap screws removed ...,removed and replaced black mrh blade lower tip...,removed and replaced black mrh blade lower tip...,removed and replaced black blade lower tip cap...,
2,blue mrb damper leaking out of limits. ...,removed and replaced blue main rotor head dam...,removed and replaced blue main rotor damper ia...,inspected and replaced blue mrb damper iaw 150...,removed and replaced blue mrb damper iaw 150-3...,
3,left hand nose landing gear tire worn. \n\n###...,replaced left nose landing gear tire iaw sss ...,replaced lh nlg tire iaw sss 3240. ataf apaf a...,iaw s/n 2120 removed and replaced left hand no...,removed and replaced left hand nose landing ge...,
4,scir change: 16pmgg9. yellow blade tip cap s...,removed and replaced yellow bladetip cap scre...,removed and replased all tip cap screws iaw 15...,replaced screws iaw a1-h60ra-140-300. all chec...,replaced yellow blade tip cap screws iaw a1-h6...,


In [39]:
nsdata['Upd_Finetune']      = nsdata['finetune_completion'].apply(lambda x: process_data(x))
nsdata['Upd_NShot_Curie']   = nsdata['nshot_completion_curie'].apply(lambda x: process_data(x))
nsdata['Upd_NShot_Davinci'] = nsdata['nshot_completion_davinci'].apply(lambda x: process_data(x))
nsdata['Upd_Actual']        = nsdata['actual_completion'].apply(lambda x: process_data(x))

In [40]:
rouge   = load_metric('rouge')
rouge   = rouge.compute(predictions=nsdata['Upd_Finetune'].tolist(), references=[[x] for x in nsdata['Upd_Actual']])
rouge

{'rouge1': AggregateScore(low=Score(precision=0.5596612734752757, recall=0.48957065261166266, fmeasure=0.5042537855440952), mid=Score(precision=0.6538003607794429, recall=0.6021880074274288, fmeasure=0.6023432561721545), high=Score(precision=0.7502726022394897, recall=0.7113032944473935, fmeasure=0.7036463174742328)),
 'rouge2': AggregateScore(low=Score(precision=0.35353197037137374, recall=0.31189652844713506, fmeasure=0.3182310410510946), mid=Score(precision=0.4586681247628679, recall=0.4190763039206067, fmeasure=0.42374923010905213), high=Score(precision=0.5786169715846395, recall=0.552809710791453, fmeasure=0.549162418690499)),
 'rougeL': AggregateScore(low=Score(precision=0.47035174834187093, recall=0.4148356490598334, fmeasure=0.42916171557913335), mid=Score(precision=0.5782386734305771, recall=0.523951477957332, fmeasure=0.5332462851019608), high=Score(precision=0.6811627791850565, recall=0.6533938790615401, fmeasure=0.648514029849493)),
 'rougeLsum': AggregateScore(low=Score(pr

In [41]:
rouge   = load_metric('rouge')
rouge   = rouge.compute(predictions=nsdata['Upd_NShot_Curie'].tolist(), references=[[x] for x in nsdata['Upd_Actual']])
rouge

{'rouge1': AggregateScore(low=Score(precision=0.3727761920095415, recall=0.33187928220407736, fmeasure=0.3428473353168135), mid=Score(precision=0.45302617484069096, recall=0.4149446177937377, fmeasure=0.40525603857513415), high=Score(precision=0.5266320113578178, recall=0.49914848082086083, fmeasure=0.45865839804286856)),
 'rouge2': AggregateScore(low=Score(precision=0.19672337242401802, recall=0.17291969498149773, fmeasure=0.17850236802854452), mid=Score(precision=0.2504350316192491, recall=0.22823338350723493, fmeasure=0.22224153617693249), high=Score(precision=0.311061475451906, recall=0.29316945208894973, fmeasure=0.2711426933103656)),
 'rougeL': AggregateScore(low=Score(precision=0.3458436714237952, recall=0.30343301192714517, fmeasure=0.3101075605139638), mid=Score(precision=0.41222663063875525, recall=0.3796201733190164, fmeasure=0.3701150436567302), high=Score(precision=0.4819075577200576, recall=0.465647164931456, fmeasure=0.4238865920302943)),
 'rougeLsum': AggregateScore(low

In [42]:
rouge   = load_metric('rouge')
rouge   = rouge.compute(predictions=nsdata['Upd_NShot_Davinci'].tolist(), references=[[x] for x in nsdata['Upd_Actual']])
rouge

{'rouge1': AggregateScore(low=Score(precision=0.3317821012544803, recall=0.24998039772727274, fmeasure=0.2793258171764496), mid=Score(precision=0.4507794868530163, recall=0.3618035019688044, fmeasure=0.38785760025205906), high=Score(precision=0.5466599818455842, recall=0.4780296548658058, fmeasure=0.4832542712371364)),
 'rouge2': AggregateScore(low=Score(precision=0.1719538665924215, recall=0.1366530350861225, fmeasure=0.1499098816953766), mid=Score(precision=0.2586128257541046, recall=0.21494735301892237, fmeasure=0.22664884973796823), high=Score(precision=0.34854815477972434, recall=0.29945255183221803, fmeasure=0.310942182768551)),
 'rougeL': AggregateScore(low=Score(precision=0.3006985047092376, recall=0.22776132986625844, fmeasure=0.2530169168284688), mid=Score(precision=0.41258742351365696, recall=0.33315649844513606, fmeasure=0.3561021864691335), high=Score(precision=0.5162586816514712, recall=0.4425359887049226, fmeasure=0.4578308279015947)),
 'rougeLsum': AggregateScore(low=Sc

In [44]:
nlp, punctuations, stop_words, porterStemmer = load_support_spacy()
ftune_tokens = [spacy_tokenizer(sent, nlp, punctuations, stop_words, porterStemmer, stem=False) for sent in tqdm(nsdata['Upd_Finetune'])]
act_tokens  = [[spacy_tokenizer(sent, nlp, punctuations, stop_words, porterStemmer, stem=False)] for sent in tqdm(nsdata['Upd_Actual'])]
curie_tokens = [spacy_tokenizer(sent, nlp, punctuations, stop_words, porterStemmer, stem=False) for sent in tqdm(nsdata['Upd_NShot_Curie'])]
davinci_tokens = [spacy_tokenizer(sent, nlp, punctuations, stop_words, porterStemmer, stem=False) for sent in tqdm(nsdata['Upd_NShot_Davinci'])]

100%|█████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 124.19it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 125.76it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 137.89it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 169.48it/s]


In [52]:
metrics_list = list_metrics()
bleu         = load_metric('bleu')
bleu_ngram_ft    = []
bleu_ngram_curie = []
bleu_ngram_dvnci = []
for i in range(1,5):
    bleu_ngram_ft.append(bleu.compute(predictions=ftune_tokens, references=act_tokens,max_order=i)['bleu'])
    bleu_ngram_curie.append(bleu.compute(predictions=curie_tokens, references=act_tokens,max_order=i)['bleu'])
    bleu_ngram_dvnci.append(bleu.compute(predictions=davinci_tokens, references=act_tokens,max_order=i)['bleu'])
print("Bleu Scores for 20 instance from 1 to 4 gram for Finetuned model: ")
print(bleu_ngram_ft)
print("Bleu Scores for 20 instance model from 1 to 4 gram for Curie model: ")
print(bleu_ngram_curie)
print("Bleu Scores for 20 instance model from 1 to 4 gram for Davinci model: ")
print(bleu_ngram_dvnci)

Bleu Scores for 20 instance from 1 to 4 gram for Finetuned model: 
[0.4976118507175784, 0.4024628993333813, 0.3280804514054867, 0.27252460116453414]
Bleu Scores for 20 instance model from 1 to 4 gram for Curie model: 
[0.32772594547320916, 0.24616958147111095, 0.18751735954917328, 0.143130473864383]
Bleu Scores for 20 instance model from 1 to 4 gram for Davinci model: 
[0.2504854584234485, 0.18523252387068123, 0.14253999618930616, 0.1098713100240581]


# EVALUATION ON RAW DATASET PREDICTION FOR GPT-2 Tokenization

In [None]:
tokenizer   = GPT2Tokenizer.from_pretrained("gpt2")
pred_tokens = [get_gpt2_tokenized_text(i[:-1], tokenizer) for i in tqdm(data['Pred_Solution'])]
act_tokens  = [[get_gpt2_tokenized_text(i[:-1], tokenizer)] for i in tqdm(data['Actual_Solution'])]

### BLEU SCORE COMPUTATION

In [None]:
metrics_list = list_metrics()
bleu         = load_metric('bleu')
bleu_score   = bleu.compute(predictions=pred_tokens, references=act_tokens,max_order=4)
bleu_score

In [None]:
metrics_list = list_metrics()
bleu         = load_metric('bleu')
bleu_score   = bleu.compute(predictions=pred_tokens, references=act_tokens,max_order=3)
bleu_score

In [None]:
metrics_list = list_metrics()
bleu         = load_metric('bleu')
bleu_score   = bleu.compute(predictions=pred_tokens, references=act_tokens,max_order=2)
bleu_score

In [None]:
metrics_list = list_metrics()
bleu         = load_metric('bleu')
bleu_score   = bleu.compute(predictions=pred_tokens, references=act_tokens,max_order=1)
bleu_score

### SACREBLEU SCORE COMPARISON

In [None]:
sbleu         = load_metric('sacrebleu')
sbleu_score   = sbleu.compute(predictions=data['Pred_Solution'].tolist(), references=[[x] for x in data['Actual_Solution']])
sbleu_score

In [None]:
sbleu         = load_metric('sacrebleu')
sbleu_score   = sbleu.compute(predictions=pred_tokens, references=act_tokens)
sbleu_score

### ROUGE SCORE EVALUATION

In [None]:
rouge   = load_metric('rouge')
rouge   = rouge.compute(predictions=data['Pred_Solution'].tolist(), references=[[x] for x in data['Actual_Solution']])
rouge

In [None]:
list_metrics()

In [None]:
# BLEURT METRIC EVALUATION

In [None]:
# bleurt   = load_metric('bleurt')
# bleurt   = bleurt.compute(predictions=data['Pred_Solution'].tolist(), references=[[x] for x in data['Actual_Solution']])
# bleurt

In [None]:
# ! pip install git+https://github.com/google-research/bleurt.git
# !pip install rouge_score

In [None]:
# Get TFIDF Scores for each word-token
corpus     = data['Actual_Solution'].tolist()
vectorizer = TfidfVectorizer(max_df=5000)
X          = vectorizer.fit_transform(corpus)
map_tfidf  = dict(zip(vectorizer.get_feature_names(), X.toarray()[0]))

key_li = []
val_li = []
for k,v in map_tfidf.items():
    if v>= 0.01:
        key_li.append(k)
        val_li.append(v)  

In [None]:
tokenized_li = [nltk.word_tokenize(x) for x in data['Actual_Solution'].tolist()]
tokenized = [i for sub in tokenized_li for i in sub]
sorted(Counter(tokenized).items(), key=lambda x: x[1], reverse= True)

# EVALUATION ON PROCESSED DATA

In [None]:
tokenizer   = GPT2Tokenizer.from_pretrained("gpt2")
pred_tokens = [get_gpt2_tokenized_text(i[:-1], tokenizer) for i in tqdm(data['Upd_Pred_Solution'])]
act_tokens  = [[get_gpt2_tokenized_text(i[:-1], tokenizer)] for i in tqdm(data['Upd_Actual_Solution'])]

In [None]:
metrics_list = list_metrics()
bleu         = load_metric('bleu')
bleu_score   = bleu.compute(predictions=pred_tokens, references=act_tokens,max_order=4)
bleu_score

In [None]:
metrics_list = list_metrics()
bleu         = load_metric('bleu')
bleu_score   = bleu.compute(predictions=pred_tokens, references=act_tokens,max_order=3)
bleu_score      

In [None]:
metrics_list = list_metrics()
bleu         = load_metric('bleu')
bleu_score   = bleu.compute(predictions=pred_tokens, references=act_tokens,max_order=2)
bleu_score      

In [None]:
metrics_list = list_metrics()
bleu         = load_metric('bleu')
bleu_score   = bleu.compute(predictions=pred_tokens, references=act_tokens,max_order=1)
bleu_score 

In [None]:
sbleu         = load_metric('sacrebleu')
sbleu_score   = sbleu.compute(predictions=data['Upd_Pred_Solution'].tolist(), references=[[x] for x in data['Upd_Actual_Solution']])
sbleu_score

In [None]:
sbleu         = load_metric('sacrebleu')
sbleu_score   = sbleu.compute(predictions=pred_tokens, references=act_tokens)
sbleu_score

In [None]:
rouge   = load_metric('rouge')
rouge   = rouge.compute(predictions=data['Upd_Pred_Solution'].tolist(), references=[[x] for x in data['Upd_Actual_Solution']])
rouge

# EVALUATION USING WORD TOKENS 

In [None]:
# !pip install nltk
# !pip install spacy==2.3.5
# !pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
# ! pip install pyresparser

In [None]:
# !pip uninstall spacy -y