In [1]:
import re
import nltk
import numpy as np
import pandas as pd 
import transformers
from tqdm import tqdm
import spacy
import string
from nltk.tokenize import sent_tokenize, word_tokenize
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.stem.porter import *
from nltk.stem.snowball import *
from datasets import list_metrics, load_metric
from collections import Counter
from transformers import GPT2Tokenizer, GPT2TokenizerFast
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data = pd.read_csv("./Data/output_test_curie_full_ppl.csv")
data.head()

Unnamed: 0,Issue,Actual_Solution,Pred_Solution,Response_PPL,Total_PPL
0,mrh aft scissors has side to side play ...,"removed, inspected, and reshimmed aft scissor...",inspected mrh aft scissor iaw 150-300. play is...,1.338561,1.715868
1,black mrh blade lower tip cap screws worn. car...,black mrh blade lower tip cap screws removed ...,removed and replaced black mrh blade lower tip...,1.156229,1.715868
2,blue mrb damper leaking out of limits. ...,removed and replaced blue main rotor head dam...,removed and replaced blue main rotor damper ia...,1.621602,1.715868
3,left hand nose landing gear tire worn. \n\n###...,replaced left nose landing gear tire iaw sss ...,replaced lh nlg tire iaw sss 3240. ataf apaf a...,1.236525,1.715868
4,scir change: 16pmgg9. yellow blade tip cap s...,removed and replaced yellow bladetip cap scre...,removed and replased all tip cap screws iaw 15...,2.087995,1.715868


In [13]:
acro_dict = {
    "mrh": "main rotor head",
    "nlg": "nose landing gear",
    "tr": "tail rotor",
    "aff": "area fod free",
    "hyd": "hydraulic",
    "oz": "ounces",
    "tr": "tail rotor ",
    "r&r": "removed and replaced",
    "mlg": "main landing gear",
    "lh": "left hand",
    "rh": "right hand",
    "quad": "quadrant",
    "tq": "torque",
    "lgcu": "landing gear control unit",
    "150-300": "num_main",
    "3240": "main_num",
}

rev_dict = {"num_main":"150-300",
           "main_num": "3240"}

def replace_acronym(text):
    for acr, full in acro_dict.items():
        text = text.replace(acr.lower(), full)
    return text

def remove_eos(text, eos_token=" <EOS>"):
    t2 = text.replace(eos_token, "")
    return t2

def replace_num_grp(text):
    t3 = re.sub('[\d]+', '#num', text)
    return t3

def revert_val(text1):
    for acr, full in rev_dict.items():
        text1 = text1.replace(acr, full)
    return text1

def process_data(text):
    """
    Processes data: includes conversion from short form to full form and replacing numbers except a few top occuring ones
    """
    t1 = replace_acronym(text)
    t2 = remove_eos(t1)
    t3 = replace_num_grp(t2)
    t4 = revert_val(t3)
    return t4

def get_gpt2_tokenized_text(text, tokenizer):
    """
    Tokenize and create a list of tokenized words as per GPT-2 tokenizer 
    """
    tokens1   = tokenizer(text)['input_ids']
    list_toks = [tokenizer.decode(x).strip().lower() for x in tokens1]
    return list_toks

def spacy_tokenizer(sent, stem = False):
    """This function will accept a sentence as input and process the sentence into tokens, performing lemmatization, 
    lowercasing, removing stop words and punctuations."""
    
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(sent)
    
    # Create our list of punchuation marks
    punctuations = string.punctuation
    
    # Create our list of stop words
    stop_words = spacy.lang.en.stop_words.STOP_WORDS
    
    
    mytokens = [token.text for token in doc]
    # Removing stop words
    mytokens = [word.lower() for word in mytokens if word not in stop_words and word not in punctuations]
    
    if stem:
        # Perform stemming to get root words 
        porterStemmer = PorterStemmer()
        mytokens = [porterStemmer.stem(word) for word in mytokens]

    return mytokens

### PROCESS DATA

In [6]:
data['Upd_Actual_Solution'] = data['Actual_Solution'].apply(lambda x: process_data(x))
data['Upd_Pred_Solution']   = data['Pred_Solution'].apply(lambda x: process_data(x))

In [None]:
pred_tokens = [spacy_tokenizer(sent, stem=False) for sent in tqdm(data['Upd_Pred_Solution'])]
act_tokens  = [[spacy_tokenizer(sent, stem=False)] for sent in tqdm(data['Upd_Actual_Solution'])]


  0%|                                                                                         | 0/3320 [00:00<?, ?it/s][A
  0%|                                                                                 | 1/3320 [00:00<21:01,  2.63it/s][A
  0%|                                                                                 | 2/3320 [00:00<24:10,  2.29it/s][A
  0%|                                                                                 | 3/3320 [00:01<22:40,  2.44it/s][A
  0%|                                                                                 | 4/3320 [00:01<21:58,  2.52it/s][A
  0%|                                                                                 | 5/3320 [00:02<21:47,  2.53it/s][A
  0%|▏                                                                                | 6/3320 [00:02<21:29,  2.57it/s][A
  0%|▏                                                                                | 7/3320 [00:02<21:29,  2.57it/s][A
  0%|▏         

In [11]:
data['Upd_Pred_Solution'][0]

'inspected main rotor head aft scissor iaw 150-300. play is within limits. max limit is .#num. area secure and fod free. pema: #numfksa#num.'

# EVALUATION ON RAW DATASET PREDICTION

In [5]:
tokenizer   = GPT2Tokenizer.from_pretrained("gpt2")
pred_tokens = [get_gpt2_tokenized_text(i[:-1], tokenizer) for i in tqdm(data['Pred_Solution'])]
act_tokens  = [[get_gpt2_tokenized_text(i[:-1], tokenizer)] for i in tqdm(data['Actual_Solution'])]

100%|█████████████████████████████████████████████████████████████████████████████| 3320/3320 [00:07<00:00, 462.04it/s]
100%|████████████████████████████████████████████████████████████████████████████| 3320/3320 [00:03<00:00, 1050.29it/s]


### BLEU SCORE COMPUTATION

In [6]:
metrics_list = list_metrics()
bleu         = load_metric('bleu')
bleu_score   = bleu.compute(predictions=pred_tokens, references=act_tokens,max_order=4)
bleu_score

{'bleu': 0.29267535519823756,
 'precisions': [0.5397948990664817,
  0.3629951925554549,
  0.265140252507339,
  0.2018561664512234],
 'brevity_penalty': 0.9145855175097924,
 'length_ratio': 0.9180339765792512,
 'translation_length': 139151,
 'reference_length': 151575}

In [7]:
metrics_list = list_metrics()
bleu         = load_metric('bleu')
bleu_score   = bleu.compute(predictions=pred_tokens, references=act_tokens,max_order=3)
bleu_score

{'bleu': 0.34126579280458197,
 'precisions': [0.5397948990664817, 0.3629951925554549, 0.265140252507339],
 'brevity_penalty': 0.9145855175097924,
 'length_ratio': 0.9180339765792512,
 'translation_length': 139151,
 'reference_length': 151575}

In [8]:
metrics_list = list_metrics()
bleu         = load_metric('bleu')
bleu_score   = bleu.compute(predictions=pred_tokens, references=act_tokens,max_order=2)
bleu_score

{'bleu': 0.4048453401632763,
 'precisions': [0.5397948990664817, 0.3629951925554549],
 'brevity_penalty': 0.9145855175097924,
 'length_ratio': 0.9180339765792512,
 'translation_length': 139151,
 'reference_length': 151575}

In [9]:
metrics_list = list_metrics()
bleu         = load_metric('bleu')
bleu_score   = bleu.compute(predictions=pred_tokens, references=act_tokens,max_order=1)
bleu_score

{'bleu': 0.49368859711186436,
 'precisions': [0.5397948990664817],
 'brevity_penalty': 0.9145855175097924,
 'length_ratio': 0.9180339765792512,
 'translation_length': 139151,
 'reference_length': 151575}

### SACREBLEU SCORE COMPARISON

In [10]:
sbleu         = load_metric('sacrebleu')
sbleu_score   = sbleu.compute(predictions=data['Pred_Solution'].tolist(), references=[[x] for x in data['Actual_Solution']])
sbleu_score

{'score': 23.642324852109972,
 'counts': [50018, 28500, 18700, 12743],
 'totals': [95625, 92305, 88985, 85665],
 'precisions': [52.30640522875817,
  30.87590054709929,
  21.01477777153453,
  14.87538668067472],
 'bp': 0.8869482621085094,
 'sys_len': 95625,
 'ref_len': 107097}

In [11]:
sbleu         = load_metric('sacrebleu')
sbleu_score   = sbleu.compute(predictions=pred_tokens, references=act_tokens)
sbleu_score

{'score': 46.65712339786926,
 'counts': [226711, 173581, 142038, 116934],
 'totals': [312698, 309378, 306058, 302738],
 'precisions': [72.50158299701309,
  56.106445836484816,
  46.408850610015094,
  38.62547813621019],
 'bp': 0.8978612468906232,
 'sys_len': 312698,
 'ref_len': 346388}

### ROUGE SCORE EVALUATION

In [12]:
rouge   = load_metric('rouge')
rouge   = rouge.compute(predictions=data['Pred_Solution'].tolist(), references=[[x] for x in data['Actual_Solution']])
rouge

{'rouge1': AggregateScore(low=Score(precision=0.5407562935891993, recall=0.5020586295257934, fmeasure=0.5047617486216732), mid=Score(precision=0.5484841392187603, recall=0.5097293965166099, fmeasure=0.5123071390748343), high=Score(precision=0.5571247916950236, recall=0.516815112455666, fmeasure=0.5194525538636169)),
 'rouge2': AggregateScore(low=Score(precision=0.3375369233205327, recall=0.31014886900474264, fmeasure=0.3136708215220749), mid=Score(precision=0.3454919254991401, recall=0.31749396269741603, fmeasure=0.3207688874767536), high=Score(precision=0.35373340501154094, recall=0.324668818180614, fmeasure=0.328088476223464)),
 'rougeL': AggregateScore(low=Score(precision=0.48825431632971256, recall=0.45354504615286373, fmeasure=0.4561370911079002), mid=Score(precision=0.4963574872878247, recall=0.46082322550647536, fmeasure=0.4634842252276374), high=Score(precision=0.5051409249320887, recall=0.46789665192651975, fmeasure=0.47130567524309847)),
 'rougeLsum': AggregateScore(low=Score

In [13]:
list_metrics()

['accuracy',
 'bertscore',
 'bleu',
 'bleurt',
 'cer',
 'chrf',
 'code_eval',
 'comet',
 'competition_math',
 'coval',
 'cuad',
 'f1',
 'gleu',
 'glue',
 'google_bleu',
 'indic_glue',
 'matthews_correlation',
 'mauve',
 'meteor',
 'pearsonr',
 'precision',
 'recall',
 'rouge',
 'sacrebleu',
 'sari',
 'seqeval',
 'spearmanr',
 'squad',
 'squad_v2',
 'super_glue',
 'ter',
 'wer',
 'wiki_split',
 'xnli']

In [14]:
# BLEURT METRIC EVALUATION

In [15]:
# bleurt   = load_metric('bleurt')
# bleurt   = bleurt.compute(predictions=data['Pred_Solution'].tolist(), references=[[x] for x in data['Actual_Solution']])
# bleurt

In [16]:
# ! pip install git+https://github.com/google-research/bleurt.git
# !pip install rouge_score

In [17]:
# Get TFIDF Scores for each word-token
corpus     = data['Actual_Solution'].tolist()
vectorizer = TfidfVectorizer(max_df=5000)
X          = vectorizer.fit_transform(corpus)
map_tfidf  = dict(zip(vectorizer.get_feature_names(), X.toarray()[0]))

key_li = []
val_li = []
for k,v in map_tfidf.items():
    if v>= 0.01:
        key_li.append(k)
        val_li.append(v)  

In [18]:
tokenized_li = [nltk.word_tokenize(x) for x in data['Actual_Solution'].tolist()]
tokenized = [i for sub in tokenized_li for i in sub]
sorted(Counter(tokenized).items(), key=lambda x: x[1], reverse= True)

[('.', 7654),
 ('and', 4807),
 ('<', 3320),
 ('EOS', 3320),
 ('>', 3320),
 ('iaw', 3105),
 ('area', 2607),
 ('fod', 2600),
 ('free', 2348),
 (':', 2271),
 ('secure', 2082),
 (',', 1504),
 ('to', 1369),
 ('of', 1291),
 ('removed', 1258),
 ('at', 1217),
 ('replaced', 1177),
 ('time', 1133),
 ('pema', 1039),
 ('inspection', 781),
 ('150-300.', 750),
 ('rotor', 748),
 ('serviced', 727),
 ('for', 667),
 ('mrh', 645),
 ('checks', 592),
 ('yellow', 582),
 ('refer', 571),
 ('good', 555),
 ('is', 524),
 ('a1-h60ra-150-300', 508),
 ('accumulator', 503),
 ('all', 469),
 ('psi', 462),
 ('*', 460),
 ('main', 455),
 ('ataf', 454),
 ('blade', 449),
 ('green', 443),
 ('s/n', 414),
 ('blue', 413),
 ('gai-000', 409),
 ('red', 407),
 ('aff', 401),
 ('black', 400),
 ('apaf', 398),
 ('on', 380),
 ('sss', 376),
 ('tail', 358),
 ('mcn', 348),
 ('check', 344),
 ('the', 341),
 ('jcn', 338),
 ('degrees', 337),
 ('clean', 331),
 ('a/c', 322),
 ('oat', 316),
 ('hours', 302),
 ('pcr', 299),
 ('nitrogen', 271),
 ('

# EVALUATION ON PROCESSED DATA

In [19]:
tokenizer   = GPT2Tokenizer.from_pretrained("gpt2")
pred_tokens = [get_gpt2_tokenized_text(i[:-1], tokenizer) for i in tqdm(data['Upd_Pred_Solution'])]
act_tokens  = [[get_gpt2_tokenized_text(i[:-1], tokenizer)] for i in tqdm(data['Upd_Actual_Solution'])]

100%|████████████████████████████████████████████████████████████████████████████| 3320/3320 [00:02<00:00, 1470.31it/s]
100%|████████████████████████████████████████████████████████████████████████████| 3320/3320 [00:01<00:00, 1662.46it/s]


In [20]:
metrics_list = list_metrics()
bleu         = load_metric('bleu')
bleu_score   = bleu.compute(predictions=pred_tokens, references=act_tokens,max_order=4)
bleu_score

{'bleu': 0.37542769436933604,
 'precisions': [0.594894210893579,
  0.43215376953414186,
  0.3182988448776727,
  0.24276837670663493],
 'brevity_penalty': 1.0,
 'length_ratio': 1.00372002669433,
 'translation_length': 151906,
 'reference_length': 151343}

In [21]:
metrics_list = list_metrics()
bleu         = load_metric('bleu')
bleu_score   = bleu.compute(predictions=pred_tokens, references=act_tokens,max_order=3)
bleu_score      

{'bleu': 0.4341478989434973,
 'precisions': [0.594894210893579, 0.43215376953414186, 0.3182988448776727],
 'brevity_penalty': 1.0,
 'length_ratio': 1.00372002669433,
 'translation_length': 151906,
 'reference_length': 151343}

In [22]:
metrics_list = list_metrics()
bleu         = load_metric('bleu')
bleu_score   = bleu.compute(predictions=pred_tokens, references=act_tokens,max_order=2)
bleu_score      

{'bleu': 0.507036266663144,
 'precisions': [0.594894210893579, 0.43215376953414186],
 'brevity_penalty': 1.0,
 'length_ratio': 1.00372002669433,
 'translation_length': 151906,
 'reference_length': 151343}

In [23]:
metrics_list = list_metrics()
bleu         = load_metric('bleu')
bleu_score   = bleu.compute(predictions=pred_tokens, references=act_tokens,max_order=1)
bleu_score 

{'bleu': 0.594894210893579,
 'precisions': [0.594894210893579],
 'brevity_penalty': 1.0,
 'length_ratio': 1.00372002669433,
 'translation_length': 151906,
 'reference_length': 151343}

In [24]:
sbleu         = load_metric('sacrebleu')
sbleu_score   = sbleu.compute(predictions=data['Upd_Pred_Solution'].tolist(), references=[[x] for x in data['Upd_Actual_Solution']])
sbleu_score

{'score': 32.93534821143222,
 'counts': [67973, 43777, 30273, 21038],
 'totals': [115807, 112487, 109167, 105847],
 'precisions': [58.695070246185466,
  38.917386009049935,
  27.73090769188491,
  19.875858550549378],
 'bp': 0.9832261975610447,
 'sys_len': 115807,
 'ref_len': 117766}

In [25]:
sbleu         = load_metric('sacrebleu')
sbleu_score   = sbleu.compute(predictions=pred_tokens, references=act_tokens)
sbleu_score

{'score': 58.019225562232165,
 'counts': [261628, 221129, 191022, 164543],
 'totals': [360658, 357338, 354018, 350698],
 'precisions': [72.54185405564274,
  61.882307507178076,
  53.958273308136874,
  46.918716388459586],
 'bp': 0.9992682724348892,
 'sys_len': 360658,
 'ref_len': 360922}

In [26]:
rouge   = load_metric('rouge')
rouge   = rouge.compute(predictions=data['Upd_Pred_Solution'].tolist(), references=[[x] for x in data['Upd_Actual_Solution']])
rouge

{'rouge1': AggregateScore(low=Score(precision=0.5947270238048222, recall=0.583125780800635, fmeasure=0.5702848245107484), mid=Score(precision=0.6022840740678459, recall=0.5914159837924859, fmeasure=0.577416837622177), high=Score(precision=0.6103506986741225, recall=0.5997470976011195, fmeasure=0.5847768683837642)),
 'rouge2': AggregateScore(low=Score(precision=0.39203650738951984, recall=0.3842542696444089, fmeasure=0.37622761678685146), mid=Score(precision=0.40030264330778276, recall=0.39282972086050727, fmeasure=0.3841673834451809), high=Score(precision=0.4086599048752092, recall=0.40182017479620796, fmeasure=0.39206712192407767)),
 'rougeL': AggregateScore(low=Score(precision=0.5297581482409971, recall=0.5204637672867531, fmeasure=0.5084703277452172), mid=Score(precision=0.5376420092455716, recall=0.5284248305904186, fmeasure=0.5158369704487537), high=Score(precision=0.546321835957302, recall=0.5361824770284803, fmeasure=0.5238319737612823)),
 'rougeLsum': AggregateScore(low=Score(p

# Evaluate Mean Perplexity

In [27]:
data['Response_PPL'].mean()

1.9041122266641177

# EVALUATION USING WORD TOKENS 

In [5]:
# !pip install nltk
# !pip install spacy==2.3.5
# !pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz
# ! pip install pyresparser

In [7]:
# !pip uninstall spacy -y