In [None]:
# !pip install bert_score
# !pip install ipywidgets

# LOAD LIBRARIES

In [None]:
import pandas as pd
from bert_score import score
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from nltk.translate.bleu_score import corpus_bleu
import nltk
nltk.download('wordnet')
from functions import *

# BERTScore Metric

In [None]:
"""
=== Why to Use BertScore?

In order to evulate predictions on the basis of their semantic meaning.
In most of the cases Meteor is unable to handle the semantic variation of different texts having the same meaning.

Good Examples:
[

'updated the password':'changed the credentials', bert_score:0.61, meteor_score:0.11
'modified firewall settings':'updated firewall rules', bert_score:0.76, meteor_score:0.11
# 'Categorized website':'Categorized URL', bert_score:0.70, meteor_score:0.17
# 'Added space to drive':'Increased drive capacity', bert_score:0.60, meteor_score:0.12
]


=== Why to Use Meteor?

In order to evaluate predictions on the basis of syntactic and synonyms. 
In some cases bert_score give high value even if the prediction is not correct.

Good Examples:
[

'Categorized the website':'Tested website access', bert_score:0.40, meteor_score:0.16
'modified firewall settings':'changed user account', bert_score:0.39, meteor_score:0.0
'Categorized website':'updated account details', bert_score:0.32, meteor_score:0.0
'Added space to drive':'Increased system RAM', bert_score:0.34, meteor_score:0.12
]

=== Why are we not using simple cosine similarity for evaluation?

This is because we want our predictions to be evaluated in a particular sequence
with respect to actual labels. There might be the case where the particular word
in two phrases used at different locations, we want to make sure that the meaning
of that word should be matched one to one against the other phrase. 
Following example yeilds high cosine similarity whereas the phrases are not same.

 - Categorized website
 - Deleted the categorized URL
 
cosine similarity using spacy: 0.68
Bert Score: 0.25
Meteor Score: 0.22

 ------
 - 'Added space to drive'
 - 'Deleted space from drive'

cosine similarity using spacy: 0.86
Bert Score: 0.52
Meteor Score: 0.25
"""




In [None]:
actual_list = ['Added space to drive']
predicted_list =['Deleted space from drive']

In [None]:
nlp(actual_list[0]).similarity(nlp(predicted_list[0]))

In [None]:
# https://pypi.org/project/bert-score/
score(predicted_list, actual_list, lang='en', model_type='bert-base-uncased', rescale_with_baseline=True)

In [None]:
round(meteor_score(actual_list, predicted_list[0]), 4)

In [None]:
round(sentence_bleu(['categorized the site'.split()], 'site has been categorized'.split(), auto_reweigh=True), 4)