# Preparations

In this notebook we will present examples on how to use library BBMetric for evaluate our chatbots.

In [None]:
# Various imports to load the metric library, the model, the tokenizer, and the characters data
from metrics import BBMetric
from transformers import TFAutoModelForCausalLM, AutoTokenizer
import os
from data_utils import character_dict, source_dict, random_state
from transformers import AdamWeightDecay
import os

# Mount google drive, if in Colaboratory environment
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    base_folder = '/content/drive/My Drive/unibo/NLP_project/BarneyBot'
    os.system("pip install datasets")
    os.system("pip install transformers")
    os.system("pip install rouge_score")
    os.system("pip install -U sentence-transformers")
else:
    # base_folder = os.getcwd()
    base_folder = '..'

out_folder = os.path.join(base_folder, 'Data', 'Characters')
# Get Barney character folder, as an example
barney_folder = os.path.join(out_folder, 'Barney')
if not os.path.exists(barney_folder):
    os.makedirs(barney_folder)
    
# Create some basic sentences to feed to the metrics
sentences_basic = ["Hi!", "How are you?", "I hate you."]
sentences_basic_2 = ["Hello!", "How are you doing?", "I think this is good."]
sentences_vader = ["Come to the dark side!", "I will kill you!", "Luke, I am your father."]
sentences_barney = ["Did you get the suit?", "Legendary!", "I like girls."]

Below a model from the checkpoint `microsoft/DialoGPT-small` is loaded with its corresponding tokenizer and setting the padding token to `#`

In [None]:
# Load and compile dialogpt defaul model
model = TFAutoModelForCausalLM.from_pretrained('microsoft/DialoGPT-small', cache_dir=os.path.join(os.getcwd(), "cache"))
model.compile(optimizer=AdamWeightDecay(learning_rate=2e-5))
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small', cache_dir=os.path.join(os.getcwd(), "cache"))
tokenizer.pad_token = '#'

In the following cell the Barney model is loaded and a conversation is preprocessed. Then the HuggingFace dataset is transformod into a tensorflow one, ready to be fed to the model

In [None]:
# Functions to load a dataset and prepare it, used for perplexity
from datasets import load_dataset, DatasetDict
from transformers import DataCollatorForLanguageModeling
from data_utils import load_df, construct_conv, preprocess_function

# Select a batch size, used for perplexity
batch_size = 8

# Load the Barney dialogpt finetuned model
model_barney = TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=\
                                            os.path.join(barney_folder, character_dict['Barney']['checkpoint_folder']))
model_barney.compile(optimizer=AdamWeightDecay(learning_rate=2e-5))
data_collator = DataCollatorForLanguageModeling(mlm=False, tokenizer=tokenizer, return_tensors='tf')

# Load the Barney dataset and process it as a conversation
barney_hg = load_df('Barney', base_folder)
tokenized_barney_hg = barney_hg.map(preprocess_function, batched=False)
# Transform the HuggingFace dataset as a tensorflow one, ready to be fed to the model
barney_test_set = tokenized_barney_hg["test"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

# Running Metrics

`BBMetric.metrics_list` show up the list of all the available metrics

In [None]:
# Display the available metrics list
BBMetric.metrics_list

`BBMetric.load_metric(metric_name)` load the specified metric with name `metric_name` by loading the respective model or algorithm which computes it. It will return the `metric` asked ready to be compute by invoking `metric.compute`. Some metrics (such as the human ones and the semantic classifier) require training, in which case a method `metric.train` is provided.

In [None]:
# BLEU on a pair of sets of sentences
metric = BBMetric.load_metric("bleu")

print(metric.compute(predictions=sentences_basic, references=sentences_basic_2))

In [None]:
# Rouge-L on a pair of sets of sentences
metric = BBMetric.load_metric("rouge l")

print(metric.compute(predictions=sentences_basic, references=sentences_basic_2))

In [None]:
# Distinct on a set of sentences
metric = BBMetric.load_metric("distinct")

# ngram_size is optional, defaults to 3
metric.compute(sentences=sentences_basic, ngram_size=2)

In [None]:
# Emotion labeling on a set of sentences
metric = BBMetric.load_metric("emotion")

print(metric.compute(sentences=sentences_basic))

In [None]:
# Semantic Similarity (similar to BERTScore) on a pair of sets of sentences
metric = BBMetric.load_metric("semantic similarity")

print(metric.compute(sentences_a=sentences_basic, sentences_b=sentences_basic_2))

In [None]:
# Semantic Answer Similarity on a pair of sets of sentences
metric = BBMetric.load_metric("semantic answer similarity")

print(metric.compute(predictions=sentences_basic, references=sentences_basic_2))

In [None]:
# Semantic Classifier on a set of sentences
# Note: This Metric is not autonomous from project folder structure and datatypes! #
metric = BBMetric.load_metric("semantic classifier")

# n_shuffles is optional, defaults to 10
# from_saved_embeddings is optional, defaults to True
# shutdown_at_end is optional, defaults to False
metric.train(character='Barney', character_dict=character_dict, source_dict=source_dict, random_state=random_state,
             base_folder=base_folder, n_shuffles=10, from_saved_embeddings=True, shutdown_at_end=False)

# Computations for Barney semantic classifier on different sets of sentences
print(metric.compute(character='Barney', character_dict=character_dict, base_folder=base_folder,
               sentences=sentences_basic))
print(metric.compute(character='Barney', character_dict=character_dict, base_folder=base_folder,
               sentences=sentences_vader))
print(metric.compute(character='Barney', character_dict=character_dict, base_folder=base_folder,
               sentences=sentences_barney))

In [None]:
# Perplexity on an encoded test set (taken from one of our datasets)
metric = BBMetric.load_metric("perplexity")

print(metric.compute(model=model_barney, encoded_test_set=barney_test_set))

In [None]:
# Human - coherence metric
metric = BBMetric.load_metric("human - coherence")

# Ask a human to perform evaluation
# length is optional, defaults to 5
metric.train(model=model, tokenizer=tokenizer,
             filepath=os.path.join(os.getcwd(), "Data", "Characters", "Default", "humancoherence.csv"),
             length=5)

# Print score averages
metric.compute(filepath=os.path.join(os.getcwd(), "Data", "Characters", "Default", "humancoherence.csv"))

In [None]:
# Human - consistency metric
metric = BBMetric.load_metric("human - consistency")

# Ask a human to perform evaluation
metric.train(model=model, tokenizer=tokenizer,
             filepath=os.path.join(os.getcwd(), "Data", "Characters", "Default", "humanconsistency.csv"))

# Print score averages
metric.compute(filepath=os.path.join(os.getcwd(), "Data", "Characters", "Default", "humanconsistency.csv"))

In [None]:
# Human - style metric
metric = BBMetric.load_metric("human - style")

# Ask a human to perform evaluation
metric.train(model=model, tokenizer=tokenizer,
             filepath=os.path.join('..', "Data", "Characters", "Default", "humanstyle.csv"),
             questions=barney_sentences)

# Print score averages
metric.compute(filepath=os.path.join('..', "Data", "Characters", "Default", "humanstyle.csv"))