# Loading

In [1]:
from Data.data_dicts import character_dict, source_dict, random_state

model_name = 'microsoft/DialoGPT-small'
character = 'Barney' # 'Barney' | 'Sheldon' | 'Harry' | 'Fry' | 'Vader' | 'Joey' | 'Phoebe' | 'Bender' | Default'
character_2 = 'Sheldon'

In [2]:
# Mount google drive
import os
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    base_folder = '/content/drive/My Drive/unibo/NLP_project/BarneyBot'
    os.system("pip install datasets")
    os.system("pip install transformers")
    os.system("pip install rouge_score")
    os.system("pip install -U sentence-transformers")
else:
    base_folder = os.getcwd()
    
in_folder = os.path.join(base_folder, 'Data', 'Characters', character)
if not os.path.exists(in_folder):
    os.makedirs(in_folder)
out_folder = os.path.join(base_folder, 'Data', 'Characters', character)
if not os.path.exists(out_folder):
    os.makedirs(out_folder)
    
in_folder_2 = os.path.join(base_folder, 'Data', 'Characters', character_2)
if not os.path.exists(in_folder_2):
    os.makedirs(in_folder_2)
out_folder_2 = os.path.join(base_folder, 'Data', 'Characters', character_2)
if not os.path.exists(out_folder_2):
    os.makedirs(out_folder_2)
    
in_folder_def = os.path.join(base_folder, 'Data', 'Characters', 'Default')
if not os.path.exists(in_folder_def):
    os.makedirs(in_folder_def)
out_folder_def = os.path.join(base_folder, 'Data', 'Characters', 'Default')
if not os.path.exists(out_folder_def):
    os.makedirs(out_folder_def)
    
metrics_folder = os.path.join(base_folder, 'Metrics')
if not os.path.exists(metrics_folder):
    os.makedirs(metrics_folder)

In [3]:
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
import json
import numpy as np
import time
import scipy as sp

In [4]:
def save_as_json(filepath, filename, data):
    if not os.path.exists(filepath):
        os.makedirs(filepath, exist_ok=True)
    with open(os.path.join(filepath, filename + ".json"), 'w') as f:
        f.write(json.dumps(data, indent=4))

def load_from_json(filepath, filename):
    if not os.path.exists(os.path.join(filepath, filename + '.json')):
        return dict()
    with open(os.path.join(filepath, filename + '.json'), 'r') as f:
        return json.load(f)

In [5]:
from datasets import load_dataset, DatasetDict

def load_df(character):
    dataset_path = os.path.join(base_folder, "Data", "Characters", character, character+'.csv')
    
    character_hg = load_dataset('csv', 
                                data_files=dataset_path, 
                                cache_dir=os.path.join(base_folder, "cache"))
    
    # 85% train / 10% test / 5% validation
    train_test_hg = character_hg['train'].train_test_split(test_size=0.15, seed=random_state)
    test_val = train_test_hg['test'].train_test_split(test_size=0.33, seed=random_state)
    
    
    character_hg = DatasetDict({
        'train': train_test_hg['train'],
        'test': test_val['train'],
        'val': test_val['test']
    })
    
    return character_hg

In [6]:
def construct_conv(row, tokenizer):
    MAX_LENGTH = 512
    row = list(reversed(list(row.values())))
    model_inputs = tokenizer(row)
    tokenizer_pad_token_id = tokenizer.encode('#')[0]
    for i in range(len(model_inputs['input_ids'])):
        model_inputs['input_ids'][i].append(tokenizer.eos_token_id)
        model_inputs['attention_mask'][i].append(1)
    model_inputs['input_ids'] = [item for sublist in model_inputs['input_ids'] for item in sublist]
    model_inputs['attention_mask'] = [item for sublist in model_inputs['attention_mask'] for item in sublist]
    if MAX_LENGTH > len(model_inputs['input_ids']):
        model_inputs['input_ids'] += [tokenizer_pad_token_id] * (MAX_LENGTH - len(model_inputs['input_ids']))
        model_inputs['attention_mask'] += [0] * (MAX_LENGTH - len(model_inputs['attention_mask']))
    elif MAX_LENGTH < len(model_inputs['input_ids']):
        model_inputs['input_ids'] = model_inputs['input_ids'][:MAX_LENGTH-1]
        model_inputs['input_ids'][-1] = tokenizer.eos_token_id
        model_inputs['attention_mask'] = model_inputs['attention_mask'][:MAX_LENGTH-1]
        model_inputs['attention_mask'][-1] = 1
    model_inputs["labels"] = model_inputs["input_ids"]
    return model_inputs

def preprocess_function(examples):
    tokenizer.pad_token = '#'
    model_inputs = construct_conv(examples, tokenizer)
    return model_inputs

In [7]:
os.environ["HF_DATASETS_CACHE"] = os.path.join(base_folder, "cache")
character_hg = load_df(character)

Using custom data configuration default-bf922d7ec0ca33b8
Reusing dataset csv (C:\Users\david\Documents\unibo\natural_language_processing\project\BarneyBot\cache\csv\default-bf922d7ec0ca33b8\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at C:\Users\david\Documents\unibo\natural_language_processing\project\BarneyBot\cache\csv\default-bf922d7ec0ca33b8\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-fc24b00ef0a2bb87.arrow and C:\Users\david\Documents\unibo\natural_language_processing\project\BarneyBot\cache\csv\default-bf922d7ec0ca33b8\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-762e65cc2da69210.arrow
Loading cached split indices for dataset at C:\Users\david\Documents\unibo\natural_language_processing\project\BarneyBot\cache\csv\default-bf922d7ec0ca33b8\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-9584f7c57efdb89c.arrow and C:\Users\david\Documents\unibo\natural_language_processing\project\BarneyBot\cache\csv\default-bf922d7ec0ca33b8\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-ba54ddf4f2459593.arrow


In [8]:
checkpoint_folder = os.path.join(out_folder, character_dict[character]['checkpoint_folder'])
checkpoint_folder_2 = os.path.join(out_folder_2, character_dict[character_2]['checkpoint_folder'])

In [9]:
from transformers import TFAutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=os.path.join(base_folder, "cache"))
tokenizer.pad_token = '#'

model = TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=checkpoint_folder)
model_2 = TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=checkpoint_folder_2)
model_def = TFAutoModelForCausalLM.from_pretrained(model_name, cache_dir=os.path.join(base_folder, "cache"))

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at C:\Users\david\Documents\unibo\natural_language_processing\project\BarneyBot\Data\Characters\Barney\barney_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at C:\Users\david\Documents\unibo\natural_language_processing\project\BarneyBot\Data\Characters\Sheldon\sheldon_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from 

# Metrics Preparation

In [10]:
sample_questions = character_hg['test']['context']

In [11]:
n_beams = 3
top_k = 50
top_p = 0.92

def get_predictions_cached(sample_questions, model, filename, generation_method, override_predictions=False):
    prediction_path = os.path.join(in_folder, filename)
    if os.path.exists(prediction_path) and not override_predictions:
        print("Loading predictions from stored file")
        with open(prediction_path, 'r') as file:
            json_string = file.read()
        predictions = json.loads(json_string)
        print("Loaded predictions from stored file")

    else:
        print("Creating predictions")
        predictions = list()
        for x in tqdm(sample_questions):
            tokenized_question = tokenizer.encode(x + tokenizer.eos_token, return_tensors='tf')
            max_length = 128 + tokenized_question.shape[1]
            if generation_method == "Greedy":
                generated_answer = model.generate(tokenized_question,
                                    pad_token_id=tokenizer.eos_token_id, max_length=max_length)[0].numpy().tolist()
            elif generation_method == "Beam Search":
                generated_answer = model.generate(tokenized_question,
                                             pad_token_id=tokenizer.eos_token_id, max_length=max_length,
                                             n_beams=n_beams)[0].numpy().tolist()
            elif generation_method == "Sampling":
                generated_answer = model.generate(tokenized_question,
                                             pad_token_id=tokenizer.eos_token_id, max_length=max_length,
                                             do_sample=True, top_k=top_k, top_p=top_p)[0].numpy().tolist()
            predictions.append(generated_answer[len(tokenized_question[0]):])

        # Save predictions as a JSON file
        output_string = json.dumps(predictions)
        with open(prediction_path, 'w') as file:
            file.write(output_string)
        
    return predictions

In [12]:
predictions_greedy = get_predictions_cached(sample_questions, model,
                                            character_dict[character]['prediction_filename'] + '_greedy.json',
                                            "Greedy")
predictions_nbeams = get_predictions_cached(sample_questions, model,
                                            character_dict[character]['prediction_filename'] + '_nbeams.json',
                                            "Beam Search")
predictions_sampling = get_predictions_cached(sample_questions, model,
                                              character_dict[character]['prediction_filename'] + '_sampling.json',
                                              "Sampling")

Loading predictions from stored file
Loaded predictions from stored file
Loading predictions from stored file
Loaded predictions from stored file
Loading predictions from stored file
Loaded predictions from stored file


In [13]:
def get_dataframe_for_metrics(data_test, predictions_greedy, predictions_nbeams, predictions_sampling):
    i = 0
    df = {'ctx':[], 'ctx_tk':[]}
    has_labels = 'response' in data_test.features
    if has_labels:
        df['lbl'] = []
        df['lbl_tk'] = []
    if predictions_greedy:
        df['prd_greedy'] = []
        df['prd_greedy_tk'] = []
    if predictions_nbeams:
        df['prd_nbeams'] = []
        df['prd_nbeams_tk'] = [] 
    if predictions_sampling:
        df['prd_sampling'] = []
        df['prd_sampling_tk'] = []
    for sample in tqdm(data_test):
        # encode the context and label sentences, add the eos_token and return a tensor
        ctx_tk = tokenizer.encode(sample['context'] + tokenizer.eos_token, return_tensors='tf').numpy().tolist()
        ctx = sample['context']
        df['ctx_tk'].append(ctx_tk)
        df['ctx'].append(ctx)
        if has_labels:
            lbl_tk = tokenizer.encode(sample['response'] + tokenizer.eos_token, return_tensors='tf').numpy().tolist()
            lbl = sample['response']
            df['lbl'].append(lbl)
            df['lbl_tk'].append(lbl_tk)
        if predictions_greedy:
            prd_greedy_tk = predictions_greedy[i]
            prd_greedy = tokenizer.decode(prd_greedy_tk, skip_special_tokens=True)
            df['prd_greedy'].append(prd_greedy)
            df['prd_greedy_tk'].append(prd_greedy_tk)
        if predictions_nbeams:
            prd_nbeams_tk = predictions_nbeams[i]
            prd_nbeams = tokenizer.decode(prd_nbeams_tk, skip_special_tokens=True)
            df['prd_nbeams'].append(prd_nbeams)
            df['prd_nbeams_tk'].append(prd_nbeams_tk)
        if predictions_sampling:
            prd_sampling_tk = predictions_sampling[i]
            prd_sampling = tokenizer.decode(prd_sampling_tk, skip_special_tokens=True)
            df['prd_sampling'].append(prd_sampling)
            df['prd_sampling_tk'].append(prd_sampling_tk)
        i += 1
    return pd.DataFrame(data=df)

In [14]:
df_char = get_dataframe_for_metrics(character_hg['test'], predictions_greedy, predictions_nbeams, predictions_sampling)
df_char

100%|██████████████████████████████████████████████████████████████████████████████| 522/522 [00:00<00:00, 1614.63it/s]


Unnamed: 0,ctx,ctx_tk,lbl,lbl_tk,prd_greedy,prd_greedy_tk,prd_nbeams,prd_nbeams_tk,prd_sampling,prd_sampling_tk
0,"I know, it's two years of my life I'm never ge...","[[40, 760, 11, 340, 338, 734, 812, 286, 616, 1...",Daddy's home.,"[[48280, 338, 1363, 13, 50256]]","Oh, God!","[5812, 11, 1793, 0, 50256]","Oh, God!","[5812, 11, 1793, 0, 50256]","Oh, of course.","[5812, 11, 286, 1781, 13, 50256]"
1,Wh-Where'd you get a meatball...,"[[1199, 12, 8496, 1549, 345, 651, 257, 6174, 1...",I don't have much time!,"[[40, 836, 470, 423, 881, 640, 0, 50256]]",I don't know. I just saw a meatball sub.,"[40, 836, 470, 760, 13, 314, 655, 2497, 257, 6...",I don't know. I just saw a meatball sub.,"[40, 836, 470, 760, 13, 314, 655, 2497, 257, 6...",You don't remember?,"[1639, 836, 470, 3505, 30, 50256]"
2,"Okay, what is so urgent that you called me and...","[[16454, 11, 644, 318, 523, 18039, 326, 345, 1...",I could tell you knew something was up with me...,"[[40, 714, 1560, 345, 2993, 1223, 373, 510, 35...","I'm sorry, I don't follow you.","[40, 1101, 7926, 11, 314, 836, 470, 1061, 345,...","I'm sorry, I don't follow you.","[40, 1101, 7926, 11, 314, 836, 470, 1061, 345,...",All right. It's time to start?,"[3237, 826, 13, 632, 338, 640, 284, 923, 30, 5..."
3,How much?,"[[2437, 881, 30, 50256]]",A little.,"[[32, 1310, 13, 50256]]",I have not decided. I want to get married in a...,"[40, 423, 407, 3066, 13, 314, 765, 284, 651, 6...",I have not decided. I want to get married in a...,"[40, 423, 407, 3066, 13, 314, 765, 284, 651, 6...","He said he had like 3,000 in suits.","[1544, 531, 339, 550, 588, 513, 11, 830, 287, ..."
4,You're being super nice. It's... freaking me o...,"[[1639, 821, 852, 2208, 3621, 13, 632, 338, 98...","I'm being Barney, and I think tonight's going ...","[[40, 1101, 852, 41921, 11, 290, 314, 892, 997...",I'm not gross. I'm just... gross.,"[40, 1101, 407, 10319, 13, 314, 1101, 655, 986...",I'm not gross. I'm just... gross.,"[40, 1101, 407, 10319, 13, 314, 1101, 655, 986...",Buckminster Fuller?,"[33, 1347, 18462, 31863, 30, 50256]"
...,...,...,...,...,...,...,...,...,...,...
517,"Okay, I want to lay down some ground rules for...","[[16454, 11, 314, 765, 284, 3830, 866, 617, 23...","Well, well, well. How rich. You make me promis...","[[5779, 11, 880, 11, 880, 13, 1374, 5527, 13, ...","Oh, I know. I just want to be as awesome as sh...","[5812, 11, 314, 760, 13, 314, 655, 765, 284, 3...","Oh, I know. I just want to be as awesome as sh...","[5812, 11, 314, 760, 13, 314, 655, 765, 284, 3...","No, I'll be right over.","[2949, 11, 314, 1183, 307, 826, 625, 13, 50256]"
518,It looks to be a... sacred... spa.,"[[1026, 3073, 284, 307, 257, 986, 13626, 986, ...",Owl. How do we go? We will do what? Jump?,"[[46, 40989, 13, 1374, 466, 356, 467, 30, 775,...","I'm sorry, I don't follow you.","[40, 1101, 7926, 11, 314, 836, 470, 1061, 345,...","I'm sorry, I don't follow you.","[40, 1101, 7926, 11, 314, 836, 470, 1061, 345,...",It's called a spa.,"[1026, 338, 1444, 257, 41900, 13, 50256]"
519,That's putting it a bit strongly.,"[[2504, 338, 5137, 340, 257, 1643, 7634, 13, 5...",A bit strongly. She's not my girlfriend.,"[[32, 1643, 7634, 13, 1375, 338, 407, 616, 110...",I'm not going to put it in a little strong.,"[40, 1101, 407, 1016, 284, 1234, 340, 287, 257...",I'm not going to put it in a little strong.,"[40, 1101, 407, 1016, 284, 1234, 340, 287, 257...",And here comes the fun part.,"[1870, 994, 2058, 262, 1257, 636, 13, 50256]"
520,I do.,"[[40, 466, 13, 50256]]",I'm gonna head out to a reggae concert. I'm a ...,"[[40, 1101, 8066, 1182, 503, 284, 257, 842, 25...",You're a good man.,"[1639, 821, 257, 922, 582, 13, 50256]",You're a good man.,"[1639, 821, 257, 922, 582, 13, 50256]",Hey.,"[10814, 13, 50256]"


# Metrics For Character 1

In [15]:
def ccl_sim(ctx_lbl, ctx_cht, lbl_cht):
    return ((1 - abs(ctx_lbl - ctx_cht))**2 + lbl_cht**2) / 2

In [26]:
from Lib.BBMetrics import BBMetric

def compute_sample_metrics(context_sentence, label_response, chatbot_response, verbose=True,
                           label_chatbot_symmetry=False):
    scores = {}
    lbl_text = 'label' if not label_chatbot_symmetry else 'chatbota'
    cht_text = 'chatbot' if not label_chatbot_symmetry else 'chatbotb'
    
    scores['metadata'] = {}
    scores['metadata']['ordering'] = ['context-'+lbl_text,
                                      'context-'+cht_text,
                                      cht_text+'-'+lbl_text,
                                      'ccl']
    
    if verbose:
        # prints the sample
        print('* context:', context_sentence) 
        print('* ' + lbl_text  + ':  ', label_response)
        print('* ' + cht_text  + ':', chatbot_response) 
    # 1) computes metrics for semantic similarity
    metric = BBMetric.load_metric("semantic similarity")
    scores['semantic similarity'] = [metric.compute(sentences_a=context_sentence,
                                                      sentences_b=label_response)['score']]
    scores['semantic similarity'].append(metric.compute(sentences_a=context_sentence,
                                                      sentences_b=chatbot_response)['score'])
    scores['semantic similarity'].append(metric.compute(sentences_a=label_response,
                                                      sentences_b=chatbot_response)['score'])
    scores['semantic similarity'].append(ccl_sim(scores['semantic similarity'][0],
                                                 scores['semantic similarity'][1],
                                                 scores['semantic similarity'][2]))
    if verbose:
        print('=== SEMANTIC SIMILARITY ===')
        print('context-'+lbl_text+' similarity:   ', scores['semantic similarity'][0])
        print('context-'+cht_text+' similarity: ', scores['semantic similarity'][1])
        print(cht_text+'-'+lbl_text+' similarity:   ', scores['semantic similarity'][2])
        print('ccl-sim similarity:            ', scores['semantic similarity'][3])
    # 2) computes metrics for bleu
    metric = BBMetric.load_metric("bleu")
    scores['bleu'] = [metric.compute(predictions=label_response, references=context_sentence)['score']]
    scores['bleu'].append(metric.compute(predictions=chatbot_response, references=context_sentence)['score'])
    scores['bleu'].append(metric.compute(predictions=chatbot_response, references=label_response)['score'])
    scores['bleu'].append(ccl_sim(scores['bleu'][0],
                                  scores['bleu'][1],
                                  scores['bleu'][2]))
    if verbose:
        print('===         BLEU         ===')
        print('context-to-'+lbl_text+' bleu:      ', scores['bleu'][0])
        print('context-to-'+cht_text+' bleu:    ', scores['bleu'][1])
        print(lbl_text+'-to-'+cht_text+' bleu:      ', scores['bleu'][2])
        print('ccl-sim bleu:            ', scores['bleu'][3])
    # 3) computes metrics for rouge-L
    metric = BBMetric.load_metric("rouge l")
    scores['rouge l'] = [metric.compute(predictions=label_response, references=context_sentence)['score']]
    scores['rouge l'].append(metric.compute(predictions=chatbot_response, references=context_sentence)['score'])
    scores['rouge l'].append(metric.compute(predictions=chatbot_response, references=label_response)['score'])
    scores['rouge l'].append(ccl_sim(scores['rouge l'][0],
                                     scores['rouge l'][1],
                                     scores['rouge l'][2]))
    if verbose:
        print('===        ROUGE-L       ===')
        print('context-to-'+lbl_text+' rouge:     ', scores['rouge l'][0])
        print('context-to-'+cht_text+' rouge:   ', scores['rouge l'][1])
        print(lbl_text+'-to-'+cht_text+' rouge:     ', scores['rouge l'][2])
        print('ccl-sim rouge:            ', scores['rouge l'][3])
    # 6) computes metrics for distinct
    metric = BBMetric.load_metric("distinct")
    scores['distinct'] = [metric.compute(sentences=context_sentence)['score']]
    scores['distinct'].append(metric.compute(sentences=label_response)['score'])
    scores['distinct'].append(metric.compute(sentences=chatbot_response)['score'])
    scores['distinct'].append(ccl_sim(scores['distinct'][0],
                                      scores['distinct'][1],
                                      scores['distinct'][2]))
    if verbose:
        print('===       DISTINCT      ===')
        print('context distinct:          ', scores['distinct'][0])
        print(lbl_text+' distinct:          ', scores['distinct'][1])
        print(cht_text+' distinct:          ', scores['distinct'][2])
        print('ccl-sim distinct:          ', scores['distinct'][3])
    # 4) computes sas metric
    metric = BBMetric.load_metric("semantic answer similarity")
    scores['semantic answer similarity'] = [metric.compute(predictions=context_sentence,
                                                    references=label_response)['score']]
    scores['semantic answer similarity'].append(metric.compute(predictions=context_sentence,
                                                        references=chatbot_response)['score'])
    scores['semantic answer similarity'].append(metric.compute(predictions=label_response,
                                                        references=chatbot_response)['score'])
    scores['semantic answer similarity'].append(ccl_sim(scores['semantic answer similarity'][0],
                                                        scores['semantic answer similarity'][1],
                                                        scores['semantic answer similarity'][2]))
    if verbose:
        print('===         SAS         ===')
        print('context-'+lbl_text+' sas:          ', scores['semantic answer similarity'][0])
        print('context-'+cht_text+' sas:        ', scores['semantic answer similarity'][1])
        print(lbl_text+'-'+cht_text+' sas:          ', scores['semantic answer similarity'][2])
        print('ccl-sim sas:               ', scores['semantic answer similarity'][3])
    # 5) computes emotion metric
    metric = BBMetric.load_metric("emotion")
    scores['emotion'] = [metric.compute(sentences=context_sentence)]
    scores['emotion'].append(metric.compute(sentences=label_response))
    scores['emotion'].append(metric.compute(sentences=chatbot_response))
    scores['emotion'].append(sp.stats.stats.pearsonr(scores['emotion'][1]['score'],
                                                     scores['emotion'][2]['score'])[0])
    if verbose:
        print('===       EMOTION       ===')
        print('context emotions:            \n', list(zip(scores['emotion'][0]['label'], scores['emotion'][0]['score'])))
        print(lbl_text+' emotions:              \n', list(zip(scores['emotion'][1]['label'], scores['emotion'][1]['score'])))
        print(cht_text+' emotions:            \n', list(zip(scores['emotion'][2]['label'], scores['emotion'][2]['score'])))
        print(lbl_text+'-'+cht_text+'emotion corr:  \n', scores['emotion'][3])
    return scores

In [36]:
def compute_set_metrics(model, tokenizer, context_sentences, label_responses, chatbot_responses, character, verbose=True,
                        classifier_n_sentences=50, include_sentences=False, label_chatbot_symmetry=False):
    scores = {}
    
    lbl_text = 'label' if not label_chatbot_symmetry else 'chatbota'
    cht_text = 'chatbot' if not label_chatbot_symmetry else 'chatbotb'
    
    scores['metadata'] = {}
    scores['metadata']['ordering'] = ['context-'+lbl_text,
                                      'context-'+cht_text,
                                      cht_text+'-'+lbl_text,
                                      'ccl']
    
    # 0) computes metrics for perplexity
    metric = BBMetric.load_metric("semantic similarity")
    scores['semantic similarity'] = [metric.compute(sentences_a=context_sentences,
                                            sentences_b=label_responses)]
    scores['semantic similarity'].append(metric.compute(sentences_a=context_sentences,
                                            sentences_b=chatbot_responses)),
    scores['semantic similarity'].append(metric.compute(sentences_a=label_responses,
                                              sentences_b=chatbot_responses))
    scores['semantic similarity'].append(ccl_sim(scores['semantic similarity'][0]['score'],
                                                 scores['semantic similarity'][1]['score'],
                                                 scores['semantic similarity'][2]['score']))
    if verbose:
        print('=== SEMANTIC SIMILARITY ===')
        print('context-'+lbl_text+' similarity:   ', scores['semantic similarity'][0])
        print('context-'+cht_text+' similarity: ', scores['semantic similarity'][1])
        print(cht_text+'-'+lbl_text+' similarity:   ', scores['semantic similarity'][2])
        print('ccl-sim similarity:            ', scores['semantic similarity'][3])
    # 1) computes metrics for perplexity
    if not label_chatbot_symmetry:
        metric = BBMetric.load_metric("perplexity")
        scores['perplexity'] = metric.compute(model=model, tokenizer=tokenizer, sentences=chatbot_responses)['score_concat']
        if verbose:
            print('===       PERPLEXITY     ===')
            print('chatbot perplexity:         ', scores['perplexity'])
    elif verbose:
        print("Symmetric mode, skipping Perplexity.")
    # 2) computes metrics for bleu
    metric = BBMetric.load_metric("bleu")
    scores['bleu'] = [metric.compute(predictions=label_responses, references=context_sentences)]
    scores['bleu'].append(metric.compute(predictions=chatbot_responses, references=context_sentences))
    scores['bleu'].append(metric.compute(predictions=chatbot_responses, references=label_responses))
    scores['bleu'].append(ccl_sim(scores['bleu'][0]['score'],
                                  scores['bleu'][1]['score'],
                                  scores['bleu'][2]['score']))
    if verbose:
        print('===         BLEU         ===')
        print('context-to-'+lbl_text+' bleu:      ', scores['bleu'][0])
        print('context-to-'+cht_text+' bleu:    ', scores['bleu'][1])
        print(lbl_text+'-to-'+cht_text+' bleu:      ', scores['bleu'][2])
        print('ccl-sim bleu:            ', scores['bleu'][3])
    # 3) computes metrics for rouge-L
    metric = BBMetric.load_metric("rouge l")
    scores['rouge l'] = [metric.compute(predictions=label_responses, references=context_sentences)]
    scores['rouge l'].append(metric.compute(predictions=chatbot_responses, references=context_sentences))
    scores['rouge l'].append(metric.compute(predictions=chatbot_responses, references=label_responses))
    scores['rouge l'].append(ccl_sim(scores['rouge l'][0]['score'],
                                     scores['rouge l'][1]['score'],
                                     scores['rouge l'][2]['score']))
    if verbose:
        print('===        ROUGE-L       ===')
        print('context-to-'+lbl_text+' rouge:     ', scores['rouge l'][0])
        print('context-to-'+cht_text+' rouge:   ', scores['rouge l'][1])
        print(lbl_text+'-to-'+cht_text+' rouge:     ', scores['rouge l'][2])
        print('ccl-sim rouge:            ', scores['rouge l'][3])
    # 4) computes metrics for distinct
    metric = BBMetric.load_metric("distinct")
    scores['distinct'] = [metric.compute(sentences=context_sentences)]
    scores['distinct'].append(metric.compute(sentences=label_responses))
    scores['distinct'].append(metric.compute(sentences=chatbot_responses))
    scores['distinct'].append(ccl_sim(scores['distinct'][0]['score'],
                                      scores['distinct'][1]['score'],
                                      scores['distinct'][2]['score']))
    if verbose:
        print('===       DISTINCT      ===')
        print('context distinct:          ', scores['distinct'][0])
        print(lbl_text+' distinct:          ', scores['distinct'][1])
        print(cht_text+' distinct:          ', scores['distinct'][2])
        print('ccl-sim distinct:          ', scores['distinct'][3])
    # 6) computes emotion metric
    metric = BBMetric.load_metric("emotion")
    scores['emotion'] = [metric.compute(sentences=context_sentence)]
    scores['emotion'].append(metric.compute(sentences=label_response))
    scores['emotion'].append(metric.compute(sentences=chatbot_response))
    scores['emotion'].append(sp.stats.stats.pearsonr(scores['emotion'][1]['score'],
                                                     scores['emotion'][2]['score'])[0])
    if verbose:
        print('===       EMOTION       ===')
        print('context emotions:            \n', list(zip(scores['emotion'][0]['label'], scores['emotion'][0]['score'])))
        print(lbl_text+' emotions:              \n', list(zip(scores['emotion'][1]['label'], scores['emotion'][1]['score'])))
        print(cht_text+' emotions:            \n', list(zip(scores['emotion'][2]['label'], scores['emotion'][2]['score'])))
        print(lbl_text+'-'+cht_text+'emotion corr:  \n', scores['emotion'][3])
    # 8) computes sas metric
    metric = BBMetric.load_metric("semantic answer similarity")
    scores['semantic answer similarity'] = [metric.compute(predictions=context_sentences,
                                                    references=label_responses)]
    scores['semantic answer similarity'].append(metric.compute(predictions=context_sentences,
                                                        references=chatbot_responses))
    scores['semantic answer similarity'].append(metric.compute(predictions=label_responses,
                                                        references=chatbot_responses))
    scores['semantic answer similarity'].append(ccl_sim(scores['semantic answer similarity'][0]['score'],
                                                        scores['semantic answer similarity'][1]['score'],
                                                        scores['semantic answer similarity'][2]['score']))
    if verbose:
        print('===         SAS         ===')
        print('context-'+lbl_text+' sas:          ', scores['semantic answer similarity'][0])
        print('context-'+cht_text+' sas:        ', scores['semantic answer similarity'][1])
        print(lbl_text+'-'+cht_text+' sas:          ', scores['semantic answer similarity'][2])
        print('ccl-sim sas:               ', scores['semantic answer similarity'][3])
    # 9) computes metrics for semantic classifier
    metric = BBMetric.load_metric("semantic classifier")
    start_time = time.time()
    scores['semantic classifier'] = [metric.compute(character=character, character_dict=character_dict, 
                                                   base_folder=base_folder, sentences=label_responses,
                                                   n_sentences=classifier_n_sentences)]
    scores['semantic classifier'].append(metric.compute(character=character, character_dict=character_dict, 
                                                   base_folder=base_folder, sentences=chatbot_responses,
                                                   n_sentences=classifier_n_sentences))
    end_time = time.time()
    if verbose:
        print('=== SEMANTIC CLASSIFIER ===')
        print('sem-classifier '+lbl_text+':                ', scores['semantic classifier'][0])
        print('sem-classifier '+cht_text+':                  ', scores['semantic classifier'][1])
        print('time elapsed computing semantic classifier:  {:.2f} s'.format(end_time - start_time))
    if not label_chatbot_symmetry and os.path.exists(os.path.join(os.getcwd(), "Data", "Characters", character, "humancoherence.csv")):
        scores['human'] = {}
        metric = BBMetric.load_metric("human - coherence")
        scores['human']['coherence'] = metric.compute(filepath=os.path.join(os.getcwd(), "Data", "Characters",
                                                                            character, "humancoherence.csv"))
        metric = BBMetric.load_metric("human - style")
        scores['human']['style'] = metric.compute(filepath=os.path.join(os.getcwd(), "Data", "Characters",
                                                                        character, "humanstyle.csv"))
        metric = BBMetric.load_metric("human - consistency")
        scores['human']['consistency'] = metric.compute(filepath=os.path.join(os.getcwd(), "Data", "Characters",
                                                                              character, "humanconsistency.csv"))
        if verbose:
            print('===    HUMAN METRICS    ===')
            print('coherence:                 ', scores['human']['coherence'])
            print('consistency:               ', scores['human']['consistency'])
            print('style:                     ', scores['human']['style'])
    elif verbose:
        print("Symmetric mode, skipping Human metrics.")
    if include_sentences:
        sentences_df = {}
        sentences_df['context'] = context_sentences
        sentences_df[lbl_text] = label_responses
        sentences_df[cht_text] = chatbot_responses
        scores['sentences'] = sentences_df
        if verbose:
            print('===      SENTENCES      ===')
            for i in range(len(context_sentences)):
                print("* context: ", context_sentences[i])
                print("* " + lbl_text + ":", label_responses[i])
                print("* " + cht_text + ":", chatbot_responses[i])
                print()
    elif verbose:
        print("Skipping sentence outputting.")
    return scores

In [28]:
for i in range(1):
    print("##### Sample " + str(i+1) + " #####")
    context_sentence = df_char['ctx'][i]
    chatbot_response = df_char['prd_greedy'][i]
    label_response   = df_char['lbl'][i]
    compute_sample_metrics(context_sentence, label_response, chatbot_response)
    print()

##### Sample 1 #####
* context: I know, it's two years of my life I'm never getting back. A little part of me just wants to jump the bones of the next guy I see.
* label:   Daddy's home.
* chatbot: Oh, God!
=== SEMANTIC SIMILARITY ===
context-label similarity:    0.009414163418114185
context-chatbot similarity:  0.01723679155111313
chatbot-label similarity:    0.008852469734847546
ccl-sim similarity:             0.4922471517326578
===         BLEU         ===
context-to-label bleu:       0.0
context-to-chatbot bleu:     0.0
label-to-chatbot bleu:       0.0
ccl-sim bleu:             0.5
===        ROUGE-L       ===
context-to-label rouge:      0.0588235294117647
context-to-chatbot rouge:    0.0
label-to-chatbot rouge:      0.0
ccl-sim rouge:             0.4429065743944637
===       DISTINCT      ===
context distinct:           0.20930232558139536
label distinct:           0.0
chatbot distinct:           0.0
ccl-sim distinct:           0.3126014061654948
===         SAS         ===
conte

In [31]:
set_size = 10
i = 30

In [37]:
print("##### Set (Size " + str(set_size) + ") #####")
context_sentences = list(df_char['ctx'][i:i+set_size])
chatbot_responses = list(df_char['prd_greedy'][i:i+set_size])
label_responses   = list(df_char['lbl'][i:i+set_size])
compute_set_metrics(model, tokenizer,
                    context_sentences, label_responses, chatbot_responses, character)

##### Set (Size 10) #####
=== SEMANTIC SIMILARITY ===
context-label similarity:    {'score': 0.013566510751843452, 'std': 0.004443929065018892}
context-chatbot similarity:  {'score': 0.11592505127191544, 'std': 0.17320799827575684}
chatbot-label similarity:    {'score': 0.056996725499629974, 'std': 0.07998812943696976}
ccl-sim similarity:             0.4045044082474677


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 10.80it/s]


===       PERPLEXITY     ===
chatbot perplexity:          57.42257428571708
===         BLEU         ===
context-to-label bleu:       {'score': 0.0, 'std': 0.0}
context-to-chatbot bleu:     {'score': 0.0, 'std': 0.0}
label-to-chatbot bleu:       {'score': 0.0, 'std': 0.0}
ccl-sim bleu:             0.5
===        ROUGE-L       ===
context-to-label rouge:      {'score': 0.031536954087346025, 'std': 0.04899248320162975}
context-to-chatbot rouge:    {'score': 0.09306625577812018, 'std': 0.16891792560453}
label-to-chatbot rouge:      {'score': 0.02930093089867526, 'std': 0.04120352281931709}
ccl-sim rouge:             0.4407928980682674
===       DISTINCT      ===
context distinct:           {'score': 0.09560764577892737, 'std': 0.0692566770759311}
label distinct:           {'score': 0.13910765029661293, 'std': 0.04298315932942512}
chatbot distinct:           {'score': 0.1097739725969051, 'std': 0.05256244507214358}
ccl-sim distinct:           0.4634712832086868
===       EMOTION       ===


{'metadata': {'ordering': ['context-label',
   'context-chatbot',
   'chatbot-label',
   'ccl']},
 'semantic similarity': [{'score': 0.013566510751843452,
   'std': 0.004443929065018892},
  {'score': 0.11592505127191544, 'std': 0.17320799827575684},
  {'score': 0.056996725499629974, 'std': 0.07998812943696976},
  0.4045044082474677],
 'perplexity': 57.42257428571708,
 'bleu': [{'score': 0.0, 'std': 0.0},
  {'score': 0.0, 'std': 0.0},
  {'score': 0.0, 'std': 0.0},
  0.5],
 'rouge l': [{'score': 0.031536954087346025, 'std': 0.04899248320162975},
  {'score': 0.09306625577812018, 'std': 0.16891792560453},
  {'score': 0.02930093089867526, 'std': 0.04120352281931709},
  0.4407928980682674],
 'distinct': [{'score': 0.09560764577892737, 'std': 0.0692566770759311},
  {'score': 0.13910765029661293, 'std': 0.04298315932942512},
  {'score': 0.1097739725969051, 'std': 0.05256244507214358},
  0.4634712832086868],
 'emotion': [{'score': [0.006270758341997862,
    0.015778280794620514,
    0.000853337

In [None]:
print("##### Full Test Set #####")
context_sentences = list(df_char['ctx'])
chatbot_responses = list(df_char['prd_greedy'])
label_responses   = list(df_char['lbl'])
scores = compute_set_metrics(model, tokenizer,
                    context_sentences, 
                    label_responses, 
                    chatbot_responses,
                    character,
                    classifier_n_sentences=75)

In [None]:
save_as_json(metrics_folder, character+'_base_metrics', scores)

# Metrics Between Character 1 & Character 2

In [None]:
def get_predictions_small(sample_questions, model, generation_method):
    print("Creating predictions")
    predictions = list()
    for x in tqdm(sample_questions):
        tokenized_question = tokenizer.encode(x + tokenizer.eos_token, return_tensors='tf')
        max_length = 128 + tokenized_question.shape[1]
        if generation_method == "Greedy":
            generated_answer = model.generate(tokenized_question,
                                pad_token_id=tokenizer.eos_token_id, max_length=max_length)[0].numpy().tolist()
        elif generation_method == "Beam Search":
            generated_answer = model.generate(tokenized_question,
                                         pad_token_id=tokenizer.eos_token_id, max_length=max_length,
                                         n_beams=n_beams)[0].numpy().tolist()
        elif generation_method == "Sampling":
            generated_answer = model.generate(tokenized_question,
                                         pad_token_id=tokenizer.eos_token_id, max_length=max_length,
                                         do_sample=True, top_k=top_k, top_p=top_p)[0].numpy().tolist()
        predictions.append(generated_answer[len(tokenized_question[0]):])
    return predictions

In [None]:
df_common = load_dataset('csv',
                         data_files=os.path.join(base_folder, 'Data', 'common_dataset.csv'), 
                         cache_dir=os.path.join(base_folder, "cache"))

In [None]:
df_common

In [None]:
predictions_1_sampling = get_predictions_small(df_common['train']['context'], model, "Sampling")

In [None]:
predictions_2_sampling = get_predictions_small(df_common['train']['context'], model_2, "Sampling")

In [None]:
df_common_char_1 = get_dataframe_for_metrics(df_common['train'], None, None, predictions_1_sampling)
df_common_char_2 = get_dataframe_for_metrics(df_common['train'], None, None, predictions_2_sampling)

In [None]:
print("##### " + character + "  Vs. " + character_2 + " #####")
context_sentences   = list(df_common_char_1['ctx'])
chatbot_responses   = list(df_common_char_1['prd_sampling'])
chatbot_2_responses = list(df_common_char_2['prd_sampling'])
scores = compute_set_metrics(None, None,
                            context_sentences, chatbot_responses, chatbot_2_responses, None,
                            include_sentences=True, label_chatbot_symmetry=True)

In [None]:
save_as_json(metrics_folder, character+'_vs_'+character_2+'_metrics', scores)

# Metrics Between Different Sampling Methods

In [None]:
scores = {}
print("##### Greedy vs. N-Beams #####")
context_sentences = list(df_char['ctx'])
greedy_responses  = list(df_char['prd_greedy'])
nbeams_responses  = list(df_char['prd_nbeams'])
scores['greedy_vs_nbeams'] = compute_set_metrics(None, None,
                                                 context_sentences,
                                                 greedy_responses,
                                                 nbeams_responses,
                                                 character,
                                                 classifier_n_sentences=75,
                                                 label_chatbot_symmetry=True)

In [None]:
print("##### Greedy vs. Sampling #####")
context_sentences   = list(df_char['ctx'])
greedy_responses    = list(df_char['prd_greedy'])
sampling_responses  = list(df_char['prd_sampling'])
scores['greedy_vs_sampling'] = compute_set_metrics(None, None,
                                                   context_sentences,
                                                   greedy_responses,
                                                   sampling_responses,
                                                   character,
                                                   classifier_n_sentences=75,
                                                   label_chatbot_symmetry=True)

In [None]:
print("##### N-Beams vs. Sampling #####")
context_sentences   = list(df_char['ctx'])
nbeams_responses    = list(df_char['prd_nbeams'])
sampling_responses  = list(df_char['prd_sampling'])
scores['nbeams_vs_sampling'] = compute_set_metrics(None, None,
                                                   context_sentences,
                                                   nbeams_responses,
                                                   sampling_responses,
                                                   character,
                                                   classifier_n_sentences=75,
                                                   label_chatbot_symmetry=True)

In [None]:
save_as_json(metrics_folder, character+'_sampling_comparison_metrics', scores)

# Metrics Between Non-Finetuned And Character

In [None]:
predictions_def_sampling = get_predictions_cached(sample_questions, model_def,
                                                  os.path.join(in_folder_def, 'from_' + character + '_df_' + '_sampling.json'),
                                                  "Sampling")

In [None]:
df_char_def = get_dataframe_for_metrics(character_hg['test'], None, None, predictions_def_sampling)

In [None]:
for i in range(1):
    print("##### Sample " + str(i+1) + " #####")
    context_sentence   = df_char['ctx'][i]
    character_response = df_char['prd_sampling'][i]
    default_response   = df_char_def['prd_sampling'][i]
    compute_sample_metrics(context_sentence, default_response, character_response, label_chatbot_symmetry=True)
    print()

In [None]:
set_size = 50
i = 30
print("##### Set (Size " + str(set_size) + ") #####")
context_sentences   = list(df_char['ctx'][i:i+set_size])
character_responses = list(df_char['prd_sampling'][i:i+set_size])
default_responses   = list(df_char_def['prd_sampling'][i:i+set_size])
compute_set_metrics(None, None,
                    context_sentences, default_responses, character_responses, character, label_chatbot_symmetry=True)

In [None]:
print("##### Full Test Set (Size " + str(set_size) + ") #####")
context_sentences   = list(df_char['ctx'])
character_responses = list(df_char['prd_sampling'])
default_responses   = list(df_char_def['prd_sampling'])
scores = compute_set_metrics(None, None,
                             context_sentences, 
                             default_responses, 
                             character_responses,
                             character,
                             classifier_n_sentences=75,
                             label_chatbot_symmetry=True)

In [None]:
save_as_json(metrics_folder, character+'_vs_nonfinetuned_metrics', scores)