# Loading

In [1]:
from Data.data_dicts import character_dict, source_dict, random_state

model_name = 'microsoft/DialoGPT-small'
character = 'Barney' # 'Barney' | 'Sheldon' | 'Harry' | 'Fry' | 'Vader' | 'Joey' | 'Phoebe' | 'Bender' | Default'
character_2 = 'Sheldon'

In [2]:
# Mount google drive
import os
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    base_folder = '/content/drive/My Drive/unibo/NLP_project/BarneyBot'
    os.system("pip install datasets")
    os.system("pip install transformers")
    os.system("pip install rouge_score")
    os.system("pip install -U sentence-transformers")
else:
    base_folder = os.getcwd()
    
in_folder = os.path.join(base_folder, 'Data', 'Characters', character)
if not os.path.exists(in_folder):
    os.makedirs(in_folder)
out_folder = os.path.join(base_folder, 'Data', 'Characters', character)
if not os.path.exists(out_folder):
    os.makedirs(out_folder)
    
in_folder_2 = os.path.join(base_folder, 'Data', 'Characters', character_2)
if not os.path.exists(in_folder_2):
    os.makedirs(in_folder_2)
out_folder_2 = os.path.join(base_folder, 'Data', 'Characters', character_2)
if not os.path.exists(out_folder_2):
    os.makedirs(out_folder_2)
    
in_folder_def = os.path.join(base_folder, 'Data', 'Characters', 'Default')
if not os.path.exists(in_folder_def):
    os.makedirs(in_folder_def)
out_folder_def = os.path.join(base_folder, 'Data', 'Characters', 'Default')
if not os.path.exists(out_folder_def):
    os.makedirs(out_folder_def)

In [3]:
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
import json
import numpy as np
import time
import scipy as sp

In [4]:
from datasets import load_dataset, DatasetDict

def load_df(character):
    dataset_path = os.path.join(base_folder, "Data", "Characters", character, character+'.csv')
    
    character_hg = load_dataset('csv', 
                                data_files=dataset_path, 
                                cache_dir=os.path.join(base_folder, "cache"))
    
    # 85% train / 10% test / 5% validation
    train_test_hg = character_hg['train'].train_test_split(test_size=0.15, seed=random_state)
    test_val = train_test_hg['test'].train_test_split(test_size=0.33, seed=random_state)
    
    
    character_hg = DatasetDict({
        'train': train_test_hg['train'],
        'test': test_val['train'],
        'val': test_val['test']
    })
    
    return character_hg

In [5]:
def construct_conv(row, tokenizer):
    MAX_LENGTH = 512
    row = list(reversed(list(row.values())))
    model_inputs = tokenizer(row)
    tokenizer_pad_token_id = tokenizer.encode('#')[0]
    for i in range(len(model_inputs['input_ids'])):
        model_inputs['input_ids'][i].append(tokenizer.eos_token_id)
        model_inputs['attention_mask'][i].append(1)
    model_inputs['input_ids'] = [item for sublist in model_inputs['input_ids'] for item in sublist]
    model_inputs['attention_mask'] = [item for sublist in model_inputs['attention_mask'] for item in sublist]
    if MAX_LENGTH > len(model_inputs['input_ids']):
        model_inputs['input_ids'] += [tokenizer_pad_token_id] * (MAX_LENGTH - len(model_inputs['input_ids']))
        model_inputs['attention_mask'] += [0] * (MAX_LENGTH - len(model_inputs['attention_mask']))
    elif MAX_LENGTH < len(model_inputs['input_ids']):
        model_inputs['input_ids'] = model_inputs['input_ids'][:MAX_LENGTH-1]
        model_inputs['input_ids'][-1] = tokenizer.eos_token_id
        model_inputs['attention_mask'] = model_inputs['attention_mask'][:MAX_LENGTH-1]
        model_inputs['attention_mask'][-1] = 1
    model_inputs["labels"] = model_inputs["input_ids"]
    return model_inputs

def preprocess_function(examples):
    tokenizer.pad_token = '#'
    model_inputs = construct_conv(examples, tokenizer)
    return model_inputs

In [6]:
os.environ["HF_DATASETS_CACHE"] = os.path.join(base_folder, "cache")
character_hg = load_df(character)

Using custom data configuration default-0c3b759885fd8d5f
Reusing dataset csv (D:\University\Esami da Superare\Natural Language Processing\BarneyBot\BarneyBot\cache\csv\default-0c3b759885fd8d5f\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at D:\University\Esami da Superare\Natural Language Processing\BarneyBot\BarneyBot\cache\csv\default-0c3b759885fd8d5f\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-c79122a57f55334a.arrow and D:\University\Esami da Superare\Natural Language Processing\BarneyBot\BarneyBot\cache\csv\default-0c3b759885fd8d5f\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-4faecb1c641ab4b9.arrow
Loading cached split indices for dataset at D:\University\Esami da Superare\Natural Language Processing\BarneyBot\BarneyBot\cache\csv\default-0c3b759885fd8d5f\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-7b672756e3cf7bb2.arrow and D:\University\Esami da Superare\Natural Language Processing\BarneyBot\BarneyBot\cache\csv\default-0c3b759885fd8d5f\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-bb39fb19125a9103.arrow


In [7]:
checkpoint_folder = os.path.join(out_folder, character_dict[character]['checkpoint_folder'])
checkpoint_folder_2 = os.path.join(out_folder_2, character_dict[character_2]['checkpoint_folder'])

In [8]:
from transformers import TFAutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=os.path.join(base_folder, "cache"))
tokenizer.pad_token = '#'

model = TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=checkpoint_folder)
model_2 = TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=checkpoint_folder_2)
model_def = TFAutoModelForCausalLM.from_pretrained(model_name, cache_dir=os.path.join(base_folder, "cache"))

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at D:\University\Esami da Superare\Natural Language Processing\BarneyBot\BarneyBot\Data\Characters\Barney\barney_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


# Metrics Preparation

In [9]:
sample_questions = character_hg['test']['context']

In [10]:
n_beams = 3
top_k = 50
top_p = 0.92

def get_predictions_cached(sample_questions, model, filepath, generation_method):
    prediction_path = os.path.join(in_folder, filename)
    if os.path.exists(prediction_path) and not override_predictions:
        print("Loading predictions from stored file")
        with open(filepath, 'r') as file:
            json_string = file.read()
        predictions = json.loads(json_string)
        print("Loaded predictions from stored file")

    else:
        print("Creating predictions")
        predictions = list()
        for x in tqdm(sample_questions):
            tokenized_question = tokenizer.encode(x + tokenizer.eos_token, return_tensors='tf')
            max_length = 128 + tokenized_question.shape[1]
            if generation_method == "Greedy":
                generated_answer = model.generate(tokenized_question,
                                    pad_token_id=tokenizer.eos_token_id, max_length=max_length)[0].numpy().tolist()
            elif generation_method == "Beam Search":
                generated_answer = model.generate(tokenized_question,
                                             pad_token_id=tokenizer.eos_token_id, max_length=max_length,
                                             n_beams=n_beams)[0].numpy().tolist()
            elif generation_method == "Sampling":
                generated_answer = model.generate(tokenized_question,
                                             pad_token_id=tokenizer.eos_token_id, max_length=max_length,
                                             do_sample=True, top_k=top_k, top_p=top_p)[0].numpy().tolist()
            predictions.append(generated_answer[len(tokenized_question[0]):])

        # Save predictions as a JSON file
        output_string = json.dumps(predictions)
        with open(filepath, 'w') as file:
            file.write(output_string)
        
    return predictions

In [11]:
predictions_greedy = get_predictions_cached(sample_questions, model,
                                            os.path.join(in_folder, 
                                                         character_dict[character]['prediction_filename'] + '_greedy.json'),
                                            "Greedy")
predictions_nbeams = get_predictions_cached(sample_questions, model,
                                            os.path.join(in_folder, 
                                                         character_dict[character]['prediction_filename'] + '_nbeams.json'),
                                            "Beam Search")
predictions_sampling = get_predictions_cached(sample_questions, model,
                                              os.path.join(in_folder,
                                                           character_dict[character]['prediction_filename'] + '_sampling.json'),
                                              "Sampling")

Loading predictions from stored file
Loaded predictions from stored file
Loading predictions from stored file
Loaded predictions from stored file
Loading predictions from stored file
Loaded predictions from stored file


In [12]:
def get_dataframe_for_metrics(data_test, predictions_greedy, predictions_nbeams, predictions_sampling):
    i = 0
    df = {'ctx':[], 'ctx_tk':[]}
    has_labels = 'response' in df.columns
    if has_labels:
        df['lbl'] = []
        df['lbl_tk'] = []
    if predictions_greedy:
        df['prd_greedy'] = []
        df['prd_greedy_tk'] = []
    if predictions_nbeams:
        df['prd_nbeams'] = []
        df['prd_nbeams_tk'] = [] 
    if predictions_sampling:
        df['prd_sampling'] = []
        df['prd_sampling_tk'] = []
    for sample in tqdm(data_test):
        # encode the context and label sentences, add the eos_token and return a tensor
        ctx_tk = tokenizer.encode(sample['context'] + tokenizer.eos_token, return_tensors='tf').numpy().tolist()
        ctx = sample['context']
        df['ctx_tk'].append(ctx_tk)
        df['ctx'].append(ctx)
        if has_labels:
            lbl_tk = tokenizer.encode(sample['response'] + tokenizer.eos_token, return_tensors='tf').numpy().tolist()
            lbl = sample['response']
            df['lbl'].append(lbl)
            df['lbl_tk'].append(lbl_tk)
        if predictions_greedy:
            prd_greedy_tk = predictions_greedy[i]
            prd_greedy = tokenizer.decode(prd_greedy_tk, skip_special_tokens=True)
            df['prd_greedy'].append(prd_greedy)
            df['prd_greedy_tk'].append(prd_greedy_tk)
        if predictions_nbeams:
            prd_nbeams_tk = predictions_nbeams[i]
            prd_nbeams = tokenizer.decode(prd_nbeams_tk, skip_special_tokens=True)
            df['prd_nbeams'].append(prd_nbeams)
            df['prd_nbeams_tk'].append(prd_nbeams_tk)
        if predictions_sampling:
            prd_sampling_tk = predictions_sampling[i]
            prd_sampling = tokenizer.decode(prd_sampling_tk, skip_special_tokens=True)
            df['prd_sampling'].append(prd_sampling)
            df['prd_sampling_tk'].append(prd_sampling_tk)
        i += 1
    return pd.DataFrame(data=df)

In [13]:
df_char = get_dataframe_for_metrics(character_hg['test'], predictions_greedy, predictions_nbeams, predictions_sampling)
df_char

100%|███████████████████████████████████████████████████████████████████████████████| 522/522 [00:00<00:00, 626.65it/s]


Unnamed: 0,ctx,prd_greedy,prd_nbeams,prd_sampling,lbl,ctx_tk,prd_greedy_tk,prd_nbeams_tk,prd_sampling_tk,lbl_tk
0,"I know, it's two years of my life I'm never ge...","Oh, God!","Oh, God!","Oh, of course.",Daddy's home.,"[[40, 760, 11, 340, 338, 734, 812, 286, 616, 1...","[5812, 11, 1793, 0, 50256]","[5812, 11, 1793, 0, 50256]","[5812, 11, 286, 1781, 13, 50256]","[[48280, 338, 1363, 13, 50256]]"
1,Wh-Where'd you get a meatball...,I don't know. I just saw a meatball sub.,I don't know. I just saw a meatball sub.,You don't remember?,I don't have much time!,"[[1199, 12, 8496, 1549, 345, 651, 257, 6174, 1...","[40, 836, 470, 760, 13, 314, 655, 2497, 257, 6...","[40, 836, 470, 760, 13, 314, 655, 2497, 257, 6...","[1639, 836, 470, 3505, 30, 50256]","[[40, 836, 470, 423, 881, 640, 0, 50256]]"
2,"Okay, what is so urgent that you called me and...","I'm sorry, I don't follow you.","I'm sorry, I don't follow you.",All right. It's time to start?,I could tell you knew something was up with me...,"[[16454, 11, 644, 318, 523, 18039, 326, 345, 1...","[40, 1101, 7926, 11, 314, 836, 470, 1061, 345,...","[40, 1101, 7926, 11, 314, 836, 470, 1061, 345,...","[3237, 826, 13, 632, 338, 640, 284, 923, 30, 5...","[[40, 714, 1560, 345, 2993, 1223, 373, 510, 35..."
3,How much?,I have not decided. I want to get married in a...,I have not decided. I want to get married in a...,"He said he had like 3,000 in suits.",A little.,"[[2437, 881, 30, 50256]]","[40, 423, 407, 3066, 13, 314, 765, 284, 651, 6...","[40, 423, 407, 3066, 13, 314, 765, 284, 651, 6...","[1544, 531, 339, 550, 588, 513, 11, 830, 287, ...","[[32, 1310, 13, 50256]]"
4,You're being super nice. It's... freaking me o...,I'm not gross. I'm just... gross.,I'm not gross. I'm just... gross.,Buckminster Fuller?,"I'm being Barney, and I think tonight's going ...","[[1639, 821, 852, 2208, 3621, 13, 632, 338, 98...","[40, 1101, 407, 10319, 13, 314, 1101, 655, 986...","[40, 1101, 407, 10319, 13, 314, 1101, 655, 986...","[33, 1347, 18462, 31863, 30, 50256]","[[40, 1101, 852, 41921, 11, 290, 314, 892, 997..."
...,...,...,...,...,...,...,...,...,...,...
517,"Okay, I want to lay down some ground rules for...","Oh, I know. I just want to be as awesome as sh...","Oh, I know. I just want to be as awesome as sh...","No, I'll be right over.","Well, well, well. How rich. You make me promis...","[[16454, 11, 314, 765, 284, 3830, 866, 617, 23...","[5812, 11, 314, 760, 13, 314, 655, 765, 284, 3...","[5812, 11, 314, 760, 13, 314, 655, 765, 284, 3...","[2949, 11, 314, 1183, 307, 826, 625, 13, 50256]","[[5779, 11, 880, 11, 880, 13, 1374, 5527, 13, ..."
518,It looks to be a... sacred... spa.,"I'm sorry, I don't follow you.","I'm sorry, I don't follow you.",It's called a spa.,Owl. How do we go? We will do what? Jump?,"[[1026, 3073, 284, 307, 257, 986, 13626, 986, ...","[40, 1101, 7926, 11, 314, 836, 470, 1061, 345,...","[40, 1101, 7926, 11, 314, 836, 470, 1061, 345,...","[1026, 338, 1444, 257, 41900, 13, 50256]","[[46, 40989, 13, 1374, 466, 356, 467, 30, 775,..."
519,That's putting it a bit strongly.,I'm not going to put it in a little strong.,I'm not going to put it in a little strong.,And here comes the fun part.,A bit strongly. She's not my girlfriend.,"[[2504, 338, 5137, 340, 257, 1643, 7634, 13, 5...","[40, 1101, 407, 1016, 284, 1234, 340, 287, 257...","[40, 1101, 407, 1016, 284, 1234, 340, 287, 257...","[1870, 994, 2058, 262, 1257, 636, 13, 50256]","[[32, 1643, 7634, 13, 1375, 338, 407, 616, 110..."
520,I do.,You're a good man.,You're a good man.,Hey.,I'm gonna head out to a reggae concert. I'm a ...,"[[40, 466, 13, 50256]]","[1639, 821, 257, 922, 582, 13, 50256]","[1639, 821, 257, 922, 582, 13, 50256]","[10814, 13, 50256]","[[40, 1101, 8066, 1182, 503, 284, 257, 842, 25..."


# Metrics For Character 1

In [14]:
from Lib.BBMetrics import BBMetric

def compute_sample_metrics(context_sentence, label_response, chatbot_response, verbose=True, w=(1,1,1)):
    scores = {}
    if verbose:
        # prints the sentences
        print('* context:', context_sentence) 
        print('* label:  ', label_response)
        print('* chatbot:', chatbot_response) 
    # 1) computes metrics for semantic similarity
    metric = BBMetric.load_metric("semantic similarity")
    scores['semantic similarity'] = [metric.compute(sentences_a=context_sentence,
                                                      sentences_b=label_response)['score']]
    scores['semantic similarity'].append(metric.compute(sentences_a=context_sentence,
                                                      sentences_b=chatbot_response)['score'])
    scores['semantic similarity'].append(metric.compute(sentences_a=label_response,
                                                      sentences_b=chatbot_response)['score'])
    scores['semantic similarity'].append(sum(np.array(scores['semantic similarity']) * np.array(w)) / sum(w))
    # ss_scores = scores['semantic similarity']
    if verbose:
        print('=== SEMANTIC SIMILARITY ===')
        print('context-label similarity:   ', scores['semantic similarity'][0])
        print('context-chatbot similarity: ', scores['semantic similarity'][1])
        print('label-chatbot similarity:   ', scores['semantic similarity'][2])
        print('> Merged Metrics')
        print('  semantic similarity mean:     ',  scores['semantic similarity'][3])
    # 2) computes metrics for bleu
    metric = BBMetric.load_metric("bleu")
    scores['bleu'] = metric.compute(predictions=chatbot_response, references=label_response)['score']
    if verbose:
        print('===        BLEU         ===')
        print('bleu:                       ', scores['bleu'])
    # 3) computes metrics for rouge-L
    metric = BBMetric.load_metric("rouge l")
    scores['rouge l'] = [metric.compute(predictions=context_sentence, references=label_response)['score']]
    scores['rouge l'].append(metric.compute(predictions=context_sentence, references=chatbot_response)['score'])
    scores['rouge l'].append(metric.compute(predictions=chatbot_response, references=label_response)['score'])
    scores['rouge l'].append(sum(np.array(scores['rouge l']) * np.array(w)) / sum(w))
    if verbose:
        print('===       ROUGE-L       ===')
        print('context-label rouge:        ', scores['rouge l'][0])
        print('context-chatbot rouge:      ', scores['rouge l'][1])
        print('label-chatbot rouge:        ', scores['rouge l'][2])
        print('> Merged Metrics')
        print('  rouge mean:                 ', scores['rouge l'][3] )
    # 4) computes sas metric
    metric = BBMetric.load_metric("semantic answer similarity")
    scores['semantic answer similarity'] = [metric.compute(predictions=context_sentence,
                                                    references=label_response)['score']]
    scores['semantic answer similarity'].append(metric.compute(predictions=context_sentence,
                                                        references=chatbot_response)['score'])
    scores['semantic answer similarity'].append(metric.compute(predictions=label_response,
                                                        references=chatbot_response)['score'])
    scores['semantic answer similarity'].append(sum(np.array(scores['semantic answer similarity']) * np.array(w)) / sum(w))
    if verbose:
        print('===         SAS         ===')
        print('context-label sas:          ', scores['semantic answer similarity'][0])
        print('context-chatbot sas:        ', scores['semantic answer similarity'][1])
        print('label-chatbot sas:          ', scores['semantic answer similarity'][2])
        print('> Merged Metrics')
        print('  sas mean:                   ',  scores['semantic answer similarity'][3])
    # 5) computes emotion metric
    metric = BBMetric.load_metric("emotion")
    scores['emotion'] = [metric.compute(sentences=context_sentence)]
    scores['emotion'].append(metric.compute(sentences=label_response))
    scores['emotion'].append(metric.compute(sentences=chatbot_response))
    if verbose:
        print('===       EMOTION       ===')
        print('context emotions:            \n', list(zip(scores['emotion'][0]['label'], scores['emotion'][0]['score'])))
        print('label emotions:              \n', list(zip(scores['emotion'][1]['label'], scores['emotion'][1]['score'])))
        print('chatbot emotions:            \n', list(zip(scores['emotion'][2]['label'], scores['emotion'][2]['score'])))
        print('label-chatbot emotion corr:  \n', sp.stats.stats.pearsonr(scores['emotion'][1]['score'],
                                                                         scores['emotion'][2]['score']))

    # 6) computes metrics for distinct
    metric = BBMetric.load_metric("distinct")
    scores['distinct'] = metric.compute(sentences=chatbot_response)['score']
    if verbose:
        print('===       DISTINCT      ===')
        print('distinct:                   ', scores['distinct'])

In [15]:
def compute_set_metrics(model, tokenizer, context_sentences, label_responses, chatbot_responses, verbose=True, w=(1,1,1),
                        classifier_n_sentences=50, include_qualitative=False):
    scores = {}
    
    # 0) computes metrics for perplexity
    metric = BBMetric.load_metric("semantic similarity")
    scores['semantic similarity'] = [metric.compute(sentences_a=context_sentences,
                                                      sentences_b=label_responses)['score']]
    scores['semantic similarity'].append(metric.compute(sentences_a=context_sentences,
                                                      sentences_b=chatbot_responses)['score'])
    scores['semantic similarity'].append(metric.compute(sentences_a=label_responses,
                                                      sentences_b=chatbot_responses)['score'])
    scores['semantic similarity'].append(sum(np.array(scores['semantic similarity']) * np.array(w)) / sum(w))
    if verbose:
        print('=== SEMANTIC SIMILARITY ===')
        print('semantic similarity:        ', scores['semantic similarity'])
    # 1) computes metrics for perplexity
    metric = BBMetric.load_metric("perplexity")
    if verbose:
        print('===       PERPLEXITY     ===')
    scores['perplexity'] = metric.compute(model=model, tokenizer=tokenizer, sentences=chatbot_responses)['score_concat']
    if verbose:
        print('perplexity:                 ', scores['perplexity'])
    # 2) computes metrics for bleu
    metric = BBMetric.load_metric("bleu")
    scores['bleu'] = metric.compute(predictions=chatbot_responses, references=label_responses)['score']
    if verbose:
        print('===         BLEU         ===')
        print('bleu:                       ', scores['bleu'])
    # 3) computes metrics for rouge-L
    metric = BBMetric.load_metric("rouge l")
    scores['rouge l'] = metric.compute(predictions=chatbot_responses, references=label_responses)['score']
    if verbose:
        print('===        ROUGE-L       ===')
        print('rouge:                      ', scores['rouge l'])
    # 4) computes metrics for distinct
    metric = BBMetric.load_metric("distinct")
    scores['distinct'] = metric.compute(sentences=chatbot_responses)['score']
    if verbose:
        print('===        DISTINCT      ===')
        print('distinct:                   ', scores['distinct'])
    # 6) computes emotion metric
    metric = BBMetric.load_metric("emotion")
    scores['emotion'] = [metric.compute(sentences=chatbot_responses)]
    scores['emotion'].append(metric.compute(sentences=label_responses))
    if verbose:
        print('===        EMOTION       ===')
        print('chatbot emotions:            \n', list(zip(scores['emotion'][0]['label'], scores['emotion'][0]['score'])))
        print('label emotions:              \n', list(zip(scores['emotion'][1]['label'], scores['emotion'][1]['score'])))
        print('label-chatbot emotion corr:  \n', sp.stats.stats.pearsonr(scores['emotion'][0]['score'],
                                                                         scores['emotion'][1]['score']))
    # 8) computes sas metric
    metric = BBMetric.load_metric("semantic answer similarity")
    scores['semantic answer similarity'] = [metric.compute(predictions=context_sentences,
                                                    references=label_responses)['score']]
    scores['semantic answer similarity'].append(metric.compute(predictions=context_sentences,
                                                        references=chatbot_responses)['score'])
    scores['semantic answer similarity'].append(metric.compute(predictions=label_responses,
                                                        references=chatbot_responses)['score'])
    scores['semantic answer similarity'].append(sum(np.array(scores['semantic answer similarity']) * np.array(w)) / sum(w))
    if verbose:
        print('===         SAS         ===')
        print('context-label sas:          ', scores['semantic answer similarity'][0])
        print('context-chatbot sas:        ', scores['semantic answer similarity'][1])
        print('label-chatbot sas:          ', scores['semantic answer similarity'][2])
        print('> Merged Metrics')
        print('sas-mean:                   ',  scores['semantic answer similarity'][3])
    # 9) computes metrics for semantic classifyer
    metric = BBMetric.load_metric("semantic classifier")
    start_time = time.time()
    scores['semantic classifier'] = [metric.compute(character=character, character_dict=character_dict, 
                                                   base_folder=base_folder, sentences=chatbot_responses,
                                                   n_sentences=classifier_n_sentences)]
    scores['semantic classifier'].append(metric.compute(character=character, character_dict=character_dict, 
                                                   base_folder=base_folder, sentences=label_responses,
                                                   n_sentences=classifier_n_sentences))
    end_time = time.time()
    if verbose:
        print('=== SEMANTIC CLASSIFIER ===')
        print('semantic classifier chatbot:                ', scores['semantic classifier'][0])
        print('semantic classifier label:                  ', scores['semantic classifier'][1])
        print('time elapsed computing semantic classifier:  {:.2f} s'.format(end_time - start_time))
        
    if include_qualitative:
        # Do stuff with human metrics and print sentences
        pass

In [16]:
for i in range(1):
    print("##### Sample " + str(i+1) + " #####")
    context_sentence = df_char['ctx'][i]
    chatbot_response = df_char['prd_greedy'][i]
    label_response   = df_char['lbl'][i]
    compute_sample_metrics(context_sentence, label_response, chatbot_response)
    print()

##### Sample 1 #####
* context: I know, it's two years of my life I'm never getting back. A little part of me just wants to jump the bones of the next guy I see.
* label:   Daddy's home.
* chatbot: Oh, God!
=== SEMANTIC SIMILARITY ===
context-label similarity:    0.009417437
context-chatbot similarity:  0.017228587
label-chatbot similarity:    0.008852758
> Merged Metrics
script-similarity-mean:      0.011832927353680134
===        BLEU         ===
bleu:                        0.0
===       ROUGE-L       ===
context-label rouge:         0.0588235294117647
context-chatbot rouge:       0.0
label-chatbot rouge:         0.0
> Merged Metrics
rouge mean:                  0.019607843137254898
===         SAS         ===
context-label sas:           0.1468764
context-chatbot sas:         0.18231258
label-chatbot sas:           0.21477439
> Merged Metrics
sas-mean:                    0.18132111926873526
===       EMOTION       ===
context emotions:            
 [('sadness', 0.006270761135965586

In [17]:
set_size = 50
i = 30
print("##### Set (Size " + str(set_size) + ") #####")
context_sentences = list(df_char['ctx'][i:i+set_size])
chatbot_responses = list(df_char['prd_greedy'][i:i+set_size])
label_responses   = list(df_char['lbl'][i:i+set_size])
compute_set_metrics(model, tokenizer,
                    context_sentences, label_responses, chatbot_responses)

##### Set (Size 50) #####
=== SEMANTIC SIMILARITY ===
semantic similarity:         [0.07489894, 0.20145756, 0.08608877, 0.12081509083509445]
===       PERPLEXITY     ===


100%|██████████████████████████████████████████████████████████████████████████████████| 11/11 [00:42<00:00,  3.85s/it]


perplexity:                  247.60796924454095
===         BLEU         ===
bleu:                        0.0
===        ROUGE-L       ===
rouge:                       0.07438013490229321
===        DISTINCT      ===
distinct:                    0.11439184252887444
===        EMOTION       ===
chatbot emotions:            
 [('sadness', 0.3434286758565577), ('joy', 0.2214338393410435), ('love', 0.006544853251543827), ('anger', 0.33637171503651186), ('fear', 0.08191320944606559), ('surprise', 0.010307701138954144)]
label emotions:              
 [('sadness', 0.13711809609201736), ('joy', 0.3487064220244065), ('love', 0.007899202467233407), ('anger', 0.40897555726842255), ('fear', 0.09460336329939309), ('surprise', 0.0026973685980192386)]


KeyboardInterrupt: 

In [None]:
print("##### Full Test Set (Size " + str(set_size) + ") #####")
context_sentences = list(df_char['ctx'])
chatbot_responses = list(df_char['prd_greedy'])
label_responses   = list(df_char['lbl'])
compute_set_metrics(model, tokenizer,
                    context_sentences, 
                    label_responses, 
                    chatbot_responses,
                    classifier_n_sentences=100)

# Metrics Between Character 1 & Character 2

In [None]:
def get_predictions_small(sample_questions, model, generation_method):
    print("Creating predictions")
    predictions = list()
    for x in tqdm(sample_questions):
        tokenized_question = tokenizer.encode(x + tokenizer.eos_token, return_tensors='tf')
        max_length = 128 + tokenized_question.shape[1]
        if generation_method == "Greedy":
            generated_answer = model.generate(tokenized_question,
                                pad_token_id=tokenizer.eos_token_id, max_length=max_length)[0].numpy().tolist()
        elif generation_method == "Beam Search":
            generated_answer = model.generate(tokenized_question,
                                         pad_token_id=tokenizer.eos_token_id, max_length=max_length,
                                         n_beams=n_beams)[0].numpy().tolist()
        elif generation_method == "Sampling":
            generated_answer = model.generate(tokenized_question,
                                         pad_token_id=tokenizer.eos_token_id, max_length=max_length,
                                         do_sample=True, top_k=top_k, top_p=top_p)[0].numpy().tolist()
        predictions.append(generated_answer[len(tokenized_question[0]):])
    return predictions

In [None]:
common_df = load_dataset('csv',
                         data_files=os.path.join(base_folder, 'Data', 'common_dataset.csv'), 
                         cache_dir=os.path.join(base_folder, "cache"))

In [None]:
predictions_1_greedy = get_predictions_small(common_df['context'], model, "Greedy")
predictions_1_nbeams = get_predictions_small(common_df['context'], model, "Beam Search")
predictions_1_sampling = get_predictions_small(common_df['context'], model, "Sampling")

In [None]:
predictions_2_greedy = get_predictions_small(common_df['context'], model_2, "Greedy")
predictions_2_nbeams = get_predictions_small(common_df['context'], model_2, "Beam Search")
predictions_2_sampling = get_predictions_small(common_df['context'], model_2, "Sampling")

In [None]:
df_char_1 = get_dataframe_for_metrics(common_df, predictions_1_greedy, predictions_1_nbeams, predictions_1_sampling)
df_char_2 = get_dataframe_for_metrics(common_df, predictions_2_greedy, predictions_2_nbeams, predictions_2_sampling)

In [None]:
print("##### " + character "Vs. " + character_2 + " #####")
context_sentences   = list(df_for_metrics['ctx'])
chatbot_responses   = list(df_char_1['prd_sampling'])
chatbot_2_responses = list(df_char_2['prd_sampling'])
compute_set_metrics(model, tokenizer,
                    context_sentences, chatbot_2_responses, chatbot_responses, include_qualitative=True)

# Metrics Between Different Sampling Methods

In [None]:
print("##### Greedy vs. N-Beams #####")
context_sentences = list(df_char['ctx'])
greedy_responses  = list(df_char['prd_greedy'])
nbeams_responses  = list(df_char['prd_nbeams'])
compute_set_metrics(model, tokenizer,
                    context_sentences,
                    greedy_responses,
                    nbeams_responses,
                    classifier_n_sentences=100)

In [None]:
print("##### Greedy vs. Sampling #####")
context_sentences   = list(df_char['ctx'])
greedy_responses    = list(df_char['prd_greedy'])
sampling_responses  = list(df_char['prd_sampling'])
compute_set_metrics(model, tokenizer,
                    context_sentences,
                    greedy_responses,
                    sampling_responses,
                    classifier_n_sentences=100)

In [None]:
print("##### N-Beams vs. Sampling #####")
context_sentences   = list(df_char['ctx'])
nbeams_responses    = list(df_char['prd_nbeams'])
sampling_responses  = list(df_char['prd_sampling'])
compute_set_metrics(model, tokenizer,
                    context_sentences,
                    nbeams_responses,
                    sampling_responses,
                    classifier_n_sentences=100)

# Metrics Between Non-Finetuned And Character

In [None]:
predictions_def_sampling = get_predictions_cached(sample_questions, model_def,
                                                  os.path.join(in_folder_def, 'from_' + character + '_df_' + '_sampling.json'),
                                                  "Sampling")

In [None]:
df_char_def = get_dataframe_for_metrics(character_hg['test'], None, None, predictions_def_sampling)

In [None]:
for i in range(1):
    print("##### Sample " + str(i+1) + " #####")
    context_sentence   = df_char['ctx'][i]
    character_response = df_char['prd_sampling'][i]
    default_response   = df_char_def['prd_sampling'][i]
    compute_sample_metrics(context_sentence, default_response, character_response)
    print()

In [None]:
set_size = 50
i = 30
print("##### Set (Size " + str(set_size) + ") #####")
context_sentences   = list(df_char['ctx'][i:i+set_size])
character_responses = list(df_char['prd_sampling'][i:i+set_size])
default_responses   = list(df_char_def['prd_sampling'][i:i+set_size])
compute_set_metrics(model, tokenizer,
                    context_sentences, default_responses, character_responses)

In [None]:
print("##### Full Test Set (Size " + str(set_size) + ") #####")
context_sentences   = list(df_char['ctx'])
character_responses = list(df_char['prd_sampling'])
default_responses   = list(df_char_def['prd_sampling'])
compute_set_metrics(model, tokenizer,
                    context_sentences, 
                    default_responses, 
                    character_responses,
                    classifier_n_sentences=100)

# Metrics Visualizations

In [None]:
# TODO