# Loading

In [1]:
from Data.data_dicts import character_dict, source_dict, random_state

model_name = 'microsoft/DialoGPT-small'
character = 'Vader' # 'Barney' | 'Sheldon' | 'Harry' | 'Fry' | 'Vader' | 'Joey' | 'Phoebe' | 'Bender' | Default'
character_2 = 'Harry'
override_predictions = False

In [2]:
# Mount google drive
import os
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    base_folder = '/content/drive/My Drive/unibo/NLP_project/BarneyBot'
    os.system("pip install datasets")
    os.system("pip install transformers")
    os.system("pip install rouge_score")
    os.system("pip install -U sentence-transformers")
else:
    base_folder = os.getcwd()
    
in_folder = os.path.join(base_folder, 'Data', 'Characters', character)
if not os.path.exists(in_folder):
    os.makedirs(in_folder)
out_folder = os.path.join(base_folder, 'Data', 'Characters', character)
if not os.path.exists(out_folder):
    os.makedirs(out_folder)
    
in_folder_2 = os.path.join(base_folder, 'Data', 'Characters', character_2)
if not os.path.exists(in_folder_2):
    os.makedirs(in_folder_2)
out_folder_2 = os.path.join(base_folder, 'Data', 'Characters', character_2)
if not os.path.exists(out_folder_2):
    os.makedirs(out_folder_2)
    
in_folder_def = os.path.join(base_folder, 'Data', 'Characters', 'Default')
if not os.path.exists(in_folder_def):
    os.makedirs(in_folder_def)
out_folder_def = os.path.join(base_folder, 'Data', 'Characters', 'Default')
if not os.path.exists(out_folder_def):
    os.makedirs(out_folder_def)
    
metrics_folder = os.path.join(base_folder, 'Metrics')
if not os.path.exists(metrics_folder):
    os.makedirs(metrics_folder)

In [3]:
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
import json
import numpy as np
import time
import scipy as sp

In [4]:
def save_as_json(filepath, filename, data):
    if not os.path.exists(filepath):
        os.makedirs(filepath, exist_ok=True)
    with open(os.path.join(filepath, filename + ".json"), 'w') as f:
        f.write(json.dumps(data, indent=4))

def load_from_json(filepath, filename):
    if not os.path.exists(os.path.join(filepath, filename + '.json')):
        return dict()
    with open(os.path.join(filepath, filename + '.json'), 'r') as f:
        return json.load(f)

In [5]:
from datasets import load_dataset, DatasetDict

def load_df(character):
    dataset_path = os.path.join(base_folder, "Data", "Characters", character, character+'.csv')
    
    character_hg = load_dataset('csv', 
                                data_files=dataset_path, 
                                cache_dir=os.path.join(base_folder, "cache"))
    
    # 85% train / 10% test / 5% validation
    train_test_hg = character_hg['train'].train_test_split(test_size=0.15, seed=random_state)
    test_val = train_test_hg['test'].train_test_split(test_size=0.33, seed=random_state)
    
    
    character_hg = DatasetDict({
        'train': train_test_hg['train'],
        'test': test_val['train'],
        'val': test_val['test']
    })
    
    return character_hg

In [6]:
def construct_conv(row, tokenizer):
    MAX_LENGTH = 512
    row = list(reversed(list(row.values())))
    model_inputs = tokenizer(row)
    tokenizer_pad_token_id = tokenizer.encode('#')[0]
    for i in range(len(model_inputs['input_ids'])):
        model_inputs['input_ids'][i].append(tokenizer.eos_token_id)
        model_inputs['attention_mask'][i].append(1)
    model_inputs['input_ids'] = [item for sublist in model_inputs['input_ids'] for item in sublist]
    model_inputs['attention_mask'] = [item for sublist in model_inputs['attention_mask'] for item in sublist]
    if MAX_LENGTH > len(model_inputs['input_ids']):
        model_inputs['input_ids'] += [tokenizer_pad_token_id] * (MAX_LENGTH - len(model_inputs['input_ids']))
        model_inputs['attention_mask'] += [0] * (MAX_LENGTH - len(model_inputs['attention_mask']))
    elif MAX_LENGTH < len(model_inputs['input_ids']):
        model_inputs['input_ids'] = model_inputs['input_ids'][:MAX_LENGTH-1]
        model_inputs['input_ids'][-1] = tokenizer.eos_token_id
        model_inputs['attention_mask'] = model_inputs['attention_mask'][:MAX_LENGTH-1]
        model_inputs['attention_mask'][-1] = 1
    model_inputs["labels"] = model_inputs["input_ids"]
    return model_inputs

def preprocess_function(examples):
    tokenizer.pad_token = '#'
    model_inputs = construct_conv(examples, tokenizer)
    return model_inputs

In [7]:
os.environ["HF_DATASETS_CACHE"] = os.path.join(base_folder, "cache")
character_hg = load_df(character)

Using custom data configuration default-8fc66de038de764b
Reusing dataset csv (C:\Users\david\Documents\unibo\natural_language_processing\project\BarneyBot\cache\csv\default-8fc66de038de764b\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at C:\Users\david\Documents\unibo\natural_language_processing\project\BarneyBot\cache\csv\default-8fc66de038de764b\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-fdf6a751d9e305ad.arrow and C:\Users\david\Documents\unibo\natural_language_processing\project\BarneyBot\cache\csv\default-8fc66de038de764b\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-fd4f962239c0d8a3.arrow
Loading cached split indices for dataset at C:\Users\david\Documents\unibo\natural_language_processing\project\BarneyBot\cache\csv\default-8fc66de038de764b\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-724da1b5b03e5c90.arrow and C:\Users\david\Documents\unibo\natural_language_processing\project\BarneyBot\cache\csv\default-8fc66de038de764b\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-6b695a18a6bce864.arrow


In [8]:
checkpoint_folder = os.path.join(out_folder, character_dict[character]['checkpoint_folder'])
checkpoint_folder_2 = os.path.join(out_folder_2, character_dict[character_2]['checkpoint_folder'])

In [9]:
from transformers import TFAutoModelForCausalLM, AutoTokenizer, AdamWeightDecay

tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=os.path.join(base_folder, "cache"))
tokenizer.pad_token = '#'

model = TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=checkpoint_folder)
model.compile(optimizer=AdamWeightDecay(learning_rate=2e-5))

model_2 = TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=checkpoint_folder_2)
model_2.compile(optimizer=AdamWeightDecay(learning_rate=2e-5))

model_def = TFAutoModelForCausalLM.from_pretrained(model_name, cache_dir=os.path.join(base_folder, "cache"))
model_def.compile(optimizer=AdamWeightDecay(learning_rate=2e-5))

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at C:\Users\david\Documents\unibo\natural_language_processing\project\BarneyBot\Data\Characters\Vader\vader_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! Please ensure your labels are passed as keys in the input dict so that they are accessible to the model during the forward pass. To disable this behaviour, please pass a loss argument, or explicitly pass loss=None if you do not want your model to compute a loss.
All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized fr

In [10]:
from transformers import DataCollatorForLanguageModeling
from transformers import AdamWeightDecay

batch_size = 8

data_collator = DataCollatorForLanguageModeling(mlm=False, tokenizer=tokenizer, return_tensors='tf')

tokenized_character_hg = character_hg.map(preprocess_function, batched=False)

encoded_test_set = tokenized_character_hg["test"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

Loading cached processed dataset at C:\Users\david\Documents\unibo\natural_language_processing\project\BarneyBot\cache\csv\default-8fc66de038de764b\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-fd9a309bbef55052.arrow
Loading cached processed dataset at C:\Users\david\Documents\unibo\natural_language_processing\project\BarneyBot\cache\csv\default-8fc66de038de764b\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-2d6e279951cff576.arrow
Loading cached processed dataset at C:\Users\david\Documents\unibo\natural_language_processing\project\BarneyBot\cache\csv\default-8fc66de038de764b\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-ce7aa8f599523585.arrow


# Metrics Preparation

In [11]:
sample_questions = character_hg['test']['context/0']

In [12]:
n_beams = 3
top_k = 50
top_p = 0.92

def get_predictions_cached(sample_questions, model, filename, generation_method, override_predictions=False):
    prediction_path = os.path.join(in_folder, filename)
    if os.path.exists(prediction_path) and not override_predictions:
        print("Loading predictions from stored file")
        with open(prediction_path, 'r') as file:
            json_string = file.read()
        predictions = json.loads(json_string)
        print("Loaded predictions from stored file")

    else:
        print("Creating predictions")
        predictions = list()
        for x in tqdm(sample_questions):
            tokenized_question = tokenizer.encode(x + tokenizer.eos_token, return_tensors='tf')
            max_length = 128 + tokenized_question.shape[1]
            if generation_method == "Greedy":
                generated_answer = model.generate(tokenized_question,
                                    pad_token_id=tokenizer.eos_token_id, max_length=max_length)[0].numpy().tolist()
            elif generation_method == "Beam Search":
                generated_answer = model.generate(tokenized_question,
                                             pad_token_id=tokenizer.eos_token_id, max_length=max_length,
                                             n_beams=n_beams)[0].numpy().tolist()
            elif generation_method == "Sampling":
                b = True
                c = 0
                while b:
                    generated_answer = model.generate(tokenized_question,
                                                 pad_token_id=tokenizer.eos_token_id, max_length=max_length,
                                                 do_sample=True, top_k=top_k, top_p=top_p)[0].numpy().tolist()
                    c += 1
                    if len(generated_answer[len(tokenized_question[0]):])>1:
                        b = False       
                    if c>100: 
                        generated_answer[len(tokenized_question[0]):] = tokenizer.encode('hi') + [tokenizer.eos_token_id]
                        break
            
            predictions.append(generated_answer[len(tokenized_question[0]):])

        # Save predictions as a JSON file
        output_string = json.dumps(predictions)
        with open(prediction_path, 'w') as file:
            file.write(output_string)
        
        assert all([len(p)>1 for p in predictions])
        
    return predictions

In [13]:
predictions_greedy = get_predictions_cached(sample_questions, model,
                                            character_dict[character]['prediction_filename'] + '_greedy.json',
                                            "Greedy",
                                            override_predictions=override_predictions)
predictions_nbeams = get_predictions_cached(sample_questions, model,
                                            character_dict[character]['prediction_filename'] + '_nbeams.json',
                                            "Beam Search",
                                            override_predictions=override_predictions)
predictions_sampling = get_predictions_cached(sample_questions, model,
                                              character_dict[character]['prediction_filename'] + '_sampling.json',
                                              "Sampling",
                                              override_predictions=override_predictions)

Loading predictions from stored file
Loaded predictions from stored file
Loading predictions from stored file
Loaded predictions from stored file
Loading predictions from stored file
Loaded predictions from stored file


In [14]:
def get_dataframe_for_metrics(data_test, predictions_greedy, predictions_nbeams, predictions_sampling):
    i = 0
    df = {'ctx':[], 'ctx_tk':[]}
    has_labels = 'response' in data_test.features
    if has_labels:
        df['lbl'] = []
        df['lbl_tk'] = []
    if predictions_greedy:
        df['prd_greedy'] = []
        df['prd_greedy_tk'] = []
    if predictions_nbeams:
        df['prd_nbeams'] = []
        df['prd_nbeams_tk'] = [] 
    if predictions_sampling:
        df['prd_sampling'] = []
        df['prd_sampling_tk'] = []
    for sample in tqdm(data_test):
        # encode the context and label sentences, add the eos_token and return a tensor
        ctx_tk = tokenizer.encode(sample['context/0'] + tokenizer.eos_token, return_tensors='tf').numpy().tolist()
        ctx = sample['context/0']
        df['ctx_tk'].append(ctx_tk)
        df['ctx'].append(ctx)
        if has_labels:
            lbl_tk = tokenizer.encode(sample['response'] + tokenizer.eos_token, return_tensors='tf').numpy().tolist()
            lbl = sample['response']
            df['lbl'].append(lbl)
            df['lbl_tk'].append(lbl_tk)
        if predictions_greedy:
            prd_greedy_tk = predictions_greedy[i]
            prd_greedy = tokenizer.decode(prd_greedy_tk, skip_special_tokens=True)
            df['prd_greedy'].append(prd_greedy)
            df['prd_greedy_tk'].append(prd_greedy_tk)
        if predictions_nbeams:
            prd_nbeams_tk = predictions_nbeams[i]
            prd_nbeams = tokenizer.decode(prd_nbeams_tk, skip_special_tokens=True)
            df['prd_nbeams'].append(prd_nbeams)
            df['prd_nbeams_tk'].append(prd_nbeams_tk)
        if predictions_sampling:
            prd_sampling_tk = predictions_sampling[i]
            prd_sampling = tokenizer.decode(prd_sampling_tk, skip_special_tokens=True)
            df['prd_sampling'].append(prd_sampling)
            df['prd_sampling_tk'].append(prd_sampling_tk)
        i += 1
    return pd.DataFrame(data=df)

In [15]:
df_char = get_dataframe_for_metrics(character_hg['test'], predictions_greedy, predictions_nbeams, predictions_sampling)
df_char

100%|████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 1141.83it/s]


Unnamed: 0,ctx,ctx_tk,lbl,lbl_tk,prd_greedy,prd_greedy_tk,prd_nbeams,prd_nbeams_tk,prd_sampling,prd_sampling_tk
0,I will not fight you.,"[[40, 481, 407, 1907, 345, 13, 50256]]",Give yourself to the dark side. It is the only...,"[[23318, 3511, 284, 262, 3223, 1735, 13, 632, ...",I will not fight you.,"[40, 481, 407, 1907, 345, 13, 50256]",I will not fight you.,"[40, 481, 407, 1907, 345, 13, 50256]","Prepare your ships, Imperial Master.","[37534, 533, 534, 7937, 11, 11773, 5599, 13, 5..."
1,Unlock one-five-seven and nine.Release charges.,"[[3118, 5354, 530, 12, 13261, 12, 26548, 290, ...",Did you find any droids?,"[[11633, 345, 1064, 597, 3102, 2340, 30, 50256]]","Three, four,five,seven,seven,seven,seven,seven...","[12510, 11, 1440, 11, 13261, 11, 26548, 11, 26...","Three, four,five,seven,seven,seven,seven,seven...","[12510, 11, 1440, 11, 13261, 11, 26548, 11, 26...","... a disturbance in theForce, a disturbance i...","[986, 257, 30497, 287, 262, 10292, 11, 257, 30..."
2,"Lord Vader, what about Leia and theWookiee?","[[22438, 27403, 11, 644, 546, 41212, 290, 262,...",They must never again leave thiscity.,"[[2990, 1276, 1239, 757, 2666, 428, 19205, 13,...",I don't know.,"[40, 836, 470, 760, 13, 50256]",I don't know.,"[40, 836, 470, 760, 13, 50256]",They're dead.,"[2990, 821, 2636, 13, 50256]"
3,No!,"[[2949, 0, 50256]]",All to easy. Perhaps you are not as strong as ...,"[[3237, 284, 2562, 13, 8673, 345, 389, 407, 35...",No!No!No!No!No!No!No!No!No!No!No!No!No!No!No!N...,"[2949, 0, 2949, 0, 2949, 0, 2949, 0, 2949, 0, ...",No!No!No!No!No!No!No!No!No!No!No!No!No!No!No!N...,"[2949, 0, 2949, 0, 2949, 0, 2949, 0, 2949, 0, ...","Oh, my Lord, why, sir!No, sir, no!Only me!","[5812, 11, 616, 4453, 11, 1521, 11, 15967, 0, ..."
4,Give yourself to the dark side. It is the only...,"[[23318, 3511, 284, 262, 3223, 1735, 13, 632, ...",Sister! So...you have a twinsister. Your feeli...,"[[50, 1694, 0, 1406, 986, 5832, 423, 257, 2034...",I have found my way.,"[40, 423, 1043, 616, 835, 13, 50256]",I have found my way.,"[40, 423, 1043, 616, 835, 13, 50256]",I've completed the training. You can't leave t...,"[40, 1053, 5668, 262, 3047, 13, 921, 460, 470,..."
5,Open the blast doors! Open theblast doors!,"[[11505, 262, 11975, 8215, 0, 4946, 262, 39806...","I've been waiting for you, Obi-Wan. We meet ag...","[[40, 1053, 587, 4953, 329, 345, 11, 46662, 12...",Open theblast doors!,"[11505, 262, 39806, 8215, 0, 50256]",Open theblast doors!,"[11505, 262, 39806, 8215, 0, 50256]","We're trying to land on target, don't get crazy.","[1135, 821, 2111, 284, 1956, 319, 2496, 11, 83..."
6,"Strange, that I have not. I wonder if your fee...","[[38114, 11, 326, 314, 423, 407, 13, 314, 4240...","They are clear, my Master.","[[2990, 389, 1598, 11, 616, 5599, 13, 50256]]",I have not felt this way since I've been here.,"[40, 423, 407, 2936, 428, 835, 1201, 314, 1053...",I have not felt this way since I've been here.,"[40, 423, 407, 2936, 428, 835, 1201, 314, 1053...","You know what? I'll make it clear. Lord Vader,...","[1639, 760, 644, 30, 314, 1183, 787, 340, 1598..."
7,Vader's targeting computer swings around into ...,"[[53, 5067, 338, 10822, 3644, 26728, 1088, 656...",I have you now.,"[[40, 423, 345, 783, 13, 50256]]",Vader adjusts his control stick and adjusts hi...,"[53, 5067, 46094, 465, 1630, 4859, 290, 46094,...",Vader adjusts his control stick and adjusts hi...,"[53, 5067, 46094, 465, 1630, 4859, 290, 46094,...",The Emperor stands on the platform as the lase...,"[464, 10851, 6296, 319, 262, 3859, 355, 262, 1..."
8,The Emperor's coming here?,"[[464, 10851, 338, 2406, 994, 30, 50256]]","That is correct, Commander. And heis most disp...","[[2504, 318, 3376, 11, 13353, 13, 843, 339, 27...",The Emperor's coming here!,"[464, 10851, 338, 2406, 994, 0, 50256]",The Emperor's coming here!,"[464, 10851, 338, 2406, 994, 0, 50256]",The Senate will not be denied.,"[464, 3845, 481, 407, 307, 6699, 13, 50256]"
9,The princess! Put all sections onalert!,"[[464, 21752, 0, 5930, 477, 9004, 319, 44598, ...",Obi-Wan is here. The Force is withhim.,"[[5944, 72, 12, 45681, 318, 994, 13, 383, 5221...",The princess is here!,"[464, 21752, 318, 994, 0, 50256]",The princess is here!,"[464, 21752, 318, 994, 0, 50256]",It was the last known location of the Rebellio...,"[1026, 373, 262, 938, 1900, 4067, 286, 262, 34..."


# Metrics For Character 1

In [16]:
def ccl_sim(ctx_lbl, ctx_cht, lbl_cht):
    return ((1 - abs(ctx_lbl - ctx_cht))**2 + lbl_cht**2) / 2

In [17]:
from Lib.BBMetrics import BBMetric

def compute_set_metrics(model, model_2, character, character_2, test_set_name,
                        context_sentences, label_responses, chatbot_responses, encoded_test_set,
                        classifier_n_sentences=50, label_chatbot_symmetry=False,
                        include_qualitative_sentences=False, verbose=True):
    scores = {}
    
    lbl_text = 'label' if not label_chatbot_symmetry else 'chatbota'
    cht_text = 'chatbot' if not label_chatbot_symmetry else 'chatbotb'
    
    scores['metadata'] = {}
    scores['metadata']['dataset name'] = test_set_name
    scores['metadata']['names'] = {
        'context':'context'
    }
    if label_chatbot_symmetry:
        scores['metadata']['names'][lbl_text] = character
        scores['metadata']['names'][cht_text] = character_2
    else:
        scores['metadata']['names'][lbl_text] = 'label'
        scores['metadata']['names'][cht_text] = character
    
    # 0) computes metrics for perplexity
    metric = BBMetric.load_metric("semantic similarity")
    scores['semantic similarity'] = [metric.compute(sentences_a=context_sentences,
                                            sentences_b=label_responses)]
    scores['semantic similarity'].append(metric.compute(sentences_a=context_sentences,
                                            sentences_b=chatbot_responses)),
    scores['semantic similarity'].append(metric.compute(sentences_a=label_responses,
                                              sentences_b=chatbot_responses))
    scores['semantic similarity'].append(ccl_sim(scores['semantic similarity'][0]['score'],
                                                 scores['semantic similarity'][1]['score'],
                                                 scores['semantic similarity'][2]['score']))
    scores['metadata']['semantic similarity'] = {
        'ordering': ['context-'+lbl_text, 'context-'+cht_text, cht_text+'-'+lbl_text, 'ccl']
    }
    if verbose:
        print('=== SEMANTIC SIMILARITY ===')
        print('context-'+lbl_text+' similarity:   ', scores['semantic similarity'][0])
        print('context-'+cht_text+' similarity: ', scores['semantic similarity'][1])
        print(cht_text+'-'+lbl_text+' similarity:   ', scores['semantic similarity'][2])
        print('ccl-sim similarity:            ', scores['semantic similarity'][3])
    # 1) computes metrics for perplexity
    if encoded_test_set is not None:
        metric = BBMetric.load_metric("perplexity")
        if not label_chatbot_symmetry:
            scores['perplexity'] = metric.compute(model=model, encoded_test_set=encoded_test_set)['score']
            scores['metadata']['perplexity'] = {
                'ordering': cht_text
            }
        else:
            scores['perplexity'] = [metric.compute(model=model, encoded_test_set=encoded_test_set)['score']]
            scores['perplexity'].append(metric.compute(model=model_2, encoded_test_set=encoded_test_set)['score'])
            scores['metadata']['perplexity'] = {
                'ordering': [lbl_text, cht_text]
            }
        if verbose:
            print('===       PERPLEXITY     ===')
            if label_chatbot_symmetry:
                print(lbl_text + ' perplexity:         ', scores['perplexity'][0])
                print(cht_text + ' perplexity:         ', scores['perplexity'][1])
            else:
                print(cht_text + ' perplexity:         ', scores['perplexity'])
    elif verbose:
        print("encoded_test_set not provided, skipping Perplexity.")
    # 2) computes metrics for bleu
    metric = BBMetric.load_metric("bleu")
    scores['bleu'] = [metric.compute(predictions=label_responses, references=context_sentences)]
    scores['bleu'].append(metric.compute(predictions=chatbot_responses, references=context_sentences))
    scores['bleu'].append(metric.compute(predictions=chatbot_responses, references=label_responses))
    scores['bleu'].append(ccl_sim(scores['bleu'][0]['score'],
                                  scores['bleu'][1]['score'],
                                  scores['bleu'][2]['score']))
    scores['metadata']['bleu'] = {
        'ordering': ['context-'+lbl_text, 'context-'+cht_text, cht_text+'-'+lbl_text, 'ccl']
    }
    if verbose:
        print('===         BLEU         ===')
        print('context-to-'+lbl_text+' bleu:      ', scores['bleu'][0])
        print('context-to-'+cht_text+' bleu:    ', scores['bleu'][1])
        print(lbl_text+'-to-'+cht_text+' bleu:      ', scores['bleu'][2])
        print('ccl-sim bleu:            ', scores['bleu'][3])
    # 3) computes metrics for rouge-L
    metric = BBMetric.load_metric("rouge l")
    scores['rouge l'] = [metric.compute(predictions=label_responses, references=context_sentences)]
    scores['rouge l'].append(metric.compute(predictions=chatbot_responses, references=context_sentences))
    scores['rouge l'].append(metric.compute(predictions=chatbot_responses, references=label_responses))
    scores['rouge l'].append(ccl_sim(scores['rouge l'][0]['score'],
                                     scores['rouge l'][1]['score'],
                                     scores['rouge l'][2]['score']))
    scores['metadata']['rouge l'] = {
        'ordering': ['context-'+lbl_text, 'context-'+cht_text, cht_text+'-'+lbl_text, 'ccl']
    }
    if verbose:
        print('===        ROUGE-L       ===')
        print('context-to-'+lbl_text+' rouge:     ', scores['rouge l'][0])
        print('context-to-'+cht_text+' rouge:   ', scores['rouge l'][1])
        print(lbl_text+'-to-'+cht_text+' rouge:     ', scores['rouge l'][2])
        print('ccl-sim rouge:            ', scores['rouge l'][3])
    # 4) computes metrics for distinct
    metric = BBMetric.load_metric("distinct")
    scores['distinct'] = [metric.compute(sentences=context_sentences)]
    scores['distinct'].append(metric.compute(sentences=label_responses))
    scores['distinct'].append(metric.compute(sentences=chatbot_responses))
    scores['metadata']['distinct'] = {
        'ordering': ['context', lbl_text, cht_text]
    }
    if verbose:
        print('===       DISTINCT      ===')
        print('context distinct:          ', scores['distinct'][0])
        print(lbl_text+' distinct:          ', scores['distinct'][1])
        print(cht_text+' distinct:          ', scores['distinct'][2])
        
    # 6) computes emotion metric
    metric = BBMetric.load_metric("emotion")
    scores['emotion'] = [metric.compute(sentences=context_sentences)]
    scores['emotion'].append(metric.compute(sentences=label_responses))
    scores['emotion'].append(metric.compute(sentences=chatbot_responses))
    scores['emotion'].append(sp.stats.stats.pearsonr(scores['emotion'][1]['score'],
                                                     scores['emotion'][2]['score'])[0])
    scores['metadata']['emotion'] = {
        'ordering': ['context-'+lbl_text, 'context-'+cht_text, cht_text+'-'+lbl_text, cht_text+'-'+lbl_text+' correlation']
    }
    if verbose:
        print('===       EMOTION       ===')
        print('context emotions:            \n', list(zip(scores['emotion'][0]['label'], scores['emotion'][0]['score'])))
        print(lbl_text+' emotions:              \n', list(zip(scores['emotion'][1]['label'], scores['emotion'][1]['score'])))
        print(cht_text+' emotions:            \n', list(zip(scores['emotion'][2]['label'], scores['emotion'][2]['score'])))
        print(lbl_text+'-'+cht_text+'emotion corr:  \n', scores['emotion'][3])
    # 8) computes sas metric
    metric = BBMetric.load_metric("semantic answer similarity")
    scores['semantic answer similarity'] = [metric.compute(predictions=context_sentences,
                                                    references=label_responses)]
    scores['semantic answer similarity'].append(metric.compute(predictions=context_sentences,
                                                        references=chatbot_responses))
    scores['semantic answer similarity'].append(metric.compute(predictions=label_responses,
                                                        references=chatbot_responses))
    scores['semantic answer similarity'].append(ccl_sim(scores['semantic answer similarity'][0]['score'],
                                                        scores['semantic answer similarity'][1]['score'],
                                                        scores['semantic answer similarity'][2]['score']))
    scores['metadata']['semantic answer similarity'] = {
        'ordering': ['context-'+lbl_text, 'context-'+cht_text, cht_text+'-'+lbl_text, 'ccl']
    }
    if verbose:
        print('===         SAS         ===')
        print('context-'+lbl_text+' sas:          ', scores['semantic answer similarity'][0])
        print('context-'+cht_text+' sas:        ', scores['semantic answer similarity'][1])
        print(lbl_text+'-'+cht_text+' sas:          ', scores['semantic answer similarity'][2])
        print('ccl-sim sas:               ', scores['semantic answer similarity'][3])
    # 9) computes metrics for semantic classifier
    metric = BBMetric.load_metric("semantic classifier")
    start_time = time.time()
    scores['semantic classifier'] = [metric.compute(character=character, character_dict=character_dict, 
                                                   base_folder=base_folder, sentences=label_responses,
                                                   n_sentences=classifier_n_sentences)]
    scores['semantic classifier'].append(metric.compute(character=character, character_dict=character_dict, 
                                                   base_folder=base_folder, sentences=chatbot_responses,
                                                   n_sentences=classifier_n_sentences))
    end_time = time.time()
    scores['metadata']['semantic classifier'] = {
        'ordering': [lbl_text, cht_text]
    }
    if verbose:
        print('=== SEMANTIC CLASSIFIER ===')
        print('sem-classifier '+lbl_text+':                ', scores['semantic classifier'][0])
        print('sem-classifier '+cht_text+':                  ', scores['semantic classifier'][1])
        print('time elapsed computing semantic classifier:  {:.2f} s'.format(end_time - start_time))
    if not label_chatbot_symmetry and os.path.exists(os.path.join(os.getcwd(), "Data", "Characters", character, "humancoherence.csv")):
        scores['human'] = {}
        metric = BBMetric.load_metric("human - coherence")
        scores['human']['coherence'] = metric.compute(filepath=os.path.join(os.getcwd(), "Data", "Characters",
                                                                            character, "humancoherence.csv"))
        metric = BBMetric.load_metric("human - style")
        scores['human']['style'] = metric.compute(filepath=os.path.join(os.getcwd(), "Data", "Characters",
                                                                        character, "humanstyle.csv"))
        metric = BBMetric.load_metric("human - consistency")
        scores['human']['consistency'] = metric.compute(filepath=os.path.join(os.getcwd(), "Data", "Characters",
                                                                              character, "humanconsistency.csv"))
        scores['metadata']['human'] = {
            'ordering': {
                'coherence': cht_text,
                'consistency': cht_text,
                'style': cht_text
            }
        }
        if verbose:
            print('===    HUMAN METRICS    ===')
            print('coherence:                 ', scores['human']['coherence'])
            print('consistency:               ', scores['human']['consistency'])
            print('style:                     ', scores['human']['style'])
    elif verbose:
        print("Symmetric mode, skipping Human metrics.")
    if include_qualitative_sentences:
        sentences_df = {}
        sentences_df['context'] = context_sentences
        sentences_df[lbl_text] = label_responses
        sentences_df[cht_text] = chatbot_responses
        scores['sentences'] = sentences_df
        if verbose:
            print('===      SENTENCES      ===')
            for i in range(len(context_sentences)):
                print("* context: ", context_sentences[i])
                print("* " + lbl_text + ":", label_responses[i])
                print("* " + cht_text + ":", chatbot_responses[i])
                print()
    elif verbose:
        print("Skipping sentence outputting.")
    return scores

In [18]:
"""
set_size = 10
i = 30
print("##### Set (Size " + str(set_size) + ") #####")
context_sentences = list(df_char['ctx'][i:i+set_size])
chatbot_responses = list(df_char['prd_greedy'][i:i+set_size])
label_responses   = list(df_char['lbl'][i:i+set_size])
compute_set_metrics(model, None,
                    context_sentences, label_responses, chatbot_responses, character, encoded_test_set)
"""

'\nset_size = 10\ni = 30\nprint("##### Set (Size " + str(set_size) + ") #####")\ncontext_sentences = list(df_char[\'ctx\'][i:i+set_size])\nchatbot_responses = list(df_char[\'prd_greedy\'][i:i+set_size])\nlabel_responses   = list(df_char[\'lbl\'][i:i+set_size])\ncompute_set_metrics(model, None,\n                    context_sentences, label_responses, chatbot_responses, character, encoded_test_set)\n'

In [19]:
print("##### Full Test Set #####")
context_sentences = list(df_char['ctx'])
chatbot_responses = list(df_char['prd_greedy'])
label_responses   = list(df_char['lbl'])
scores = compute_set_metrics(model, None,
                             character, None, character + " dataset",
                             context_sentences, label_responses, chatbot_responses, encoded_test_set,
                             classifier_n_sentences=75)

##### Full Test Set #####
=== SEMANTIC SIMILARITY ===
context-label similarity:    {'score': 0.2628566026687622, 'std': 0.12330655008554459}
context-chatbot similarity:  {'score': 0.5519678592681885, 'std': 0.2759278416633606}
chatbot-label similarity:    {'score': 0.2198188602924347, 'std': 0.15027783811092377}
ccl-sim similarity:             0.27684156841695584


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:05<00:00,  2.75s/it]


===       PERPLEXITY     ===
chatbot perplexity:          20.615766981008772
===         BLEU         ===
context-to-label bleu:       {'score': 0.0}
context-to-chatbot bleu:     {'score': 0.0752434444398826}
label-to-chatbot bleu:       {'score': 0.0}
ccl-sim bleu:             0.4275873435257062
===        ROUGE-L       ===
context-to-label rouge:      {'score': 0.07925918737060042, 'std': 0.07479345524211507}
context-to-chatbot rouge:    {'score': 0.2783069336058467, 'std': 0.32252310998200084}
label-to-chatbot rouge:      {'score': 0.05853002070393374, 'std': 0.0998893999786502}
ccl-sim rouge:             0.3224751380672207
===       DISTINCT      ===
context distinct:           {'score': -1.8850444702591331, 'std': 0.04780861208267612}
label distinct:           {'score': -1.860529383989936, 'std': 0.02676100873863563}
chatbot distinct:           {'score': -1.9147244626716424, 'std': 0.061163652368743125}
===       EMOTION       ===
context emotions:            
 [('sadness', 0.0377

In [20]:
save_as_json(metrics_folder, character+'_base_metrics', scores)

# Metrics Between Different Sampling Methods

In [21]:
scores = {}

In [22]:
split = True

In [23]:
print("##### Greedy vs. N-Beams #####")
context_sentences = list(df_char['ctx'])
greedy_responses  = list(df_char['prd_greedy'])
nbeams_responses  = list(df_char['prd_nbeams'])
scores['greedy_vs_nbeams'] = compute_set_metrics(None, None,
                                                 character, character, character + " dataset",
                                                 context_sentences,
                                                 greedy_responses,
                                                 nbeams_responses,
                                                 None,
                                                 classifier_n_sentences=75, label_chatbot_symmetry=True)

##### Greedy vs. N-Beams #####
=== SEMANTIC SIMILARITY ===
context-chatbota similarity:    {'score': 0.5519678592681885, 'std': 0.2759278416633606}
context-chatbotb similarity:  {'score': 0.5519678592681885, 'std': 0.2759278416633606}
chatbotb-chatbota similarity:    {'score': 1.0, 'std': 1.8066697293761536e-07}
ccl-sim similarity:             1.0
encoded_test_set not provided, skipping Perplexity.
===         BLEU         ===
context-to-chatbota bleu:       {'score': 0.0752434444398826}
context-to-chatbotb bleu:     {'score': 0.0752434444398826}
chatbota-to-chatbotb bleu:       {'score': 1.0}
ccl-sim bleu:             1.0
===        ROUGE-L       ===
context-to-chatbota rouge:      {'score': 0.2783069336058467, 'std': 0.32252310998200084}
context-to-chatbotb rouge:    {'score': 0.2783069336058467, 'std': 0.32252310998200084}
chatbota-to-chatbotb rouge:      {'score': 1.0, 'std': 0.0}
ccl-sim rouge:             1.0
===       DISTINCT      ===
context distinct:           {'score': -1.88

In [24]:
if split == True:
    save_as_json(metrics_folder, character+'_greedy_vs_nbeams_metrics', scores['greedy_vs_nbeams'])

In [25]:
print("##### Greedy vs. Sampling #####")
context_sentences   = list(df_char['ctx'])
greedy_responses    = list(df_char['prd_greedy'])
sampling_responses  = list(df_char['prd_sampling'])
scores['greedy_vs_sampling'] = compute_set_metrics(None, None,
                                                   character, character, character + " dataset",
                                                   context_sentences,
                                                   greedy_responses,
                                                   sampling_responses,
                                                   None,
                                                   classifier_n_sentences=75, label_chatbot_symmetry=True)

##### Greedy vs. Sampling #####
=== SEMANTIC SIMILARITY ===
context-chatbota similarity:    {'score': 0.5519678592681885, 'std': 0.2759278416633606}
context-chatbotb similarity:  {'score': 0.3545387387275696, 'std': 0.21952050924301147}
chatbotb-chatbota similarity:    {'score': 0.3163492679595947, 'std': 0.1475856751203537}
ccl-sim similarity:             0.37209843794738795
encoded_test_set not provided, skipping Perplexity.
===         BLEU         ===
context-to-chatbota bleu:       {'score': 0.0752434444398826}
context-to-chatbotb bleu:     {'score': 0.0}
chatbota-to-chatbotb bleu:       {'score': 0.0}
ccl-sim bleu:             0.4275873435257062
===        ROUGE-L       ===
context-to-chatbota rouge:      {'score': 0.2783069336058467, 'std': 0.32252310998200084}
context-to-chatbotb rouge:    {'score': 0.14650645744573537, 'std': 0.23855587781933377}
chatbota-to-chatbotb rouge:      {'score': 0.06841869026172095, 'std': 0.09385208013151991}
ccl-sim rouge:             0.37922576518

In [26]:
if split == True:
    save_as_json(metrics_folder, character+'_greedy_vs_sampling_metrics', scores['greedy_vs_sampling'])

In [27]:
print("##### N-Beams vs. Sampling #####")
context_sentences   = list(df_char['ctx'])
nbeams_responses    = list(df_char['prd_nbeams'])
sampling_responses  = list(df_char['prd_sampling'])
scores['nbeams_vs_sampling'] = compute_set_metrics(None, None,
                                                   character, character, character + " dataset",
                                                   context_sentences,
                                                   nbeams_responses,
                                                   sampling_responses,
                                                   None,
                                                   classifier_n_sentences=75, label_chatbot_symmetry=True)

##### N-Beams vs. Sampling #####
=== SEMANTIC SIMILARITY ===
context-chatbota similarity:    {'score': 0.5519678592681885, 'std': 0.2759278416633606}
context-chatbotb similarity:  {'score': 0.3545387387275696, 'std': 0.21952050924301147}
chatbotb-chatbota similarity:    {'score': 0.3163492679595947, 'std': 0.1475856751203537}
ccl-sim similarity:             0.37209843794738795
encoded_test_set not provided, skipping Perplexity.
===         BLEU         ===
context-to-chatbota bleu:       {'score': 0.0752434444398826}
context-to-chatbotb bleu:     {'score': 0.0}
chatbota-to-chatbotb bleu:       {'score': 0.0}
ccl-sim bleu:             0.4275873435257062
===        ROUGE-L       ===
context-to-chatbota rouge:      {'score': 0.2783069336058467, 'std': 0.32252310998200084}
context-to-chatbotb rouge:    {'score': 0.14650645744573537, 'std': 0.23855587781933377}
chatbota-to-chatbotb rouge:      {'score': 0.06841869026172095, 'std': 0.09385208013151991}
ccl-sim rouge:             0.3792257651

In [28]:
if split == True:
    save_as_json(metrics_folder, character+'_nbeams_vs_sampling_metrics', scores['nbeams_vs_sampling'])

In [29]:
if split == True:    
    scores = {}
    scores['greedy_vs_nbeams'] = load_from_json(
        filepath=metrics_folder,
        filename=character+'_greedy_vs_nbeams_metrics'
    )
    scores['greedy_vs_sampling'] = load_from_json(
        filepath=metrics_folder,
        filename=character+'_greedy_vs_sampling_metrics'
    )
    scores['nbeams_vs_sampling'] = load_from_json(
        filepath=metrics_folder,
        filename=character+'_nbeams_vs_sampling_metrics'
    )
    
    os.remove(os.path.join(
        metrics_folder,
        character+'_greedy_vs_nbeams_metrics.json'
    ))
    os.remove(os.path.join(
        metrics_folder,
        character+'_greedy_vs_sampling_metrics.json'
    ))
    os.remove(os.path.join(
        metrics_folder,
        character+'_nbeams_vs_sampling_metrics.json'
    ))

In [30]:
save_as_json(metrics_folder, character+'_sampling_comparison_metrics', scores)

# Metrics Between Character vs Non-Finetuned

In [31]:
predictions_def_sampling = get_predictions_cached(sample_questions, model_def,
                                                  os.path.join(in_folder_def, 'from_' + character + '_df_' + '_sampling.json'),
                                                  "Sampling", override_predictions=override_predictions)

Loading predictions from stored file
Loaded predictions from stored file


In [32]:
df_char_def = get_dataframe_for_metrics(character_hg['test'], None, None, predictions_def_sampling)

100%|████████████████████████████████████████████████████████████████████████████████| 16/16 [00:00<00:00, 1776.07it/s]


In [33]:
"""
for i in range(1):
    print("##### Sample " + str(i+1) + " #####")
    context_sentence   = df_char['ctx'][i]
    character_response = df_char['prd_sampling'][i]
    default_response   = df_char_def['prd_sampling'][i]
    compute_sample_metrics(context_sentence, default_response, character_response, label_chatbot_symmetry=True)
    print()
"""

'\nfor i in range(1):\n    print("##### Sample " + str(i+1) + " #####")\n    context_sentence   = df_char[\'ctx\'][i]\n    character_response = df_char[\'prd_sampling\'][i]\n    default_response   = df_char_def[\'prd_sampling\'][i]\n    compute_sample_metrics(context_sentence, default_response, character_response, label_chatbot_symmetry=True)\n    print()\n'

In [34]:
"""
set_size = 50
i = 30
print("##### Set (Size " + str(set_size) + ") #####")
context_sentences   = list(df_char['ctx'][i:i+set_size])
character_responses = list(df_char['prd_sampling'][i:i+set_size])
default_responses   = list(df_char_def['prd_sampling'][i:i+set_size])
compute_set_metrics(None, None,
                    context_sentences, default_responses, character_responses, character, label_chatbot_symmetry=True)
"""

'\nset_size = 50\ni = 30\nprint("##### Set (Size " + str(set_size) + ") #####")\ncontext_sentences   = list(df_char[\'ctx\'][i:i+set_size])\ncharacter_responses = list(df_char[\'prd_sampling\'][i:i+set_size])\ndefault_responses   = list(df_char_def[\'prd_sampling\'][i:i+set_size])\ncompute_set_metrics(None, None,\n                    context_sentences, default_responses, character_responses, character, label_chatbot_symmetry=True)\n'

In [35]:
print("##### Full Test Set #####")
context_sentences   = list(df_char['ctx'])
character_responses = list(df_char['prd_sampling'])
default_responses   = list(df_char_def['prd_sampling'])
scores = compute_set_metrics(model, model_def, character, 'Default', character + " dataset",
                             context_sentences, 
                             character_responses, 
                             default_responses,
                             encoded_test_set,
                             classifier_n_sentences=75,
                             label_chatbot_symmetry=True)

##### Full Test Set #####
=== SEMANTIC SIMILARITY ===
context-chatbota similarity:    {'score': 0.3545387387275696, 'std': 0.21952050924301147}
context-chatbotb similarity:  {'score': 0.33069825172424316, 'std': 0.18615958094596863}
chatbotb-chatbota similarity:    {'score': 0.13831575214862823, 'std': 0.07600176334381104}
ccl-sim similarity:             0.48600932105317185


100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  4.78it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:03<00:00,  1.57s/it]


===       PERPLEXITY     ===
chatbota perplexity:          20.615766981008772
chatbotb perplexity:          618.6860775727047
===         BLEU         ===
context-to-chatbota bleu:       {'score': 0.0}
context-to-chatbotb bleu:     {'score': 0.0}
chatbota-to-chatbotb bleu:       {'score': 0.0}
ccl-sim bleu:             0.5
===        ROUGE-L       ===
context-to-chatbota rouge:      {'score': 0.14650645744573537, 'std': 0.23855587781933377}
context-to-chatbotb rouge:    {'score': 0.11526831253756564, 'std': 0.1510622681644724}
chatbota-to-chatbotb rouge:      {'score': 0.04968081435472739, 'std': 0.0746656093146303}
ccl-sim rouge:             0.4704838575979566
===       DISTINCT      ===
context distinct:           {'score': -1.8850444702591331, 'std': 0.04780861208267612}
chatbota distinct:           {'score': -1.8858537217912217, 'std': 0.05431507639059038}
chatbotb distinct:           {'score': -1.84711077546383, 'std': 0.04474986771806541}
===       EMOTION       ===
context emoti

In [36]:
save_as_json(metrics_folder, character+'_vs_nonfinetuned_metrics', scores)

# Metrics Between Character 1 & Character 2

In [37]:
def get_predictions_small(sample_questions, model, generation_method):
    print("Creating predictions")
    predictions = list()
    for x in tqdm(sample_questions):
        tokenized_question = tokenizer.encode(x + tokenizer.eos_token, return_tensors='tf')
        max_length = 128 + tokenized_question.shape[1]
        if generation_method == "Greedy":
            generated_answer = model.generate(tokenized_question,
                                pad_token_id=tokenizer.eos_token_id, max_length=max_length)[0].numpy().tolist()
        elif generation_method == "Beam Search":
            generated_answer = model.generate(tokenized_question,
                                         pad_token_id=tokenizer.eos_token_id, max_length=max_length,
                                         n_beams=n_beams)[0].numpy().tolist()
        elif generation_method == "Sampling":
                b = True
                c = 0
                while b:
                    generated_answer = model.generate(tokenized_question,
                                                 pad_token_id=tokenizer.eos_token_id, max_length=max_length,
                                                 do_sample=True, top_k=top_k, top_p=top_p)[0].numpy().tolist()
                    
                    c+= 1
                    if len(generated_answer[len(tokenized_question[0]):])>1:
                        b = False         
                    if c>100: 
                        generated_answer[len(tokenized_question[0]):] = tokenizer.encode('hi') + [tokenizer.eos_token_id]
                        break 
                        
        predictions.append(generated_answer[len(tokenized_question[0]):])
        
        assert all([len(p)>1 for p in predictions])
        
    return predictions

In [48]:
df_common = load_dataset('csv',
                         data_files=os.path.join(base_folder, 'Data', 'common_dataset.csv'), 
                         cache_dir=os.path.join(base_folder, "cache"))

df_common = df_common.remove_columns(['source'])
tokenized_common_hg = df_common['train'].map(preprocess_function, batched=False)

encoded_common_set = tokenized_common_hg.to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

Using custom data configuration default-2caa0e9478f457ce


Downloading and preparing dataset csv/default to C:\Users\david\Documents\unibo\natural_language_processing\project\BarneyBot\cache\csv\default-2caa0e9478f457ce\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to C:\Users\david\Documents\unibo\natural_language_processing\project\BarneyBot\cache\csv\default-2caa0e9478f457ce\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

0ex [00:00, ?ex/s]

In [49]:
df_common

DatasetDict({
    train: Dataset({
        features: ['label', 'context/0'],
        num_rows: 35
    })
})

In [50]:
encoded_common_set

<PrefetchDataset element_spec={'input_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'labels': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}>

In [51]:
predictions_1_sampling = get_predictions_small(df_common['train']['context/0'], model, "Sampling")

Creating predictions


100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [00:46<00:00,  1.33s/it]


In [42]:
predictions_2_sampling = get_predictions_small(df_common['train']['context/0'], model_2, "Sampling")

Creating predictions


100%|██████████████████████████████████████████████████████████████████████████████████| 35/35 [00:26<00:00,  1.31it/s]


In [43]:
df_common_char_1 = get_dataframe_for_metrics(df_common['train'], None, None, predictions_1_sampling)
df_common_char_2 = get_dataframe_for_metrics(df_common['train'], None, None, predictions_2_sampling)

100%|████████████████████████████████████████████████████████████████████████████████| 35/35 [00:00<00:00, 3496.67it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 35/35 [00:00<00:00, 3496.92it/s]


In [44]:
print("##### " + character + "  Vs. " + character_2 + " #####")
context_sentences   = list(df_common_char_1['ctx'])
chatbot_responses   = list(df_common_char_1['prd_sampling'])
chatbot_2_responses = list(df_common_char_2['prd_sampling'])
scores = compute_set_metrics(model, model_2, character, character_2, "common small dataset",
                             context_sentences, chatbot_responses, chatbot_2_responses, encoded_common_set,
                             include_qualitative_sentences=True, label_chatbot_symmetry=True)

##### Vader  Vs. Harry #####
=== SEMANTIC SIMILARITY ===
context-chatbota similarity:    {'score': 0.22928862273693085, 'std': 0.19413906335830688}
context-chatbotb similarity:  {'score': 0.26278918981552124, 'std': 0.13143767416477203}
chatbotb-chatbota similarity:    {'score': 0.265611469745636, 'std': 0.13768655061721802}
ccl-sim similarity:             0.5023353033489216


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00,  5.40it/s]
100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.32it/s]


===       PERPLEXITY     ===
chatbota perplexity:          57.95086184069064
chatbotb perplexity:          43.05399126504585
===         BLEU         ===
context-to-chatbota bleu:       {'score': 0.0}
context-to-chatbotb bleu:     {'score': 0.0}
chatbota-to-chatbotb bleu:       {'score': 0.0}
ccl-sim bleu:             0.5
===        ROUGE-L       ===
context-to-chatbota rouge:      {'score': 0.07366340634319406, 'std': 0.17719224809401155}
context-to-chatbotb rouge:    {'score': 0.054179676999004736, 'std': 0.08987718399234672}
chatbota-to-chatbotb rouge:      {'score': 0.06389015467962837, 'std': 0.09993900923630875}
ccl-sim rouge:             0.48274705444288296
===       DISTINCT      ===
context distinct:           {'score': -1.890156719517119, 'std': 0.061888269489336664}
chatbota distinct:           {'score': -1.8925175597818058, 'std': 0.05512533105673859}
chatbotb distinct:           {'score': -1.9070860012552835, 'std': 0.06746374618886293}
===       EMOTION       ===
context 

In [45]:
save_as_json(metrics_folder, character+'_vs_'+character_2+'_metrics', scores)