# Loading

In [1]:
from Data.data_dicts import character_dict, source_dict, random_state

model_name = 'microsoft/DialoGPT-small'
character = 'Barney' # 'Barney' | 'Sheldon' | 'Harry' | 'Fry' | 'Vader' | 'Joey' | 'Phoebe' | 'Bender' | Default'
override_predictions = False

In [2]:
# Mount google drive
import os
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    base_folder = '/content/drive/My Drive/unibo/NLP_project/BarneyBot'
    os.system("pip install datasets")
    os.system("pip install transformers")
    os.system("pip install rouge_score")
    os.system("pip install -U sentence-transformers")
else:
    base_folder = os.getcwd()
    
in_folder = os.path.join(base_folder, 'Data', 'Characters', character)
if not os.path.exists(in_folder):
    os.makedirs(in_folder)
out_folder = os.path.join(base_folder, 'Data', 'Characters', character)
if not os.path.exists(out_folder):
    os.makedirs(out_folder)

In [3]:
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
import json

In [4]:
from datasets import load_dataset, DatasetDict

def load_df(character):
    os.environ["HF_DATASETS_CACHE"] = os.path.join(base_folder, "cache")

    dataset_path = os.path.join(base_folder, "Data", "Characters", character, character+'.csv')
    
    character_hg = load_dataset('csv', 
                                data_files=dataset_path, 
                                cache_dir=os.path.join(base_folder, "cache"))
    
    # 85% train / 10% test / 5% validation
    train_test_hg = character_hg['train'].train_test_split(test_size=0.15, seed=random_state)
    test_val = train_test_hg['test'].train_test_split(test_size=0.33, seed=random_state)
    
    
    character_hg = DatasetDict({
        'train': train_test_hg['train'],
        'test': test_val['train'],
        'val': test_val['test']
    })
    
    return character_hg

In [5]:
def construct_conv(row, tokenizer):
    MAX_LENGTH = 512
    row = list(reversed(list(row.values())))
    model_inputs = tokenizer(row)
    tokenizer_pad_token_id = tokenizer.encode('#')[0]
    for i in range(len(model_inputs['input_ids'])):
        model_inputs['input_ids'][i].append(tokenizer.eos_token_id)
        model_inputs['attention_mask'][i].append(1)
    model_inputs['input_ids'] = [item for sublist in model_inputs['input_ids'] for item in sublist]
    model_inputs['attention_mask'] = [item for sublist in model_inputs['attention_mask'] for item in sublist]
    if MAX_LENGTH > len(model_inputs['input_ids']):
        model_inputs['input_ids'] += [tokenizer_pad_token_id] * (MAX_LENGTH - len(model_inputs['input_ids']))
        model_inputs['attention_mask'] += [0] * (MAX_LENGTH - len(model_inputs['attention_mask']))
    elif MAX_LENGTH < len(model_inputs['input_ids']):
        model_inputs['input_ids'] = model_inputs['input_ids'][:MAX_LENGTH-1]
        model_inputs['input_ids'][-1] = tokenizer.eos_token_id
        model_inputs['attention_mask'] = model_inputs['attention_mask'][:MAX_LENGTH-1]
        model_inputs['attention_mask'][-1] = 1
    model_inputs["labels"] = model_inputs["input_ids"]
    return model_inputs

def preprocess_function(examples):
    tokenizer.pad_token = '#'
    model_inputs = construct_conv(examples, tokenizer)
    return model_inputs

In [6]:
character_hg = load_df(character)

Using custom data configuration default-1e2fb1391f2def74
Reusing dataset csv (D:\University\Esami da Superare\Natural Language Processing\BarneyBot\BarneyBot\cache\csv\default-1e2fb1391f2def74\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached split indices for dataset at D:\University\Esami da Superare\Natural Language Processing\BarneyBot\BarneyBot\cache\csv\default-1e2fb1391f2def74\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-e7caca9d101d9d1a.arrow and D:\University\Esami da Superare\Natural Language Processing\BarneyBot\BarneyBot\cache\csv\default-1e2fb1391f2def74\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-5bc4c5fa6633ec9b.arrow
Loading cached split indices for dataset at D:\University\Esami da Superare\Natural Language Processing\BarneyBot\BarneyBot\cache\csv\default-1e2fb1391f2def74\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-752558deb048626d.arrow and D:\University\Esami da Superare\Natural Language Processing\BarneyBot\BarneyBot\cache\csv\default-1e2fb1391f2def74\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519\cache-fc53bacd7ff03c4e.arrow


In [7]:
checkpoint_folder = os.path.join(out_folder, character_dict[character]['checkpoint_folder'])

In [8]:
from transformers import TFAutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=os.path.join(base_folder, "cache"))
model = TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=checkpoint_folder)

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at D:\University\Esami da Superare\Natural Language Processing\BarneyBot\BarneyBot\Data\Characters\Barney\barney_model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


# Metrics Preparation

In [9]:
filename = character_dict[character]['prediction_filename'] + '.json'
prediction_path = os.path.join(in_folder, filename)
if os.path.exists(prediction_path) and not override_predictions:
    print("Loading predictions from stored file")
    with open(os.path.join(in_folder, filename), 'r') as file:
        json_string = file.read()
    predictions = json.loads(json_string)
    print("Loaded predictions from stored file")

else:
    print("Creating predictions")
    predictions = list()
    for x in tqdm(sample_questions):
        tokenized_question = tokenizer.encode(x + tokenizer.eos_token, return_tensors='tf')
        max_length = 128 + tokenized_question.shape[1]
        generated_answer = model.generate(tokenized_question,
                                   pad_token_id=tokenizer.eos_token_id, max_length=max_length)[0].numpy().tolist()
        predictions.append(generated_answer[len(tokenized_question[0]):])

    # Save predictions as a JSON file
    output_string = json.dumps(predictions)
    with open(os.path.join(in_folder, filename), 'w') as file:
        file.write(output_string)

Loading predictions from stored file
Loaded predictions from stored file


In [14]:
def getDataframeForMetrics(data_test, predictions):
    i = 0
    df = {'context':[], 'prediction':[], 'label':[],
          'context_tokenized':[], 'prediction_tokenized':[], 'label_tokenized':[]}
    for sample in tqdm(data_test):
        # encode the context and label sentences, add the eos_token and return a tensor
        ctx_tk = tokenizer.encode(sample['context'] + tokenizer.eos_token, return_tensors='tf').numpy().tolist()
        lbl_tk = tokenizer.encode(sample['response'] + tokenizer.eos_token, return_tensors='tf').numpy().tolist()
        ctb_tk = predictions[i]
        # decode the chatbot response's tokens
        ctx = sample['context']
        lbl = sample['response']
        ctb = tokenizer.decode(ctb_tk, skip_special_tokens=True)
        # add them in the chat
        df['context'].append(ctx)
        df['prediction'].append(ctb)
        df['label'].append(lbl)
        df['context_tokenized'].append(ctx_tk)
        df['prediction_tokenized'].append(ctb_tk)
        df['label_tokenized'].append(lbl_tk)
        i += 1
    return pd.DataFrame(data=df)

In [16]:
df_for_metrics = getDataframeForMetrics(character_hg['test'], predictions)
df_for_metrics

100%|██████████████████████████████████████████████████████████████████████████████| 522/522 [00:00<00:00, 1050.31it/s]


Unnamed: 0,context,prediction,label,context_tokenized,prediction_tokenized,label_tokenized
0,"I know, it's two years of my life I'm never ge...","Oh, God!",Daddy's home.,"[[40, 760, 11, 340, 338, 734, 812, 286, 616, 1...","[5812, 11, 1793, 0, 50256]","[[48280, 338, 1363, 13, 50256]]"
1,Wh-Where'd you get a meatball...,I don't know. I just saw a meatball sub.,I don't have much time!,"[[1199, 12, 8496, 1549, 345, 651, 257, 6174, 1...","[40, 836, 470, 760, 13, 314, 655, 2497, 257, 6...","[[40, 836, 470, 423, 881, 640, 0, 50256]]"
2,"Okay, what is so urgent that you called me and...","I'm sorry, I don't follow you.",I could tell you knew something was up with me...,"[[16454, 11, 644, 318, 523, 18039, 326, 345, 1...","[40, 1101, 7926, 11, 314, 836, 470, 1061, 345,...","[[40, 714, 1560, 345, 2993, 1223, 373, 510, 35..."
3,How much?,I have not decided. I want to get married in a...,A little.,"[[2437, 881, 30, 50256]]","[40, 423, 407, 3066, 13, 314, 765, 284, 651, 6...","[[32, 1310, 13, 50256]]"
4,You're being super nice. It's... freaking me o...,I'm not gross. I'm just... gross.,"I'm being Barney, and I think tonight's going ...","[[1639, 821, 852, 2208, 3621, 13, 632, 338, 98...","[40, 1101, 407, 10319, 13, 314, 1101, 655, 986...","[[40, 1101, 852, 41921, 11, 290, 314, 892, 997..."
...,...,...,...,...,...,...
517,"Okay, I want to lay down some ground rules for...","Oh, I know. I just want to be as awesome as sh...","Well, well, well. How rich. You make me promis...","[[16454, 11, 314, 765, 284, 3830, 866, 617, 23...","[5812, 11, 314, 760, 13, 314, 655, 765, 284, 3...","[[5779, 11, 880, 11, 880, 13, 1374, 5527, 13, ..."
518,It looks to be a... sacred... spa.,"I'm sorry, I don't follow you.",Owl. How do we go? We will do what? Jump?,"[[1026, 3073, 284, 307, 257, 986, 13626, 986, ...","[40, 1101, 7926, 11, 314, 836, 470, 1061, 345,...","[[46, 40989, 13, 1374, 466, 356, 467, 30, 775,..."
519,That's putting it a bit strongly.,I'm not going to put it in a little strong.,A bit strongly. She's not my girlfriend.,"[[2504, 338, 5137, 340, 257, 1643, 7634, 13, 5...","[40, 1101, 407, 1016, 284, 1234, 340, 287, 257...","[[32, 1643, 7634, 13, 1375, 338, 407, 616, 110..."
520,I do.,You're a good man.,I'm gonna head out to a reggae concert. I'm a ...,"[[40, 466, 13, 50256]]","[1639, 821, 257, 922, 582, 13, 50256]","[[40, 1101, 8066, 1182, 503, 284, 257, 842, 25..."


# Metrics

In [26]:
from Lib.BBMetrics import BBMetric
from statistics import harmonic_mean

def compute_sample_metrics(context_sentence, label_response, chatbot_response, verbose=True):
    scores = {}
    if verbose:
        # prints the sentences
        print('* context:', context_sentence) 
        print('* label:  ', label_response)
        print('* chatbot:', chatbot_response) 
    # 1) computes metrics for semantic similarity
    metric = BBMetric.load_metric("semantic similarity")
    scores['semantic similarity'] = [metric.compute(sentences_a=context_sentence,
                                                      sentences_b=label_response)['score']]
    scores['semantic similarity'].append(metric.compute(sentences_a=context_sentence,
                                                      sentences_b=chatbot_response)['score'])
    scores['semantic similarity'].append(metric.compute(sentences_a=label_response,
                                                      sentences_b=chatbot_response)['score'])
    ss_scores = scores['semantic similarity']
    if verbose:
        print('=== SEMANTIC SIMILARITY ===')
        print('context-label similarity:   ', scores['semantic similarity'][0])
        print('context-chatbot similarity: ', scores['semantic similarity'][1])
        print('label-chatbot similarity:   ', scores['semantic similarity'][2])
        print('> Merged Metrics')
        print('  script-similarity-quo:    ', (ss_scores[1] - ss_scores[0] + ss_scores[2] + 1) / 3)
        print('  script-similarity-abs:    ', abs(ss_scores[1]-ss_scores[2]))
        print('  script-similarity-hm:     ', harmonic_mean((ss_scores[0], ss_scores[1], ss_scores[2])))
        print('  script-similarity-sq:     ', ((ss_scores[1] - ss_scores[2])**2 + ss_scores[0])/2)
    # 2) computes metrics for bleu
    metric = BBMetric.load_metric("bleu")
    scores['bleu'] = metric.compute(predictions=chatbot_response, references=label_response)['score']
    if verbose:
        print('===        BLEU         ===')
        print('bleu:                       ', scores['bleu'])
    # 3) computes metrics for rouge-L
    metric = BBMetric.load_metric("rouge l")
    scores['rouge l'] = [metric.compute(predictions=context_sentence, references=label_response)['score']]
    scores['rouge l'].append(metric.compute(predictions=context_sentence, references=chatbot_response)['score'])
    scores['rouge l'].append(metric.compute(predictions=chatbot_response, references=label_response)['score'])
    if verbose:
        print('===       ROUGE-L       ===')
        print('context-label rouge:        ', scores['rouge l'][0])
        print('context-chatbot rouge:      ', scores['rouge l'][1])
        print('label-chatbot rouge:        ', scores['rouge l'][2])
    # 4) computes sas metric
    metric = BBMetric.load_metric("semantic answer similarity")
    scores['semantic similarity'] = [metric.compute(predictions=context_sentence,
                                                    references=label_response)['score']]
    scores['semantic similarity'].append(metric.compute(predictions=context_sentence,
                                                        references=chatbot_response)['score'])
    scores['semantic similarity'].append(metric.compute(predictions=label_response,
                                                        references=chatbot_response)['score'])
    if verbose:
        print('===         SAS         ===')
        print('context-label sas:          ', scores['semantic similarity'][0])
        print('context-chatbot sas:        ', scores['semantic similarity'][1])
        print('label-chatbot sas:          ', scores['semantic similarity'][2])
    # 5) computes emotion metric
    metric = BBMetric.load_metric("emotion")
    scores['emotion'] = [metric.compute(sentences=context_sentence)]
    scores['emotion'].append(metric.compute(sentences=label_response))
    scores['emotion'].append(metric.compute(sentences=chatbot_response))
    if verbose:
        print('===       EMOTION       ===')
        print('context emotions:            \n', list(zip(scores['emotion'][0]['label'], scores['emotion'][0]['score'])))
        print('label emotions:              \n', list(zip(scores['emotion'][1]['label'], scores['emotion'][1]['score'])))
        print('chatbot emotions:            \n', list(zip(scores['emotion'][2]['label'], scores['emotion'][2]['score'])))
    # 6) computes metrics for distinct
    metric = BBMetric.load_metric("distinct")
    scores['distinct'] = metric.compute(sentences=chatbot_response)['score']
    if verbose:
        print('===       DISTINCT      ===')
        print('distinct:                   ', scores['distinct'])

In [27]:
def compute_set_metrics(model, tokenizer, context_sentences, label_responses, chatbot_responses, verbose=True):
    scores = {}
    # 1) computes metrics for perplexity
    metric = BBMetric.load_metric("perplexity")
    if verbose:
        print('===      PERPLEXITY     ===')
    scores['perplexity'] = metric.compute(model=model, tokenizer=tokenizer, sentences=chatbot_responses)['score_concat']
    if verbose:
        print('perplexity:                 ', scores['perplexity'])
    # 2) computes metrics for bleu
    metric = BBMetric.load_metric("bleu")
    scores['bleu'] = metric.compute(predictions=chatbot_responses, references=label_responses)['score']
    if verbose:
        print('===        BLEU         ===')
        print('bleu:                       ', scores['bleu'])
    # 3) computes metrics for rouge-L
    metric = BBMetric.load_metric("rouge l")
    scores['rouge l'] = metric.compute(predictions=chatbot_responses, references=label_responses)['score']
    if verbose:
        print('===       ROUGE-L       ===')
        print('rouge:                      ', scores['rouge l'])
    # 4) computes metrics for distinct
    metric = BBMetric.load_metric("distinct")
    scores['distinct'] = metric.compute(sentences=chatbot_responses)['score']
    if verbose:
        print('===       DISTINCT      ===')
        print('distinct:                   ', scores['distinct'])

In [28]:
for i in range(1):
    print("##### Sample " + str(i+1) + " #####")
    context_sentence = df_for_metrics['context'][i]
    chatbot_response = df_for_metrics['prediction'][i]
    label_response   = df_for_metrics['label'][i]
    compute_sample_metrics(context_sentence, label_response, chatbot_response)
    print()

##### Sample 1 #####
* context: I know, it's two years of my life I'm never getting back. A little part of me just wants to jump the bones of the next guy I see.
* label:   Daddy's home.
* chatbot: Oh, God!
=== SEMANTIC SIMILARITY ===
context-label similarity:    0.009417437
context-chatbot similarity:  0.017228587
label-chatbot similarity:    0.008852758
> Merged Metrics
  script-similarity-quo:     0.3388879696528117
  script-similarity-abs:     0.008375828
  script-similarity-hm:      0.010822971059354132
  script-similarity-sq:      0.004743795757117874
===        BLEU         ===
bleu:                        0.0
===       ROUGE-L       ===
context-label rouge:         0.0588235294117647
context-chatbot rouge:       0.0
label-chatbot rouge:         0.0
===         SAS         ===
context-label sas:           0.1468764
context-chatbot sas:         0.18231258
label-chatbot sas:           0.21477439
===       EMOTION       ===


KeyError: 'label'

In [24]:
set_size = 20
print("##### Set (Size " + str(set_size) + ") #####")
context_sentences = list(df_for_metrics['context'][i:i+20])
chatbot_responses = list(df_for_metrics['prediction'][i:i+20])
label_responses   = list(df_for_metrics['label'][i:i+20])
print(chatbot_responses)
compute_set_metrics(model, tokenizer,
                    context_sentences, label_responses, chatbot_responses)

##### Set (Size 20) #####
['Oh, God!', "I don't know. I just saw a meatball sub.", "I'm sorry, I don't follow you.", 'I have not decided. I want to get married in a couple years.', "I'm not gross. I'm just... gross.", "I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry.", "I'm sorry.", "You're a good man.", "I'm sorry.", "I'm not kidding.", "I'm not going to be able to sleep. I'm going to be sick.", "I'm not sure I like this new look.", 'What?', "You're not allowed to enter the Vatican.", "I'm not sure.", "Oh, wait, wait. That's Professor Mosby's office.", 'I love you.', "I'm gonna crush it!", 'Hey.', "I'm sorry, I can't believe you're still trying to push me."]
===      PERPLEXITY     ===


100%|████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:21<00:00,  3.66s/it]


perplexity:                  53.72093300853435
===        BLEU         ===
bleu:                        0.0
===       ROUGE-L       ===
rouge:                       0.10481168913255934
===       DISTINCT      ===
distinct:                    0.0902332163840651


In [25]:
print("##### Full Test Set (Size " + str(set_size) + ") #####")
compute_set_metrics(model, tokenizer,
                    list(df_test['context']),
                    list(df_test['label']),
                    list(df_test['prediction']))

##### Full Test Set (Size 20) #####


Token indices sequence length is longer than the specified maximum sequence length for this model (7597 > 1024). Running this sequence through the model will result in indexing errors


===      PERPLEXITY     ===


 41%|█████████████████████████████████▎                                               | 49/119 [03:52<05:31,  4.73s/it]


KeyboardInterrupt: 