In [72]:
character_dict = {
    'Barney':{
        'df_filename': 'Barney.csv',
        'prediction_filename': 'barney_prediction',
        'checkpoint_folder': 'barney_model'
    },
    'Sheldon':{
        'df_filename': 'Sheldon.csv',
        'prediction_filename': 'sheldon_prediction',
        'checkpoint_folder': 'sheldon_model'
    },
    'Harry':{
        'df_filename': 'Harry.csv',
        'prediction_filename': 'harry_prediction',
        'checkpoint_folder': 'harry_model'
    },
    'Fry':{
        'df_filename': 'Fry.csv',
        'prediction_filename': 'fry_prediction',
        'checkpoint_folder': 'fry_model'
    },
    'Vader':{
        'df_filename': 'Vader.csv',
        'prediction_filename': 'vader_prediction',
        'checkpoint_folder': 'vader_model'
    },
    'Phoebe':{
        'df_filename': 'Phoebe.csv',
        'prediction_filename': 'phoebe_prediction',
        'checkpoint_folder': 'phoebe_model'
    },
    'Joey':{
        'df_filename': 'Joey.csv',
        'prediction_filename': 'joey_prediction',
        'checkpoint_folder': 'joey_model'
    }
}

In [73]:
model_name = 'microsoft/DialoGPT-small'
character = 'Barney' # 'Barney' | 'Sheldon' | 'Harry' | 'Fry' | 'Vader' | 'Joey' | 'Phoebe' | 'Default'
from_saved_weights = True
from_n_epochs = 'last'  # 'last' | int : specify the checkpoint from which to start (number of epochs trained) 
do_chat = False
do_fine_tuning = False
override_predictions = False
batch_size = 8
epochs = 10
using_cuda = False
using_dataset = True
compute_metrics = True

shutdown_at_end = False # 'h'-> ibernate | 's' -> shutdown | False -> do nothing at the end

In [74]:
if character == 'Default':
    do_fine_tuning = False
    from_saved_weights = False
    from_n_epochs = 0
    using_dataset = False
    compute_metrics = False

In [75]:
# Mount google drive
import os
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    base_folder = '/content/drive/My Drive/unibo/NLP_project/BarneyBot'
    os.system("pip install datasets")
    os.system("pip install transformers")
    os.system("pip install rouge_score")
    os.system("pip install -U sentence-transformers")
else:
    base_folder = os.getcwd()
    
in_folder = os.path.join(base_folder, 'Data', 'Characters', character)
if not os.path.exists(in_folder):
    os.makedirs(in_folder)
out_folder = os.path.join(base_folder, 'Data', 'Characters', character)
if not os.path.exists(out_folder):
    os.makedirs(out_folder)

In [76]:
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
import json

In [77]:
from datasets import load_dataset

def load_df(character):
    if not using_dataset:
        return None
    os.environ["HF_DATASETS_CACHE"] = os.path.join(base_folder, "cache")

    dataset_path = os.path.join(base_folder, "Datasets", "Characters", character, character+'.csv')
    
    character_hg = load_dataset('csv', 
                                data_files=dataset_path, 
                                cache_dir=os.path.join(base_folder, "cache"))
    character_hg = character_hg["train"].train_test_split(test_size=0.1)
    return character_hg

# Bot Loading & Fine-Tuning

In [78]:
checkpoint_folder_template = character_dict[character]['checkpoint_folder'] + '_'
checkpoint_folder_template

'barney_model_'

In [79]:
os.listdir(out_folder)

['.ipynb_checkpoints',
 'Barney.csv',
 'barney_model_10',
 'barney_prediction_10.json',
 'barney_prediction_20.json',
 'Barney_preprocessing.ipynb',
 'HIMYM_preprocessed.csv']

In [80]:
n_start_epochs = 0

if from_saved_weights:
    try:
        if from_n_epochs == 'last':
            checkpoint_names = [d for d in os.listdir(out_folder) if checkpoint_folder_template in d]
            checkpoint_names.sort()

            checkpoint_folder = checkpoint_names[-1]
            checkpoint_folder = os.path.join(out_folder, checkpoint_folder)
        else:
            checkpoint_folder = os.path.join(out_folder, checkpoint_folder_template + str(from_n_epochs))

        n_epochs_idx = checkpoint_folder.rfind('_') + 1
        n_start_epochs = int(checkpoint_folder[n_epochs_idx:])
    except:
        print(checkpoint_folder_template+'*', 'not found in out folder\n',
             'from_saved_weights set to False')
        from_saved_weights = False

# print(checkpoint_folder)
print('n_start_epochs:', n_start_epochs)

n_start_epochs: 10


In [81]:
from_saved_weights

True

In [82]:
from transformers import TFAutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=os.path.join(base_folder, "cache"))
if from_saved_weights:
    model = TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=checkpoint_folder)
else:
    model = TFAutoModelForCausalLM.from_pretrained(model_name, cache_dir=os.path.join(base_folder, "cache"))

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at D:\University\Esami da Superare\Natural Language Processing\BarneyBot\BarneyBot\Datasets\Characters\Barney\barney_model_10.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [83]:
# Let's chat for n lines
# type 'exit' to end chat
if do_chat:
    user_answ = ''
    step = 0
    while user_answ!='exit':
        user_answ = input(">> User:")
        # encode the new user input, add the eos_token and return a tensor
        new_user_input_ids = tokenizer.encode(user_answ + tokenizer.eos_token, return_tensors='tf')
        # append the new user input tokens to the chat history
        bot_input_ids = tf.concat([chat_history_ids, new_user_input_ids], axis=-1) if step > 0 else new_user_input_ids
        step += 1
        # generated a response while limiting the current answer to 128 tokens,
        max_length = 128 + bot_input_ids.shape[1]
        chat_history_ids = model.generate(bot_input_ids, 
                                          max_length=max_length, 
                                          pad_token_id=tokenizer.eos_token_id,
                                          do_sample = False)
        # pretty print last ouput tokens from bot
        print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

In [84]:
def construct_conv(row, tokenizer):
    MAX_LENGTH = 512
    row = list(reversed(list(row.values())))
    model_inputs = tokenizer(row)
    tokenizer_pad_token_id = tokenizer.encode('#')[0]
    for i in range(len(model_inputs['input_ids'])):
        model_inputs['input_ids'][i].append(tokenizer.eos_token_id)
        model_inputs['attention_mask'][i].append(1)
    model_inputs['input_ids'] = [item for sublist in model_inputs['input_ids'] for item in sublist]
    model_inputs['attention_mask'] = [item for sublist in model_inputs['attention_mask'] for item in sublist]
    if MAX_LENGTH > len(model_inputs['input_ids']):
        model_inputs['input_ids'] += [tokenizer_pad_token_id] * (MAX_LENGTH - len(model_inputs['input_ids']))
        model_inputs['attention_mask'] += [0] * (MAX_LENGTH - len(model_inputs['attention_mask']))
    elif MAX_LENGTH < len(model_inputs['input_ids']):
        model_inputs['input_ids'] = model_inputs['input_ids'][:MAX_LENGTH-1]
        model_inputs['input_ids'][-1] = tokenizer.eos_token_id
        model_inputs['attention_mask'] = model_inputs['attention_mask'][:MAX_LENGTH-1]
        model_inputs['attention_mask'][-1] = 1
    model_inputs["labels"] = model_inputs["input_ids"]
    return model_inputs

def preprocess_function(examples):
    tokenizer.pad_token = '#'
    model_inputs = construct_conv(examples, tokenizer)
    return model_inputs

In [85]:
if using_dataset:
    character_hg = load_df(character)
    tokenized_character_hg = character_hg.map(preprocess_function, batched=False)
    print(tokenized_character_hg)

Using custom data configuration default-8bedd3069c26fcac
Reusing dataset csv (D:\University\Esami da Superare\Natural Language Processing\BarneyBot\BarneyBot\cache\csv\default-8bedd3069c26fcac\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

DatasetDict({
    train: Dataset({
        features: ['response', 'context', 'context/0', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4674
    })
    test: Dataset({
        features: ['response', 'context', 'context/0', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 520
    })
})


In [86]:
from transformers import DataCollatorForLanguageModeling

if using_dataset:
    data_collator = DataCollatorForLanguageModeling(mlm=False, tokenizer=tokenizer, return_tensors='tf')

In [87]:
if using_dataset:
    tf_train_set = tokenized_character_hg["train"].to_tf_dataset(
        columns=["input_ids", "attention_mask", "labels"],
        shuffle=True,
        batch_size=batch_size,
        collate_fn=data_collator,
    )

    tf_test_set = tokenized_character_hg["test"].to_tf_dataset(
        columns=["input_ids", "attention_mask", "labels"],
        shuffle=False,
        batch_size=batch_size,
        collate_fn=data_collator,
    )

In [88]:
physical_devices = tf.config.list_physical_devices('GPU')
physical_devices

[]

In [89]:
from transformers import AdamWeightDecay

if using_dataset:
    model.compile(optimizer=AdamWeightDecay(learning_rate=2e-5))
if do_fine_tuning:
    model.fit(
        x=tf_train_set, 
        validation_data=tf_test_set, 
        epochs=epochs
    )
else: epochs = 0

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! Please ensure your labels are passed as keys in the input dict so that they are accessible to the model during the forward pass. To disable this behaviour, please pass a loss argument, or explicitly pass loss=None if you do not want your model to compute a loss.


In [92]:
n_tot_epochs = n_start_epochs + epochs
print("Number of epochs for model:", n_tot_epochs)

Number of epochs for model: 10


In [93]:
checkpoint_folder = checkpoint_folder_template + str(n_tot_epochs)
checkpoint_folder = os.path.join(out_folder, checkpoint_folder)

In [94]:
if using_dataset:
    model.save_pretrained(save_directory=checkpoint_folder)

In [95]:
# Let's chat for n lines
# type 'exit' to end chat
if do_chat:
    user_answ = ''
    step = 0
    while user_answ!='exit':
        user_answ = input(">> User:")
        # encode the new user input, add the eos_token and return a tensor
        new_user_input_ids = tokenizer.encode(user_answ + tokenizer.eos_token, return_tensors='tf')
        # append the new user input tokens to the chat history
        bot_input_ids = tf.concat([chat_history_ids, new_user_input_ids], axis=-1) if step > 0 else new_user_input_ids
        step += 1
        # generated a response while limiting the current answer to 128 tokens,
        max_length = 128 + bot_input_ids.shape[1]
        chat_history_ids = model.generate(bot_input_ids, 
                                          max_length=max_length, 
                                          pad_token_id=tokenizer.eos_token_id,
                                         do_sample = False)
        # pretty print last ouput tokens from bot
        print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

# Metrics

In [96]:
if compute_metrics:
    sample_references = tokenized_character_hg["test"]['response']
    sample_questions = tokenized_character_hg["test"]['context']

In [98]:
if compute_metrics:
    filename = character_dict[character]['prediction_filename'] + '_' + str(n_tot_epochs) + '.json'
    prediction_path = os.path.join(in_folder, filename)
    if os.path.exists(prediction_path) and not override_predictions:
        print("Loading predictions from stored file")
        with open(os.path.join(in_folder, filename), 'r') as file:
            json_string = file.read()
        predictions = json.loads(json_string)
        print("Loaded predictions from stored file")

    else:
        print("Creating predictions")
        predictions = list()
        for x in tqdm(sample_questions):
            tokenized_question = tokenizer.encode(x + tokenizer.eos_token, return_tensors='tf')
            max_length = 128 + tokenized_question.shape[1]
            generated_answer = model.generate(tokenized_question,
                                       pad_token_id=tokenizer.eos_token_id, max_length=max_length)[0].numpy().tolist()
            predictions.append(generated_answer[len(tokenized_question[0]):])

        # Save predictions as a JSON file
        output_string = json.dumps(predictions)
        with open(os.path.join(in_folder, filename), 'w') as file:
            file.write(output_string)

Loading predictions from stored file
Loaded predictions from stored file


In [99]:
if compute_metrics:
    labels = list()
    for x in tqdm(sample_references):
        labels.append(tokenizer.encode(x + tokenizer.eos_token, return_tensors='tf').numpy().tolist())
    print("Example sample")
    print("Context:\t", tokenized_character_hg["test"]['context'][0])
    print("Bot Answer:\t", tokenizer.decode(predictions[0], skip_special_tokens=True))
    print("Label:\t\t", tokenizer.decode(labels[0][0], skip_special_tokens=True))

100%|██████████████████████████████████████████████████████████████████████████████| 520/520 [00:00<00:00, 2624.83it/s]

Example sample
Context:	 I do. I The guy naked!
Bot Answer:	 No, I'm sorry, I'm just... I'm sorry.
Label:		 It's great! Why did you change your mind?





In [100]:
# free gpu memory
if using_cuda:
    from numba import cuda
    cuda.select_device(0)
    cuda.close()

## Test metrics

In [101]:
from lib.BBMetrics import BBMetric
from statistics import harmonic_mean

def compute_sample_metrics(context_sentence, label_response, chatbot_response, verbose=True):
    scores = {}
    if verbose:
        # prints the sentences
        print('* context:', context_sentence) 
        print('* label:  ', label_response)
        print('* chatbot:', chatbot_response) 
    # 1) computes metrics for semantic similarity
    metric = BBMetric.load_metric("semantic similarity")
    scores['semantic similarity'] = metric.compute(sentences_a=[context_sentence, context_sentence, label_response],
                                                   sentences_b=[label_response, chatbot_response, chatbot_response])['scores']
    ss_scores = scores['semantic similarity']
    if verbose:
        print('=== SEMANTIC SIMILARITY ===')
        print('context-label similarity:   ', scores['semantic similarity'][0])
        print('context-chatbot similarity: ', scores['semantic similarity'][1])
        print('label-chatbot similarity:   ', scores['semantic similarity'][2])
        print('> Merged Metrics')
        print('  script-similarity-quo:    ', (ss_scores[1] - ss_scores[0] + ss_scores[2] + 1) / 3)
        print('  script-similarity-abs:    ', abs(ss_scores[1]-ss_scores[2]))
        print('  script-similarity-hm:     ', harmonic_mean((ss_scores[0], ss_scores[1], ss_scores[2])))
        print('  script-similarity-sq:     ', ((ss_scores[1] - ss_scores[2])**2 + ss_scores[0])/2)
    # 2) computes metrics for bleu
    metric = BBMetric.load_metric("bleu")
    scores['bleu'] = metric.compute(predictions=chatbot_response, references=label_response)['score']
    if verbose:
        print('===        BLEU         ===')
        print('bleu:                       ', scores['bleu'])
    # 3) computes metrics for rouge-L
    metric = BBMetric.load_metric("rouge l")
    scores['rouge l'] = [metric.compute(predictions=context_sentence, references=label_response)['score']]
    scores['rouge l'].append(metric.compute(predictions=context_sentence, references=chatbot_response)['score'])
    scores['rouge l'].append(metric.compute(predictions=chatbot_response, references=label_response)['score'])
    if verbose:
        print('===       ROUGE-L       ===')
        print('context-label rouge:        ', scores['rouge l'][0])
        print('context-chatbot rouge:      ', scores['rouge l'][1])
        print('label-chatbot rouge:        ', scores['rouge l'][2])
    # 4) computes sas metric
    metric = BBMetric.load_metric("semantic answer similarity")
    scores['semantic similarity'] = metric.compute(predictions=[context_sentence, context_sentence, label_response],
                                                   references=[label_response, chatbot_response, chatbot_response])['scores']
    if verbose:
        print('===         SAS         ===')
        print('context-label sas:          ', scores['semantic similarity'][0])
        print('context-chatbot sas:        ', scores['semantic similarity'][1])
        print('label-chatbot sas:          ', scores['semantic similarity'][2])
    # 5) computes emotion metric
    metric = BBMetric.load_metric("emotion")
    emotions = metric.compute(sentences=[context_sentence, label_response, chatbot_response])
    scores['emotion'] = list(zip(emotions['scores'], emotions['labels']))
    if verbose:
        print('===       EMOTION       ===')
        print('context emotion:            ', scores['emotion'][0][1])
        print('label emotion:              ', scores['emotion'][1][1])
        print('chatbot emotion:            ', scores['emotion'][2][1])
    # 6) computes metrics for distinct
    metric = BBMetric.load_metric("distinct")
    scores['distinct'] = metric.compute(sentences=chatbot_response)['score']
    if verbose:
        print('===       DISTINCT      ===')
        print('distinct:                   ', scores['distinct'])

In [109]:
def compute_set_metrics(model, tokenizer, context_sentences, label_responses, chatbot_responses, verbose=True):
    scores = {}
    if verbose:
        # prints the sentences size
        print('* set size:', len(context_sentences))
    # 1) computes metrics for perplexity
    metric = BBMetric.load_metric("perplexity")
    if verbose:
        print('=== SEMANTIC SIMILARITY ===')
    scores['perplexity'] = metric.compute(model=model, tokenizer=tokenizer, sentences=chatbot_responses)['score']
    if verbose:
        print('perplexity:                 ', scores['perplexity'])
    # 2) computes metrics for bleu
    metric = BBMetric.load_metric("bleu")
    scores['bleu'] = metric.compute(predictions=chatbot_responses, references=label_responses)['score']
    if verbose:
        print('===        BLEU         ===')
        print('bleu:                       ', scores['bleu'])
    # 3) computes metrics for rouge-L
    metric = BBMetric.load_metric("rouge l")
    scores['rouge l'] = metric.compute(predictions=chatbot_responses, references=label_responses)['score']
    if verbose:
        print('===       ROUGE-L       ===')
        print('rouge:                      ', scores['rouge l'])
    # 4) computes metrics for distinct
    metric = BBMetric.load_metric("distinct")
    scores['distinct'] = metric.compute(sentences=chatbot_responses)['score']
    if verbose:
        print('===       DISTINCT      ===')
        print('distinct:                   ', scores['distinct'])

In [104]:
if compute_metrics:
    for i in range(1):
        print("##### Set " + str(i+1) + " #####")
        context_sentence = str(tokenized_character_hg["test"]['context'][i])
        chatbot_response = str(tokenizer.decode(predictions[i], skip_special_tokens=True))
        label_response   = str(tokenizer.decode(labels[i][0], skip_special_tokens=True))
        compute_sample_metrics(context_sentence, label_response, chatbot_response)
        print()

##### Set 1 #####
* context: I do. I The guy naked!
* label:   It's great! Why did you change your mind?
* chatbot: No, I'm sorry, I'm just... I'm sorry.
=== SEMANTIC SIMILARITY ===
context-label similarity:    0.009795298
context-chatbot similarity:  0.020441735
label-chatbot similarity:    0.01217396
> Merged Metrics
  script-similarity-quo:     0.34094013273715973
  script-similarity-abs:     0.008267775
  script-similarity-hm:      0.012867149825662717
  script-similarity-sq:      0.0049318269874715215
===        BLEU         ===
bleu:                        0.0
===       ROUGE-L       ===
context-label rouge:         0.0
context-chatbot rouge:       0.25
label-chatbot rouge:         0.0
===         SAS         ===
context-label sas:           0.18286088
context-chatbot sas:         0.283809
label-chatbot sas:           0.209447
===       EMOTION       ===
context emotion:             anger
label emotion:               joy
chatbot emotion:             sadness



In [111]:
if compute_metrics:
    set_size = 20
    print("##### Overall #####")
    context_sentences = [str(tokenized_character_hg["test"]['context'][i]) for i in range(set_size)]
    chatbot_responses = [str(tokenizer.decode(predictions[i], skip_special_tokens=True)) for i in range(set_size)]
    label_responses   = [str(tokenizer.decode(labels[i][0], skip_special_tokens=True)) for i in range(set_size)]
    print(chatbot_responses)
    compute_set_metrics(model, tokenizer,
                        context_sentences, label_responses, chatbot_responses)

##### Overall #####
["No, I'm sorry, I'm just... I'm sorry.", "You're not going to believe this.", "I'm not sure.", "I'm not sure.", "What's the matter?", "I can't get married like this!", "I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry, I'm sorry,", "You're not going to be able to sleep with her.", "I'm sorry, I'm not sure I like the song.", "I'm sorry, I don't know.", "No, I'm fine.", "I'm sorry, I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorry. I'm sorr

 43%|████████████████████████████████████                                                | 3/7 [00:03<00:05,  1.32s/it]


KeyboardInterrupt: 

In [36]:
if shutdown_at_end:
    os.system('shutdown /' + shutdown_at_end)