In [1]:
# Mount google drive
import os
try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False
if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive',force_remount=True)
    base_folder = '/content/drive/My Drive/unibo/NLP_project/BarneyBot'
    os.system("pip install datasets")
    os.system("pip install transformers")
    os.system("pip install rouge_score")
    os.system("pip install -U sentence-transformers")
else:
    base_folder = os.getcwd()
    
in_folder = os.path.join(base_folder, "in")
if not os.path.exists(in_folder):
    os.makedirs(in_folder)
out_folder = os.path.join(base_folder, "out")
if not os.path.exists(out_folder):
    os.makedirs(out_folder)

In [2]:
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
import json

In [3]:
character_dict = {
    'Barney':{
        'df_filename': 'Barney.csv',
        'prediction_filename': 'barney_prediction.json',
        'checkpoint_folder': 'barney_model'
    },
    'Sheldon':{
        'df_filename': 'Sheldon.csv',
        'prediction_filename': 'sheldon_prediction.json',
        'checkpoint_folder': 'sheldon_model'
    }
             }

In [4]:
model_name = 'microsoft/DialoGPT-small'
character = 'Barney' # 'Barney' | 'Sheldon'
from_saved_weights = True
do_chat = False
do_fine_tuning = True
override_predictions = False
batch_size = 8
epochs = 10

In [5]:
from datasets import load_dataset

def load_df(character):
    os.environ["HF_DATASETS_CACHE"] = os.path.join(base_folder, "cache")

    dataset_path = os.path.join(base_folder, "Datasets", "Characters", character, character+'.csv')
    
    character_hg = load_dataset('csv', 
                                data_files=dataset_path, 
                                cache_dir=os.path.join(base_folder, "cache"))
    character_hg = character_hg["train"].train_test_split(test_size=0.1)
    return character_hg

# Bot Loading & Fine-Tuning

In [7]:
checkpoint_folder_template = character_dict[character]['checkpoint_folder'] + '_'
checkpoint_folder_template

'barney_model_'

In [8]:
os.listdir(out_folder)

['barney_model_10', 'sheldon_model_10']

In [9]:
n_start_epochs = 0
if from_saved_weights:
    try:
        checkpoint_names = [d for d in os.listdir(out_folder) if checkpoint_folder_template in d]
        checkpoint_names.sort()
        
        checkpoint_folder = checkpoint_names[-1]
        checkpoint_folder = os.path.join(out_folder, checkpoint_folder)

        n_epochs_idx = checkpoint_folder.rfind('_') + 1
        n_start_epochs = int(checkpoint_folder[n_epochs_idx:])
    except:
        print(checkpoint_folder_template+'*', 'not found in out folder\n',
             'from_saved_weights set to False')
        from_saved_weights = False
        
n_start_epochs

10

In [10]:
from_saved_weights

True

In [11]:
from transformers import TFAutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=os.path.join(base_folder, "cache"))
if from_saved_weights:
    model = TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=checkpoint_folder)
else:
    model = TFAutoModelForCausalLM.from_pretrained(model_name, cache_dir=os.path.join(base_folder, "cache"))

All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at C:\Users\david\Documents\unibo\natural_language_processing\project\BarneyBot\out\barney_model_10.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [12]:
# Let's chat for 3 lines
if do_chat:
    for step in range(10):
        # encode the new user input, add the eos_token and return a tensor
        new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='tf')
        # append the new user input tokens to the chat history
        bot_input_ids = tf.concat([chat_history_ids, new_user_input_ids], axis=-1) if step > 0 else new_user_input_ids
        # generated a response while limiting the current answer to 128 tokens,
        max_length = 128 + bot_input_ids.shape[1]
        chat_history_ids = model.generate(bot_input_ids, max_length=max_length, pad_token_id=tokenizer.eos_token_id)
        # pretty print last ouput tokens from bot
        print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

In [13]:
def construct_conv(row, tokenizer):
    MAX_LENGTH = 512
    row = list(reversed(list(row.values())))
    model_inputs = tokenizer(row)
    tokenizer_pad_token_id = tokenizer.encode('#')[0]
    for i in range(len(model_inputs['input_ids'])):
        model_inputs['input_ids'][i].append(tokenizer.eos_token_id)
        model_inputs['attention_mask'][i].append(1)
    model_inputs['input_ids'] = [item for sublist in model_inputs['input_ids'] for item in sublist]
    model_inputs['attention_mask'] = [item for sublist in model_inputs['attention_mask'] for item in sublist]
    if MAX_LENGTH > len(model_inputs['input_ids']):
        model_inputs['input_ids'] += [tokenizer_pad_token_id] * (MAX_LENGTH - len(model_inputs['input_ids']))
        model_inputs['attention_mask'] += [0] * (MAX_LENGTH - len(model_inputs['attention_mask']))
    elif MAX_LENGTH < len(model_inputs['input_ids']):
        model_inputs['input_ids'] = model_inputs['input_ids'][:MAX_LENGTH-1]
        model_inputs['input_ids'][-1] = tokenizer.eos_token_id
        model_inputs['attention_mask'] = model_inputs['attention_mask'][:MAX_LENGTH-1]
        model_inputs['attention_mask'][-1] = 1
    model_inputs["labels"] = model_inputs["input_ids"]
    return model_inputs

def preprocess_function(examples):
    tokenizer.pad_token = '#'
    model_inputs = construct_conv(examples, tokenizer)
    return model_inputs

In [14]:
character_hg = load_df(character)
tokenized_character_hg = character_hg.map(preprocess_function, batched=False)

Using custom data configuration default-9133cea5b318789a
Reusing dataset csv (C:\Users\david\Documents\unibo\natural_language_processing\project\BarneyBot\cache\csv\default-9133cea5b318789a\0.0.0\433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

0ex [00:00, ?ex/s]

0ex [00:00, ?ex/s]

In [15]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(mlm=False, tokenizer=tokenizer, return_tensors='tf')

print(tokenized_character_hg)

DatasetDict({
    train: Dataset({
        features: ['response', 'context', 'context/0', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 4465
    })
    test: Dataset({
        features: ['response', 'context', 'context/0', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 497
    })
})


In [16]:
tf_train_set = tokenized_character_hg["train"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)

tf_test_set = tokenized_character_hg["test"].to_tf_dataset(
    columns=["input_ids", "attention_mask", "labels"],
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

In [17]:
physical_devices = tf.config.list_physical_devices('GPU')
physical_devices

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [18]:
from transformers import AdamWeightDecay

model.compile(optimizer=AdamWeightDecay(learning_rate=2e-5))
if do_fine_tuning:
    model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=epochs)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! Please ensure your labels are passed as keys in the input dict so that they are accessible to the model during the forward pass. To disable this behaviour, please pass a loss argument, or explicitly pass loss=None if you do not want your model to compute a loss.




In [19]:
n_tot_epochs = n_start_epochs + epochs
n_tot_epochs

11

In [20]:
checkpoint_folder = checkpoint_folder_template + str(n_tot_epochs)
checkpoint_folder = os.path.join(out_folder, checkpoint_folder)

In [21]:
model.save_pretrained(save_directory=checkpoint_folder)

In [22]:
# Let's chat for 3 lines
if do_chat:
    for step in range(3):
        # encode the new user input, add the eos_token and return a tensor
        new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='tf')
        # append the new user input tokens to the chat history
        bot_input_ids = tf.concat([chat_history_ids, new_user_input_ids], axis=-1) if step > 0 else new_user_input_ids
        # generated a response while limiting the total chat history to 1000 tokens, 
        chat_history_ids = model.generate(bot_input_ids, max_length=64, pad_token_id=tokenizer.eos_token_id)
        # pretty print last ouput tokens from bot
        print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

# Metrics

In [None]:
sample_references = tokenized_character_hg["test"]['response']
sample_questions = tokenized_character_hg["test"]['context']

In [None]:
filename = character_dict[character]['prediction_filename'] + '_' + str(n_tot_epochs)
prediction_path = os.path.join(in_folder, filename)
if os.path.exists(prediction_path) and not override_predictions:
    with open(os.path.join(in_folder, filename), 'r') as file:
        json_string = file.read()
    predictions = json.loads(json_string)
    
else:
    predictions = list()
    for x in tqdm(sample_questions):
        tokenized_question = tokenizer.encode(x + tokenizer.eos_token, return_tensors='tf')
        max_length = 128 + tokenized_question.shape[1]
        generated_answer = model.generate(tokenized_question,
                                   pad_token_id=tokenizer.eos_token_id, max_length=max_length)[0].numpy().tolist()
        predictions.append(generated_answer[len(tokenized_question[0]):])
        
    # Save predictions as a JSON file
    output_string = json.dumps(predictions)
    with open(os.path.join(in_folder, filename), 'w') as file:
        file.write(output_string)

In [None]:
labels = list()
for x in tqdm(sample_references):
    labels.append(tokenizer.encode(x + tokenizer.eos_token, return_tensors='tf').numpy().tolist())

In [None]:
print("Context:\t", tokenized_character_hg["test"]['context'][0])
print("Bot Answer:\t", tokenizer.decode(predictions[0], skip_special_tokens=True))
print("Label:\t\t", tokenizer.decode(labels[0][0], skip_special_tokens=True))

# BLEU

In [None]:
import datasets
bleu_metric = datasets.load_metric('bleu')

In [None]:
print(bleu_metric.inputs_description)

# Sentence similarity

In [None]:
from sentence_transformers import CrossEncoder
model_ss = CrossEncoder('cross-encoder/stsb-roberta-large')

In [None]:
from statistics import harmonic_mean

## ROGUE-S

In [None]:
rouge_metric = datasets.load_metric("rouge")

In [None]:
print(rouge_metric.inputs_description)

## Test metrics

In [None]:
for i in range(3):
    context_sentence = str(tokenized_character_hg["test"]['context'][i])
    chatbot_response = str(tokenizer.decode(predictions[i], skip_special_tokens=True))
    label_response   = str(tokenizer.decode(labels[i][0], skip_special_tokens=True))
    # prints the sentences
    print('* context:', context_sentence) 
    print('* label:  ', label_response)
    print('* chatbot:', chatbot_response) 
    # 1) computes metrics for semantic similarity
    print('=== SEMANTIC SIMILARITY ===')
    scores = model_ss.predict([(context_sentence, label_response),
                               (context_sentence, chatbot_response), 
                               (label_response, chatbot_response)])
    print('context-label similarity:  ', scores[0])
    print('context-chatbot similarity:', scores[1])
    print('label-chatbot   similarity:', scores[2])
    print('---   Merged Metrics    ---')
    print('     script-similarity-quo:', (scores[1] - scores[0] + scores[2] + 1) / 3)
    print('     script-similarity-abs:', abs(scores[1]-scores[2]))
    print('     script-similarity-hm: ', harmonic_mean((scores[0], scores[1], scores[2])))
    print('#'*11)
    # 2) computes metrics for bleu
    print('===        BLEU         ===')
    bleu_metric.add_batch(predictions=[predictions[i]], references=[labels[i]])
    scores = bleu_metric.compute()
    print('bleu:                      ', scores['bleu'])
    # 3) computes metrics for rouge-S
    print('===       ROUGE-S       ===')
    rouge_metric.add_batch(predictions=[chatbot_response], references=[label_response])
    scores = rouge_metric.compute()
    print('rouge:                     ', scores["rouge1"].mid.fmeasure)
    print('#'*11)
    print()

In [None]:
#os.system('shutdown /h')