# Data Preparation, Library Loading & Parameters

In [1]:
### Run environment setup
import os
import lib.BBSetup as BBSetup

try:
    from google.colab import drive
    BBSetup.colab_setup(mount_folder=r"/content/drive/My Drive/unibo/NLP_project/BarneyBot")
except:
    BBSetup.anaconda_setup(base_folder=r"E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBot",
                           env_name="barneybot")

### Define folders
base_folder = BBSetup.BASE_FOLDER
out_folder = BBSetup.set_folder(os.path.join(base_folder, 'Data', 'Characters'))

pip install -r "E:\University\Esami da Superare\Natural Language Processing\BarneyBotGit\BarneyBotGit\requirements.txt"


In [None]:
# Import character dictionaries, useful to map a character to its data, and a fixed random seed
from lib.BBData import character_dict, source_dict, random_state, model_name
from lib.BBDataLoad import load_char_df, dialogpt_preprocess_function

# Training args
character = 'Phoebe' # 'Barney' | 'Sheldon' | 'Harry' | 'Fry' | 'Vader' | 'Joey' | 'Phoebe' | 'Bender' | Default'
from_saved_weights = True
do_chat = False
do_fine_tuning = False
override_predictions = False
batch_size = 8
epochs = 1000
using_cuda = True
using_dataset = True
shutdown_at_end = 'h' # 'h'-> ibernate | 's' -> shutdown | False -> do nothing at the end

In [None]:
# if no character is chosen, default to no training, defaul dialogpt weights and no dataset
if character == 'Default':
    do_fine_tuning = False
    from_saved_weights = False
    using_dataset = False

In [None]:
# Imports to deal with dataset and chatbot loading
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
import json

In [None]:
# HuggingFace dataset class, and loading function
from datasets import load_dataset, DatasetDict

# Bot Loading & Fine-Tuning

In [None]:
# Get path of dialogpt tuned model weights for a given character
checkpoint_folder = os.path.join(out_folder, character, character_dict[character]['checkpoint_folder'])

In [None]:
# Import (tf) model and tokenizer structures from HuggingFace
from transformers import TFAutoModelForCausalLM, AutoTokenizer

# Load tokenizer, and set its padding token to the symbol #, which is removed in preprocessing from all datasets
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=os.path.join(base_folder, "cache"))
tokenizer.pad_token = '#'
# Load model, from pre-trained weights or from default ones, depending on flag
if from_saved_weights:
    model = TFAutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=checkpoint_folder)
else:
    model = TFAutoModelForCausalLM.from_pretrained(model_name, cache_dir=os.path.join(base_folder, "cache"))

In [None]:
# Let's chat!
if do_chat:
    # Initialize chat variables
    user_answ = ''
    step = 0
    # If the user types 'exit', the chat finishes
    while user_answ != 'exit':
        # Get the user prompt
        user_answ = input(">> User:")
        # Encode the user input, adding the eos_token
        new_user_input_ids = tokenizer.encode(user_answ + tokenizer.eos_token, return_tensors='tf')
        # Append the user input tokens to the chat history, increase the step count
        bot_input_ids = tf.concat([chat_history_ids, new_user_input_ids], axis=-1) if step > 0 else new_user_input_ids
        step += 1
        # Generate a response from the current chat history, while limiting the current answer to 128 tokens.
        # We use greedy selection for next token
        max_length = 128 + bot_input_ids.shape[1]
        chat_history_ids = model.generate(bot_input_ids,
                                          max_length=max_length,
                                          pad_token_id=tokenizer.eos_token_id,
                                          do_sample = False)
        # Pretty print the last generated output from the bot
        print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

In [None]:
# If we want to load the dataset, call the appropriate function and then use the processing function defined above
if using_dataset:
    character_hg = load_char_df(character, base_folder)
    tokenized_character_hg = character_hg.map(lambda row: dialogpt_preprocess_function(row, tokenizer), batched=False)
    print(tokenized_character_hg)

In [None]:
# Import the appropriate datacollator from HuggingFace
# A datacollator applies preprocessing and batching to its sentences
from transformers import DataCollatorForLanguageModeling

# Load the datacollator for our dataset
if using_dataset:
    data_collator = DataCollatorForLanguageModeling(mlm=False, tokenizer=tokenizer, return_tensors='tf')

In [None]:
# Define tensorflow datasets for train, test and validation
if using_dataset:
    tf_train_set = tokenized_character_hg["train"].to_tf_dataset(
        columns=["input_ids", "attention_mask", "labels"],
        shuffle=True,
        batch_size=batch_size,
        collate_fn=data_collator,
    )

    tf_test_set = tokenized_character_hg["test"].to_tf_dataset(
        columns=["input_ids", "attention_mask", "labels"],
        shuffle=False,
        batch_size=batch_size,
        collate_fn=data_collator,
    )

    tf_val_set = tokenized_character_hg["val"].to_tf_dataset(
        columns=["input_ids", "attention_mask", "labels"],
        shuffle=False,
        batch_size=batch_size,
        collate_fn=data_collator,
    )

In [None]:
# Initiialize tensorflow GPU by checking the available GPU devices
physical_devices = tf.config.list_physical_devices('GPU')
physical_devices

In [None]:
# Imports for tensorflow optimizer method and early stopping callback
from transformers import AdamWeightDecay
from tensorflow.keras import callbacks

# Define early stopping behavior
earlystop_callback = callbacks.EarlyStopping(
        monitor="val_loss",
        min_delta=0,
        patience=6,
        verbose=0,
        mode="min",
        baseline=None,
        restore_best_weights=True,
    )

# Compile the model and finetune it
if using_dataset:
    model.compile(optimizer=AdamWeightDecay(learning_rate=2e-5))
if do_fine_tuning:
    model.fit(
        x=tf_train_set, 
        validation_data=tf_val_set, 
        epochs=epochs,
        callbacks=[earlystop_callback],
        batch_size = batch_size
    )
else: epochs = 0

In [None]:
# After training, save the weights of the model
if using_dataset and do_fine_tuning:
    model.save_pretrained(save_directory=checkpoint_folder)

In [None]:
# We chat once again with the newly trained bot, exactly as before
if do_chat:
    user_answ = ''
    step = 0
    while user_answ!='exit':
        user_answ = input(">> User:")
        new_user_input_ids = tokenizer.encode(user_answ + tokenizer.eos_token, return_tensors='tf')
        bot_input_ids = tf.concat([chat_history_ids, new_user_input_ids], axis=-1) if step > 0 else new_user_input_ids
        step += 1
        max_length = 128 + bot_input_ids.shape[1]
        chat_history_ids = model.generate(bot_input_ids, 
                                          max_length=max_length, 
                                          pad_token_id=tokenizer.eos_token_id,
                                         do_sample = False)
        # pretty print last ouput tokens from bot
        print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))