# ECE1724 Project - Gollum Chatbot

This chatbot attempts to impersonate the character Gollum/Smeagol from Lord of the Rings. Originally I was planning on using data from the books, but then switched to movie scripts as they were more easy to acquire lines from. The dataset is from https://www.kaggle.com/datasets/paultimothymooney/lord-of-the-rings-data. Some inspiration, such as the usage of Microsoft's DialoGPT model, was taken from a Rick and Morty bot that I saw online but can no longer find.

## Imports

In [1]:
#standard imports
import glob
import logging
import os
import pickle
import random
import re
import shutil
from pathlib import Path
from typing import Dict, List, Tuple

#other normal
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm, trange

#pytorch stuff
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from torch.utils.tensorboard import SummaryWriter


#transformers imports
from transformers import (AdamW, AutoConfig, AutoModelForCausalLM, AutoTokenizer,
                          MODEL_WITH_LM_HEAD_MAPPING, PreTrainedModel,
                          PreTrainedTokenizer, WEIGHTS_NAME,
                          get_linear_schedule_with_warmup)

## Initial Configuration

The arguments below were found online, and some, such as learning rate, were varied to see different results. Most of the arguments are pretty self explanatory.

In [20]:
#configurations and logger setup
logger = logging.getLogger(__name__)

MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

#arguments class for configuration
class Args:
    def __init__(self):
        self.output_dir = 'output-small'
        self.model_type = 'gpt2'
        self.model_name_or_path = 'microsoft/DialoGPT-small'
        self.config_name = self.model_name_or_path
        self.tokenizer_name = self.model_name_or_path
        self.cache_dir = 'cached'
        self.block_size = 512
        self.do_train = True
        self.do_eval = True
        self.evaluate_during_training = False
        self.train_batch_size = 1
        self.eval_batch_size = 1 
        self.gradient_accumulation_steps = 1
        self.learning_rate = 5e-5
        self.weight_decay = 0.0
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 1.0
        self.num_train_epochs = 5
        self.max_steps = -1
        self.warmup_steps = 0
        self.logging_steps = 1000
        self.save_steps = 3500
        self.save_total_limit = None
        self.eval_all_checkpoints = False
        self.overwrite_output_dir = True
        self.overwrite_cache = True
        self.should_continue = False
        self.seed = 42
        self.local_rank = -1

args = Args()

## Data Preparation

Aside from seperating Gollum's/Smeagol's lines from the rest of the data, I'll have to use several prior responses for each dialogue line for context.

In [21]:
#load data
script = pd.read_excel("my_scripts2.xlsx")

#number of context lines
n = 7

#list for contexted dialogue
contexted = []

#loop trough data starting from nth entry
for i in range(n, len(script['dialog']), 2):
    #get n preceeding dialogues as context and current as response
    row = [script['dialog'][j] for j in range(i - n, i)] + [script['dialog'][i]]
    #append row to contexted data
    contexted.append(row)

#dataframe from contexted list
df = pd.DataFrame(contexted, columns=['context/' + str(i) for i in range(n)] + ['response'])
df.head(5)

#split data
train_set, val_set = train_test_split(df, test_size=0.1)

The data is going to be converted into a better format for the model. Each response will be concatenated in one string for each row.

In [22]:
#function to construct conversation sequences
def construct_conv(row, tokenizer, eos=True):
    flatten = lambda l: [item for sublist in l for item in sublist]
    tokens = tokenizer.encode(row + (tokenizer.eos_token if eos else ''), return_tensors='pt')
    return tokens.squeeze().tolist()

#dataset class for handling conversations
class ConversationDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):
        block_size = block_size - (tokenizer.model_max_length - tokenizer.max_len_single_sentence)
        directory = args.cache_dir
        cached_features_file = os.path.join(directory, args.model_type + "_cached_lm_" + str(block_size))

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)
            self.examples = []
            for _, row_series in df.iterrows():
                row_series_str = row_series.apply(lambda x: '' if pd.isna(x) else str(x))
                row_text = ' '.join(row_series_str[:-1])
                response_text = str(row_series.iloc[-1]) if not pd.isna(row_series.iloc[-1]) else ''
                conv_tokens = construct_conv(row_text + ' ' + response_text, tokenizer)
                self.examples.append(conv_tokens)
            logger.info("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

#function for loading and caching examples
def load_and_cache_examples(args, tokenizer, df_train, df_val, evaluate=False):
    return ConversationDataset(tokenizer, args, df_val if evaluate else df_train)

#function to set the random seed for reproducibility
def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

#function to sort checkpoints
def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False):
    ordering_and_checkpoint_path = []
    glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))
    for path in glob_checkpoints:
        if use_mtime:
            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
        else:
            regex_match = re.match(f".*{checkpoint_prefix}-([0-9]+)", path)
            if regex_match and regex_match.groups():
                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))
    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
    return checkpoints_sorted

#function to manage checkpoint rotation
def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False):
    if not args.save_total_limit or args.save_total_limit <= 0:
        return
    checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
    if len(checkpoints_sorted) <= args.save_total_limit:
        return
    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
    for checkpoint in checkpoints_to_be_deleted:
        logger.info("Deleting older checkpoint [%s] due to args.save_total_limit", checkpoint)
        shutil.rmtree(checkpoint)

## Training Function

In [23]:
def train(args, train_dataset, model, tokenizer):
    #initialize TensorBoard if master process
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    #calculate effective batch size
    args.train_batch_size = args.train_batch_size

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    #select appropriate sampler
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate, drop_last=True)

    #total number of training steps
    t_total = args.max_steps if args.max_steps > 0 else len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    #prepare model for training
    model = model.module if hasattr(model, "module") else model  #handling distributed training scenario
    model.resize_token_embeddings(len(tokenizer))

    #prepare optimizer and scheduler
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], "weight_decay": args.weight_decay},
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)

    #load optimizer and scheduler if resuming training
    if args.model_name_or_path and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt")):
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    logger.info("***** Running training *****")
    logger.info(f"Num examples = {len(train_dataset)}")
    logger.info(f"Num Epochs = {args.num_train_epochs}")
    logger.info(f"Batch size = {args.train_batch_size}")
    logger.info(f"Total optimization steps = {t_total}")

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()

    set_seed(args)  #here for reproducibility

    for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
        for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
            model.train()
            inputs, labels = batch, batch
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            outputs = model(inputs, labels=labels)
            loss = outputs[0]

            loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                optimizer.step()
                scheduler.step()  #update learning rate schedule
                model.zero_grad()
                global_step += 1

                #log metrics
                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                #save model checkpoint
                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    output_dir = os.path.join(args.output_dir, f"checkpoint-{global_step}")
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = model.module if hasattr(model, "module") else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info(f"Saving model checkpoint to {output_dir}")

                    _rotate_checkpoints(args, "checkpoint")

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info(f"Saving optimizer and scheduler states to {output_dir}")

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step

## Evaluation Function

In [24]:
def evaluate(args, model, tokenizer, df_train, df_val, prefix=""):
    eval_output_dir = args.output_dir
    eval_dataset = load_and_cache_examples(args, tokenizer, df_train, df_val, evaluate=True)

    os.makedirs(eval_output_dir, exist_ok=True)  #ensure directory exists for outputs
    args.eval_batch_size = args.eval_batch_size  

    #collate function to handle padding
    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    #setup DataLoader for evaluation
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate, drop_last=True)

    logger.info(f"***** Running evaluation {prefix} *****")
    logger.info(f"Num examples = {len(eval_dataset)}")
    logger.info(f"Batch size = {args.eval_batch_size}")
    
    eval_loss, nb_eval_steps = 0.0, 0  #initialize evaluation loss and step counter
    model.eval()  #set model to evaluation mode

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = batch, batch  #unpack batch
        inputs = inputs.to(args.device)  #move inputs to device
        labels = labels.to(args.device)  #move labels to device

        with torch.no_grad():  #disable gradient calculation for evaluation
            outputs = model(inputs, labels=labels)  #forward pass
            lm_loss = outputs[0]  #extract loss
            eval_loss += lm_loss.mean().item()  #accumulate loss
        nb_eval_steps += 1  #increment step counter

    eval_loss /= nb_eval_steps  #calculate average loss
    perplexity = torch.exp(torch.tensor(eval_loss))  #calculate perplexity

    result = {"perplexity": perplexity}  #prepare results

    #log evaluation results
    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info(f"***** Eval results {prefix} *****")
        for key, value in result.items():
            logger.info(f"{key} = {value}")
            writer.write(f"{key} = {value}\n")

    return result

## Main

In [25]:
def main(df_trn, df_val):
    args = Args()  #initialize configuration arguments

    #check for continuation from checkpoint
    if args.should_continue:
        sorted_checkpoints = _sorted_checkpoints(args)
        if not sorted_checkpoints:
            raise ValueError("Used --should_continue but no checkpoint found in output_dir.")
        args.model_name_or_path = sorted_checkpoints[-1]

    #ensure output directory is ready
    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir and not args.should_continue:
        raise ValueError(f"Output directory ({args.output_dir}) exists and is not empty. Use --overwrite_output_dir.")

    device = torch.device("cpu")  #setup device for CPU training
    args.n_gpu = 0
    args.device = device

    #setup logging
    logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.warning(f"Process rank: {args.local_rank}, device: {device}, n_gpu: {args.n_gpu}, distributed training: {bool(args.local_rank != -1)}")

    set_seed(args)  #set random seed for reproducibility

    #initialize model and tokenizer
    config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
    model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, from_tf=False, config=config, cache_dir=args.cache_dir)
    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    #begin training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(f"global_step = {global_step}, average loss = {tr_loss}")

    #save trained model and tokenizer
    if args.do_train:
        os.makedirs(args.output_dir, exist_ok=True)
        logger.info(f"Saving model checkpoint to {args.output_dir}")
        model_to_save = model.module if hasattr(model, "module") else model  #handle parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

    #begin evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir] if not args.eval_all_checkpoints else list(os.path.dirname(c) for c in sorted(glob.glob(f"{args.output_dir}/**/{WEIGHTS_NAME}", recursive=True)))
        logger.info(f"Evaluate the following checkpoints: {checkpoints}")
        for checkpoint in checkpoints:
            prefix = checkpoint.split("/")[-1] if "checkpoint" in checkpoint else ""
            model = AutoModelForCausalLM.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, df_trn, df_val, prefix=prefix)
            results.update({f"{k}_{prefix}": v for k, v in result.items()})

    return results

Now the model can be trained on the data.

In [26]:
main(train_set, val_set)

04/01/2024 13:33:15 - INFO - __main__ -   Training/evaluation parameters <__main__.Args object at 0x0000021083DACDD0>
04/01/2024 13:33:15 - INFO - __main__ -   Creating features from dataset file at cached
04/01/2024 13:33:15 - INFO - __main__ -   Saving features into cached file cached\gpt2_cached_lm_512
04/01/2024 13:33:15 - INFO - __main__ -   ***** Running training *****
04/01/2024 13:33:15 - INFO - __main__ -   Num examples = 103
04/01/2024 13:33:15 - INFO - __main__ -   Num Epochs = 5
04/01/2024 13:33:15 - INFO - __main__ -   Batch size = 1
04/01/2024 13:33:15 - INFO - __main__ -   Total optimization steps = 515


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/103 [00:00<?, ?it/s]

Iteration:   0%|          | 0/103 [00:00<?, ?it/s]

Iteration:   0%|          | 0/103 [00:00<?, ?it/s]

Iteration:   0%|          | 0/103 [00:00<?, ?it/s]

Iteration:   0%|          | 0/103 [00:00<?, ?it/s]

04/01/2024 13:45:06 - INFO - __main__ -   global_step = 515, average loss = 2.5333530737358387
04/01/2024 13:45:06 - INFO - __main__ -   Saving model checkpoint to output-small
04/01/2024 13:45:07 - INFO - __main__ -   Evaluate the following checkpoints: ['output-small']
04/01/2024 13:45:07 - INFO - __main__ -   Creating features from dataset file at cached
04/01/2024 13:45:07 - INFO - __main__ -   Saving features into cached file cached\gpt2_cached_lm_512
04/01/2024 13:45:07 - INFO - __main__ -   ***** Running evaluation  *****
04/01/2024 13:45:07 - INFO - __main__ -   Num examples = 12
04/01/2024 13:45:07 - INFO - __main__ -   Batch size = 1


Evaluating:   0%|          | 0/12 [00:00<?, ?it/s]

04/01/2024 13:45:11 - INFO - __main__ -   ***** Eval results  *****
04/01/2024 13:45:11 - INFO - __main__ -   perplexity = 5.865156173706055


{'perplexity_': tensor(5.8652)}

## Actual Chat

In [5]:
#load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')
model = AutoModelForCausalLM.from_pretrained('output-small')
model.to(torch.device("cpu"))

#left padding
tokenizer.padding_side = 'left'

#initialize chat history variable
chat_history_ids = None

#chat 
print("Let's chat! (type 'quit' to stop)")
while True:
    user_input = input(">> User: ")
    if user_input.lower() == 'quit':
        break

    #encode new user input, adding EOS token
    new_user_input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors='pt')

    #manages length
    if chat_history_ids is not None:
        #trim chat_history_ids if needed
        bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1)
        #make sure input does not exceed the max_length for the model
        max_input_length = model.config.n_positions
        if bot_input_ids.size(-1) > max_input_length:
            bot_input_ids = bot_input_ids[:, -max_input_length:]
    else:
        bot_input_ids = new_user_input_ids

    #response
    chat_history_ids = model.generate(
        bot_input_ids,
        max_length=500,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=3,
        do_sample=True,
        top_k=20,
        top_p=0.9,
        temperature=0.7
    )

    #print the bot's response
    bot_output = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
    print(f"Gollum: {bot_output}")

Let's chat! (type 'quit' to stop)


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Gollum: The last hobbit, yes. Now, if only I had a master's in drawing, I wouldn't have to destroy precious. I just want to collect the precious. What's your idea of a perfect meal? A fat, juicy fish, caught fresh from the river, yes, precious. Do you like the cold? Not too cold, no. Smeagol! Sneaky little fish, yes!


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Gollum: Sneak. We likes it, sss. But we needs to riddle ourselves, yes? The precious, yes’s.


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Gollum: ’talks to precious’’, smeagols, yes, talks to precious.’es, sakes with fishes, yes.
