# Config setup

In [2]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import pandas as pd
import numpy as np
import random
import logging
import pickle
import os
import shutil
from typing import Dict, List, Tuple

from sklearn.model_selection import train_test_split

import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm.notebook import tqdm, trange
import csv
import random
import re
import os
import unicodedata
import codecs
from io import open
import itertools
import math

%matplotlib inline

In [3]:
%%capture
!pip install transformers

In [4]:
import transformers
from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    AutoModelWithLMHead,
    AutoTokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)

In [5]:
try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter

# Configs
logger = logging.getLogger(__name__)

MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

In [6]:
use_cuda = torch.cuda.is_available()
device = torch.device('cuda' if use_cuda else 'cpu')
device

device(type='cuda')

# **Importing Data**

In [None]:
%%capture
!pip install opendatasets

In [None]:
import opendatasets as od
url = 'https://www.kaggle.com/datasets/rajathmc/cornell-moviedialog-corpus/data'
od.download(url)

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: rubenav
Your Kaggle Key: ··········
Downloading cornell-moviedialog-corpus.zip to ./cornell-moviedialog-corpus


100%|██████████| 9.58M/9.58M [00:01<00:00, 8.25MB/s]





In [None]:
metadata_path = "/content/cornell-moviedialog-corpus/movie_characters_metadata.txt"
conversations_path = "/content/cornell-moviedialog-corpus/movie_conversations.txt"
lines_path = "/content/cornell-moviedialog-corpus/movie_lines.txt"
titles_path = "/content/cornell-moviedialog-corpus/movie_titles_metadata.txt"
raw_script_urls_path = "/content/cornell-moviedialog-corpus/raw_script_urls.txt"

In [None]:
corpus_name = 'cornell-moviedialog-corpus'
corpus = os.path.join('/content/', corpus_name)
def printLines(filename, n=10):
    with open(filename, 'rb') as f:
        lines = f.readlines()
    for line in lines[:n]:
        print(line)

printLines(os.path.join(corpus,'movie_lines.txt'))

b'L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!\n'
b'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!\n'
b'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.\n'
b'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?\n'
b"L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.\n"
b'L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow\n'
b"L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.\n"
b'L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No\n'
b'L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?\n'
b'L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?\n'


In [None]:
column_names = ["lineID","characterID","movieID","character","text"]
def LoadLines(file, column_names):
    lines = {}
    with open(file, 'r', encoding='iso-8859-1') as f:
        for line in f:
            dict = {}
            list_field = line.split(' +++$+++ ')
            for i, field in enumerate(list_field):
                dict[column_names[i]] = field
            lines[dict['lineID']] = dict
    return lines

In [None]:
lines = LoadLines(os.path.join(corpus, 'movie_lines.txt'), column_names)

In [None]:
def Loadconversation(file, lines, column_names):
    conversation = []
    with open(file, 'r', encoding='iso-8859-1') as f:
        for line in f:
            dict_column = {}
            list_column = line.split(' +++$+++ ')
            for i, col in enumerate(list_column):
                dict_column[column_names[i]] = col
            line_id_list = eval(dict_column['utteranceIDs'])
            dict_column['lines'] = []
            for line in line_id_list:
                dict_column['lines'].append(lines[line])
            conversation.append(dict_column)
    return conversation
conversations = Loadconversation(os.path.join(corpus, 'movie_conversations.txt'), lines,["character1ID", "character2ID", "movieID", "utteranceIDs"])

In [None]:
def get_pair_conversation(conversations):
    """
    return list of pair conversation  [[input1, response1], [input2, response2],....]
    """
    pair = []
    for conversation in conversations:
        num_sentence = len(conversation['lines'])
        for i in range(num_sentence-1):
            input = conversation['lines'][i]['text'].strip()
            response = conversation['lines'][i+1]['text'].strip()
            if input and response:
                pair.append([input, response])
    return pair

In [None]:
# create new file to overwrite into it
os.chdir('/content/')
os.getcwd()
if not os.path.exists('data_save'):
    os.makedirs('data_save')
os.chdir('data_save')

path_save = '/content/data_save'
datafile = os.path.join(path_save, "formatted_movie_lines.txt")

delimiter = '\t'
#Unescape the delimiter
delimiter = str(codecs.decode(delimiter, "unicode_escape"))

print("\nWriting newly formatted file...")
with open(datafile, 'w', encoding='utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter=delimiter, lineterminator='\n')
    for pair in get_pair_conversation(conversations):
      writer.writerow(pair)



Writing newly formatted file...


# **Preparing Data for Training**



In [None]:
datafile = '/content/formatted_movie_lines.txt'
lines = open(datafile, encoding='utf-8').\
        read().strip().split('\n')

In [None]:
df = pd.DataFrame(lines, columns = ['lines'])
df.head()

Unnamed: 0,lines
0,Can we make this quick? Roxanne Korrine and A...
1,"Well, I thought we'd start with pronunciation,..."
2,Not the hacking and gagging and spitting part....
3,You're asking me out. That's so cute. What's ...
4,"No, no, it's my fault -- we didn't have a prop..."


In [None]:
contexted = []

n = 3 # number of previous responses to include in the context

for i in range(n, len(df['lines'])):
  row = []
  prev = i - 1 - n
  for j in range(i, prev, -1):
    row.append(df['lines'][j])
  contexted.append(row)

In [None]:
columns = ['response', 'context']
columns = columns + ['context/'+str(i) for i in range(n-1)]
columns

['response', 'context', 'context/0', 'context/1']

In [None]:
df = pd.DataFrame.from_records(contexted, columns=columns)
df.head(5)

Unnamed: 0,response,context,context/0,context/1
0,You're asking me out. That's so cute. What's ...,Not the hacking and gagging and spitting part....,"Well, I thought we'd start with pronunciation,...",Can we make this quick? Roxanne Korrine and A...
1,"No, no, it's my fault -- we didn't have a prop...",You're asking me out. That's so cute. What's ...,Not the hacking and gagging and spitting part....,"Well, I thought we'd start with pronunciation,..."
2,"Cameron.\tThe thing is, Cameron -- I'm at the ...","No, no, it's my fault -- we didn't have a prop...",You're asking me out. That's so cute. What's ...,Not the hacking and gagging and spitting part....
3,"The thing is, Cameron -- I'm at the mercy of a...","Cameron.\tThe thing is, Cameron -- I'm at the ...","No, no, it's my fault -- we didn't have a prop...",You're asking me out. That's so cute. What's ...
4,Why?\tUnsolved mystery. She used to be really...,"The thing is, Cameron -- I'm at the mercy of a...","Cameron.\tThe thing is, Cameron -- I'm at the ...","No, no, it's my fault -- we didn't have a prop..."


In [None]:
def construct_conv(row, tokenizer, eos = True):
    # from: https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-list-of-lists
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list(reversed([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]))
    conv = flatten(conv)
    return conv

In [None]:
class ConversationDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):

        block_size = block_size - (tokenizer.model_max_length - tokenizer.max_len_single_sentence)

        directory = args.cache_dir
        cached_features_file = os.path.join(
            directory, args.model_type + "_cached_lm_" + str(block_size)
        )

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            for _, row in df.iterrows():
                conv = construct_conv(row, tokenizer)
                if len(conv) > block_size: continue
                self.examples.append(conv)

            # Note that we are losing the last truncated example here for the sake of simplicity (no padding)
            # If your dataset is small, first you should loook for a bigger one :-) and second you
            # can change this behavior by adding (model specific) padding.

            logger.info("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

In [None]:
def load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False):
    return ConversationDataset(tokenizer, args, df_val if evaluate else df_trn)


def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
    ordering_and_checkpoint_path = []

    glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))

    for path in glob_checkpoints:
        if use_mtime:
            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
        else:
            regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
            if regex_match and regex_match.groups():
                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))

    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
    return checkpoints_sorted


def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
    if not args.save_total_limit:
        return
    if args.save_total_limit <= 0:
        return

    # Check if we should delete older checkpoint(s)
    checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
    if len(checkpoints_sorted) <= args.save_total_limit:
        return

    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
    for checkpoint in checkpoints_to_be_deleted:
        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
        shutil.rmtree(checkpoint)

# **Initializing Model**

In [None]:
#collapse
# Args to allow for easy convertion of python script to notebook
class Args():
    def __init__(self):
        self.output_dir = 'output'
        self.model_type = 'gpt2'
        self.model_name_or_path = 'microsoft/DialoGPT-small'
        self.config_name = 'microsoft/DialoGPT-small'
        self.tokenizer_name = 'microsoft/DialoGPT-small'
        self.cache_dir = 'cached'
        self.block_size = 512
        self.do_train = True
        self.do_eval = True
        self.evaluate_during_training = False
        self.per_gpu_train_batch_size = 4
        self.per_gpu_eval_batch_size = 4
        self.gradient_accumulation_steps = 8
        self.learning_rate = 5e-5
        self.weight_decay = 0.0
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 1.0
        self.num_train_epochs = 3
        self.max_steps = -1
        self.warmup_steps = 0
        self.logging_steps = 1000
        self.save_steps = 3500
        self.save_total_limit = None
        self.eval_all_checkpoints = False
        self.no_cuda = False
        self.overwrite_output_dir = True
        self.overwrite_cache = True
        self.should_continue = False
        self.seed = 42
        self.local_rank = -1
        self.fp16 = False
        self.fp16_opt_level = 'O1'

args = Args()

In [None]:
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate, drop_last = True
    )

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    model = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))
    # add_special_tokens_(model, tokenizer)


    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if (
        args.model_name_or_path
        and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
        and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = (batch, batch)
            if inputs.shape[1] > 1024: continue
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step

# Evaluation of some model

def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, df_trn, df_val, prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=True)
    os.makedirs(eval_output_dir, exist_ok=True)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate, drop_last = True
    )

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result

In [None]:
def main(df_trn, df_val):
    args = Args()

    if args.should_continue:
        sorted_checkpoints = _sorted_checkpoints(args)
        if len(sorted_checkpoints) == 0:
            raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
        else:
            args.model_name_or_path = sorted_checkpoints[-1]

    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
        and not args.should_continue
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )

    # Setup CUDA, GPU & distributed training

    use_cuda = torch.cuda.is_available()
    device = torch.device('cuda' if use_cuda else 'cpu')
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
    model = AutoModelWithLMHead.from_pretrained(
        args.model_name_or_path,
        from_tf=False,
        config=config,
        cache_dir=args.cache_dir,
    )
    model.to(args.device)

    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False)

        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
    if args.do_train:
        # Create output directory if needed
        os.makedirs(args.output_dir, exist_ok=True)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = AutoModelWithLMHead.from_pretrained(args.output_dir)
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""

            model = AutoModelWithLMHead.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, df_trn, df_val, prefix=prefix)
            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

    return results

# **Training Model**

In [None]:
train_df, val_df = train_test_split(df, test_size = 0.2)

In [None]:
main(train_df, val_df)



Downloading (…)lve/main/config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]



Downloading model.safetensors:   0%|          | 0.00/351M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1161 > 1024). Running this sequence through the model will result in indexing errors


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/34179 [00:00<?, ?it/s]



Iteration:   0%|          | 0/34179 [00:00<?, ?it/s]



Iteration:   0%|          | 0/34179 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/8546 [00:00<?, ?it/s]

{'perplexity_': tensor(4.9225)}

# **Testing Chatbot**

In [None]:
test_chats = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/formatted_movie_lines.txt', sep='\t', names=['person_1','person_2'], engine='python')
test_chats.head()

Unnamed: 0,person_1,person_2
0,Can we make this quick? Roxanne Korrine and A...,"Well, I thought we'd start with pronunciation,..."
1,"Well, I thought we'd start with pronunciation,...",Not the hacking and gagging and spitting part....
2,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...
3,You're asking me out. That's so cute. What's ...,Forget it.
4,"No, no, it's my fault -- we didn't have a prop...",Cameron.


In [None]:
temp_person_1 = list(test_chats['person_1'])[:1000]
temp_person_2 = list(test_chats['person_2'])[:1000]

In [None]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small', padding_side='left')
model = AutoModelWithLMHead.from_pretrained('/content/drive/MyDrive/Colab Notebooks/my_model/output')

person_1 = temp_person_1
person_2_response = temp_person_2
chatbot_response = []

step = 0
for i in range(len(person_1)):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    # input_sentence = input(">> User:")
    new_user_input_ids = tokenizer.encode(person_1[i] + tokenizer.eos_token, return_tensors='pt')
    # print(new_user_input_ids)

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens,
    chat_history_ids = model.generate(
        bot_input_ids, max_length=200,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=3,
        do_sample=True,
        top_k=100,
        top_p=0.7,
        temperature = 0.5
    )

    response = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
    chatbot_response.append(response)
    # pretty print last ouput tokens from bot
    #print("ChatBot: {}".format(chatbot_response))
    if step < 0:
      step += 1
    else:
      step = 0

In [None]:
# !pip install rouge-score



In [None]:
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction

In [None]:
# Calculate ROUGE scores
r_scores = {key: [] for key in ['rouge1', 'rouge2', 'rougeL']}

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

for i in range(len(person_2_response)):
    ref = person_2_response[i]
    cand = chatbot_response[i]
    temp_scores = scorer.score(ref, cand)
    for key in temp_scores:
        r_scores[key].append(temp_scores[key])

# Calculate BLEU score
sf = SmoothingFunction()
b_scores = np.array([])

for i in range(len(person_2_response)):
    ref = person_2_response[i].split()
    cand = chatbot_response[i]
    b_scores = np.append(b_scores, sentence_bleu(ref, cand, smoothing_function=sf.method1))

In [None]:
# Raw rouge1 scores
r1_precision = np.array([])
r1_recall = np.array([])
r1_fmeasure = np.array([])

# Raw rouge2 scores
r2_precision = np.array([])
r2_recall = np.array([])
r2_fmeasure = np.array([])

# Raw rougeL scores
rL_precision = np.array([])
rL_recall = np.array([])
rL_fmeasure = np.array([])

In [None]:
rouge1_scores = r_scores['rouge1']
rouge2_scores = r_scores['rouge2']
rougeL_scores = r_scores['rougeL']

In [None]:
for i in range(len(rouge1_scores)):
    # Organize rouge1 metrics
    r1_precision = np.append(r1_precision, [rouge1_scores[i].precision])
    r1_recall = np.append(r1_recall, [rouge1_scores[i].recall])
    r1_fmeasure = np.append(r1_fmeasure, [rouge1_scores[i].fmeasure])

    # Organize rouge2 metrics
    r2_precision = np.append(r2_precision, [rouge2_scores[i].precision])
    r2_recall = np.append(r2_recall, [rouge2_scores[i].recall])
    r2_fmeasure = np.append(r2_fmeasure, [rouge2_scores[i].fmeasure])

    # Organize rougeL metrics
    rL_precision = np.append(rL_precision, [rougeL_scores[i].precision])
    rL_recall = np.append(rL_recall, [rougeL_scores[i].recall])
    rL_fmeasure = np.append(rL_fmeasure, [rougeL_scores[i].fmeasure])

In [None]:
# Avg rouge1 scores
avg_r1_precision = np.mean(r1_precision)
avg_r1_recall = np.mean(r1_recall)
avg_r1_fmeasure = np.mean(r1_fmeasure)

# Avg rouge2 scores
avg_r2_precision = np.mean(r2_precision)
avg_r2_recall = np.mean(r2_recall)
avg_r2_fmeasure = np.mean(r2_fmeasure)

# Avg rougeL scores
avg_rL_precision = np.mean(rL_precision)
avg_rL_recall = np.mean(rL_recall)
avg_rL_fmeasure = np.mean(rL_fmeasure)

# Avg bleu score
avg_bleu_score = np.mean(b_scores)

In [None]:
print(avg_r1_precision)
print(avg_r1_recall)
print(avg_r1_fmeasure)
print()

print(avg_r2_precision)
print(avg_r2_recall)
print(avg_r2_fmeasure)
print()

print(avg_rL_precision)
print(avg_rL_recall)
print(avg_rL_fmeasure)
print()

print(avg_bleu_score)

0.10136991834171288
0.14239660750263672
0.09769796561590607

0.012708500250253359
0.017803129827431492
0.011847092868368762

0.0864937873010742
0.12584437242482596
0.08447611453066203

0.03986924352446297


# **Exporting Folder**

In [None]:
!zip -r /content/output.zip /content/output

In [None]:
from google.colab import files
files.download("/content/output.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# **Freeplay Chat with Chatbot**

In [8]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-small')
model = AutoModelWithLMHead.from_pretrained('/content/drive/MyDrive/Colab Notebooks/my_model/output')

# Let's chat until you want to stop
step = 0
while (True):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    input_sentence = input(">> User:")
    new_user_input_ids = tokenizer.encode(input_sentence + tokenizer.eos_token, return_tensors='pt')
    # print(new_user_input_ids)

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens,
    chat_history_ids = model.generate(
        bot_input_ids, max_length=200,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=3,
        do_sample=True,
        top_k=100,
        top_p=0.7,
        temperature = 0.8
    )

    # pretty print last ouput tokens from bot
    print("ChatBot: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))
    if step < 0:
      step += 1
    else:
      step = 0
    if input_sentence in ['q','quit', 'bye', 'Bye']: break



>> User:hi there


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


ChatBot: Hello.	You're late.
>> User:what is your name


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


ChatBot: What is your Name	Michael.
>> User:what is your favorite color?


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


ChatBot: What is your favourite color?	Red.
>> User:q


A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


ChatBot: So, what do you do now?	I'm just a little busy.
