# Imports

In [1]:
!pip -q install transformers

[K     |████████████████████████████████| 2.5MB 8.2MB/s 
[K     |████████████████████████████████| 3.3MB 51.5MB/s 
[K     |████████████████████████████████| 901kB 50.6MB/s 
[?25h

In [2]:
# all the imports

import glob
import logging
import os
import pickle
import random
import re
import shutil
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm.notebook import tqdm, trange

from pathlib import Path

from transformers import (
    MODEL_WITH_LM_HEAD_MAPPING,
    WEIGHTS_NAME,
    AdamW,
    AutoConfig,
    PreTrainedModel,
    PreTrainedTokenizer,
    get_linear_schedule_with_warmup,
)


try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter

In [3]:
! unzip archive.zip

Archive:  archive.zip
  inflating: friends_quotes.csv      


In [4]:
data = pd.read_csv('friends_quotes.csv')
df = data

# Data Cleaning

In [5]:
df.sample(10)

Unnamed: 0,author,episode_number,episode_title,quote,quote_order,season
45937,Chandler,12.0,Joey Dates Rachel,You are not going to believe what I did today!,107.0,8.0
7328,CHAN,7.0,Ross Finds Out,"[slowly lifts coffee cup to his mouth] Ow, ow,...",19.0,2.0
19620,Chandler,7.0,Chandler Crosses The Line,Then why didnt you tell me to do that?!!,178.0,4.0
15110,Phoebe,15.0,Ross And Rachel Take A Break,Yeah. But Sergei said it took the Germans six ...,56.0,3.0
36130,Chandler,24.0,"The Proposal, Part I & II",Eh,295.0,6.0
35601,Phoebe,23.0,The Ring,(To Rachel) Watch this.,24.0,6.0
55136,Chandler,1.0,Joey and Rachel Kiss,"Not quite. Monica's still at the salon, and I'...",142.0,10.0
57286,Monica,8.0,The Late Thanksgiving,It's too late for apologies.,257.0,10.0
37499,Chandler,5.0,The Engagement Picture,Aww! (Smiles.),95.0,7.0
6825,JADE,5.0,Five Steaks And An Eggplant,"So, are we gonna get together or what?",13.0,2.0


In [6]:
x = df.author.unique()
x

array(['Monica', 'Joey', 'Chandler', ..., 'Passenger #2', 'Passenger #3',
       'Gate attendant #2'], dtype=object)

In [7]:
df=df.replace(['CHANDLER'],'Chandler')
df=df.replace(['chandler'],'Chandler')
df=df.replace(['CHAN'],'Chandler')
df=df.replace(['Chandler (to Monica)'],'Chandler')
df=df.replace(['Chandler and Joey'],'Chandler')
df=df.replace(['Chandler and Ross'],'Chandler')
df=df.replace(['CHANDLER AND JOEY'],'Chandler')
df=df.replace(['CHAN, JOEY, ROSS'],'Chandler')
df=df.replace(['Chandler (to Joey)'],'Chandler')
df=df.replace(['Chandler (nearly weeping)'],'Chandler')
df=df.replace(['Chandler (Stands up and walks to Joey)'],'Chandler')
df=df.replace(['CHANDLER and JOEY'],'Chandler')
df=df.replace(['Chandler and Monica'],'Chandler')
df=df.replace(['Chandler, Monica, and Rachel'],'Chandler')
df=df.replace(['Chandlers'],'Chandler')
df=df.replace(['Chandler and Joe'],'Chandler')
df=df.replace(['Chandler, Joey, and Phoebe'],'Chandler')

In [8]:
df.quote[df.author=='Chandler']

2        All right Joey, be nice. So does he have a hum...
6                                Sounds like a date to me.
7        Alright, so I'm back in high school, I'm stand...
9        Then I look down, and I realize there's a phon...
11                                           That's right.
                               ...                        
60247                                        Where's Ross?
60275    (to his children) Look around, you guys. This ...
60282                                            Oh, okay.
60286                              Oh, it's gonna be okay.
60290                                         Sure. Where?
Name: quote, Length: 8305, dtype: object

In [9]:
data = df
CHARACTER_NAME = 'Chandler'

In [10]:
contexted = []

# context window of size 7
n = 7

for i in data[data.author == CHARACTER_NAME].index:
  if i < n:
    continue
  row = []
  prev = i - 1 - n # we additionally substract 1, so row will contain current responce and 7 previous responces  
  for j in range(i, prev, -1):
    row.append(data.quote[j])
  contexted.append(row)

columns = ['response', 'context'] 
columns = columns + ['context/' + str(i) for i in range(n - 1)]

df = pd.DataFrame.from_records(contexted, columns=columns)

In [11]:
df.sample(10)

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5
3100,Ha! Ha! Ha!,"Okay, bye!","Bye, Pheebs!","Ooh, uh (She grabs her coat and runs out.)",Okay!,"Yayohyay! Okay, I gotta go tell Frank and Al...",Yay!,Yay!
2695,No! No! No! I just kissed her.,And what?! Did you sleep with her?!,"Yeah, I mean when you were late last night, Ka...",What?,Its me. Im the other guy.,Who?,"See uh, thats-thats actually what I wanted t...","But hey, listen just so you know, you might ha..."
6451,"Honey, I dont like baths! Could you draw me a...",I drew you a bath!,What is it?,"No, but someones really not going to get over...",Sex on the balcony?,"Boy, do I have a surprise for you!",Hey.,"Is it me, or is veto starting to sound really ..."
6910,"(picking up the telephone, answering it with a...","Dedicated to the great work of Eric Aasen, Gui...",Thats the best birthday ever.,"Joey, happy birthday to you.",Joey! Joey.,"(with birthday cake, singing) Happy birthday t...","Well, I hope, you got some room left.",Excellent. The shrew in particular was exclusive.
6879,"But, those are for you.",(gets a pack out of his jacket),"Yes, but I just had one. Two. Two tiny cigaret...",What do I smell? (sniffs him) I smell smoke. H...,"(still backs away) Alright, the truth is, I so...",You dont need a shower.,(shrinks back) Right. You know what? Actually ...,"(comes over) O-kay, so you wanna play it that ..."
5357,I will tell the story! It was going great. I l...,"Well I had a great time! Umm, Chancy on the ot...",Hey! How was it?,(entering with Chandler) Hey.,I cant believe that! Now the only thing left ...,(recoils in horror) Women are mean!!! (Storms ...,"I dont want to talk about it. Yknow, you cou...",Hey! How was sailing?
825,"Wow, Heckles was voted class clown, and so was...",That's what it says.,Funniest? Heckles?,"""Heckles, you crack me up in science class. Yo...",He's even kind of cute.,"Wow, he looks so normal.","Check it out, check it out. Heckles' high scho...",Is there blood coming out of my ears?
2284,"Yeah, and I dont have any cologne.","Joey, what are you doing?! Its never gonna ha...","(to Chandler) And I ah, borrowed some of your ...",Oh!,"Oh, hey, you guys are finally gonna get to mee...","Okay, I need a date! (runs to her bedroom)",Bye-bye!,Bye!
2104,(to Joey) Is that your new walk?,Lets go.,Maybe she killed him?,Theyve been quiet for a long time.,Then how come it is?,"(crying) Yeah, but this cant be it, I mean.","(crying) No. I cant, youre a totally differe...","Look, look, theres got to be a way we can wor..."
6970,What...? That's not you! Life is good again! R...,THAT'S NOT ME!,"Oh, good, good. Play more, 'cause I wanna see ...",Cowboy boots? I've never worn cowboy boots in ...,"Yeah, oh, but I just keep picturing you rollin...","Chandler, this is not our problem. We've got e...","Oh, yeah, well, poor Richard. Y'... I can grow...","Isn't that sad? I mean, can you see how pathet..."


# Data Preparation 

In [12]:
trn_df, val_df = train_test_split(df, test_size=0.1)
trn_df.head()

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5
2100,"Yeah, come on, its Ross and Rachel, theyve g...","Theyre gonna get through this, arent they?","Well, I should think so. You slept with someone.","You can have the last piece, if you want.","Yeah, and could you please chop some up and ju...","Thats okay, Ill just pick em off.","With ah, extra anchovies.",No anchovies.
7731,I bought you. How did I forget that that's all...,Wha... How the hell is that gonna help?,(to the dog) What are you barking at?,Come on robot!,Go! Go!,"Ok, on your mark... Get set... GO!!!","Ok, ladies and gentlemen, wind your toys! (the...","Phoebe, you get the bear, uhm, Joey, you get t..."
349,Oh my God.,"Really? Not even to, um, change his PAJAMAS?! ...",How would I know? I-I wasn't here.,No. (Monica brushes Coma Guy's hair in the oth...,I'm not really here. Just thought I'd drop the...,"Nothing, I just thought I'd stop by.. y'know, ...",What are you doing here?,Hi.
3073,"Hey, yknow what, if youre gonna do that, if ...","Oh, want a good name, go with Joey. Joeys you...","No, Im-Im not sure about Hulk, but I like th...","Ooh-ooh, Pheebs, you want a strong name? How a...","Well, it certainly worked for that Valdez kid.",Ugh! No! This is so hard! I went through this ...,Hey! Do we have a baby name yet?,Hey!
5148,I cant do that.,Okay. There may be a way that we can get the o...,"No, its not! When I looked at the other ring ...",Its not a stupid gumball machine looking ring...,I cant believe I let you talk me into buying ...,(entering from her room) Hey.,That wasso good. (Starts crying again.),I would really like that. (They kiss.)


In [13]:
trn_df, val_df = train_test_split(df, test_size=0.1)
trn_df.head()

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5
2283,Bye!,"Okay, bye-bye!",Okay.,"All right, Ill see you tonight.","Okay. (to Phoebe and Chandler) Did ah, you guy...","I meant, me plus one!",But you said one.,"No. No, not at all, not at all. I actually was..."
6904,I suppose that Monica will have the manipulat...,You used to tell girls you were a Kennedy. (be...,I would never lie to get someone into bed.,"Well, we had a little fight.",Wher-where have you been?,Son of a bitch!,(entering with husband) Happy birthday!,And remember whatever comes up first. Okay? An...
3915,"Okay, there's something different though--Oh m...",Well-well that's 'cause I went down there and ...,"No, I'm-I'm serious!",Thanks!,You stink!,Oh it was great! It was great! I went down the...,(To Rachel) How did work go?,Yeah?! So's yours!
5183,"Okay, now will you guys get out of here? I wan...",Oh! Oop! (Hands him back the ring.),"Yeah, right here in my pocket. (Pats his pocke...","Yeah, yeah you have the ring?",Its the big night! We wanted to wish you good...,Will you marry me? Will you marry me? (Makes l...,"See? Now, he could date her.",Thats not what she said last night. (Ross gla...
8154,Oh yeah?,"Yeah, last Saturday. Wow! She was the first bl...",That's terrible!,You're kidding!,She died.,Yeah.,"Well, I think that shirt makes you look like y...",What do you think Pheebs?


# Creating The Functions 

In [14]:
# create dataset suitable for our model
def construct_conv(row, tokenizer, eos = True):
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list(reversed([tokenizer.encode(x) + [tokenizer.eos_token_id] for x in row]))
    conv = flatten(conv)
    return conv

class ConversationDataset(Dataset):
    def __init__(self, tokenizer: PreTrainedTokenizer, args, df, block_size=512):

        block_size = block_size - (tokenizer.model_max_length - tokenizer.max_len_single_sentence)

        directory = args.cache_dir
        cached_features_file = os.path.join(
            directory, args.model_type + "_cached_lm_" + str(block_size)
        )

        if os.path.exists(cached_features_file) and not args.overwrite_cache:
            logger.info("Loading features from cached file %s", cached_features_file)
            with open(cached_features_file, "rb") as handle:
                self.examples = pickle.load(handle)
        else:
            logger.info("Creating features from dataset file at %s", directory)

            self.examples = []
            for _, row in df.iterrows():
                conv = construct_conv(row, tokenizer)
                self.examples.append(conv)

            logger.info("Saving features into cached file %s", cached_features_file)
            with open(cached_features_file, "wb") as handle:
                pickle.dump(self.examples, handle, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

In [15]:
# Cacheing and storing of data/checkpoints

def load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False):
    return ConversationDataset(tokenizer, args, df_val if evaluate else df_trn)


def set_seed(args):
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)


def _sorted_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> List[str]:
    ordering_and_checkpoint_path = []

    glob_checkpoints = glob.glob(os.path.join(args.output_dir, "{}-*".format(checkpoint_prefix)))

    for path in glob_checkpoints:
        if use_mtime:
            ordering_and_checkpoint_path.append((os.path.getmtime(path), path))
        else:
            regex_match = re.match(".*{}-([0-9]+)".format(checkpoint_prefix), path)
            if regex_match and regex_match.groups():
                ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path))

    checkpoints_sorted = sorted(ordering_and_checkpoint_path)
    checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted]
    return checkpoints_sorted


def _rotate_checkpoints(args, checkpoint_prefix="checkpoint", use_mtime=False) -> None:
    if not args.save_total_limit:
        return
    if args.save_total_limit <= 0:
        return

    # Check if we should delete older checkpoint(s)
    checkpoints_sorted = _sorted_checkpoints(args, checkpoint_prefix, use_mtime)
    if len(checkpoints_sorted) <= args.save_total_limit:
        return

    number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - args.save_total_limit)
    checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete]
    for checkpoint in checkpoints_to_be_deleted:
        logger.info("Deleting older checkpoint [{}] due to args.save_total_limit".format(checkpoint))
        shutil.rmtree(checkpoint)

In [16]:
from transformers import AutoModelWithLMHead, AutoModelForCausalLM, AutoTokenizer
import torch

tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
model = AutoModelWithLMHead.from_pretrained("microsoft/DialoGPT-medium")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=26.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=642.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…






HBox(children=(FloatProgress(value=0.0, description='Downloading', max=862955157.0, style=ProgressStyle(descri…




In [17]:
"""
Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
using a masked language modeling (MLM) loss.
"""

# Configs
logger = logging.getLogger(__name__)

MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

In [18]:
# Args to allow for easy convertion of python script to notebook
class Args():
    def __init__(self):
        self.output_dir = 'output-small'
        self.model_type = 'gpt2'
        self.model_name_or_path = 'microsoft/DialoGPT-small'
        self.config_name = 'microsoft/DialoGPT-small'
        self.tokenizer_name = 'microsoft/DialoGPT-small'
        self.cache_dir = 'cached'
        self.block_size = 512
        self.do_train = True
        self.do_eval = True
        self.evaluate_during_training = False
        self.per_gpu_train_batch_size = 4
        self.per_gpu_eval_batch_size = 4
        self.gradient_accumulation_steps = 1
        self.learning_rate = 5e-5
        self.weight_decay = 0.0
        self.adam_epsilon = 1e-8
        self.max_grad_norm = 1.0
        self.num_train_epochs = 10
        self.max_steps = -1
        self.warmup_steps = 0
        self.logging_steps = 1000
        self.save_steps = 3500
        self.save_total_limit = None
        self.eval_all_checkpoints = False
        self.no_cuda = False
        self.overwrite_output_dir = True
        self.overwrite_cache = True
        self.should_continue = False
        self.seed = 42
        self.local_rank = -1
        self.fp16 = False
        self.fp16_opt_level = 'O1'

args = Args()

# Train and Evaluate 

In [19]:
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate, drop_last = True
    )

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    model = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))
    # add_special_tokens_(model, tokenizer)


    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if (
        args.model_name_or_path
        and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
        and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = (batch, batch)
            if inputs.shape[1] > 1024: continue
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step

# Evaluation of some model

def evaluate(args, model: PreTrainedModel, tokenizer: PreTrainedTokenizer, df_trn, df_val, prefix="") -> Dict:
    # Loop to handle MNLI double evaluation (matched, mis-matched)
    eval_output_dir = args.output_dir

    eval_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=True)
    os.makedirs(eval_output_dir, exist_ok=True)
    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(
        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size, collate_fn=collate, drop_last = True
    )

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        inputs, labels = (batch, batch)
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        with torch.no_grad():
            outputs = model(inputs, labels=labels)
            lm_loss = outputs[0]
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {"perplexity": perplexity}

    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
    with open(output_eval_file, "w") as writer:
        logger.info("***** Eval results {} *****".format(prefix))
        for key in sorted(result.keys()):
            logger.info("  %s = %s", key, str(result[key]))
            writer.write("%s = %s\n" % (key, str(result[key])))

    return result

In [20]:
# Main runner

def main(df_trn, df_val):
    args = Args()
    
    if args.should_continue:
        sorted_checkpoints = _sorted_checkpoints(args)
        if len(sorted_checkpoints) == 0:
            raise ValueError("Used --should_continue but no checkpoint was found in --output_dir.")
        else:
            args.model_name_or_path = sorted_checkpoints[-1]

    if (
        os.path.exists(args.output_dir)
        and os.listdir(args.output_dir)
        and args.do_train
        and not args.overwrite_output_dir
        and not args.should_continue
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )

    # Setup CUDA, GPU & distributed training
    device = torch.device("cuda")
    args.n_gpu = torch.cuda.device_count()
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    config = AutoConfig.from_pretrained(args.config_name, cache_dir=args.cache_dir)
    tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name, cache_dir=args.cache_dir)
    model = AutoModelWithLMHead.from_pretrained(
        args.model_name_or_path,
        from_tf=False,
        config=config,
        cache_dir=args.cache_dir,
    )
    model.to(args.device)
    
    logger.info("Training/evaluation parameters %s", args)

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args, tokenizer, df_trn, df_val, evaluate=False)

        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)

    # Saving best-practices: if you use save_pretrained for the model and tokenizer, you can reload them using from_pretrained()
    if args.do_train:
        # Create output directory if needed
        os.makedirs(args.output_dir, exist_ok=True)

        logger.info("Saving model checkpoint to %s", args.output_dir)
        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        model_to_save = (
            model.module if hasattr(model, "module") else model
        )  # Take care of distributed/parallel training
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))

        # Load a trained model and vocabulary that you have fine-tuned
        model = AutoModelWithLMHead.from_pretrained(args.output_dir)
        tokenizer = AutoTokenizer.from_pretrained(args.output_dir)
        model.to(args.device)

    # Evaluation
    results = {}
    if args.do_eval and args.local_rank in [-1, 0]:
        checkpoints = [args.output_dir]
        if args.eval_all_checkpoints:
            checkpoints = list(
                os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))
            )
            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
        logger.info("Evaluate the following checkpoints: %s", checkpoints)
        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
            prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else ""

            model = AutoModelWithLMHead.from_pretrained(checkpoint)
            model.to(args.device)
            result = evaluate(args, model, tokenizer, df_trn, df_val, prefix=prefix)
            result = dict((k + "_{}".format(global_step), v) for k, v in result.items())
            results.update(result)

    return results

# Run the Main Function 


In [21]:
main(trn_df, val_df)

06/24/2021 12:16:16 - INFO - filelock -   Lock 140091203285392 acquired on cached/0cbdd50f204f3ddbaa452e976340a5725f0b5ddb201704058c87e14d9679e070.e6898db50ba3aa698f0f652e876a1e4bd813321dea3e22b776f9a3c39d36aaab.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=641.0, style=ProgressStyle(description_…

06/24/2021 12:16:16 - INFO - filelock -   Lock 140091203285392 released on cached/0cbdd50f204f3ddbaa452e976340a5725f0b5ddb201704058c87e14d9679e070.e6898db50ba3aa698f0f652e876a1e4bd813321dea3e22b776f9a3c39d36aaab.lock





06/24/2021 12:16:17 - INFO - filelock -   Lock 140091189893136 acquired on cached/5f8cf488e0bdda2e393e798f478673a4d26c1386082a1a05e42269f3ecc89f50.67d01b18f2079bd75eac0b2f2e7235768c7f26bd728e7a855a1c5acae01a91a8.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=26.0, style=ProgressStyle(description_w…

06/24/2021 12:16:17 - INFO - filelock -   Lock 140091189893136 released on cached/5f8cf488e0bdda2e393e798f478673a4d26c1386082a1a05e42269f3ecc89f50.67d01b18f2079bd75eac0b2f2e7235768c7f26bd728e7a855a1c5acae01a91a8.lock





06/24/2021 12:16:17 - INFO - filelock -   Lock 140091189893136 acquired on cached/3cf340c89a43b5e6f31c4cd609fc2fc92f3d7aafdf6c8987e2ea9e02cb78b4e2.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…

06/24/2021 12:16:18 - INFO - filelock -   Lock 140091189893136 released on cached/3cf340c89a43b5e6f31c4cd609fc2fc92f3d7aafdf6c8987e2ea9e02cb78b4e2.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f.lock





06/24/2021 12:16:18 - INFO - filelock -   Lock 140091203285392 acquired on cached/4e3f74e7c741909c4d1b48a23febe75c1be66a20c2b98cf7db4b8b10f12dc10c.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…

06/24/2021 12:16:19 - INFO - filelock -   Lock 140091203285392 released on cached/4e3f74e7c741909c4d1b48a23febe75c1be66a20c2b98cf7db4b8b10f12dc10c.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b.lock





06/24/2021 12:16:20 - INFO - filelock -   Lock 140091189851216 acquired on cached/aeb12aa1fc2f135700fcf9f8f0eec86c0649dc5ce0df86677adf0388271f33f3.1010e0ba25016a38144b58e8852f1dcc18876341e3b5728a99b3ffa11cc733cd.lock


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=351265583.0, style=ProgressStyle(descri…

06/24/2021 12:16:27 - INFO - filelock -   Lock 140091189851216 released on cached/aeb12aa1fc2f135700fcf9f8f0eec86c0649dc5ce0df86677adf0388271f33f3.1010e0ba25016a38144b58e8852f1dcc18876341e3b5728a99b3ffa11cc733cd.lock





06/24/2021 12:16:40 - INFO - __main__ -   Training/evaluation parameters <__main__.Args object at 0x7f6987132390>
06/24/2021 12:16:40 - INFO - __main__ -   Creating features from dataset file at cached
06/24/2021 12:16:49 - INFO - __main__ -   Saving features into cached file cached/gpt2_cached_lm_512
06/24/2021 12:16:49 - INFO - __main__ -   ***** Running training *****
06/24/2021 12:16:49 - INFO - __main__ -     Num examples = 7472
06/24/2021 12:16:49 - INFO - __main__ -     Num Epochs = 10
06/24/2021 12:16:49 - INFO - __main__ -     Instantaneous batch size per GPU = 4
06/24/2021 12:16:49 - INFO - __main__ -     Total train batch size (w. parallel, distributed & accumulation) = 4
06/24/2021 12:16:49 - INFO - __main__ -     Gradient Accumulation steps = 1
06/24/2021 12:16:49 - INFO - __main__ -     Total optimization steps = 18680


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=10.0, style=ProgressStyle(description_width='…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1868.0, style=ProgressStyle(description_w…






HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1868.0, style=ProgressStyle(description_w…

06/24/2021 12:33:08 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-3500
06/24/2021 12:33:12 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-3500





HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1868.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1868.0, style=ProgressStyle(description_w…

06/24/2021 12:49:38 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-7000
06/24/2021 12:49:42 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-7000





HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1868.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1868.0, style=ProgressStyle(description_w…

06/24/2021 13:06:13 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-10500
06/24/2021 13:06:17 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-10500





HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1868.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1868.0, style=ProgressStyle(description_w…

06/24/2021 13:22:41 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-14000
06/24/2021 13:22:45 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-14000





HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1868.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=1868.0, style=ProgressStyle(description_w…

06/24/2021 13:39:17 - INFO - __main__ -   Saving model checkpoint to output-small/checkpoint-17500
06/24/2021 13:39:21 - INFO - __main__ -   Saving optimizer and scheduler states to output-small/checkpoint-17500
06/24/2021 13:44:54 - INFO - __main__ -    global_step = 18680, average loss = 1.6208497326525937
06/24/2021 13:44:54 - INFO - __main__ -   Saving model checkpoint to output-small






06/24/2021 13:44:58 - INFO - __main__ -   Evaluate the following checkpoints: ['output-small']
06/24/2021 13:44:59 - INFO - __main__ -   Creating features from dataset file at cached
06/24/2021 13:45:01 - INFO - __main__ -   Saving features into cached file cached/gpt2_cached_lm_512
06/24/2021 13:45:01 - INFO - __main__ -   ***** Running evaluation  *****
06/24/2021 13:45:01 - INFO - __main__ -     Num examples = 831
06/24/2021 13:45:01 - INFO - __main__ -     Batch size = 4


HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=207.0, style=ProgressStyle(description_w…

06/24/2021 13:45:18 - INFO - __main__ -   ***** Eval results  *****
06/24/2021 13:45:18 - INFO - __main__ -     perplexity = tensor(4.6389)





{'perplexity_': tensor(4.6389)}

# Loading the trained model


In [22]:
tokenizer = AutoTokenizer.from_pretrained('microsoft/DialoGPT-medium')
model = AutoModelWithLMHead.from_pretrained('output-small')



In [34]:
# Let's chat for 4 lines
for step in range(4):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')
    # print(new_user_input_ids)

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(
        bot_input_ids, max_length=200,
        pad_token_id=tokenizer.eos_token_id,  
        no_repeat_ngram_size=3,       
        do_sample=True, 
        top_k=100, 
        top_p=0.7,
        temperature=0.8
    )
    
    # pretty print  last ouput tokens from bot
    print("Chandler: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))

>> User:Hi I am Jenny 
Chandler: Hi.
>> User:Will you go out with me?
Chandler: (to Chandler) Youre mom is gonna be so disappointed.
>> User:Should we go out for Pizza?
Chandler: Sure.
>> User:Coffee?
Chandler: !!!(to Monica) You dont understand! You don't make any noise when you get coffee, you just get a little sip of water.


# Loading it up on the Hugging Face

In [24]:
!sudo apt-get install git-lfs

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  git-lfs
0 upgraded, 1 newly installed, 0 to remove and 39 not upgraded.
Need to get 2,129 kB of archives.
After this operation, 7,662 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 git-lfs amd64 2.3.4-1 [2,129 kB]
Fetched 2,129 kB in 1s (1,526 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package git-lfs.
(Reading database ... 160772 files and directories c

In [25]:
!git config --global user.email "harshghodkar@gmail.com"
# Tip: using the same email as your huggingface.co account will link your commits to your profile
!git config --global user.name "Saviour1001"

In [26]:
MY_MODEL_NAME = 'ChandlerBot'
HUGGINGFACE_API_KEY = 'api_trRmwbvgsQFQjdYkLDtMlBDdMMPZTlQHng'
model.push_to_hub(MY_MODEL_NAME, use_auth_token=HUGGINGFACE_API_KEY)
tokenizer.push_to_hub(MY_MODEL_NAME, use_auth_token=HUGGINGFACE_API_KEY)

06/24/2021 13:51:23 - INFO - huggingface_hub.repository -   git version 2.17.1
Sorry, no usage text found for "git-lfs"
06/24/2021 13:51:44 - INFO - huggingface_hub.repository -   
Git LFS: (0 of 1 files) 0 B / 486.76 MB                                        
Git LFS: (0 of 1 files) 0 B / 486.76 MB                                        
Git LFS: (0 of 1 files) 355.45 KB / 486.76 MB                                  
Git LFS: (0 of 1 files) 3.07 MB / 486.76 MB                                    
Git LFS: (0 of 1 files) 9.43 MB / 486.76 MB                                    
Git LFS: (0 of 1 files) 17.11 MB / 486.76 MB                                   
Git LFS: (0 of 1 files) 26.21 MB / 486.76 MB                                   
Git LFS: (0 of 1 files) 34.27 MB / 486.76 MB                                   
Git LFS: (0 of 1 files) 42.82 MB / 486.76 MB                                   
Git LFS: (0 of 1 files) 51.93 MB / 486.76 MB                                   
Git LFS: (0 of 1 fi

'https://huggingface.co/Saviour/ChandlerBot/commit/29ddccf410f9c8100c56adeedeeb4a77155fac84'