# Fine-Tuning GPT

https://www.modeldifferently.com/en/2021/12/generaci%C3%B3n-de-fake-news-con-gpt-2/#data-loading-and-processing

In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import torch, os, re, pandas as pd, json
from sklearn.model_selection import train_test_split
from transformers import (
    DataCollatorForLanguageModeling,
    DataCollatorWithPadding,
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    AutoConfig,
    AutoTokenizer,
    AutoModelForCausalLM
)
from datasets import Dataset, list_metrics, load_metric


In [4]:
#MODEL_NAME = "gpt2"
#model_cls = GPT2LMHeadModel
#tokenizer_cls = GPT2Tokenizer

MODEL_NAME = "microsoft/DialoGPT-small"
model_cls = AutoModelForCausalLM
tokenizer_cls = AutoTokenizer

In [5]:
if torch.cuda.is_available():  
    dev = "cuda:0" 
else:  
    dev = "cpu"  

device = torch.device(dev) 

print(f"""
    cuda: {torch.cuda.is_available()}
    current_device: {torch.cuda.current_device()}
    device_count: {torch.cuda.device_count()}
""")


    cuda: True
    current_device: 0
    device_count: 1



In [6]:
config = AutoConfig.from_pretrained(MODEL_NAME)

## Load Data, Tokenizer, and Model

### Data Loading from CSV

In [7]:
filepath = "../data/processed.csv"
df = pd.read_csv(filepath, encoding="utf-8", usecols=["character", "content"]).rename(columns={"content": "text"})

pd.set_option("display.max_colwidth", None)
df.tail(10)

Unnamed: 0,character,text
32080,Charlotte,"What're the odds, right? But yes, we, uh...know each other. And he's also here in Boston."
32081,Toby,Right. Geeze. I...did not expect to ever hear from him again.
32082,Charlotte,...Surprise?
32083,Charlotte,"So, uh, would you want to talk to him? Because this is all stuff he really should get the chance to ask you about directly."
32084,Toby,Um...sure.
32085,Charlotte,"Ok, good. Um, we should probably go. But, uh, I'll see you tomorrow?"
32086,Charlotte,[_as she talks she takes the book off the table and carefully tucks it away in her bag_]
32087,Toby,"Um, yeah."
32088,Toby,Tomorrow.
32089,Charlotte,[_she will leave with the others_]


### Model

In [8]:
base_model = model_cls.from_pretrained(MODEL_NAME, config=config)

In [9]:
base_model.num_parameters

<bound method ModuleUtilsMixin.num_parameters of GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0

### Tokenizer

In [10]:
base_tokenizer = tokenizer_cls.from_pretrained(MODEL_NAME)

In [11]:
print(f"vocab_size: {base_tokenizer.vocab_size}")

vocab_size: 50257


In [12]:
vocabulary = base_tokenizer.get_vocab()
vocabulary["Hi"]

17250

In [13]:
base_tokenizer.eos_token

'<|endoftext|>'

In [14]:
base_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

## Conversational Model

### Data Preparation

In [109]:
import argparse
import tarfile
import os.path

from typing import Dict, Union

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoModelWithLMHead, AutoTokenizer
from datasets import Dataset


def load_csv(filename):
    return pd.read_csv(filename)


def prepare_data(
    data_or_filename: Union[str, pd.DataFrame],
    filter_by: str = None,
    filter_value: str = None,
    content_key: str = "text",
    n: int = 7,
    test_size: float = 0.1,
    flatten: bool = True,
):
    data = load_csv(data_or_filename) if isinstance(data_or_filename, str) else data_or_filename

    contexted_data = prepare_context(
        data,
        filter_by=filter_by,
        filter_value=filter_value,
        content_key=content_key,
        n=n,
    )

    trn_df, val_df = train_test_split(contexted_data, test_size=test_size, shuffle=False)
    
    return trn_df, val_df


def prepare_context(
    data: pd.DataFrame,
    filter_by: str = None,
    filter_value: str = None,
    content_key: str = "text",
    n: int = 7,
    flatten: bool = True,
    eos_token: str = "<|endoftext|>",
):
    if filter_by:
        indexes = data.loc[data[filter_by] == filter_value].index
        for idx, i in enumerate(indexes):
            if i > n:
                break
        indexes = indexes[idx:]
    else:
        indexes = range(n, len(data[content_key]))
        
    contexted = []

    for i in indexes:
        row = []
        prev = i - 1 - n
        for j in range(i, prev, -1):
            row.append(data.iloc[j][content_key])
        contexted.append(row)
            
    columns = ["response", "context"]
    columns = columns + ["context/" + str(i) for i in range(n - 1)]

    print(columns)
    df = pd.DataFrame.from_records(contexted, columns=columns)
    
    return df


def prepare_dataset(df):
    columns = [col for col in df] 
    dataset = Dataset.from_pandas(concat_text(df))
    dataset = dataset.remove_columns(columns + ['__index_level_0__'])
    return dataset


def concat_text(df):
    df["text"] = df.apply(concat_text_in_row, axis=1)
    return df


def concat_text_in_row(row, eos_token):
    concat_text = f"{eos_token}".join(row)
    # Add to end
    concat_text += eos_token
    print(concat_text)
    return concat_text


def construct_conv(example, tokenizer, eos = True):
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list(reversed([tokenizer.encode(x, padding="max_length", max_length=250) + [tokenizer.eos_token_id] for x in example]))
    print(f"Conv Length: {len(conv)}") 
    print(set(list(map(len, conv))))
    conv = flatten(conv)
    return conv

In [67]:
base_tokenizer.eos_token

'<|endoftext|>'

In [110]:
df.loc[df["text"] == "[*Charlotte immediately becomes very, very tense*]"]

Unnamed: 0,character,text
862,Charlotte,"[*Charlotte immediately becomes very, very tense*]"


In [114]:
df.iloc[862-7:862+10]

Unnamed: 0,character,text
855,Charlotte,...Do you...want to talk about it?
856,bitjockey,No. I'm fine. We need to make sure you're..okay
857,Charlotte,[*makes a face again*] I'm _really_ sorry for just...falling apart like that
858,bitjockey,(( 🤦🏽 ))
859,bitjockey,no you shouldn't be sorry
860,bitjockey,it happens
861,bitjockey,do you..take anything for that? to help?
862,Charlotte,"[*Charlotte immediately becomes very, very tense*]"
863,Charlotte,No.
864,bitjockey,Oh.


In [112]:
train_df, val_df = prepare_data(df, filter_by="character", filter_value="Charlotte", flatten=True)

['response', 'context', 'context/0', 'context/1', 'context/2', 'context/3', 'context/4', 'context/5']


In [113]:
train_df.loc[train_df["response"] == "[*Charlotte immediately becomes very, very tense*]"]

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5
344,"[*Charlotte immediately becomes very, very tense*]",do you..take anything for that? to help?,it happens,no you shouldn't be sorry,(( 🤦🏽 )),[*makes a face again*] I'm _really_ sorry for just...falling apart like that,No. I'm fine. We need to make sure you're..okay,...Do you...want to talk about it?


In [20]:
df.loc[df["text"] == "Uhm, well. I need a refresher? I think?"]

Unnamed: 0,character,text
24214,bitjockey,"Uhm, well. I need a refresher? I think?"


In [21]:
df.iloc[24214:24250]

Unnamed: 0,character,text
24214,bitjockey,"Uhm, well. I need a refresher? I think?"
24215,Akari,Right.
24216,Description,"[_Akari launches into a technical explanation. The short version is that even if Nix and Temper manage to stop the degradation, the damage has already been done. Neither of them are certain they'll be able to fix that, so they're really more focused on keeping Shiro alive. In most cases, a fix to this kind of problem would be biotech replacements or treatments. However, since Shiro's system is so fucked up, the introduction of new cyberware was the catalyst for this in the first place, and the modifications are basically evolving. The closest comparison is a very bad autoimmune disorder.\nWhen Akari was in Montreal, her research area was working on new cyberware. It's currently theoretical, but Akari was able to figure out a way to integrate cyberware in such a way that it does less damage to the body. It's not nanotech but an almost symbiotic merging of biotech and cybertech. Akari is convinced the theory is sound, she just needs to work out the physical details_]"
24217,bitjockey,"[_nods_] Right, right...so..how can I help?"
24218,Akari,[_thinking_] I remember you being more interested in the practical than the theory. Is that still true?
24219,bitjockey,[_nods_]
24220,Akari,"I asked Shiro herself with some help on the chemistry and math, and I've got computer simulations running to try out some possible behavior and designs. But I'm still in need of help with actually moving it out of the theoretical. [_shakes head_] I need a proof-of-concept."
24221,Akari,How far into biomemetics are you willing to go for robots?
24222,bitjockey,What do you mean?
24223,Akari,"I suppose that's a bad way of wording it, considering who I'm talking to. [_shakes head_] How small are you able to go in building robots? Because I'm not sure if we're going to end up taking biomemetics to the cellular level here. But I know I need an excellent roboticist."


In [17]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [18]:
def preprocess(tokenizer):
    def _tokenize(example):
        flatten = lambda l: [item for sublist in l for item in sublist]
        conv = list(reversed([tokenizer.encode(x, padding="max_length", max_length=250) + [tokenizer.eos_token_id] for x in example if "__index" not in x]))
        conv = flatten(conv)
        return conv
        
    def _preprocess(examples):
        #flatten = lambda l: [item for sublist in l for item in sublist]
        #conv = list([tokenizer.encode(x, padding="max_length", max_length=250) + [tokenizer.eos_token_id] for x in examples])
        #conv = flatten(conv)
        #return {"input_ids": conv}
        return {"input_ids": list(map(_tokenize, examples))}
    
    return _preprocess


def tokenize(example):
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list(reversed([tokenizer.encode(x, padding="max_length", max_length=250) + [tokenizer.eos_token_id] for x in example]))
    print(f"Conv Length: {len(conv)}") 
    print(set(list(map(len, conv))))
    conv = flatten(conv)
    return conv


def debug_preprocess(tokenizer, max_length=250): 
    def _tokenize(examples):
        input_ids = [tokenizer.encode(v, padding="max_length", max_length=250) for k, v in examples.items() if isinstance(v, str)]
        examples["input_ids"] = input_ids
        return examples
    return _tokenize

In [19]:
train_dataset

Dataset({
    features: ['response', 'context', 'context/0', 'context/1', 'context/2', 'context/3', 'context/4', 'context/5', '__index_level_0__'],
    num_rows: 4967
})

In [20]:
val_dataset

Dataset({
    features: ['response', 'context', 'context/0', 'context/1', 'context/2', 'context/3', 'context/4', 'context/5', '__index_level_0__'],
    num_rows: 552
})

In [21]:
from typing import List

from torch.nn.utils.rnn import pad_sequence


def preprocess_conv(tokenizer):
    def _construct(examples):
        flatten = lambda l: [item for sublist in l for item in sublist] 
        conv = list(reversed([tokenizer.encode(v) + [tokenizer.eos_token_id] for k,v in examples.items() if isinstance(v, str)]))
        conv = flatten(conv)
        examples["input_ids"] = conv
        return examples 
    return _construct


def preprocess_function(tokenizer, max_length=512):
    def _construct(examples):
        flatten = lambda l: [item for sublist in l for item in sublist] 
        concat_text = f"{tokenizer.eos_token}".join(reversed([v for _, v in examples.items() if isinstance(v, str)]))
        concat_text = concat_text + tokenizer.eos_token
        tokenized = tokenizer(concat_text, padding="max_length",  max_length=max_length)
        examples["input_ids"] = tokenized["input_ids"]
        examples["attention_mask"] = tokenized["attention_mask"]
        return examples
        
    return _construct


def collate(examples: List[torch.Tensor], tokenizer):
    if tokenizer._pad_token is None:
        return pad_sequence(examples, batch_first=True)
    return pad_sequence(
        examples, batch_first=True, padding_value=tokenizer.pad_token_id
    )


In [22]:
base_tokenizer

PreTrainedTokenizerFast(name_or_path='microsoft/DialoGPT-small', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '[PAD]'})

In [23]:
tokenized_train_dataset = train_dataset.map(preprocess_function(tokenizer=base_tokenizer, max_length=256), remove_columns=list(train_dataset.features.keys()))
tokenized_val_dataset = val_dataset.map(preprocess_function(tokenizer=base_tokenizer, max_length=256), remove_columns=list(val_dataset.features.keys()))

  0%|          | 0/4967 [00:00<?, ?ex/s]

  0%|          | 0/552 [00:00<?, ?ex/s]

In [40]:
for i in range(10):
    decoded = base_tokenizer.decode(tokenized_train_dataset[i]["input_ids"])
    print(len(tokenized_train_dataset[i]["input_ids"]), decoded)
    

256 [_will follow Hopper_]<|endoftext|>[_once they reach Anton and Arcade_] Hello<|endoftext|>[_to Anton_] This is my mo--mentor, Akari.<|endoftext|>[_shakes hands_] Nice to meet you.<|endoftext|>[_noticed the slip, hides a smile_]<|endoftext|>[_laughs nervously_] Uhm, so Anton. You are a decker, right?<|endoftext|>That's right<|endoftext|>I hear this is where the tech talk is happening.<|endoftext|>[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][P

In [25]:
tokenized_train_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 4967
})

In [26]:
tokenized_train_dataset.set_format(type="torch", columns=["input_ids"])
tokenized_val_dataset.set_format(type="torch", columns=["input_ids"])

In [27]:
tokenized_train_dataset[0].keys()

dict_keys(['input_ids'])

### Metrics

In [28]:
metric = load_metric("perplexity")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


### Training

In [29]:
base_model.resize_token_embeddings(len(base_tokenizer))

Embedding(50258, 768)

In [30]:
FINETUNED_MODEL = 'SP-05162022a-myDialoGPT2-small'

data_collator = DataCollatorForLanguageModeling(
    tokenizer=base_tokenizer,
    mlm=False,
    return_tensors="pt",
)

trainer = None
training_args = TrainingArguments(
    output_dir=FINETUNED_MODEL,          # output directory
    evaluation_strategy="epoch",
    num_train_epochs=3,           # total # of training epochs
    per_device_train_batch_size=3,  # batch size per device during training
    per_device_eval_batch_size=3,   # batch size for evaluation
    weight_decay=0.01,           # strength of weight decay
    logging_dir=FINETUNED_MODEL,            # directory for storing logs
    prediction_loss_only=True,
)

In [31]:
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

In [32]:
trainer.train()

***** Running training *****
  Num examples = 4967
  Num Epochs = 3
  Instantaneous batch size per device = 3
  Total train batch size (w. parallel, distributed & accumulation) = 3
  Gradient Accumulation steps = 1
  Total optimization steps = 4968


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss
1,3.1853,2.858918
2,2.7895,2.627786
3,2.5986,2.557067


Saving model checkpoint to SP-05162022a-myDialoGPT2-small/checkpoint-500
Configuration saved in SP-05162022a-myDialoGPT2-small/checkpoint-500/config.json


Attempted to log scalar metric loss:
4.1728
Attempted to log scalar metric learning_rate:
4.496779388083736e-05
Attempted to log scalar metric epoch:
0.3


Model weights saved in SP-05162022a-myDialoGPT2-small/checkpoint-500/pytorch_model.bin
Saving model checkpoint to SP-05162022a-myDialoGPT2-small/checkpoint-1000
Configuration saved in SP-05162022a-myDialoGPT2-small/checkpoint-1000/config.json


Attempted to log scalar metric loss:
3.3785
Attempted to log scalar metric learning_rate:
3.993558776167472e-05
Attempted to log scalar metric epoch:
0.6


Model weights saved in SP-05162022a-myDialoGPT2-small/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to SP-05162022a-myDialoGPT2-small/checkpoint-1500
Configuration saved in SP-05162022a-myDialoGPT2-small/checkpoint-1500/config.json


Attempted to log scalar metric loss:
3.1853
Attempted to log scalar metric learning_rate:
3.490338164251208e-05
Attempted to log scalar metric epoch:
0.91


Model weights saved in SP-05162022a-myDialoGPT2-small/checkpoint-1500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 552
  Batch size = 3


Attempted to log scalar metric eval_loss:
2.8589184284210205
Attempted to log scalar metric eval_runtime:
8.4877
Attempted to log scalar metric eval_samples_per_second:
65.036
Attempted to log scalar metric eval_steps_per_second:
21.679
Attempted to log scalar metric epoch:
1.0


Saving model checkpoint to SP-05162022a-myDialoGPT2-small/checkpoint-2000
Configuration saved in SP-05162022a-myDialoGPT2-small/checkpoint-2000/config.json


Attempted to log scalar metric loss:
2.9583
Attempted to log scalar metric learning_rate:
2.9871175523349438e-05
Attempted to log scalar metric epoch:
1.21


Model weights saved in SP-05162022a-myDialoGPT2-small/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to SP-05162022a-myDialoGPT2-small/checkpoint-2500
Configuration saved in SP-05162022a-myDialoGPT2-small/checkpoint-2500/config.json


Attempted to log scalar metric loss:
2.8546
Attempted to log scalar metric learning_rate:
2.4838969404186795e-05
Attempted to log scalar metric epoch:
1.51


Model weights saved in SP-05162022a-myDialoGPT2-small/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to SP-05162022a-myDialoGPT2-small/checkpoint-3000
Configuration saved in SP-05162022a-myDialoGPT2-small/checkpoint-3000/config.json


Attempted to log scalar metric loss:
2.7895
Attempted to log scalar metric learning_rate:
1.9806763285024154e-05
Attempted to log scalar metric epoch:
1.81


Model weights saved in SP-05162022a-myDialoGPT2-small/checkpoint-3000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 552
  Batch size = 3


Attempted to log scalar metric eval_loss:
2.62778639793396
Attempted to log scalar metric eval_runtime:
8.5124
Attempted to log scalar metric eval_samples_per_second:
64.846
Attempted to log scalar metric eval_steps_per_second:
21.615
Attempted to log scalar metric epoch:
2.0


Saving model checkpoint to SP-05162022a-myDialoGPT2-small/checkpoint-3500
Configuration saved in SP-05162022a-myDialoGPT2-small/checkpoint-3500/config.json


Attempted to log scalar metric loss:
2.6801
Attempted to log scalar metric learning_rate:
1.4774557165861514e-05
Attempted to log scalar metric epoch:
2.11


Model weights saved in SP-05162022a-myDialoGPT2-small/checkpoint-3500/pytorch_model.bin
Saving model checkpoint to SP-05162022a-myDialoGPT2-small/checkpoint-4000
Configuration saved in SP-05162022a-myDialoGPT2-small/checkpoint-4000/config.json


Attempted to log scalar metric loss:
2.6111
Attempted to log scalar metric learning_rate:
9.742351046698874e-06
Attempted to log scalar metric epoch:
2.42


Model weights saved in SP-05162022a-myDialoGPT2-small/checkpoint-4000/pytorch_model.bin
Saving model checkpoint to SP-05162022a-myDialoGPT2-small/checkpoint-4500
Configuration saved in SP-05162022a-myDialoGPT2-small/checkpoint-4500/config.json


Attempted to log scalar metric loss:
2.5986
Attempted to log scalar metric learning_rate:
4.710144927536232e-06
Attempted to log scalar metric epoch:
2.72


Model weights saved in SP-05162022a-myDialoGPT2-small/checkpoint-4500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 552
  Batch size = 3


Attempted to log scalar metric eval_loss:
2.5570669174194336
Attempted to log scalar metric eval_runtime:
8.5078
Attempted to log scalar metric eval_samples_per_second:
64.882
Attempted to log scalar metric eval_steps_per_second:
21.627
Attempted to log scalar metric epoch:
3.0




Training completed. Do not forget to share your model on huggingface.co/models =)




Attempted to log scalar metric train_runtime:
860.1493
Attempted to log scalar metric train_samples_per_second:
17.324
Attempted to log scalar metric train_steps_per_second:
5.776
Attempted to log scalar metric total_flos:
1980352723968000.0
Attempted to log scalar metric train_loss:
2.982759656921485
Attempted to log scalar metric epoch:
3.0


TrainOutput(global_step=4968, training_loss=2.982759656921485, metrics={'train_runtime': 860.1493, 'train_samples_per_second': 17.324, 'train_steps_per_second': 5.776, 'total_flos': 1980352723968000.0, 'train_loss': 2.982759656921485, 'epoch': 3.0})

In [33]:
trainer.save_model(FINETUNED_MODEL)
base_tokenizer.save_pretrained(FINETUNED_MODEL)
config.save_pretrained(FINETUNED_MODEL)

Saving model checkpoint to SP-05162022a-myDialoGPT2-small
Configuration saved in SP-05162022a-myDialoGPT2-small/config.json
Model weights saved in SP-05162022a-myDialoGPT2-small/pytorch_model.bin
tokenizer config file saved in SP-05162022a-myDialoGPT2-small/tokenizer_config.json
Special tokens file saved in SP-05162022a-myDialoGPT2-small/special_tokens_map.json
Configuration saved in SP-05162022a-myDialoGPT2-small/config.json


In [34]:
finetuned_model = AutoModelForCausalLM.from_pretrained(FINETUNED_MODEL)
tokenizer = AutoTokenizer.from_pretrained(FINETUNED_MODEL)

loading configuration file SP-05162022a-myDialoGPT2-small/config.json
Model config GPT2Config {
  "_name_or_path": "SP-05162022a-myDialoGPT2-small",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "conversational": {
      "max_length": 1000
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.19.0",
  "use_cache": true,
  "vocab_size": 50258

In [35]:
def generate_responses(model, tokenizer, text, chat_history_ids=None, step=0):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors='pt')

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(
        bot_input_ids,
        max_length=512,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=3,
        do_sample=True,
        top_k=100,
        top_p=0.7
    )
    
    response = tokenizer.decode(
        chat_history_ids[:, bot_input_ids.shape[-1]:][0],
        skip_special_tokens=True,
    )

    return response, chat_history_ids, step + 1

def chat(model, tokenizer):
    step = 0
    chat_history_ids = []
    
    while True: 
        text = input(">> ")
        if text in ["/q", "/quit", "/e", "/exit"]: break
        print(f"User: {text}")
        response, chat_history_ids, step = generate_responses(
            model=model,
            tokenizer=tokenizer,
            text=text,
            chat_history_ids=chat_history_ids,
            step=step
        )
        print(f"Bot: {response}")
        

In [36]:
chat(finetuned_model, tokenizer)

>>  Hello?


User: Hello?
Bot: *Kalahan is in a really awkward spot in the living room with Lauren.*


>>  what's going on


User: what's going on
Bot: *He's been standing there for a few minutes, his arms crossed, but he's still not sure if he should be standing or sitting.*


>>  Are you okay?


User: Are you okay?
Bot: ...No.


>>  What happened?


User: What happened?
Bot: It's..it's been a rough day.


>>  Has it?


User: Has it?
Bot: [_nods_]


>>  /quit


## Re-do dataset

In [73]:
train_df.iloc[3000]

response                                                    [_looks concerned_] ..what do you mean?
context                                                     Now...I think...it was the same as you.
context/0                         I...remember much...of it. I...lose sense of time...near the end.
context/1                                                                       It is...unpleasant.
context/2                                                                                  [_nods_]
context/3                                                                  ...perhaps...it is time.
context/4    [_smiles weakly_] I thought...with everything that..you might want to talk to someone.
context/5                                                                ...is that...why you came?
Name: 3466, dtype: object

In [74]:
df.loc[df["text"] == "[_looks concerned_] ..what do you mean?"]

Unnamed: 0,character,text
14345,bitjockey,[_looks concerned_] ..what do you mean?


In [75]:
df.iloc[14345:14352]

Unnamed: 0,character,text
14345,bitjockey,[_looks concerned_] ..what do you mean?
14346,Shiro,"[_sighs_] They tried to...program me, I suppose. Simsense. Simulations. Drugs, I think."
14347,Shiro,I...remember the tank. But...near the end...things are chaotic. The memories...slip.
14348,bitjockey,[_stiffens_]
14349,bitjockey,So there are...gaps too? In your memory?
14350,Shiro,At that point...yes.
14351,Shiro,I...some are there but...I can't...hold onto them.


In [39]:
chat(finetuned_model, tokenizer)

>>  Hiya


User: Hiya
Bot: So, uh, what's up?


>>  Nothing much. You?


User: Nothing much. You?
Bot: I...just woke up.


>>  You did?


User: You did?
Bot: Yeah.


>>  Did you sleep okay?


User: Did you sleep okay?
Bot: You know it sounds like you are going to have a panic attack and start screaming.


>>  What?


User: What?
Bot: _::sits up straighter::_


>>  How you doing?


User: How you doing?
Bot: Just.


>>  Just?


User: Just?
Bot: About.


>>  I don't know


User: I don't know
Bot: Did your dad..tell you?


>>  [_shrugs_]


User: [_shrugs_]
Bot: I was born in '97, so I...had some time to think about it before coming back to this.


>>  How old are you?


User: How old are you?
Bot: ...I'm not really sure.


>>  /quit


In [80]:
train_df.loc[train_df["response"] == "[_looks concerned_] ..what do you mean?"]

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5
3466,[_looks concerned_] ..what do you mean?,Now...I think...it was the same as you.,I...remember much...of it. I...lose sense of time...near the end.,It is...unpleasant.,[_nods_],...perhaps...it is time.,[_smiles weakly_] I thought...with everything that..you might want to talk to someone.,...is that...why you came?


In [83]:
train_dataset[3000]

{'response': '[_looks concerned_] ..what do you mean?',
 'context': 'Now...I think...it was the same as you.',
 'context/0': 'I...remember much...of it. I...lose sense of time...near the end.',
 'context/1': 'It is...unpleasant.',
 'context/2': '[_nods_]',
 'context/3': '...perhaps...it is time.',
 'context/4': '[_smiles weakly_] I thought...with everything that..you might want to talk to someone.',
 'context/5': '...is that...why you came?',
 '__index_level_0__': 3466}