# Fine-Tuning GPT

https://www.modeldifferently.com/en/2021/12/generaci%C3%B3n-de-fake-news-con-gpt-2/#data-loading-and-processing

In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
import torch, os, re, pandas as pd, json
from sklearn.model_selection import train_test_split
from transformers import (
    DataCollatorForLanguageModeling,
    DataCollatorWithPadding,
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    AutoConfig,
    AutoTokenizer,
    AutoModelForCausalLM
)
from datasets import Dataset, list_metrics, load_metric


In [2]:
#MODEL_NAME = "gpt2"
#model_cls = GPT2LMHeadModel
#tokenizer_cls = GPT2Tokenizer

MODEL_NAME = "microsoft/DialoGPT-small"
model_cls = AutoModelForCausalLM
tokenizer_cls = AutoTokenizer

In [3]:
if torch.cuda.is_available():  
    dev = "cuda:0" 
else:  
    dev = "cpu"  

device = torch.device(dev) 

print(f"""
    cuda: {torch.cuda.is_available()}
    current_device: {torch.cuda.current_device()}
    device_count: {torch.cuda.device_count()}
""")


    cuda: True
    current_device: 0
    device_count: 1



In [4]:
config = AutoConfig.from_pretrained(MODEL_NAME)

## Load Data, Tokenizer, and Model

In [44]:
filepath = "../data/empatheticdialogues/train.csv"
df = pd.read_csv(filepath, encoding="utf-8", on_bad_lines="skip")

In [45]:
df[:10]

Unnamed: 0,conv_id,utterance_idx,context,prompt,speaker_idx,utterance,selfeval,tags
0,hit:0_conv:1,1,sentimental,I remember going to the fireworks with my best...,1,I remember going to see the fireworks with my ...,5|5|5_2|2|5,
1,hit:0_conv:1,2,sentimental,I remember going to the fireworks with my best...,0,Was this a friend you were in love with_comma_...,5|5|5_2|2|5,
2,hit:0_conv:1,3,sentimental,I remember going to the fireworks with my best...,1,This was a best friend. I miss her.,5|5|5_2|2|5,
3,hit:0_conv:1,4,sentimental,I remember going to the fireworks with my best...,0,Where has she gone?,5|5|5_2|2|5,
4,hit:0_conv:1,5,sentimental,I remember going to the fireworks with my best...,1,We no longer talk.,5|5|5_2|2|5,
5,hit:0_conv:1,6,sentimental,I remember going to the fireworks with my best...,0,Oh was this something that happened because of...,5|5|5_2|2|5,
6,hit:1_conv:2,1,afraid,i used to scare for darkness,2,it feels like hitting to blank wall when i se...,4|3|4_3|5|5,
7,hit:1_conv:2,2,afraid,i used to scare for darkness,3,Oh ya? I don't really see how,4|3|4_3|5|5,
8,hit:1_conv:2,3,afraid,i used to scare for darkness,2,dont you feel so.. its a wonder,4|3|4_3|5|5,
9,hit:1_conv:2,4,afraid,i used to scare for darkness,3,I do actually hit blank walls a lot of times b...,4|3|4_3|5|5,


In [46]:
df.iloc[2]["prompt"]

'I remember going to the fireworks with my best friend. There was a lot of people_comma_ but it only felt like us in the world.'

In [47]:
base_tokenizer = tokenizer_cls.from_pretrained(MODEL_NAME)

In [51]:
df["context"] = df[["conv_id", "prompt", "utterance"]].groupby("conv_id")["utterance"].transform(lambda x: base_tokenizer.eos_token.join(x))

In [52]:
df["context"][0]

'I remember going to see the fireworks with my best friend. It was the first time we ever spent time alone together. Although there was a lot of people_comma_ we felt like the only people in the world.<|endoftext|>Was this a friend you were in love with_comma_ or just a best friend?<|endoftext|>This was a best friend. I miss her.<|endoftext|>Where has she gone?<|endoftext|>We no longer talk.<|endoftext|>Oh was this something that happened because of an argument?'

### Data Loading from CSV

In [6]:
filepath = "../data/empatheticdialogues/train.csv"
df = pd.read_csv(filepath, encoding="utf-8", usecols=["character", "content"]).rename(columns={"content": "text"})

pd.set_option("display.max_colwidth", None)
df.tail(10)

Unnamed: 0,character,text
32080,Charlotte,"What're the odds, right? But yes, we, uh...know each other. And he's also here in Boston."
32081,Toby,Right. Geeze. I...did not expect to ever hear from him again.
32082,Charlotte,...Surprise?
32083,Charlotte,"So, uh, would you want to talk to him? Because this is all stuff he really should get the chance to ask you about directly."
32084,Toby,Um...sure.
32085,Charlotte,"Ok, good. Um, we should probably go. But, uh, I'll see you tomorrow?"
32086,Charlotte,[_as she talks she takes the book off the table and carefully tucks it away in her bag_]
32087,Toby,"Um, yeah."
32088,Toby,Tomorrow.
32089,Charlotte,[_she will leave with the others_]


### Model

In [7]:
base_model = model_cls.from_pretrained(MODEL_NAME, config=config)

In [8]:
base_model.num_parameters

<bound method ModuleUtilsMixin.num_parameters of GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0

### Tokenizer

In [36]:
base_tokenizer = tokenizer_cls.from_pretrained(MODEL_NAME)

In [10]:
print(f"vocab_size: {base_tokenizer.vocab_size}")

vocab_size: 50257


In [11]:
vocabulary = base_tokenizer.get_vocab()
vocabulary["Hi"]

17250

In [12]:
base_tokenizer.eos_token

'<|endoftext|>'

In [11]:
base_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

## Conversational Model

### Data Preparation

In [20]:
import argparse
import tarfile
import os.path

from typing import Dict, Union

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoModelWithLMHead, AutoTokenizer
from datasets import Dataset


def load_csv(filename):
    return pd.read_csv(filename)


def prepare_data(
    data_or_filename: Union[str, pd.DataFrame],
    filter_by: str = None,
    filter_value: str = None,
    content_key: str = "text",
    n: int = 7,
    test_size: float = 0.1,
):
    data = load_csv(data_or_filename) if isinstance(data_or_filename, str) else data_or_filename

    contexted_data = prepare_context(
        data,
        filter_by=filter_by,
        filter_value=filter_value,
        content_key=content_key,
        n=n,
    )

    trn_df, val_df = train_test_split(contexted_data, test_size=test_size, shuffle=False)
    
    return trn_df, val_df


def prepare_context(
    data: pd.DataFrame,
    filter_by: str = None,
    filter_value: str = None,
    content_key: str = "text",
    n: int = 7,
):
    if filter_by:
        indexes = data.loc[data[filter_by] == filter_value].index
        for idx, i in enumerate(indexes):
            if i > n:
                break
        indexes = indexes[idx:]
    else:
        indexes = range(n, len(data[content_key]))
        
    contexted = []

    for i in indexes:
        row = []
        prev = i - 1 - n
        for j in range(i, prev, -1):
            row.append(data.iloc[j][content_key])
        contexted.append(row)
            
    columns = ["response", "context"]
    columns = columns + ["context/" + str(i) for i in range(n - 1)]

    print(columns)
    df = pd.DataFrame.from_records(contexted, columns=columns)
    
    return df


def prepare_dataset(df):
    columns = [col for col in df] 
    dataset = Dataset.from_pandas(concat_text(df))
    dataset = dataset.remove_columns(columns + ['__index_level_0__'])
    return dataset


def concat_text(df):
    df["text"] = df.apply(concat_text_in_row, axis=1)
    return df


def concat_text_in_row(row, eos_token):
    concat_text = f"{eos_token}".join(row)
    # Add to end
    concat_text += eos_token
    print(concat_text)
    return concat_text


def construct_conv(example, tokenizer, eos = True):
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list(reversed([tokenizer.encode(x, padding="max_length", max_length=250) + [tokenizer.eos_token_id] for x in example]))
    print(f"Conv Length: {len(conv)}") 
    print(set(list(map(len, conv))))
    conv = flatten(conv)
    return conv

In [21]:
base_tokenizer.eos_token

'<|endoftext|>'

In [22]:
train_df, val_df = prepare_data(df, filter_by="character", filter_value="bitjockey")

['response', 'context', 'context/0', 'context/1', 'context/2', 'context/3', 'context/4', 'context/5']


In [41]:
train_df

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5
0,i nvr askd,@bitjockey im free nw...?,@Charlotte we nvr hd a ch4nce 2 t4lk,((just watch)),((I don't know what to do but I'm excited)),((convention accepted)),((this will indicate ooc responses)),yea
1,wht exactly do u kno abt m3,i nvr askd,@bitjockey im free nw...?,@Charlotte we nvr hd a ch4nce 2 t4lk,((just watch)),((I don't know what to do but I'm excited)),((convention accepted)),((this will indicate ooc responses))
2,((omg why do they type like this lmao)),wht exactly do u kno abt m3,i nvr askd,@bitjockey im free nw...?,@Charlotte we nvr hd a ch4nce 2 t4lk,((just watch)),((I don't know what to do but I'm excited)),((convention accepted))
3,((okay I have no idea what that says lol)),ur nm. rl 1,lik,um,((_dystopia_. Also charlotte is a little shit)),((omg why do they type like this lmao)),wht exactly do u kno abt m3,i nvr askd
4,@Charlotte do u ever type in real words,((I speak hackese)),((you speak charlotte!),((don't ask how I can do this lol)),"((your name, real one))",((would hopper?)),((okay I have no idea what that says lol)),ur nm. rl 1
...,...,...,...,...,...,...,...,...
4958,Hello. Do you...have some time? To talk?,[_is in the office working at a computer. She looks up when Hopper enters_] Hi Alex.,[_weak smile_] Thanks. [_heads there_],She's in the office.,"[_Meanwhile, elsewhere..._]","[_is idly running fingers through Shiro's hair as she sleeps_] Trust me, I'm the _last_ person who's going to fault you for that bit of worry.","I don't know her well, but I guess this isn't how it normally goes? Maybe she didn't want to distract you while you were working. [_sighs_] But you're probably thinking of all the bad case scenarios, aren't you?",[_nods_]
4959,"Uhm, so...I was wondering if....There was something you mentioned about helping Shiro. I was wondering if I could maybe help with that?","[_collateral nonchalantly wanders into the common room, sits down and resumes watching {whatever sci-fi show he was watching previous}._]",Of course.,Hello. Do you...have some time? To talk?,[_is in the office working at a computer. She looks up when Hopper enters_] Hi Alex.,[_weak smile_] Thanks. [_heads there_],She's in the office.,"[_Meanwhile, elsewhere..._]"
4960,"Uhm, well. I need a refresher? I think?",How much do you know?,"Oh! Sure, I'd love that.","Uhm, so...I was wondering if....There was something you mentioned about helping Shiro. I was wondering if I could maybe help with that?","[_collateral nonchalantly wanders into the common room, sits down and resumes watching {whatever sci-fi show he was watching previous}._]",Of course.,Hello. Do you...have some time? To talk?,[_is in the office working at a computer. She looks up when Hopper enters_] Hi Alex.
4961,"[_nods_] Right, right...so..how can I help?","[_Akari launches into a technical explanation. The short version is that even if Nix and Temper manage to stop the degradation, the damage has already been done. Neither of them are certain they'll be able to fix that, so they're really more focused on keeping Shiro alive. In most cases, a fix to this kind of problem would be biotech replacements or treatments. However, since Shiro's system is so fucked up, the introduction of new cyberware was the catalyst for this in the first place, and the modifications are basically evolving. The closest comparison is a very bad autoimmune disorder.\nWhen Akari was in Montreal, her research area was working on new cyberware. It's currently theoretical, but Akari was able to figure out a way to integrate cyberware in such a way that it does less damage to the body. It's not nanotech but an almost symbiotic merging of biotech and cybertech. Akari is convinced the theory is sound, she just needs to work out the physical details_]",Right.,"Uhm, well. I need a refresher? I think?",How much do you know?,"Oh! Sure, I'd love that.","Uhm, so...I was wondering if....There was something you mentioned about helping Shiro. I was wondering if I could maybe help with that?","[_collateral nonchalantly wanders into the common room, sits down and resumes watching {whatever sci-fi show he was watching previous}._]"


In [40]:
train_df.iloc[0].values

array(['i nvr askd', '@bitjockey im free nw...?',
       '@Charlotte we nvr hd a ch4nce 2 t4lk', '((just watch))',
       "((I don't know what to do but I'm excited))",
       '((convention accepted))', '((this will indicate ooc responses))',
       'yea'], dtype=object)

In [42]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [43]:
train_dataset

Dataset({
    features: ['response', 'context', 'context/0', 'context/1', 'context/2', 'context/3', 'context/4', 'context/5', '__index_level_0__'],
    num_rows: 4963
})

In [25]:
val_dataset

Dataset({
    features: ['response', 'context', 'context/0', 'context/1', 'context/2', 'context/3', 'context/4', 'context/5', '__index_level_0__'],
    num_rows: 552
})

In [44]:
def preprocess_function(tokenizer, max_length=512):
    def _construct(examples):
        flatten = lambda l: [item for sublist in l for item in sublist] 
        concat_text = f"{tokenizer.eos_token}".join(reversed([v for _, v in examples.items() if isinstance(v, str)]))
        concat_text = concat_text + tokenizer.eos_token
        tokenized = tokenizer(concat_text, padding="max_length",  max_length=max_length)
        examples["input_ids"] = tokenized["input_ids"]
        examples["attention_mask"] = tokenized["attention_mask"]
        return examples
        
    return _construct

In [45]:
tokenized_train_dataset = train_dataset.map(preprocess_function(tokenizer=base_tokenizer, max_length=256), remove_columns=list(train_dataset.features.keys()))
tokenized_val_dataset = val_dataset.map(preprocess_function(tokenizer=base_tokenizer, max_length=256), remove_columns=list(val_dataset.features.keys()))

  0%|          | 0/4963 [00:00<?, ?ex/s]

  0%|          | 0/552 [00:00<?, ?ex/s]

In [50]:
for i in range(100, 110):
    decoded = base_tokenizer.decode(tokenized_train_dataset[i]["input_ids"])
    print(len(tokenized_train_dataset[i]["input_ids"]), decoded, "\n")
    

256 hard to find someone who doesn’t exist anymore<|endoftext|>But enough about me<|endoftext|>I could make it harder to find. Make sure other people can't follow the trail I did. If you want.<|endoftext|>I’m..open to it<|endoftext|>I can put Pongo to work on it<|endoftext|>pngo?<|endoftext|>One of my sprites<|endoftext|>a sprite?<|endoftext|>[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]

In [30]:
tokenized_train_dataset

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 4963
})

In [51]:
tokenized_train_dataset.set_format(type="torch", columns=["input_ids"])
tokenized_val_dataset.set_format(type="torch", columns=["input_ids"])

In [53]:
base_tokenizer.decode(tokenized_train_dataset[3000]["input_ids"])

"In our defense, you _were_ a little obnoxious.<|endoftext|>Wow.<|endoftext|>Well I didn't exactly think I'd be there very long.<|endoftext|>[_frowns_]<|endoftext|>[*shrugs*] I'd been through a lot of fosters at that point.<|endoftext|>*Kalahan's ears perk up at the word 'fosters' and he frowns, somewhat confused, as he tries to remember if he already knew about this or not.*<|endoftext|>*He recalls that he did know she was adopted and seems to mellow out, but he's more interested in the situation now.*<|endoftext|>So..Medusa and Nix..you were already close?<|endoftext|>[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PA

### Metrics

In [54]:
metric = load_metric("perplexity")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


### Training

In [55]:
base_model.resize_token_embeddings(len(base_tokenizer))

Embedding(50258, 768)

In [32]:
FINETUNED_MODEL = 'CHARLOTTE-05162022a-myDialoGPT2-small'

data_collator = DataCollatorForLanguageModeling(
    tokenizer=base_tokenizer,
    mlm=False,
    return_tensors="pt",
)

trainer = None
training_args = None
training_args = TrainingArguments(
    output_dir=FINETUNED_MODEL,          # output directory
    evaluation_strategy="epoch",
    num_train_epochs=3,           # total # of training epochs
    per_device_train_batch_size=2,  # batch size per device during training
    per_device_eval_batch_size=2,   # batch size for evaluation
    weight_decay=0.01,           # strength of weight decay
    logging_dir=FINETUNED_MODEL,            # directory for storing logs
    prediction_loss_only=True,
)

In [33]:
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

In [34]:
trainer.train()

***** Running training *****
  Num examples = 7823
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 11736


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss
1,2.988,3.169161
2,2.5902,3.244871
3,2.4256,3.307563


Saving model checkpoint to CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-500
Configuration saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-500/config.json


Attempted to log scalar metric loss:
4.3112
Attempted to log scalar metric learning_rate:
4.786980231765508e-05
Attempted to log scalar metric epoch:
0.13


Model weights saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-500/pytorch_model.bin
Saving model checkpoint to CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-1000
Configuration saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-1000/config.json


Attempted to log scalar metric loss:
3.5121
Attempted to log scalar metric learning_rate:
4.573960463531016e-05
Attempted to log scalar metric epoch:
0.26


Model weights saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-1500
Configuration saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-1500/config.json


Attempted to log scalar metric loss:
3.3265
Attempted to log scalar metric learning_rate:
4.360940695296524e-05
Attempted to log scalar metric epoch:
0.38


Model weights saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-2000
Configuration saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-2000/config.json


Attempted to log scalar metric loss:
3.2139
Attempted to log scalar metric learning_rate:
4.147920927062032e-05
Attempted to log scalar metric epoch:
0.51


Model weights saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-2500
Configuration saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-2500/config.json


Attempted to log scalar metric loss:
3.1432
Attempted to log scalar metric learning_rate:
3.934901158827539e-05
Attempted to log scalar metric epoch:
0.64


Model weights saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-3000
Configuration saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-3000/config.json


Attempted to log scalar metric loss:
3.0656
Attempted to log scalar metric learning_rate:
3.7218813905930474e-05
Attempted to log scalar metric epoch:
0.77


Model weights saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-3000/pytorch_model.bin
Saving model checkpoint to CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-3500
Configuration saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-3500/config.json


Attempted to log scalar metric loss:
2.988
Attempted to log scalar metric learning_rate:
3.508861622358555e-05
Attempted to log scalar metric epoch:
0.89


Model weights saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-3500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 870
  Batch size = 2


Attempted to log scalar metric eval_loss:
3.169161319732666
Attempted to log scalar metric eval_runtime:
12.1362
Attempted to log scalar metric eval_samples_per_second:
71.687
Attempted to log scalar metric eval_steps_per_second:
35.843
Attempted to log scalar metric epoch:
1.0


Saving model checkpoint to CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-4000
Configuration saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-4000/config.json


Attempted to log scalar metric loss:
2.9464
Attempted to log scalar metric learning_rate:
3.2958418541240625e-05
Attempted to log scalar metric epoch:
1.02


Model weights saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-4000/pytorch_model.bin
Saving model checkpoint to CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-4500
Configuration saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-4500/config.json


Attempted to log scalar metric loss:
2.7669
Attempted to log scalar metric learning_rate:
3.0828220858895703e-05
Attempted to log scalar metric epoch:
1.15


Model weights saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-4500/pytorch_model.bin
Saving model checkpoint to CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-5000
Configuration saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-5000/config.json


Attempted to log scalar metric loss:
2.7445
Attempted to log scalar metric learning_rate:
2.8698023176550785e-05
Attempted to log scalar metric epoch:
1.28


Model weights saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-5000/pytorch_model.bin
Saving model checkpoint to CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-5500
Configuration saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-5500/config.json


Attempted to log scalar metric loss:
2.6896
Attempted to log scalar metric learning_rate:
2.656782549420586e-05
Attempted to log scalar metric epoch:
1.41


Model weights saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-5500/pytorch_model.bin
Saving model checkpoint to CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-6000
Configuration saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-6000/config.json


Attempted to log scalar metric loss:
2.68
Attempted to log scalar metric learning_rate:
2.4437627811860943e-05
Attempted to log scalar metric epoch:
1.53


Model weights saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-6000/pytorch_model.bin
Saving model checkpoint to CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-6500
Configuration saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-6500/config.json


Attempted to log scalar metric loss:
2.6444
Attempted to log scalar metric learning_rate:
2.230743012951602e-05
Attempted to log scalar metric epoch:
1.66


Model weights saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-6500/pytorch_model.bin
Saving model checkpoint to CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-7000
Configuration saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-7000/config.json


Attempted to log scalar metric loss:
2.6484
Attempted to log scalar metric learning_rate:
2.01772324471711e-05
Attempted to log scalar metric epoch:
1.79


Model weights saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-7000/pytorch_model.bin
Saving model checkpoint to CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-7500
Configuration saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-7500/config.json


Attempted to log scalar metric loss:
2.5902
Attempted to log scalar metric learning_rate:
1.8047034764826175e-05
Attempted to log scalar metric epoch:
1.92


Model weights saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-7500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 870
  Batch size = 2


Attempted to log scalar metric eval_loss:
3.244870901107788
Attempted to log scalar metric eval_runtime:
12.0984
Attempted to log scalar metric eval_samples_per_second:
71.91
Attempted to log scalar metric eval_steps_per_second:
35.955
Attempted to log scalar metric epoch:
2.0


Saving model checkpoint to CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-8000
Configuration saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-8000/config.json


Attempted to log scalar metric loss:
2.5169
Attempted to log scalar metric learning_rate:
1.5916837082481257e-05
Attempted to log scalar metric epoch:
2.04


Model weights saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-8000/pytorch_model.bin
Saving model checkpoint to CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-8500
Configuration saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-8500/config.json


Attempted to log scalar metric loss:
2.4503
Attempted to log scalar metric learning_rate:
1.3786639400136334e-05
Attempted to log scalar metric epoch:
2.17


Model weights saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-8500/pytorch_model.bin
Saving model checkpoint to CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-9000
Configuration saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-9000/config.json


Attempted to log scalar metric loss:
2.4398
Attempted to log scalar metric learning_rate:
1.1656441717791411e-05
Attempted to log scalar metric epoch:
2.3


Model weights saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-9000/pytorch_model.bin
Saving model checkpoint to CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-9500
Configuration saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-9500/config.json


Attempted to log scalar metric loss:
2.4249
Attempted to log scalar metric learning_rate:
9.52624403544649e-06
Attempted to log scalar metric epoch:
2.43


Model weights saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-9500/pytorch_model.bin
Saving model checkpoint to CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-10000
Configuration saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-10000/config.json


Attempted to log scalar metric loss:
2.4436
Attempted to log scalar metric learning_rate:
7.3960463531015685e-06
Attempted to log scalar metric epoch:
2.56


Model weights saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-10000/pytorch_model.bin
Saving model checkpoint to CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-10500
Configuration saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-10500/config.json


Attempted to log scalar metric loss:
2.406
Attempted to log scalar metric learning_rate:
5.265848670756646e-06
Attempted to log scalar metric epoch:
2.68


Model weights saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-10500/pytorch_model.bin
Saving model checkpoint to CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-11000
Configuration saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-11000/config.json


Attempted to log scalar metric loss:
2.4101
Attempted to log scalar metric learning_rate:
3.135650988411725e-06
Attempted to log scalar metric epoch:
2.81


Model weights saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-11000/pytorch_model.bin
Saving model checkpoint to CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-11500
Configuration saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-11500/config.json


Attempted to log scalar metric loss:
2.4256
Attempted to log scalar metric learning_rate:
1.0054533060668031e-06
Attempted to log scalar metric epoch:
2.94


Model weights saved in CHARLOTTE-05162022a-myDialoGPT2-small/checkpoint-11500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 870
  Batch size = 2


Attempted to log scalar metric eval_loss:
3.307562828063965
Attempted to log scalar metric eval_runtime:
12.0967
Attempted to log scalar metric eval_samples_per_second:
71.92
Attempted to log scalar metric eval_steps_per_second:
35.96
Attempted to log scalar metric epoch:
3.0




Training completed. Do not forget to share your model on huggingface.co/models =)




Attempted to log scalar metric train_runtime:
1450.5928
Attempted to log scalar metric train_samples_per_second:
16.179
Attempted to log scalar metric train_steps_per_second:
8.09
Attempted to log scalar metric total_flos:
3102890851584000.0
Attempted to log scalar metric train_loss:
2.808187192203077
Attempted to log scalar metric epoch:
3.0


TrainOutput(global_step=11736, training_loss=2.808187192203077, metrics={'train_runtime': 1450.5928, 'train_samples_per_second': 16.179, 'train_steps_per_second': 8.09, 'total_flos': 3102890851584000.0, 'train_loss': 2.808187192203077, 'epoch': 3.0})

In [36]:
trainer.save_model(FINETUNED_MODEL)
base_tokenizer.save_pretrained(FINETUNED_MODEL)
config.save_pretrained(FINETUNED_MODEL)

Saving model checkpoint to CHARLOTTE-05162022a-myDialoGPT2-small
Configuration saved in CHARLOTTE-05162022a-myDialoGPT2-small/config.json
Model weights saved in CHARLOTTE-05162022a-myDialoGPT2-small/pytorch_model.bin
tokenizer config file saved in CHARLOTTE-05162022a-myDialoGPT2-small/tokenizer_config.json
Special tokens file saved in CHARLOTTE-05162022a-myDialoGPT2-small/special_tokens_map.json
Configuration saved in CHARLOTTE-05162022a-myDialoGPT2-small/config.json


In [37]:
finetuned_model = AutoModelForCausalLM.from_pretrained(FINETUNED_MODEL)
tokenizer = AutoTokenizer.from_pretrained(FINETUNED_MODEL)

loading configuration file CHARLOTTE-05162022a-myDialoGPT2-small/config.json
Model config GPT2Config {
  "_name_or_path": "CHARLOTTE-05162022a-myDialoGPT2-small",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "conversational": {
      "max_length": 1000
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.19.0",
  "use_cache": true,
  "voca

In [38]:
def generate_responses(model, tokenizer, text, chat_history_ids=None, step=0):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors='pt')

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(
        bot_input_ids,
        max_length=512,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=3,
        do_sample=True,
        top_k=100,
        top_p=0.7
    )
    
    response = tokenizer.decode(
        chat_history_ids[:, bot_input_ids.shape[-1]:][0],
        skip_special_tokens=True,
    )

    return response, chat_history_ids, step + 1

def chat(model, tokenizer):
    step = 0
    chat_history_ids = []
    
    while True: 
        text = input(">> ")
        if text in ["/q", "/quit", "/e", "/exit"]: break
        print(f"User: {text}")
        response, chat_history_ids, step = generate_responses(
            model=model,
            tokenizer=tokenizer,
            text=text,
            chat_history_ids=chat_history_ids,
            step=step
        )
        print(f"Bot: {response}")
        

In [39]:
chat(finetuned_model, tokenizer)

>>  Hello


User: Hello
Bot: *Kalahan chuckles and shrugs.*


>>  What's wrong with you?


User: What's wrong with you?
Bot: *He glances over at Amarok and Eliza.*


>>  What about them?


User: What about them?
Bot: [_shrugs_] I don't know. They seem nice.


>>  How are you doing?


User: How are you doing?
Bot: They're doing better. I think. I mean, it's nice to have something to do.


>>  No, I mean you.


User: No, I mean you.
Bot: [_quietly_] Right, that.


>>  This doesn't make sense, dude.


User: This doesn't make sense, dude.
Bot: It's...not the end of the world.


>>  But isn't it though?


User: But isn't it though?
Bot: It...is not?


>>  How's Shiro?


User: How's Shiro?
Bot: She's doing better


>>  That's good to hear.


User: That's good to hear.
Bot: I guess


>>  Who is she with?


User: Who is she with?
Bot: [*shrug*]


>>  What are you doing?


User: What are you doing?
Bot: Just hanging out


>>  Are we meeting up? Where?


User: Are we meeting up? Where?
Bot: Yep


>>  Where are we meeting up?


User: Where are we meeting up?
Bot: Yeah


>>  What is the location


User: What is the location
Bot: You know where I'm going to meet up


>>  No, I do not.


User: No, I do not.
Bot: [charlotte will show up soon


>>  Hey


User: Hey
Bot: [[*follows after him*] Did you guys have anything else to drink?


>>  No we don't


User: No we don't
Bot: [will wait unless you want to wait?]


>>  I don't


User: I don't
Bot: ....[*will wait if you want us to...?


>>  I really don't want to


User: I really don't want to
Bot: [_will show up unless you say something


>>  /quit


## Re-do dataset

In [73]:
train_df.iloc[3000]

response                                                    [_looks concerned_] ..what do you mean?
context                                                     Now...I think...it was the same as you.
context/0                         I...remember much...of it. I...lose sense of time...near the end.
context/1                                                                       It is...unpleasant.
context/2                                                                                  [_nods_]
context/3                                                                  ...perhaps...it is time.
context/4    [_smiles weakly_] I thought...with everything that..you might want to talk to someone.
context/5                                                                ...is that...why you came?
Name: 3466, dtype: object

In [74]:
df.loc[df["text"] == "[_looks concerned_] ..what do you mean?"]

Unnamed: 0,character,text
14345,bitjockey,[_looks concerned_] ..what do you mean?


In [75]:
df.iloc[14345:14352]

Unnamed: 0,character,text
14345,bitjockey,[_looks concerned_] ..what do you mean?
14346,Shiro,"[_sighs_] They tried to...program me, I suppose. Simsense. Simulations. Drugs, I think."
14347,Shiro,I...remember the tank. But...near the end...things are chaotic. The memories...slip.
14348,bitjockey,[_stiffens_]
14349,bitjockey,So there are...gaps too? In your memory?
14350,Shiro,At that point...yes.
14351,Shiro,I...some are there but...I can't...hold onto them.


In [39]:
chat(finetuned_model, tokenizer)

>>  Hiya


User: Hiya
Bot: So, uh, what's up?


>>  Nothing much. You?


User: Nothing much. You?
Bot: I...just woke up.


>>  You did?


User: You did?
Bot: Yeah.


>>  Did you sleep okay?


User: Did you sleep okay?
Bot: You know it sounds like you are going to have a panic attack and start screaming.


>>  What?


User: What?
Bot: _::sits up straighter::_


>>  How you doing?


User: How you doing?
Bot: Just.


>>  Just?


User: Just?
Bot: About.


>>  I don't know


User: I don't know
Bot: Did your dad..tell you?


>>  [_shrugs_]


User: [_shrugs_]
Bot: I was born in '97, so I...had some time to think about it before coming back to this.


>>  How old are you?


User: How old are you?
Bot: ...I'm not really sure.


>>  /quit


In [80]:
train_df.loc[train_df["response"] == "[_looks concerned_] ..what do you mean?"]

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5
3466,[_looks concerned_] ..what do you mean?,Now...I think...it was the same as you.,I...remember much...of it. I...lose sense of time...near the end.,It is...unpleasant.,[_nods_],...perhaps...it is time.,[_smiles weakly_] I thought...with everything that..you might want to talk to someone.,...is that...why you came?


In [83]:
train_dataset[3000]

{'response': '[_looks concerned_] ..what do you mean?',
 'context': 'Now...I think...it was the same as you.',
 'context/0': 'I...remember much...of it. I...lose sense of time...near the end.',
 'context/1': 'It is...unpleasant.',
 'context/2': '[_nods_]',
 'context/3': '...perhaps...it is time.',
 'context/4': '[_smiles weakly_] I thought...with everything that..you might want to talk to someone.',
 'context/5': '...is that...why you came?',
 '__index_level_0__': 3466}