# Fine-Tuning GPT

https://www.modeldifferently.com/en/2021/12/generaci%C3%B3n-de-fake-news-con-gpt-2/#data-loading-and-processing

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch, os, re, pandas as pd, json
from sklearn.model_selection import train_test_split
from transformers import (
    DataCollatorForLanguageModeling,
    DataCollatorWithPadding,
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    AutoConfig,
    AutoTokenizer,
    AutoModelForCausalLM
)
from datasets import Dataset, list_metrics, load_metric


In [3]:
#MODEL_NAME = "gpt2"
#model_cls = GPT2LMHeadModel
#tokenizer_cls = GPT2Tokenizer

MODEL_NAME = "microsoft/DialoGPT-small"
model_cls = AutoModelForCausalLM
tokenizer_cls = AutoTokenizer

In [4]:
if torch.cuda.is_available():  
    dev = "cuda:0" 
else:  
    dev = "cpu"  

device = torch.device(dev) 

print(f"""
    cuda: {torch.cuda.is_available()}
    current_device: {torch.cuda.current_device()}
    device_count: {torch.cuda.device_count()}
""")


    cuda: True
    current_device: 0
    device_count: 1



In [5]:
config = AutoConfig.from_pretrained(MODEL_NAME)

## Load Data, Tokenizer, and Model

### Data Loading from CSV

In [6]:
filepath = "../data/processed.csv"
df = pd.read_csv(filepath, encoding="utf-8", usecols=["character", "content"]).rename(columns={"content": "text"})

pd.set_option("display.max_colwidth", None)
df.tail(10)

Unnamed: 0,character,text
32080,Charlotte,"What're the odds, right? But yes, we, uh...know each other. And he's also here in Boston."
32081,Toby,Right. Geeze. I...did not expect to ever hear from him again.
32082,Charlotte,...Surprise?
32083,Charlotte,"So, uh, would you want to talk to him? Because this is all stuff he really should get the chance to ask you about directly."
32084,Toby,Um...sure.
32085,Charlotte,"Ok, good. Um, we should probably go. But, uh, I'll see you tomorrow?"
32086,Charlotte,[_as she talks she takes the book off the table and carefully tucks it away in her bag_]
32087,Toby,"Um, yeah."
32088,Toby,Tomorrow.
32089,Charlotte,[_she will leave with the others_]


### Model

In [7]:
base_model = model_cls.from_pretrained(MODEL_NAME, config=config)

In [8]:
base_model.num_parameters

<bound method ModuleUtilsMixin.num_parameters of GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0

### Tokenizer

In [9]:
base_tokenizer = tokenizer_cls.from_pretrained(MODEL_NAME)

In [10]:
print(f"vocab_size: {base_tokenizer.vocab_size}")

vocab_size: 50257


In [11]:
vocabulary = base_tokenizer.get_vocab()
vocabulary["Hi"]

17250

In [21]:
base_tokenizer.eos_token

'<|endoftext|>'

## Conversational Model

### Data Preparation

In [68]:
import argparse
import tarfile
import os.path

from typing import Dict, Union

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoModelWithLMHead, AutoTokenizer
from datasets import Dataset


def load_csv(filename):
    return pd.read_csv(filename)


def prepare_data(
    data_or_filename: Union[str, pd.DataFrame],
    filter_by: str = None,
    filter_value: str = None,
    content_key: str = "text",
    n: int = 7,
    test_size: float = 0.1,
    flatten: bool = True,
):
    data = load_csv(data_or_filename) if isinstance(data_or_filename, str) else data_or_filename

    contexted_data = prepare_context(
        data,
        filter_by=filter_by,
        filter_value=filter_value,
        content_key=content_key,
        n=n,
    )

    trn_df, val_df = train_test_split(contexted_data, test_size=test_size)
    
    if flatten:
        train_dataset = prepare_dataset(trn_df)
        val_dataset = prepare_dataset(val_df)
        return train_dataset, val_dataset

    return trn_df, val_df


def prepare_context(
    data: pd.DataFrame,
    filter_by: str = None,
    filter_value: str = None,
    content_key: str = "text",
    n: int = 7,
):
    if filter_by:
        indexes = data.loc[data[filter_by] == filter_value].index
    else:
        indexes = range(n, len(data[content_key]))

    contexted = []

    for i in indexes:
        row = []
        prev = i - 1 - n
        for j in range(i, prev, -1):
            row.append(data.iloc[j][content_key])
        contexted.append(row)

    columns = ["response", "context"]
    columns = columns + ["context/" + str(i) for i in range(n - 1)]

    df = pd.DataFrame.from_records(contexted, columns=columns)
    return df


def prepare_dataset(df):
    columns = [col for col in df] 
    dataset = Dataset.from_pandas(concat_text(df))
    dataset = dataset.remove_columns(columns + ['__index_level_0__'])
    return dataset


def concat_text(df):
    df["text"] = df.apply(concat_text_in_row, axis=1)
    return df


def concat_text_in_row(row):
    concat_text = f"{base_tokenizer.eos_token}".join(row.values)
    # Add to end
    concat_text += base_tokenizer.eos_token
    return concat_text


def construct_conv(example, tokenizer, eos = True):
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list(reversed([tokenizer.encode(x, padding="max_length", max_length=250) + [tokenizer.eos_token_id] for x in example]))
    print(f"Conv Length: {len(conv)}") 
    print(set(list(map(len, conv))))
    conv = flatten(conv)
    return conv

In [70]:
base_tokenizer.pad_token = "<|PAD|>"

In [71]:
conv1 = construct_conv(train_df.iloc[1], base_tokenizer)

Conv Length: 8
{251}


In [72]:
conv1

[29795,
 1462,
 575,
 9613,
 290,
 309,
 620,
 19181,
 62,
 60,
 1374,
 466,
 356,
 7765,
 683,
 510,
 30,
 22481,
 4692,
 1660,
 319,
 683,
 44825,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50256,
 50

In [26]:
train_df, val_df = prepare_data(df, filter_by="character", filter_value="bitjockey", flatten=False)

In [120]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [207]:
def preprocess(tokenizer):
    def _tokenize(example):
        flatten = lambda l: [item for sublist in l for item in sublist]
        conv = list(reversed([tokenizer.encode(x, padding="max_length", max_length=250) + [tokenizer.eos_token_id] for x in example if "__index" not in x]))
        conv = flatten(conv)
        return conv
        
    def _preprocess(examples):
        #flatten = lambda l: [item for sublist in l for item in sublist]
        #conv = list([tokenizer.encode(x, padding="max_length", max_length=250) + [tokenizer.eos_token_id] for x in examples])
        #conv = flatten(conv)
        #return {"input_ids": conv}
        return {"input_ids": list(map(_tokenize, examples))}
    
    return _preprocess


def tokenize(example):
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list(reversed([tokenizer.encode(x, padding="max_length", max_length=250) + [tokenizer.eos_token_id] for x in example]))
    print(f"Conv Length: {len(conv)}") 
    print(set(list(map(len, conv))))
    conv = flatten(conv)
    return conv

def debug_preprocess(examples):
    print("count", len(examples)) 
    #conv = list(reversed([tokenizer.encode(x, padding="max_length", max_length=250) + [tokenizer.eos_token_id] for x in examples]))
    #print(conv)
    print([v for k,v in examples.items()])

In [189]:
train_dataset

Dataset({
    features: ['response', 'context', 'context/0', 'context/1', 'context/2', 'context/3', 'context/4', 'context/5', '__index_level_0__'],
    num_rows: 4967
})

In [190]:
val_dataset

Dataset({
    features: ['response', 'context', 'context/0', 'context/1', 'context/2', 'context/3', 'context/4', 'context/5', '__index_level_0__'],
    num_rows: 552
})

In [208]:
val_dataset.map(debug_preprocess)

  0%|          | 0/552 [00:00<?, ?ex/s]

count 9
['I am al right', 'me or yourself?', 'who are you trying to convince?', 'I am fine.', 'I am fine. I am okay.', 'Everything is fine.', "...you know you can tell me if something's wrong, ok?", '............k', 420]
count 9
['I had..one thing. To look forward to. That is..where Charlotte and I became acquiainted.', 'What?', '[_sighs_] There was..something.', '...any time?', "I couldn't really..look past that. There wasn't...", 'But you...never had a chance to really deal with all of it, because you were focused on just...getting to the next morning.', '[_Medusa looks like she wants to say something, but then thinks better of it._]', 'Okay.', 2807]
count 9
["(( I really haven't eaten anything btw so this is .. method acting or w/e ))", '@Charlotte hey', '(( 😂 ))', '[quietly] What the fuck', '[Mercifully,  no dreams haunted her during her nap. She heads over to her kitchen table where she left her cyberdeck and sits down. Her commlink right now is slaved to her cyberdeck so she can 

Dataset({
    features: ['response', 'context', 'context/0', 'context/1', 'context/2', 'context/3', 'context/4', 'context/5', '__index_level_0__'],
    num_rows: 552
})

In [196]:
val_df

Unnamed: 0,response,context,context/0,context/1,context/2,context/3,context/4,context/5
420,I am al right,me or yourself?,who are you trying to convince?,I am fine.,I am fine. I am okay.,Everything is fine.,"...you know you can tell me if something's wrong, ok?",............k
2807,I had..one thing. To look forward to. That is..where Charlotte and I became acquiainted.,What?,[_sighs_] There was..something.,...any time?,I couldn't really..look past that. There wasn't...,"But you...never had a chance to really deal with all of it, because you were focused on just...getting to the next morning.","[_Medusa looks like she wants to say something, but then thinks better of it._]",Okay.
433,(( I really haven't eaten anything btw so this is .. method acting or w/e )),@Charlotte hey,(( 😂 )),[quietly] What the fuck,"[Mercifully, no dreams haunted her during her nap. She heads over to her kitchen table where she left her cyberdeck and sits down. Her commlink right now is slaved to her cyberdeck so she can read it on a bigger screen or w/e. She reads the chat log between her and Charlotte.]",_It's past noon. And I haven't eaten anything_,Drek.,(( ugh why did I come up with such a brooding character 😛 ))
3450,[_shakes head_] It was disturbing enough for all of us who _did_ see. For you..I don't..think it would have been a good idea to see it.,[_quietly_] I...am not sure if...I should be glad I...didn't see it...or not.,...no.,[_sighs_] I figured as much. It's still...it isn't any better.,It was not...unexpected.,[_Shiro sets the tea and the book she was reading down_],[_shrugs slightly_],[_sits wherever there is space_] [_softly_] How...are you?
2157,I did.,You seemed like you enjoyed this afternoon.,"[_distributes it, I mean_]",[_presents the food to the squad_],"[_assuming you have ordered food, it arrives after said time passes_]","[_the food will arrive in the exact maximum estimated time the order claims, because this is Boston and a dystopia and so we can't have nice things and this is how it works anyway_]",Alright,Guess we know what Julia wants.
...,...,...,...,...,...,...,...,...
3862,Alright.,...So there will be options.,[*charlotte will name some games that he and kalahan totally talked about in the days leading up to this*],Do you know what we'll have to play? I don't even know what Kalahan enjoys playing...,Should be an interesting evening,Yeah.,"[*notices hopper and the moms (aka a convenient excuse to ignore nix's Look)*] Hey, you guys heading over too?","[_as she makes her way towards Kalahan's place_] You know, I don't actually know what games they'll have there."
351,yes but i really think that was my fault shti,okay. are they...getting help?,okay,the job went well but my...asosicate is not,i mean,sorry,"no everyone is not okay, no everyone is not alive, or no the job did not go south?",no
160,((i've given up trying to type in 1337 speak all the time)),yeah,@Charlotte r u free,"So, it's a day ending in 'y'. Got it.","so, you know. just Nix bein Nix",she did her stupid fuckn manipulative mastermind shit,Do I want to know why you have to yell at her?,👍
5262,Not unless you can help with...[_shrug_],Anything you might need help with?,"Oh, I see. I'm not up to anything important right now. You?","...But, uh, what're you up to?","[_nods_] He wasn't finished before, just out of tasks. And, well, he wanted to share what he'd found so far. For obvious reasons.",Oh? You did?,"I don't know? But, uh, I sent Wilbur back.",Oh no...it's alright. How are you doing?


In [54]:
tokenized_train_dataset = train_dataset.map(lambda e: base_tokenizer(e["text"], truncation=True, padding="max_length", max_length=250), remove_columns=["text"], batched=True)
tokenized_train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])
tokenized_val_dataset = val_dataset.map(lambda e: base_tokenizer(e["text"], truncation=True, padding="max_length", max_length=250), remove_columns=["text"], batched=True)
tokenized_val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask"])

  0%|          | 0/5 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

### Metrics

In [50]:
metric = load_metric("perplexity")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


### Training

In [48]:
base_model.resize_token_embeddings(len(base_tokenizer))

Embedding(50257, 768)

In [57]:
model_path = './my-DialoGPT'

data_collator = DataCollatorForLanguageModeling(
    tokenizer=base_tokenizer,
    mlm=False,
)

trainer = None
training_args = TrainingArguments(
    output_dir=model_path,          # output directory
    evaluation_strategy="epoch",
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=2,  # batch size per device during training
    per_device_eval_batch_size=2,   # batch size for evaluation
    weight_decay=0.01,               # strength of weight decay
    logging_dir=model_path,            # directory for storing logs
    prediction_loss_only=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [58]:
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

In [59]:
trainer.get_train_dataloader()

<torch.utils.data.dataloader.DataLoader at 0x7f01501b70d0>

In [60]:
trainer.train()

***** Running training *****
  Num examples = 4967
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 7452


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss
1,3.2241,2.903491
2,2.7628,2.639832
3,2.5225,2.549835


Saving model checkpoint to ./my-DialoGPT/checkpoint-500
Configuration saved in ./my-DialoGPT/checkpoint-500/config.json


Attempted to log scalar metric loss:
4.3744
Attempted to log scalar metric learning_rate:
4.6645195920558245e-05
Attempted to log scalar metric epoch:
0.2


Model weights saved in ./my-DialoGPT/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./my-DialoGPT/checkpoint-1000
Configuration saved in ./my-DialoGPT/checkpoint-1000/config.json


Attempted to log scalar metric loss:
3.575
Attempted to log scalar metric learning_rate:
4.329039184111648e-05
Attempted to log scalar metric epoch:
0.4


Model weights saved in ./my-DialoGPT/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./my-DialoGPT/checkpoint-1500
Configuration saved in ./my-DialoGPT/checkpoint-1500/config.json


Attempted to log scalar metric loss:
3.3675
Attempted to log scalar metric learning_rate:
3.993558776167472e-05
Attempted to log scalar metric epoch:
0.6


Model weights saved in ./my-DialoGPT/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./my-DialoGPT/checkpoint-2000
Configuration saved in ./my-DialoGPT/checkpoint-2000/config.json


Attempted to log scalar metric loss:
3.2241
Attempted to log scalar metric learning_rate:
3.6580783682232964e-05
Attempted to log scalar metric epoch:
0.81


Model weights saved in ./my-DialoGPT/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 552
  Batch size = 2


Attempted to log scalar metric eval_loss:
2.9034910202026367
Attempted to log scalar metric eval_runtime:
7.3398
Attempted to log scalar metric eval_samples_per_second:
75.206
Attempted to log scalar metric eval_steps_per_second:
37.603
Attempted to log scalar metric epoch:
1.0


Saving model checkpoint to ./my-DialoGPT/checkpoint-2500
Configuration saved in ./my-DialoGPT/checkpoint-2500/config.json


Attempted to log scalar metric loss:
3.1195
Attempted to log scalar metric learning_rate:
3.32259796027912e-05
Attempted to log scalar metric epoch:
1.01


Model weights saved in ./my-DialoGPT/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ./my-DialoGPT/checkpoint-3000
Configuration saved in ./my-DialoGPT/checkpoint-3000/config.json


Attempted to log scalar metric loss:
2.8748
Attempted to log scalar metric learning_rate:
2.9871175523349438e-05
Attempted to log scalar metric epoch:
1.21


Model weights saved in ./my-DialoGPT/checkpoint-3000/pytorch_model.bin
Saving model checkpoint to ./my-DialoGPT/checkpoint-3500
Configuration saved in ./my-DialoGPT/checkpoint-3500/config.json


Attempted to log scalar metric loss:
2.8411
Attempted to log scalar metric learning_rate:
2.6516371443907677e-05
Attempted to log scalar metric epoch:
1.41


Model weights saved in ./my-DialoGPT/checkpoint-3500/pytorch_model.bin
Saving model checkpoint to ./my-DialoGPT/checkpoint-4000
Configuration saved in ./my-DialoGPT/checkpoint-4000/config.json


Attempted to log scalar metric loss:
2.7769
Attempted to log scalar metric learning_rate:
2.3161567364465916e-05
Attempted to log scalar metric epoch:
1.61


Model weights saved in ./my-DialoGPT/checkpoint-4000/pytorch_model.bin
Saving model checkpoint to ./my-DialoGPT/checkpoint-4500
Configuration saved in ./my-DialoGPT/checkpoint-4500/config.json


Attempted to log scalar metric loss:
2.7628
Attempted to log scalar metric learning_rate:
1.9806763285024154e-05
Attempted to log scalar metric epoch:
1.81


Model weights saved in ./my-DialoGPT/checkpoint-4500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 552
  Batch size = 2


Attempted to log scalar metric eval_loss:
2.6398324966430664
Attempted to log scalar metric eval_runtime:
7.3561
Attempted to log scalar metric eval_samples_per_second:
75.04
Attempted to log scalar metric eval_steps_per_second:
37.52
Attempted to log scalar metric epoch:
2.0


Saving model checkpoint to ./my-DialoGPT/checkpoint-5000
Configuration saved in ./my-DialoGPT/checkpoint-5000/config.json


Attempted to log scalar metric loss:
2.7169
Attempted to log scalar metric learning_rate:
1.6451959205582396e-05
Attempted to log scalar metric epoch:
2.01


Model weights saved in ./my-DialoGPT/checkpoint-5000/pytorch_model.bin
Saving model checkpoint to ./my-DialoGPT/checkpoint-5500
Configuration saved in ./my-DialoGPT/checkpoint-5500/config.json


Attempted to log scalar metric loss:
2.5583
Attempted to log scalar metric learning_rate:
1.3097155126140634e-05
Attempted to log scalar metric epoch:
2.21


Model weights saved in ./my-DialoGPT/checkpoint-5500/pytorch_model.bin
Saving model checkpoint to ./my-DialoGPT/checkpoint-6000
Configuration saved in ./my-DialoGPT/checkpoint-6000/config.json


Attempted to log scalar metric loss:
2.543
Attempted to log scalar metric learning_rate:
9.742351046698874e-06
Attempted to log scalar metric epoch:
2.42


Model weights saved in ./my-DialoGPT/checkpoint-6000/pytorch_model.bin
Saving model checkpoint to ./my-DialoGPT/checkpoint-6500
Configuration saved in ./my-DialoGPT/checkpoint-6500/config.json


Attempted to log scalar metric loss:
2.5223
Attempted to log scalar metric learning_rate:
6.3875469672571135e-06
Attempted to log scalar metric epoch:
2.62


Model weights saved in ./my-DialoGPT/checkpoint-6500/pytorch_model.bin
Saving model checkpoint to ./my-DialoGPT/checkpoint-7000
Configuration saved in ./my-DialoGPT/checkpoint-7000/config.json


Attempted to log scalar metric loss:
2.5225
Attempted to log scalar metric learning_rate:
3.032742887815352e-06
Attempted to log scalar metric epoch:
2.82


Model weights saved in ./my-DialoGPT/checkpoint-7000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 552
  Batch size = 2


Attempted to log scalar metric eval_loss:
2.549834728240967
Attempted to log scalar metric eval_runtime:
7.3521
Attempted to log scalar metric eval_samples_per_second:
75.081
Attempted to log scalar metric eval_steps_per_second:
37.541
Attempted to log scalar metric epoch:
3.0




Training completed. Do not forget to share your model on huggingface.co/models =)




Attempted to log scalar metric train_runtime:
916.2018
Attempted to log scalar metric train_samples_per_second:
16.264
Attempted to log scalar metric train_steps_per_second:
8.134
Attempted to log scalar metric total_flos:
1901129184000000.0
Attempted to log scalar metric train_loss:
2.954400595451515
Attempted to log scalar metric epoch:
3.0


TrainOutput(global_step=7452, training_loss=2.954400595451515, metrics={'train_runtime': 916.2018, 'train_samples_per_second': 16.264, 'train_steps_per_second': 8.134, 'total_flos': 1901129184000000.0, 'train_loss': 2.954400595451515, 'epoch': 3.0})

In [61]:
trainer.save_model("my-DialoGPT")
base_tokenizer.save_pretrained("my-DialoGPT")
config.save_pretrained("my-DialoGPT")

Saving model checkpoint to my-DialoGPT
Configuration saved in my-DialoGPT/config.json
Model weights saved in my-DialoGPT/pytorch_model.bin
tokenizer config file saved in my-DialoGPT/tokenizer_config.json
Special tokens file saved in my-DialoGPT/special_tokens_map.json
Configuration saved in my-DialoGPT/config.json


In [62]:
finetuned_model = AutoModelForCausalLM.from_pretrained("my-DialoGPT")
tokenizer = AutoTokenizer.from_pretrained("my-DialoGPT")

loading configuration file my-DialoGPT/config.json
Model config GPT2Config {
  "_name_or_path": "my-DialoGPT",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "conversational": {
      "max_length": 1000
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.19.0",
  "use_cache": true,
  "vocab_size": 50257
}

loading weights file my-DialoGPT/p

In [23]:
def generate_responses(model, tokenizer, text, chat_history_ids=None, step=0):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors='pt')

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(
        bot_input_ids,
        max_length=512,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=3,
        do_sample=True,
        top_k=100,
        top_p=0.7
    )
    
    response = tokenizer.decode(
        chat_history_ids[:, bot_input_ids.shape[-1]:][0],
        skip_special_tokens=True,
    )

    return response, chat_history_ids, step + 1

def chat(model, tokenizer):
    step = 0
    chat_history_ids = []
    
    while True: 
        text = input(">> ")
        if text in ["/q", "/quit", "/e", "/exit"]: break
        print(f"User: {text}")
        response, chat_history_ids, step = generate_responses(
            model=model,
            tokenizer=tokenizer,
            text=text,
            chat_history_ids=chat_history_ids,
            step=step
        )
        print(f"Bot: {response}")
        

In [24]:
chat(finetuned_model, tokenizer)

>>  Hello


User: Hello
Bot: Hey. You're on the floor. [*to hopper*] How are you? [*to Charlotte*] Hey. [_to Amarok, Kalahan, and Hopper_] What happened? [*charlotte doesn't say anything, just looks around at everyone*] Hi. How are..you? [*will follow*] Oh hey Hopper. How about you? Have a nice day? [*gives hopper a quick hug*] I'll be right back, if you need me. [*looks around*] [_will head to the other room_] It's a little late for this, but if you want to come with, I'm in the common room, the people who are in the building are also here. [ _to Hopper, Kalahashi_]...Hey, are you coming? [*shakes head*]  Oh, hi.  Oh hey, Charlotte.  Sorry, I haven't seen you in awhile.  [_she takes the commlink off the couch and sits on the couch, reading the book, just reading it._]  I can meet you there if you're ready.  How about yourself, uhm, at the library.  Yeah, I'll go.  I'll let you know when I'm ready. [*sighs_]  Hey, did you want anything, you want me to come over?  I have something to sit down, or d

>>  /quit


In [8]:
text = "[_as she talks she takes the book off the table and carefully tucks it away in her bag_]"

In [15]:
tokenizer.pad_token = "<|pad|>"
tokenizer(text, padding="max_length", max_length=512)

{'input_ids': [29795, 292, 673, 6130, 673, 2753, 262, 1492, 572, 262, 3084, 290, 7773, 256, 6238, 340, 1497, 287, 607, 6131, 62, 60, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,

In [14]:
finetuned_model.max_length

AttributeError: 'GPT2LMHeadModel' object has no attribute 'max_length'