# Fine-Tuning GPT

https://www.modeldifferently.com/en/2021/12/generaci%C3%B3n-de-fake-news-con-gpt-2/#data-loading-and-processing

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch, os, re, pandas as pd, json
from sklearn.model_selection import train_test_split
from transformers import (
    DataCollatorForLanguageModeling,
    DataCollatorWithPadding,
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    AutoConfig,
    AutoTokenizer,
    AutoModelForCausalLM
)
from datasets import Dataset, list_metrics, load_metric


In [3]:
#MODEL_NAME = "gpt2"
#model_cls = GPT2LMHeadModel
#tokenizer_cls = GPT2Tokenizer

MODEL_NAME = "microsoft/DialoGPT-small"
model_cls = AutoModelForCausalLM
tokenizer_cls = AutoTokenizer

In [4]:
if torch.cuda.is_available():  
    dev = "cuda:0" 
else:  
    dev = "cpu"  

device = torch.device(dev) 

print(f"""
    cuda: {torch.cuda.is_available()}
    current_device: {torch.cuda.current_device()}
    device_count: {torch.cuda.device_count()}
""")


    cuda: True
    current_device: 0
    device_count: 1



In [5]:
config = AutoConfig.from_pretrained(MODEL_NAME)

## Load Data, Tokenizer, and Model

### Data Loading from CSV

In [6]:
filepath = "../data/processed.csv"
df = pd.read_csv(filepath, encoding="utf-8", usecols=["character", "content"]).rename(columns={"content": "text"})

pd.set_option("display.max_colwidth", None)
df.tail(10)

Unnamed: 0,character,text
32080,Charlotte,"What're the odds, right? But yes, we, uh...know each other. And he's also here in Boston."
32081,Toby,Right. Geeze. I...did not expect to ever hear from him again.
32082,Charlotte,...Surprise?
32083,Charlotte,"So, uh, would you want to talk to him? Because this is all stuff he really should get the chance to ask you about directly."
32084,Toby,Um...sure.
32085,Charlotte,"Ok, good. Um, we should probably go. But, uh, I'll see you tomorrow?"
32086,Charlotte,[_as she talks she takes the book off the table and carefully tucks it away in her bag_]
32087,Toby,"Um, yeah."
32088,Toby,Tomorrow.
32089,Charlotte,[_she will leave with the others_]


### Model

In [7]:
base_model = model_cls.from_pretrained(MODEL_NAME, config=config)

In [8]:
base_model.num_parameters

<bound method ModuleUtilsMixin.num_parameters of GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0

### Tokenizer

In [9]:
base_tokenizer = tokenizer_cls.from_pretrained(MODEL_NAME)

In [10]:
print(f"vocab_size: {base_tokenizer.vocab_size}")

vocab_size: 50257


In [11]:
vocabulary = base_tokenizer.get_vocab()
vocabulary["Hi"]

17250

In [12]:
base_tokenizer.eos_token

'<|endoftext|>'

In [13]:
base_tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

## Conversational Model

### Data Preparation

In [14]:
import argparse
import tarfile
import os.path

from typing import Dict, Union

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoModelWithLMHead, AutoTokenizer
from datasets import Dataset


def load_csv(filename):
    return pd.read_csv(filename)


def prepare_data(
    data_or_filename: Union[str, pd.DataFrame],
    filter_by: str = None,
    filter_value: str = None,
    content_key: str = "text",
    n: int = 7,
    test_size: float = 0.1,
    flatten: bool = True,
):
    data = load_csv(data_or_filename) if isinstance(data_or_filename, str) else data_or_filename

    contexted_data = prepare_context(
        data,
        filter_by=filter_by,
        filter_value=filter_value,
        content_key=content_key,
        n=n,
    )

    trn_df, val_df = train_test_split(contexted_data, test_size=test_size)
    
    if flatten:
        train_dataset = prepare_dataset(trn_df)
        val_dataset = prepare_dataset(val_df)
        return train_dataset, val_dataset

    return trn_df, val_df


def prepare_context(
    data: pd.DataFrame,
    filter_by: str = None,
    filter_value: str = None,
    content_key: str = "text",
    n: int = 7,
):
    if filter_by:
        indexes = data.loc[data[filter_by] == filter_value].index
    else:
        indexes = range(n, len(data[content_key]))

    contexted = []

    for i in indexes:
        row = []
        prev = i - 1 - n
        for j in range(i, prev, -1):
            row.append(data.iloc[j][content_key])
        contexted.append(row)

    columns = ["response", "context"]
    columns = columns + ["context/" + str(i) for i in range(n - 1)]

    df = pd.DataFrame.from_records(contexted, columns=columns)
    return df


def prepare_dataset(df):
    columns = [col for col in df] 
    dataset = Dataset.from_pandas(concat_text(df))
    dataset = dataset.remove_columns(columns + ['__index_level_0__'])
    return dataset


def concat_text(df):
    df["text"] = df.apply(concat_text_in_row, axis=1)
    return df


def concat_text_in_row(row):
    concat_text = f"{base_tokenizer.eos_token}".join(row.values)
    # Add to end
    concat_text += base_tokenizer.eos_token
    return concat_text


def construct_conv(example, tokenizer, eos = True):
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list(reversed([tokenizer.encode(x, padding="max_length", max_length=250) + [tokenizer.eos_token_id] for x in example]))
    print(f"Conv Length: {len(conv)}") 
    print(set(list(map(len, conv))))
    conv = flatten(conv)
    return conv

In [15]:
base_tokenizer._pad_token

'[PAD]'

In [16]:
train_df, val_df = prepare_data(df, filter_by="character", filter_value="bitjockey", flatten=False)

In [17]:
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)

In [18]:
def preprocess(tokenizer):
    def _tokenize(example):
        flatten = lambda l: [item for sublist in l for item in sublist]
        conv = list(reversed([tokenizer.encode(x, padding="max_length", max_length=250) + [tokenizer.eos_token_id] for x in example if "__index" not in x]))
        conv = flatten(conv)
        return conv
        
    def _preprocess(examples):
        #flatten = lambda l: [item for sublist in l for item in sublist]
        #conv = list([tokenizer.encode(x, padding="max_length", max_length=250) + [tokenizer.eos_token_id] for x in examples])
        #conv = flatten(conv)
        #return {"input_ids": conv}
        return {"input_ids": list(map(_tokenize, examples))}
    
    return _preprocess


def tokenize(example):
    flatten = lambda l: [item for sublist in l for item in sublist]
    conv = list(reversed([tokenizer.encode(x, padding="max_length", max_length=250) + [tokenizer.eos_token_id] for x in example]))
    print(f"Conv Length: {len(conv)}") 
    print(set(list(map(len, conv))))
    conv = flatten(conv)
    return conv


def debug_preprocess(tokenizer, max_length=250): 
    def _tokenize(examples):
        input_ids = [tokenizer.encode(v, padding="max_length", max_length=250) for k, v in examples.items() if isinstance(v, str)]
        examples["input_ids"] = input_ids
        return examples
    return _tokenize

In [19]:
train_dataset

Dataset({
    features: ['response', 'context', 'context/0', 'context/1', 'context/2', 'context/3', 'context/4', 'context/5', '__index_level_0__'],
    num_rows: 4967
})

In [20]:
val_dataset

Dataset({
    features: ['response', 'context', 'context/0', 'context/1', 'context/2', 'context/3', 'context/4', 'context/5', '__index_level_0__'],
    num_rows: 552
})

In [21]:
from typing import List

from torch.nn.utils.rnn import pad_sequence


def preprocess_conv(tokenizer):
    def _construct(examples):
        flatten = lambda l: [item for sublist in l for item in sublist]
        conv = list(reversed([tokenizer.encode(v) + [tokenizer.eos_token_id] for k,v in examples.items() if isinstance(v, str)]))
        conv = flatten(conv)
        examples["input_ids"] = conv
        return examples 
    return _construct


def collate(examples: List[torch.Tensor], tokenizer):
    if tokenizer._pad_token is None:
        return pad_sequence(examples, batch_first=True)
    return pad_sequence(
        examples, batch_first=True, padding_value=tokenizer.pad_token_id
    )


In [22]:
base_tokenizer

PreTrainedTokenizerFast(name_or_path='microsoft/DialoGPT-small', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '[PAD]'})

In [23]:
tokenized_train_dataset = train_dataset.map(preprocess_conv(tokenizer=base_tokenizer), remove_columns=list(train_dataset.features.keys()))
tokenized_val_dataset = val_dataset.map(preprocess_conv(tokenizer=base_tokenizer), remove_columns=list(val_dataset.features.keys()))

  0%|          | 0/4967 [00:00<?, ?ex/s]

  0%|          | 0/552 [00:00<?, ?ex/s]

In [24]:
tokenized_val_dataset["input_ids"][1]

[29795,
 301,
 3565,
 11,
 788,
 3013,
 2096,
 290,
 29959,
 607,
 1182,
 62,
 60,
 50256,
 40,
 655,
 765,
 284,
 766,
 262,
 804,
 319,
 14828,
 2501,
 338,
 1986,
 50256,
 29795,
 18487,
 947,
 17275,
 607,
 1182,
 62,
 60,
 50256,
 8491,
 345,
 41434,
 30,
 50256,
 45,
 3008,
 0,
 50256,
 58,
 9,
 28124,
 9,
 60,
 50256,
 40,
 1549,
 910,
 356,
 815,
 787,
 12612,
 19210,
 466,
 340,
 11,
 475,
 314,
 1101,
 407,
 5000,
 1654,
 339,
 4808,
 19188,
 62,
 50256,
 11028,
 986,
 392,
 339,
 492,
 26949,
 468,
 257,
 1256,
 319,
 465,
 2000,
 826,
 783,
 13,
 50256]

In [25]:
tokenized_val_dataset

Dataset({
    features: ['input_ids'],
    num_rows: 552
})

In [26]:
base_tokenizer.decode(tokenized_train_dataset["input_ids"][1])

"[_walks over to living room_] Hi..<|endoftext|>[_quietly_] I'll be in the study. [_she brushes her hand through Shiro's hair once before leaving the room_]<|endoftext|>[_to Hopper_] Hello.<|endoftext|>[_sits wherever there is space_] [_softly_] How...are you?<|endoftext|>[_shrugs slightly_]<|endoftext|>[_Shiro sets the tea and the book she was reading down_]<|endoftext|>It was not...unexpected.<|endoftext|>[_sighs_] I figured as much. It's still...it isn't any better.<|endoftext|>"

In [89]:
tokenized_train_dataset.set_format(type="torch", columns=["input_ids"])
tokenized_val_dataset.set_format(type="torch", columns=["input_ids"])

In [76]:
#train_ds = collate(tokenized_train_dataset["input_ids"], base_tokenizer)

In [77]:
#val_ds = collate(tokenized_val_dataset["input_ids"], base_tokenizer)

### Metrics

In [27]:
metric = load_metric("perplexity")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


### Training

In [28]:
base_model.resize_token_embeddings(len(base_tokenizer))

Embedding(50258, 768)

In [29]:
FINETUNED_MODEL = 'test-myDialoGPT2-small'

data_collator = DataCollatorForLanguageModeling(
    tokenizer=base_tokenizer,
    mlm=False,
    return_tensors="pt",
)

trainer = None
training_args = TrainingArguments(
    output_dir=FINETUNED_MODEL,          # output directory
    evaluation_strategy="epoch",
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=2,  # batch size per device during training
    per_device_eval_batch_size=2,   # batch size for evaluation
    weight_decay=0.01,               # strength of weight decay
    logging_dir=FINETUNED_MODEL,            # directory for storing logs
    prediction_loss_only=True,
)

In [30]:
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

In [99]:
trainer.get_train_dataloader()

<torch.utils.data.dataloader.DataLoader at 0x7f535d94ae80>

In [31]:
trainer.train()

***** Running training *****
  Num examples = 4967
  Num Epochs = 3
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 7452


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss
1,3.1247,2.782273
2,2.674,2.540396
3,2.428,2.458146


Saving model checkpoint to test-myDialoGPT2-small/checkpoint-500
Configuration saved in test-myDialoGPT2-small/checkpoint-500/config.json


Attempted to log scalar metric loss:
4.3066
Attempted to log scalar metric learning_rate:
4.6645195920558245e-05
Attempted to log scalar metric epoch:
0.2


Model weights saved in test-myDialoGPT2-small/checkpoint-500/pytorch_model.bin
Saving model checkpoint to test-myDialoGPT2-small/checkpoint-1000
Configuration saved in test-myDialoGPT2-small/checkpoint-1000/config.json


Attempted to log scalar metric loss:
3.4495
Attempted to log scalar metric learning_rate:
4.329039184111648e-05
Attempted to log scalar metric epoch:
0.4


Model weights saved in test-myDialoGPT2-small/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to test-myDialoGPT2-small/checkpoint-1500
Configuration saved in test-myDialoGPT2-small/checkpoint-1500/config.json


Attempted to log scalar metric loss:
3.2597
Attempted to log scalar metric learning_rate:
3.993558776167472e-05
Attempted to log scalar metric epoch:
0.6


Model weights saved in test-myDialoGPT2-small/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to test-myDialoGPT2-small/checkpoint-2000
Configuration saved in test-myDialoGPT2-small/checkpoint-2000/config.json


Attempted to log scalar metric loss:
3.1247
Attempted to log scalar metric learning_rate:
3.6580783682232964e-05
Attempted to log scalar metric epoch:
0.81


Model weights saved in test-myDialoGPT2-small/checkpoint-2000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 552
  Batch size = 2


Attempted to log scalar metric eval_loss:
2.782273054122925
Attempted to log scalar metric eval_runtime:
5.0384
Attempted to log scalar metric eval_samples_per_second:
109.559
Attempted to log scalar metric eval_steps_per_second:
54.779
Attempted to log scalar metric epoch:
1.0


Saving model checkpoint to test-myDialoGPT2-small/checkpoint-2500
Configuration saved in test-myDialoGPT2-small/checkpoint-2500/config.json


Attempted to log scalar metric loss:
3.0413
Attempted to log scalar metric learning_rate:
3.32259796027912e-05
Attempted to log scalar metric epoch:
1.01


Model weights saved in test-myDialoGPT2-small/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to test-myDialoGPT2-small/checkpoint-3000
Configuration saved in test-myDialoGPT2-small/checkpoint-3000/config.json


Attempted to log scalar metric loss:
2.7855
Attempted to log scalar metric learning_rate:
2.9871175523349438e-05
Attempted to log scalar metric epoch:
1.21


Model weights saved in test-myDialoGPT2-small/checkpoint-3000/pytorch_model.bin
Saving model checkpoint to test-myDialoGPT2-small/checkpoint-3500
Configuration saved in test-myDialoGPT2-small/checkpoint-3500/config.json


Attempted to log scalar metric loss:
2.76
Attempted to log scalar metric learning_rate:
2.6516371443907677e-05
Attempted to log scalar metric epoch:
1.41


Model weights saved in test-myDialoGPT2-small/checkpoint-3500/pytorch_model.bin
Saving model checkpoint to test-myDialoGPT2-small/checkpoint-4000
Configuration saved in test-myDialoGPT2-small/checkpoint-4000/config.json


Attempted to log scalar metric loss:
2.6973
Attempted to log scalar metric learning_rate:
2.3161567364465916e-05
Attempted to log scalar metric epoch:
1.61


Model weights saved in test-myDialoGPT2-small/checkpoint-4000/pytorch_model.bin
Saving model checkpoint to test-myDialoGPT2-small/checkpoint-4500
Configuration saved in test-myDialoGPT2-small/checkpoint-4500/config.json


Attempted to log scalar metric loss:
2.674
Attempted to log scalar metric learning_rate:
1.9806763285024154e-05
Attempted to log scalar metric epoch:
1.81


Model weights saved in test-myDialoGPT2-small/checkpoint-4500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 552
  Batch size = 2


Attempted to log scalar metric eval_loss:
2.5403964519500732
Attempted to log scalar metric eval_runtime:
5.0705
Attempted to log scalar metric eval_samples_per_second:
108.864
Attempted to log scalar metric eval_steps_per_second:
54.432
Attempted to log scalar metric epoch:
2.0


Saving model checkpoint to test-myDialoGPT2-small/checkpoint-5000
Configuration saved in test-myDialoGPT2-small/checkpoint-5000/config.json


Attempted to log scalar metric loss:
2.5895
Attempted to log scalar metric learning_rate:
1.6451959205582396e-05
Attempted to log scalar metric epoch:
2.01


Model weights saved in test-myDialoGPT2-small/checkpoint-5000/pytorch_model.bin
Saving model checkpoint to test-myDialoGPT2-small/checkpoint-5500
Configuration saved in test-myDialoGPT2-small/checkpoint-5500/config.json


Attempted to log scalar metric loss:
2.4874
Attempted to log scalar metric learning_rate:
1.3097155126140634e-05
Attempted to log scalar metric epoch:
2.21


Model weights saved in test-myDialoGPT2-small/checkpoint-5500/pytorch_model.bin
Saving model checkpoint to test-myDialoGPT2-small/checkpoint-6000
Configuration saved in test-myDialoGPT2-small/checkpoint-6000/config.json


Attempted to log scalar metric loss:
2.4644
Attempted to log scalar metric learning_rate:
9.742351046698874e-06
Attempted to log scalar metric epoch:
2.42


Model weights saved in test-myDialoGPT2-small/checkpoint-6000/pytorch_model.bin
Saving model checkpoint to test-myDialoGPT2-small/checkpoint-6500
Configuration saved in test-myDialoGPT2-small/checkpoint-6500/config.json


Attempted to log scalar metric loss:
2.4528
Attempted to log scalar metric learning_rate:
6.3875469672571135e-06
Attempted to log scalar metric epoch:
2.62


Model weights saved in test-myDialoGPT2-small/checkpoint-6500/pytorch_model.bin
Saving model checkpoint to test-myDialoGPT2-small/checkpoint-7000
Configuration saved in test-myDialoGPT2-small/checkpoint-7000/config.json


Attempted to log scalar metric loss:
2.428
Attempted to log scalar metric learning_rate:
3.032742887815352e-06
Attempted to log scalar metric epoch:
2.82


Model weights saved in test-myDialoGPT2-small/checkpoint-7000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 552
  Batch size = 2


Attempted to log scalar metric eval_loss:
2.458146333694458
Attempted to log scalar metric eval_runtime:
5.0748
Attempted to log scalar metric eval_samples_per_second:
108.774
Attempted to log scalar metric eval_steps_per_second:
54.387
Attempted to log scalar metric epoch:
3.0




Training completed. Do not forget to share your model on huggingface.co/models =)




Attempted to log scalar metric train_runtime:
706.9178
Attempted to log scalar metric train_samples_per_second:
21.079
Attempted to log scalar metric train_steps_per_second:
10.542
Attempted to log scalar metric total_flos:
1188951213312000.0
Attempted to log scalar metric train_loss:
2.866119614097935
Attempted to log scalar metric epoch:
3.0


TrainOutput(global_step=7452, training_loss=2.866119614097935, metrics={'train_runtime': 706.9178, 'train_samples_per_second': 21.079, 'train_steps_per_second': 10.542, 'total_flos': 1188951213312000.0, 'train_loss': 2.866119614097935, 'epoch': 3.0})

In [32]:
trainer.save_model(FINETUNED_MODEL)
base_tokenizer.save_pretrained(FINETUNED_MODEL)
config.save_pretrained(FINETUNED_MODEL)

Saving model checkpoint to test-myDialoGPT2-small
Configuration saved in test-myDialoGPT2-small/config.json
Model weights saved in test-myDialoGPT2-small/pytorch_model.bin
tokenizer config file saved in test-myDialoGPT2-small/tokenizer_config.json
Special tokens file saved in test-myDialoGPT2-small/special_tokens_map.json
Configuration saved in test-myDialoGPT2-small/config.json


In [33]:
finetuned_model = AutoModelForCausalLM.from_pretrained(FINETUNED_MODEL)
tokenizer = AutoTokenizer.from_pretrained(FINETUNED_MODEL)

loading configuration file test-myDialoGPT2-small/config.json
Model config GPT2Config {
  "_name_or_path": "test-myDialoGPT2-small",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "conversational": {
      "max_length": 1000
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.19.0",
  "use_cache": true,
  "vocab_size": 50258
}

loading weig

In [34]:
def generate_responses(model, tokenizer, text, chat_history_ids=None, step=0):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(text + tokenizer.eos_token, return_tensors='pt')

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(
        bot_input_ids,
        max_length=512,
        pad_token_id=tokenizer.eos_token_id,
        no_repeat_ngram_size=3,
        do_sample=True,
        top_k=100,
        top_p=0.7
    )
    
    response = tokenizer.decode(
        chat_history_ids[:, bot_input_ids.shape[-1]:][0],
        skip_special_tokens=True,
    )

    return response, chat_history_ids, step + 1

def chat(model, tokenizer):
    step = 0
    chat_history_ids = []
    
    while True: 
        text = input(">> ")
        if text in ["/q", "/quit", "/e", "/exit"]: break
        print(f"User: {text}")
        response, chat_history_ids, step = generate_responses(
            model=model,
            tokenizer=tokenizer,
            text=text,
            chat_history_ids=chat_history_ids,
            step=step
        )
        print(f"Bot: {response}")
        

In [None]:
chat(finetuned_model, tokenizer)

>>  Hello


User: Hello
Bot: *Kalahan is in a really awkward position, still looking at his own face, but his eyes are closed.*
*He glances over at Lauren and then back at Hopper.*


>>  What's going on?


User: What's going on?
Bot: *He looks at Lauren with some amusement.*


>>  Are you okay?


User: Are you okay?
Bot: [_looks at Hooper_] What's going?


>>  I don't know.


User: I don't know.
Bot: [_shrugs_] It's nothing, really.


>>  What do you think of Shiawase?


User: What do you think of Shiawase?
Bot: They're fine.


>>  Really?


User: Really?
Bot: Well, that's good to hear.


>>  Where are we meeting?


User: Where are we meeting?
Bot: Uhm...I think we should probably..check on Shiro


>>  Is Shiro okay?


User: Is Shiro okay?
Bot: [*nods*] Yeah, she is, I'll catch you later


>>  Wait for me


User: Wait for me
Bot: [_to Charlotte_] Are you...are you going to...?


>>  No I'm not


User: No I'm not
Bot: I'm fine. I just...


>>  I wish I was dead...


User: I wish I was dead...
Bot: [_sighs_].


>>  What?


User: What?
Bot: Do you need anything?


>>  No, do you?


User: No, do you?
Bot: Don't worry about it


>>  Okay, I won't.


User: Okay, I won't.
Bot: It's nothing.


>>  So what are you up to these days?


User: So what are you up to these days?
Bot: We can do this or...?


>>  I'm confused


User: I'm confused
Bot: I just..don't know how to start with that


>>  Well, I don't either


User: Well, I don't either
Bot: [_quietly_]


>>  So what's going on with you


User: So what's going on with you
Bot: I...[_sits down_] I don’t know how we should ask her to...


In [8]:
text = "[_as she talks she takes the book off the table and carefully tucks it away in her bag_]"

In [15]:
tokenizer.pad_token = "<|pad|>"
tokenizer(text, padding="max_length", max_length=512)

{'input_ids': [29795, 292, 673, 6130, 673, 2753, 262, 1492, 572, 262, 3084, 290, 7773, 256, 6238, 340, 1497, 287, 607, 6131, 62, 60, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,

In [14]:
finetuned_model.max_length

AttributeError: 'GPT2LMHeadModel' object has no attribute 'max_length'