# Fine-Tuning GPT

https://www.modeldifferently.com/en/2021/12/generaci%C3%B3n-de-fake-news-con-gpt-2/#data-loading-and-processing

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import torch, os, re, pandas as pd, json
from sklearn.model_selection import train_test_split
from transformers import (
    DataCollatorForLanguageModeling,
    DataCollatorWithPadding,
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    AutoConfig,
    AutoTokenizer,
    AutoModelForCausalLM
)
from datasets import Dataset, list_metrics, load_metric


In [3]:
#MODEL_NAME = "gpt2"
#model_cls = GPT2LMHeadModel
#tokenizer_cls = GPT2Tokenizer

MODEL_NAME = "microsoft/DialoGPT-small"
model_cls = AutoModelForCausalLM
tokenizer_cls = AutoTokenizer

In [4]:
if torch.cuda.is_available():  
    dev = "cuda:0" 
else:  
    dev = "cpu"  

device = torch.device(dev) 

print(f"""
    cuda: {torch.cuda.is_available()}
    current_device: {torch.cuda.current_device()}
    device_count: {torch.cuda.device_count()}
""")


    cuda: True
    current_device: 0
    device_count: 1



## Load Data, Tokenizer, and Model

### Data Loading from CSV

In [5]:
filepath = "../data/processed.csv"
df = pd.read_csv(filepath, encoding="utf-8", usecols=["character", "content"]).rename(columns={"content": "text"})

pd.set_option("display.max_colwidth", None)
df.tail(10)

Unnamed: 0,character,text
32080,Charlotte,"What're the odds, right? But yes, we, uh...know each other. And he's also here in Boston."
32081,Toby,Right. Geeze. I...did not expect to ever hear from him again.
32082,Charlotte,...Surprise?
32083,Charlotte,"So, uh, would you want to talk to him? Because this is all stuff he really should get the chance to ask you about directly."
32084,Toby,Um...sure.
32085,Charlotte,"Ok, good. Um, we should probably go. But, uh, I'll see you tomorrow?"
32086,Charlotte,[_as she talks she takes the book off the table and carefully tucks it away in her bag_]
32087,Toby,"Um, yeah."
32088,Toby,Tomorrow.
32089,Charlotte,[_she will leave with the others_]


### Model

In [6]:
base_model = model_cls.from_pretrained(MODEL_NAME)

In [7]:
base_model.num_parameters

<bound method ModuleUtilsMixin.num_parameters of GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0

### Tokenizer

In [8]:
base_tokenizer = tokenizer_cls.from_pretrained(MODEL_NAME)

In [9]:
print(f"vocab_size: {base_tokenizer.vocab_size}")

vocab_size: 50257


In [10]:
vocabulary = base_tokenizer.get_vocab()
vocabulary["Hi"]

17250

In [11]:
# Encode
text = "Hi, how are you?"
base_tokenizer.tokenize(text)

['Hi', ',', 'Ġhow', 'Ġare', 'Ġyou', '?']

In [12]:
text_ids = base_tokenizer.encode(text, return_tensors = 'pt')
text_ids

tensor([[17250,    11,   703,   389,   345,    30]])

In [13]:
generated_text_samples = base_model.generate(
    text_ids,
    max_length=100,
)

for i, beam in enumerate(generated_text_samples):
    print(f"{i}: {base_tokenizer.decode(beam, skip_special_tokens=True)}")
    print()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: Hi, how are you? Good morning everyone!



In [14]:
# Decode
for i, beam in enumerate(generated_text_samples):
    print(f"{i}: {base_tokenizer.decode(beam, skip_special_tokens=True)}")
    print()

0: Hi, how are you? Good morning everyone!



## Generate Text

### Greedy Search

In [15]:
# ejemplo de generación de texto
generated_text_samples = base_model.generate(
    text_ids,
    max_length= 100,
)

for i, beam in enumerate(generated_text_samples):
  print(f"{i}: {base_tokenizer.decode(beam, skip_special_tokens=True)}")
  print()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: Hi, how are you? Good morning everyone!



### Beam Search

In [16]:
# text generation example
generated_text_samples = base_model.generate(
    text_ids,
    max_length=50,
    num_beams=5,
    num_return_sequences= 5,
    early_stopping=True 
)

for i, beam in enumerate(generated_text_samples):
  print(f"{i}: {base_tokenizer.decode(beam, skip_special_tokens=True)}")
  print()


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: Hi, how are you? Good morning everyone!

1: Hi, how are you? Good morning!

2: Hi, how are you?'s hello

3: Hi, how are you? Good Morning everyone!

4: Hi, how are you? Good morning everyone.



In [17]:
# text generation example
generated_text_samples = base_model.generate(
    text_ids,
    max_length= 50,  
    num_beams=5,
    no_repeat_ngram_size=2,
    num_return_sequences= 5,
    early_stopping=True 
)

for i, beam in enumerate(generated_text_samples):
  print(f"{i}: {base_tokenizer.decode(beam, skip_special_tokens=True)}")
  print()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: Hi, how are you? Good morning everyone!

1: Hi, how are you? Good morning!

2: Hi, how are you?'s hello

3: Hi, how are you? Good Morning everyone!

4: Hi, how are you? Good morning everyone.



In [18]:
# text generation example
generated_text_samples = base_model.generate(
    text_ids,
    max_length= 50,  
    do_sample=True,  
    top_k=0,
    num_return_sequences= 5
)

for i, beam in enumerate(generated_text_samples):
  print(f"{i}: {base_tokenizer.decode(beam, skip_special_tokens=True)}")
  print()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: Hi, how are you? Good morning darling.

1: Hi, how are you?.

2: Hi, how are you?'s Hello.

3: Hi, how are you?'s hi... Cx

4: Hi, how are you? Good morning bass.



In [19]:
# text generation example
generated_text_samples = base_model.generate(
    text_ids,
    max_length= 50,  
    do_sample=True,  
    top_k=0,
    temperature=0.9,
    num_return_sequences= 5
)

for i, beam in enumerate(generated_text_samples):
  print(f"{i}: {base_tokenizer.decode(beam, skip_special_tokens=True)}")
  print()

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: Hi, how are you? I see you everywhere in r all.

1: Hi, how are you? Good Morning, all? Good Morning!

2: Hi, how are you?

3: Hi, how are you?'s hi.

4: Hi, how are you? How are we both still in school?



In [20]:
# text generation example
generated_text_samples = base_model.generate(
    text_ids,
    max_length= 50,  
    do_sample=True,  
    top_k=0,
    temperature=0.9,
    num_return_sequences= 5
)

for i, beam in enumerate(generated_text_samples):
  print(f"{i}: {base_tokenizer.decode(beam, skip_special_tokens=True)}")
  print()


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: Hi, how are you? Good morning stranger, good morning.

1: Hi, how are you? I won't say any more than that.

2: Hi, how are you? Good morning everyone!

3: Hi, how are you? Good morning all.

4: Hi, how are you? to : start listening to your favourite band, do some mushrooms



In [21]:
# text generation example
generated_text_samples = base_model.generate(
    text_ids,
    max_length= 50,  
    do_sample=True,  
    top_k=25,
    num_return_sequences= 5
)

for i, beam in enumerate(generated_text_samples):
  print(f"{i}: {base_tokenizer.decode(beam, skip_special_tokens=True)}")
  print()


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: Hi, how are you? Goodbye, I have seen you around

1: Hi, how are you? Zwei Zwei

2: Hi, how are you? hello everyone

3: Hi, how are you? Good morning! Hello!

4: Hi, how are you? hello kesh



In [22]:
# text generation example
generated_text_samples = base_model.generate(
    text_ids,
    max_length= 50,  
    do_sample=True,  
    top_k=0,
    top_p=0.92,
    num_return_sequences= 5
)

for i, beam in enumerate(generated_text_samples):
  print(f"{i}: {base_tokenizer.decode(beam, skip_special_tokens=True)}")
  print()


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: Hi, how are you? AMA

1: Hi, how are you?, hi

2: Hi, how are you? Hello Hello

3: Hi, how are you?..

4: Hi, how are you?. AMA



In [23]:
def generate_n_text_samples(model, tokenizer, input_text, device, n_samples = 5):
    text_ids = tokenizer.encode(input_text, return_tensors = 'pt')
    text_ids = text_ids.to(device)
    model = model.to(device)

    generated_text_samples = model.generate(
        text_ids, 
        max_length= 100,  
        num_return_sequences= n_samples,
        no_repeat_ngram_size= 2,
        repetition_penalty= 1.5,
        top_p= 0.92,
        temperature= .85,
        do_sample= True,
        top_k= 125,
        early_stopping= True
    )
    gen_text = []
    for t in generated_text_samples:
        text = tokenizer.decode(t, skip_special_tokens=True)
        gen_text.append(text)

        return gen_text


## Conversational Model

### Implementation

https://huggingface.co/microsoft/DialoGPT-medium

In [24]:
def generate_responses(text, chat_history_ids=None, step=0):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = base_tokenizer.encode(text + base_tokenizer.eos_token, return_tensors='pt')

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = base_model.generate(bot_input_ids, max_length=1000, pad_token_id=base_tokenizer.eos_token_id)
    
    response = base_tokenizer.decode(
        chat_history_ids[:, bot_input_ids.shape[-1]:][0],
        skip_special_tokens=True,
    )

    return response, chat_history_ids, step + 1

def chat():
    step = 0
    chat_history_ids = []
    
    while True: 
        text = input(">> ")
        if text in ["/q", "/quit", "/e", "/exit"]: break
        print(f"User: {text}")
        response, chat_history_ids, step = generate_responses(
            text,
            chat_history_ids=chat_history_ids,
            step=step
        )
        print(f"Bot: {response}")
        

### Demo

In [25]:
chat()

>>  /quit


### Data Preparation

In [26]:
import argparse
import tarfile
import os.path

from typing import Dict, Union

import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import AutoModelWithLMHead, AutoTokenizer
from datasets import Dataset


def load_csv(filename):
    return pd.read_csv(filename)


def prepare_data(
    data_or_filename: Union[str, pd.DataFrame],
    filter_by: str = None,
    filter_value: str = None,
    content_key: str = "text",
    n: int = 7,
    test_size: float = 0.1,
    flatten: bool = True,
):
    data = load_csv(data_or_filename) if isinstance(data_or_filename, str) else data_or_filename

    contexted_data = prepare_context(
        data,
        filter_by=filter_by,
        filter_value=filter_value,
        content_key=content_key,
        n=n,
    )

    trn_df, val_df = train_test_split(contexted_data, test_size=test_size)
    
    #train_dataset = Dataset.from_pandas(trn_df)
    #val_dataset = Dataset.from_pandas(val_df)
    
    train_dataset = prepare_dataset(trn_df)
    val_dataset = prepare_dataset(val_df)

    return train_dataset, val_dataset


def prepare_context(
    data: pd.DataFrame,
    filter_by: str = None,
    filter_value: str = None,
    content_key: str = "text",
    n: int = 7,
):
    if filter_by:
        indexes = data.loc[data[filter_by] == filter_value].index
    else:
        indexes = range(n, len(data[content_key]))

    contexted = []

    for i in indexes:
        row = []
        prev = i - 1 - n
        for j in range(i, prev, -1):
            row.append(data.iloc[j][content_key])
        contexted.append(row)

    columns = ["response", "context"]
    columns = columns + ["context/" + str(i) for i in range(n - 1)]

    df = pd.DataFrame.from_records(contexted, columns=columns)
    return df


def prepare_dataset(df):
    columns = [col for col in df] 
    dataset = Dataset.from_pandas(concat_text(df))
    dataset = dataset.remove_columns(columns + ['__index_level_0__'])
    return dataset


def concat_text(df):
    df["text"] = df.apply(concat_text_in_row, axis=1)
    return df


def concat_text_in_row(row):
    concat_text = f"{base_tokenizer.eos_token}".join(row.values)
    # Add to end
    concat_text += base_tokenizer.eos_token
    return concat_text

In [27]:
train_dataset, val_dataset = prepare_data(df, filter_by="character", filter_value="bitjockey")

In [28]:
train_dataset

Dataset({
    features: ['text'],
    num_rows: 4967
})

In [29]:
val_dataset

Dataset({
    features: ['text'],
    num_rows: 552
})

In [30]:
base_tokenizer.pad_token = "<|PAD|>"

In [31]:
def tokenize_function(examples):
    input_ids = base_tokenizer(examples["text"], padding="max_length")["input_ids"]
    return {"input_ids": input_ids}
    

def preprocess_function(examples):
    input_ids = list(map(tokenize_function, examples))
    return {"input_ids": input_ids}

    
tokenized_train_dataset = train_dataset.map(
    tokenize_function,
    remove_columns=["text"],
)

tokenized_val_dataset = val_dataset.map(
    tokenize_function,
    remove_columns=["text"],
)

  0%|          | 0/4967 [00:00<?, ?ex/s]

  0%|          | 0/552 [00:00<?, ?ex/s]

In [32]:
len(tokenized_train_dataset["input_ids"])

4967

In [33]:
metric = load_metric("perplexity")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


In [34]:
model_path = './model_dialogpt'

data_collator = DataCollatorForLanguageModeling(
        tokenizer=base_tokenizer,
        mlm=False
    )

trainer = None
training_args = TrainingArguments(
    output_dir=model_path,          # output directory
    evaluation_strategy="epoch",
    num_train_epochs=3,              # total # of training epochs
    per_device_train_batch_size=1,  # batch size per device during training
    per_device_eval_batch_size=1,   # batch size for evaluation
    weight_decay=0.01,               # strength of weight decay
    logging_dir=model_path,            # directory for storing logs
    prediction_loss_only=True,
)

In [35]:
trainer = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    compute_metrics=compute_metrics,
    data_collator=data_collator,
)

In [36]:
trainer.train()

***** Running training *****
  Num examples = 4967
  Num Epochs = 3
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 14901


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 