# Fine-Tuning a Transformer model based on coonversation data

### Note: This is just a simple implementation of how one could approach to such an idea, the results might not be too astonishing 

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install datasets

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import pandas as pd
import numpy as np
import torch
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split

# Creating a dataframe from the conversation data

In [None]:
data = load_dataset("multi_woz_v22")
customer = []
server = []
for dialogues in data['train']['turns']:
    for i in range(len(dialogues['utterance'])):
    if i%2 ==0:
        customer.append(dialogues['utterance'][i])
    else:
        server.append(dialogues['utterance'][i])

df = pd.DataFrame({"question":customer,"answer":server})
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

# The dataset contains a conversation between customers and our support agents, we are going to train our model on this conversation

In [40]:
train_df

Unnamed: 0,question,answer
19498,I need to book a train for Friday please.,Great. Where are you headed?
25916,"if there ain't none, can you try a nightclub? ...",The place is a nightclub in the south.
26109,Can you select one of the restaurants with the...,The dojo noodle house would be an option for you
50373,"Ok, thank you.",Is there anything else I can help you with?
5329,I want to go to ely and arrive by 11:15.,"I'm sorry for the confusion, just to verify-ar..."
...,...,...
54343,No. Just pick something and give me the address.,Okay. I recommend Churchill College at Storey'...
38158,I would like to reserve it for 11:15 for 8 peo...,Booking was successful. The table will be rese...
860,I am leaving peterborough.,What time do you need to travel?
15795,"No. That's everything I needed and then some, ...",Thank you for calling. Have a great day!




## Defining Tokenizer
#### We are using a pretrained GPT2 tokenizer

In [None]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
tokenizer.add_special_tokens({'bos_token': '[start]', 'eos_token': '[end]'})

## Defining Custom Dataset

### Things that are required by the gpt2 model (or any other transformer based model) are input_ids (tokenized sentences), attention_mask and labels, all of which is defined in the code below

In [4]:
from torch.utils.data import Dataset

class QADataset(Dataset):
    def __init__(self, tokenizer, data):
        self.tokenizer = tokenizer
        self.data = data
        self.max_length = 150
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        question = row['question']
        answer = row['answer']
        q_and_a = f'[start]{question}[end]{answer}'
        encoding = self.tokenizer(question,answer,
            padding='max_length',
            max_length=self.max_length,
            truncation=True,
            return_tensors="pt"
        )
        input_ids = encoding['input_ids'].squeeze()
        attention_mask= encoding['attention_mask'].squeeze()
        
        item = {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
        }
        return item

In [5]:
import torch
@torch.no_grad()
def validate(model, tokenizer,val_dl):
    model.eval()
    # tokenized = tokenizer(prompt, return_tensors='pt')
    for step, batch in enumerate(val_dl):
        if step==4:
            break
        inputs = {k: v.to(device) for k, v in batch.items()}
        sample_output = model.generate(**inputs, do_sample=True, max_length=100, top_k=20, top_p=1.0)
        generated_text = tokenizer.decode(sample_output[0], skip_special_tokens=True)
        print(f' This is Generated Text----------------> {generated_text}')

## Generated text output is showing as a single letter here because an error I made earlier, I only showed the first letter of the generated text, I did not rerun the cell because it was taking a lot of time on colab

In [6]:
train_dataset = QADataset(tokenizer,train_df)
val_dataset = QADataset(tokenizer,val_df)

config = GPT2Config.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2', config=config)
model.resize_token_embeddings(len(tokenizer))

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=True)

optimizer = optim.Adam(model.parameters(), lr=5e-5)
criterion = nn.CrossEntropyLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(2):
    train_loss = 0.0
    
    model.train()
    for step,batch in enumerate(train_loader):
        optimizer.zero_grad()
        inputs = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**inputs, labels=inputs['input_ids'])
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
        if (step + 1) % 100 == 0:
            print(f"Step {step + 1}/{len(train_loader)} - Loss: {train_loss / (step + 1):.4f}")
    train_loss /= len(train_dataset)
    validate(model,tokenizer,val_loader)
    
    print(f"Epoch {epoch+1} | Train Loss: {train_loss:.3f}")


Downloading pytorch_model.bin:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Step 100/2839 - Loss: 2.0290
Step 200/2839 - Loss: 1.4297
Step 300/2839 - Loss: 1.1987
Step 400/2839 - Loss: 1.0711
Step 500/2839 - Loss: 0.9858
Step 600/2839 - Loss: 0.9255
Step 700/2839 - Loss: 0.8794
Step 800/2839 - Loss: 0.8432
Step 900/2839 - Loss: 0.8142
Step 1000/2839 - Loss: 0.7882
Step 1100/2839 - Loss: 0.7672
Step 1200/2839 - Loss: 0.7486
Step 1300/2839 - Loss: 0.7313
Step 1400/2839 - Loss: 0.7168
Step 1500/2839 - Loss: 0.7033
Step 1600/2839 - Loss: 0.6910
Step 1700/2839 - Loss: 0.6801
Step 1800/2839 - Loss: 0.6702
Step 1900/2839 - Loss: 0.6613
Step 2000/2839 - Loss: 0.6526
Step 2100/2839 - Loss: 0.6453
Step 2200/2839 - Loss: 0.6385
Step 2300/2839 - Loss: 0.6315
Step 2400/2839 - Loss: 0.6254
Step 2500/2839 - Loss: 0.6191
Step 2600/2839 - Loss: 0.6138
Step 2700/2839 - Loss: 0.6085
Step 2800/2839 - Loss: 0.6033


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 150, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 150, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


 This is Generated Text----------------> I


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 150, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


 This is Generated Text----------------> Y
 This is Generated Text----------------> N


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 150, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


 This is Generated Text----------------> I
Epoch 1 | Train Loss: 0.038
Step 100/2839 - Loss: 0.4540
Step 200/2839 - Loss: 0.4511
Step 300/2839 - Loss: 0.4511
Step 400/2839 - Loss: 0.4486
Step 500/2839 - Loss: 0.4475
Step 600/2839 - Loss: 0.4474
Step 700/2839 - Loss: 0.4478
Step 800/2839 - Loss: 0.4467
Step 900/2839 - Loss: 0.4459
Step 1000/2839 - Loss: 0.4448
Step 1100/2839 - Loss: 0.4446
Step 1200/2839 - Loss: 0.4442
Step 1300/2839 - Loss: 0.4436
Step 1400/2839 - Loss: 0.4424
Step 1500/2839 - Loss: 0.4415
Step 1600/2839 - Loss: 0.4406
Step 1700/2839 - Loss: 0.4406
Step 1800/2839 - Loss: 0.4400
Step 1900/2839 - Loss: 0.4394
Step 2000/2839 - Loss: 0.4386
Step 2100/2839 - Loss: 0.4385
Step 2200/2839 - Loss: 0.4377
Step 2300/2839 - Loss: 0.4372
Step 2400/2839 - Loss: 0.4366
Step 2500/2839 - Loss: 0.4360
Step 2600/2839 - Loss: 0.4354
Step 2700/2839 - Loss: 0.4347
Step 2800/2839 - Loss: 0.4340


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 150, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 150, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


 This is Generated Text----------------> D
 This is Generated Text----------------> T


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 150, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


 This is Generated Text----------------> C

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 150, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.



 This is Generated Text----------------> Y
Epoch 2 | Train Loss: 0.027


## Saving

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
import os
os.chdir("/content/drive/My Drive/nlp")

## I am also saving the optimizer state dict in case that I want to train the model for more epochs

In [10]:
torch.save({'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),}, '/content/drive/My Drive/nlp/model.pt')

## Loading the model

In [15]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from torch.utils.data import Dataset, DataLoader

val_dataset = QADataset(tokenizer,val_df)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=True)

config = GPT2Config.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2', config=config)
model.resize_token_embeddings(len(tokenizer))

Embedding(50260, 768)

In [16]:
checkpoint = torch.load('model.pt',map_location=torch.device('cpu'))
model.load_state_dict(checkpoint['model_state_dict'])
model.resize_token_embeddings(len(tokenizer))

Embedding(50260, 768)

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## For using the model

In [53]:
import torch
@torch.no_grad()
def validate(model, tokenizer,val_dl,prompt="Hi"):
    model.eval()
    tokenized = tokenizer(prompt,return_tensors='pt')
    X = tokenized["input_ids"].to(device)
    a = tokenized["attention_mask"].to(device)
    output = model.generate(X, attention_mask=a ,max_new_tokens=80)
    output = tokenizer.decode(output[0],skip_special_tokens=True)
    print(f'This is output----------> {output}')

## The reason why the output is also showing the question is because in any generative model, it predicts the next "tokens" after a particular sequence, but this can be easily be taken care of to only show the output which is coming after the question has been asked

In [54]:
validate(model,tokenizer,val_loader,prompt="can you suggest hotels?")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


This is output----------> can you suggest hotels?There are 5 hotels that match your criteria. Do you have a price range in mind?


In [None]:
import tensorflow as tf
from transformers import AutoTokenizer, TFGPT2DoubleHeadsModel

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = TFGPT2DoubleHeadsModel.from_pretrained("gpt2")

# Add a [CLS] to the vocabulary (we should train it also!)
num_added_tokens = tokenizer.add_special_tokens({"cls_token": "[CLS]"})

embedding_layer = model.resize_token_embeddings(
    len(tokenizer)
)  # Update the model embeddings with the new vocabulary size

choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
encoded_choices = [tokenizer.encode(s) for s in choices]
cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]

input_ids = tf.constant(encoded_choices)[None, :]  # Batch size: 1, number of choices: 2
mc_token_ids = tf.constant([cls_token_location])  # Batch size: 1

outputs = model(input_ids, mc_token_ids=mc_token_ids)
lm_prediction_scores, mc_prediction_scores = outputs[:2]