# Fine-tuning GPT-2 on a dataset in PyTorch

Fine-tune a pre-trained GPT-2 model on a dataset containing the scripts of a python course. Let's see if the model can learn to teach Python!

For this experiment, we will use a pre-trained GPT-2 medium-sized model from the huggingface [transformers repository](https://github.com/huggingface/transformers).

This project is developed referencing https://gist.github.com/mf1024/3df214d2f17f3dcc56450ddf0d5a4cd7


In [1]:
!pip install transformers

[0m

In [None]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import numpy as np

import logging
logging.getLogger().setLevel(logging.CRITICAL)

import warnings
warnings.filterwarnings('ignore')

device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'
print(device)

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2-medium')
model = GPT2LMHeadModel.from_pretrained('gpt2-medium')
model = model.to(device)

### PyTorch Dataset module for the dataset

We will use the dataset of lecture scripts. After each paragraph, we add "<|endofext|>" which is recognized by the GPT2 model as the end of text marker. The marker will allow us to concatenate many paragraph in a single input sequence.

In [None]:
from torch.utils.data import Dataset
from torch.utils.data import Dataset, DataLoader
import os
import json
import csv

class LecturesDataset(Dataset):
    def __init__(self, lecture_dataset_path = '/kaggle/input/d/apurvapatil871/pythondata'):
        super().__init__() #initialize the Data

        lecture_path = os.path.join(lecture_dataset_path, 'trainingdata.csv')

        self.joke_list = []
        self.end_of_text_token = "<|endoftext|>"
        
        with open(lecture_path) as csv_file:
            csv_reader = csv.reader(csv_file, delimiter=',')
        
            for row in csv_reader:
                lecture_str = f"data:{row[1]}{self.end_of_text_token}"
                self.joke_list.append(lecture_str)
        
    def __len__(self):
        return len(self.joke_list)

    def __getitem__(self, item):
        return self.joke_list[item]


In [None]:
dataset = LecturesDataset()
lectures_loader = DataLoader(dataset, batch_size=1, shuffle=True)

### Hyperparameters

We mostly tuned ***BATCH_SIZE***, ***EOPOCHS***, and ***LEARNING_RATE***.

In [None]:
BATCH_SIZE = 16
EPOCHS = 5
LEARNING_RATE = 3e-5
WARMUP_STEPS = 500
MAX_SEQ_LEN = 40

### Model training

We will train the model and save the model weights after each epoch and then we will try to generate python answers with each version of the weight to see which performs the best.

In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps = -1)
proc_seq_count = 0
sum_loss = 0.0
batch_count = 0

tmp_paragraphs_tens = None
models_folder = "/kaggle/working/trained_models" #fold for the trained model

if not os.path.exists(models_folder):
    os.mkdir(models_folder)

for epoch in range(EPOCHS):
    
    print(f"EPOCH {epoch} started" + '=' * 30)
    
    for idx,paragraph in enumerate(lectures_loader):
        
        #################### Fit as many paragraph sequences into MAX_SEQ_LEN sequence as possible ####
        paragraph_tens = torch.tensor(tokenizer.encode(paragraph[0])).unsqueeze(0).to(device)
        #Skip sample from dataset if it is longer than MAX_SEQ_LEN
        if paragraph_tens.size()[1] > MAX_SEQ_LEN:
            continue
        
        #The first parapraph sequence in the sequence
        if not torch.is_tensor(tmp_paragraphs_tens):
            tmp_paragraphs_tens = paragraph_tens
            continue
        else:
            #The next paragraph does not fit in so we process the sequence and leave the last paragraph 
            #as the start for next sequence 
            if tmp_paragraphs_tens.size()[1] + paragraph_tens.size()[1] > MAX_SEQ_LEN:
                work_paragraphs_tens = tmp_paragraphs_tens
                tmp_paragraphs_tens = paragraph_tens
            else:
                #Add the paragraph to sequence, continue and try to add more
                tmp_paragraphs_tens = torch.cat([tmp_paragraphs_tens, paragraph_tens[:,1:]], dim=1)
                continue
        ################## Sequence ready, process it trough the model ##################
            
        outputs = model(work_paragraphs_tens, labels=work_paragraphs_tens)
        loss, logits = outputs[:2]                        
        loss.backward()
        sum_loss = sum_loss + loss.detach().data
                       
        proc_seq_count += 1
        if proc_seq_count == BATCH_SIZE:
            proc_seq_count = 0    
            batch_count += 1
            optimizer.step()
            scheduler.step() 
            optimizer.zero_grad()
            model.zero_grad()

        if batch_count == 100:
            print(f"sum loss is {sum_loss}")
            batch_count = 0
            sum_loss = 0.0
    
    # Store the model after each epoch to compare the performance of them
    torch.save(model.state_dict(), os.path.join(models_folder, f"gpt2_medium_pythonlecturer_{epoch}.pt"))
            

### Generating the lectures

In [None]:
def choose_from_top(probs, n=2, random_seed=None):
    ind = np.argpartition(probs, -n)[-n:]
    top_prob = probs[ind]
    top_prob = top_prob / np.sum(top_prob) # Normalize
    np.random.seed(random_seed)
    choice = np.random.choice(n, 1, p = top_prob)
    token_id = ind[choice][0]
    return int(token_id)

In [None]:
#load the fine-tuned model
MODEL_EPOCH = 4
model_path = os.path.join(models_folder, f"gpt2_medium_pythonlecturer_{MODEL_EPOCH}.pt")
model.load_state_dict(torch.load(model_path))

#set num of paragraphs to generate
generated_paragraph = 5
first_sentence = "Q: What is python? \n A:"
max_paragraph_length = 100

#setting random seed 
randomness = None #default: None, change this to a value for debugging purpose

paragraphs_output_file_path = os.path.join("/kaggle/working/", f'generated_lectures_{MODEL_EPOCH}.txt')
audio_file_folder = "/kaggle/working/"
if os.path.exists(paragraphs_output_file_path):
    os.remove(paragraphs_output_file_path)

model.eval()
with torch.no_grad():
    for paragraph_idx in range(generated_paragraph):
        paragraph_finished = False
        cur_ids = torch.tensor(tokenizer.encode(first_sentence)).unsqueeze(0).to(device)

        for i in range(max_paragraph_length):
            outputs = model(cur_ids, labels=cur_ids)
            loss, logits = outputs[:2]
            softmax_logits = torch.softmax(logits[0,-1], dim=0) #Take the first(from only one in this case) batch and the last predicted embedding
            if i < 3:
                n = 20
            else:
                n = 3
                
            next_token_id = choose_from_top(softmax_logits.to('cpu').numpy(), n=n, random_seed=randomness) #Randomly(from the topN probability distribution) select the next word
#             print(next_token_id)

            cur_ids = torch.cat([cur_ids, torch.ones((1,1)).long().to(device) * next_token_id], dim = 1) # Add the last word to the running sequence

            if next_token_id in tokenizer.encode('<|endoftext|>'):
                paragraph_finished = True
                break

        if paragraph_finished:
            output_list = list(cur_ids.squeeze().to('cpu').numpy())
            output_text = tokenizer.decode(output_list)[:-13] #not include '<|endoftext|>'
            output_text = tokenizer.decode(output_list)
            print(output_text + "\n")

            with open(paragraphs_output_file_path, 'a') as f:
                f.write(f"{output_text} \n\n")
                