# Fine tune GPT2-small
In this notebook, I will be fine tunning small gpt2

## Imports

In [1]:
import os
import sys
sys.path.append(os.path.join(os.getcwd(), "..", ".."))
from src.ds_loaders.gpt_2_dataset import GPT21024Dataset
import matplotlib.pyplot as plt

In [None]:
import argparse
from datetime import datetime
import json
import os
import pickle
import random
import sys
import time

import numpy as np
from pytorch_transformers import GPT2Tokenizer
import torch
from torch.nn import CrossEntropyLoss
import torch.nn.functional as F
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from tqdm import tnrange, tqdm, tqdm_notebook
from pytorch_transformers import ConstantLRSchedule, GPT2Config, GPT2LMHeadModel,AdamW, GPT2Tokenizer, WarmupLinearSchedule
# from tensorboardX import SummaryWriter

# from dataset import GPT21024Dataset
# from utils import add_special_tokens, beam_search, generate_beam_sample, generate_sample, sample_seq, set_seed, top_k_top_p_filtering

from src.ds_loaders.gpt_2_dataset import GPT21024DatasetTok
from src.utils.deep_tools import set_seed, get_model_tokenizer

In [3]:
d_path = "../../data/cnn_daily_1024_tok_clip_1000_gpt2.json"
ds_train = GPT21024DatasetTok(d_path, mode="train", length=100)
ds_val = GPT21024DatasetTok(d_path, mode="val", length=50)


In [4]:
print(len(ds_train[1]['document']))

1024


In [8]:
cnf = {
    "lr": 5e-5,
    "gradient_accumulation_steps": 2,
    "batch_size": 1,
    "num_workers": 4,
    "device": "cuda",
    "num_train_epochs": 5,
    "weights_dir": "weights_gpt2",
    "max_grad_norm": 1.0,
    "seed": 1
}

In [6]:
def train(model, tokenizer, train_dataset, valid_dataset, ignore_index):
	# """ Trains GPT2 model and logs necessary details.
	# 	Args:
	# 		args: dict that contains all the necessary information passed by user while training
 	# 		model: finetuned gpt/gpt2 model
	# 		tokenizer: GPT/GPT2 tokenizer
	# 		train_dataset: GPT21024Dataset object for training data
	# 		ignore_index: token not considered in loss calculation
	# """
    # writer = SummaryWriter('./logs')
    # TODO: wand init and log

    train_sampler = RandomSampler(train_dataset)
    train_dl = DataLoader(train_dataset,sampler=train_sampler,batch_size=cnf["batch_size"],num_workers=cnf["num_workers"])
    loss_fct = CrossEntropyLoss(ignore_index=ignore_index) #ignores padding token for loss calculation
    optimizer = AdamW(model.parameters(),lr=cnf["lr"])
    scheduler = WarmupLinearSchedule(optimizer, 100, 80000)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = tnrange(int(cnf["num_train_epochs"]), desc="Epoch")
    if cnf["seed"] is not None:
        set_seed(cnf)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dl, desc="Training")
        for step, batch in enumerate(epoch_iterator):
            inputs, labels = torch.tensor(batch['document']), torch.tensor(batch['document'])
            inputs = inputs.to(cnf["device"])
            labels = labels.to(cnf["device"])
            model.train()
            logits = model(inputs)[0]
            idx = batch['sum_idx'].item() # index of separator token
            # only consider loss on reference summary just like seq2seq models
            shift_logits = logits[..., idx:-1, :].contiguous()
            shift_labels = labels[..., idx+1:].contiguous()
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            loss = loss/cnf["gradient_accumulation_steps"]
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), cnf["max_grad_norm"])
            tr_loss += loss.item()
            if (step + 1) % cnf["gradient_accumulation_steps"] == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1
                # writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                # writer.add_scalar('loss', (tr_loss - logging_loss)/args.gradient_accumulation_steps, global_step)
                logging_loss = tr_loss
                print("loss:", loss.item(), end='\n\n')
                # if (step + 1)/cnf["gradient_accumulation_steps"] == 1.0:
                # 	print('After 1st update: ', end='\n\n')
                # 	generate_sample(valid_dataset, tokenizer, num=2, eval_step=False)


            # if (step + 1) % (10*cnf["gradient_accumulation_steps"]) == 0:
                # results = evaluate(args, model, valid_dataset, ignore_index, global_step)
                # for key, value in results.items():
                #     writer.add_scalar('eval_{}'.format(key), value, global_step)
                # print('After', global_step+1,'updates: ', end='\n\n')
                # generate_sample(valid_dataset, tokenizer, num=2, eval_step=True)


In [None]:
def evaluate(model, eval_dataset, ignore_index, global_step=None):
    # """ Returns perplexity score on validation dataset.
    #     Args:
    #         args: dict that contains all the necessary information passed by user while training
    #         model: finetuned gpt/gpt2 model
    #         eval_dataset: GPT21024Dataset object for validation data
    #         global_step: no. of times gradients have backpropagated
    #         ignore_index: token not considered in loss calculation
    # """
    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)
    eval_output_dir = args.output_dir

    results = {}
    eval_sampler = SequentialSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=cnf["batch_size"])
    loss_fct = CrossEntropyLoss(ignore_index=ignore_index) #ignores padding token for loss calculation

    eval_loss = 0.0
    nb_eval_steps = 0
    model.eval()

    for batch in tqdm_notebook(eval_dataloader, desc="Evaluating"):
        inputs, labels = batch['article'].to(cnf["device"]), batch['article'].to(args.device)

        with torch.no_grad():
            logits = model(inputs)[0]
            # idx = batch['sum_idx'].item() # index of separator token
            # only consider loss on reference summary just like seq2seq models
            shift_logits = logits[..., batch['sum_idx']:-1, :].contiguous()
            shift_labels = labels[..., batch['sum_idx']+1:].contiguous()
            lm_loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
            eval_loss += lm_loss.mean().item()
        nb_eval_steps += 1

    eval_loss = eval_loss / nb_eval_steps
    perplexity = torch.exp(torch.tensor(eval_loss))

    result = {
        "perplexity": perplexity
    }
    print("perplexity:", perplexity.item())

    if global_step:
        output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
        with open(output_eval_file, "a") as f:
            for key in sorted(result.keys()):
                f.write('\n\n')
                f.write("time = %s, %s = %s, step = %s\n" % (datetime.now().strftime("%d/%m/%Y %H:%M:%S"), key, str(result[key]), str(global_step)))
    return result

In [7]:
tokenizer = get_model_tokenizer()
ignore_idx = tokenizer.pad_token_id
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))
print(f"len of tokenizer: {len(tokenizer)}")

100%|██████████| 665/665 [00:00<00:00, 162921.27B/s]
100%|██████████| 548118077/548118077 [09:32<00:00, 956737.75B/s]  


len of tokenizer: 50259


RuntimeError: Expected one of cpu, cuda, xpu, mkldnn, opengl, opencl, ideep, hip, ve, ort, mlc, xla, lazy, vulkan, meta, hpu device type at start of device string: gpu

In [9]:
model.to(cnf["device"])

AssertionError: Torch not compiled with CUDA enabled