# Fine-tune GPT-2

In [4]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import torch
from torch.utils.data import Dataset 
import random
import time
import datetime
import random
from transformers import GPT2LMHeadModel, GPT2Config
import numpy as np
from torch.utils.data import random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import requests

In [5]:
import csv
# import wikipedia sentences
import os

cwd = os.getcwd()  # Get the current working directory (cwd)
files = os.listdir(cwd)  # Get all the files in that directory
print("Files in %r: %s" % (cwd, files))

req = pd.read_fwf("/Users/wz/IdeaProjects/context-aware-embedding/Resources/code/input/japan_wiki.txt")
req



Files in '/Users/wz/IdeaProjects/context-aware-embedding/Resources/code': ['Prototype 0.0.ipynb', 'Messing around.ipynb', '.DS_Store', 'Prototype0.1.ipynb', 'input', 'yago2', 'Q&A Baseline.ipynb', '.ipynb_checkpoints', 'accuracy.py', 'wandb']


Unnamed: 0,Hokkaido was formerly known as Ezo Yezo Yeso or Yesso.
0,According to Matsuura the name was thought up...
1,In contrast to the island of Honshu Hokkaido ...
2,From the Middle Ages the people in Hokkaido b...
3,Hokkaido subsequently became known as Ezochi ...
4,The disputes eventually developed into war.
...,...
40383,1494 Hosokawa Masamoto becomes Kyoto kanrei.
40384,1545 Hōjō Ujiyasu defeats the Uesugi clan forc...
40385,1551 Mōri defeats the Ōuchi led by Sue Harukat...
40386,1554 Mōri succeeds to Ōuchi lands and power.


In [8]:
master = "https://raw.githubusercontent.com/mcelikkaya/medium_articles/main/japan_wiki.txt"
req = requests.get(master)
req = req.text

In [10]:
all_sentences = req.split("\n")
all_sentences = [s.replace("\r","") for s in all_sentences]

In [11]:
print("sample size : ",len(all_sentences))
print("samples     : " )
all_sentences[0:10]

sample size :  40389
samples     : 


['Hokkaido was formerly known as Ezo  Yezo  Yeso  or Yesso.',
 'According to Matsuura  the name was thought up because the Ainu called the region Kai.',
 'In contrast to the island of Honshu  Hokkaido saw an absence of conflict during this time period.',
 'From the Middle Ages  the people in Hokkaido began to be called Ezo.',
 'Hokkaido subsequently became known as Ezochi  蝦夷地  lit.',
 'The disputes eventually developed into war.',
 'Takeda Nobuhiro killed the Ainu leader  Koshamain  and defeated the opposition in 1457.',
 'The Matsumae family s economy relied upon trade with the Ainu.',
 'They held authority over the south of Ezochi until the end of the Edo period.',
 'There were numerous revolts by the Ainu against the feudal rule.']

In [12]:
from transformers import GPT2Tokenizer
#get pretrained tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<sos>', eos_token='<eos>', pad_token='<pad>')

#tokenizer some samples
print( tokenizer.encode("Japan Tokyo") )
print( tokenizer.encode("Japan") )
print( tokenizer.encode("japan tokyo") )
print( tokenizer.encode("japan") )
print( tokenizer.encode("tokyo") )

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


[16504, 11790]
[16504]
[73, 2674, 284, 2584, 78]
[73, 2674]
[83, 482, 8226]


In [13]:
max_len = max([len(tokenizer.encode(s)) for s in all_sentences])

print(f"max_len {max_len}")

max_len 85


In [14]:
#since we will be feeding with sentences from wikipedia
#we can mark beginning and end of sentences with with sos and eos
def tokenize_seq(sent,tokenizer,max_length):
    return tokenizer('<sos>'+ sent + '<eos>', truncation=True, max_length=max_length, padding="max_length")

class JapanDataset(Dataset):

    def __init__(self, sentences, tokenizer, gpt2_type="gpt2", max_length=max_len):

        self.tokenizer = tokenizer 
        self.input_ids = []
        self.attn_masks = []

        for sentence in sentences:      
            encodings = tokenize_seq(sentence,tokenizer,max_length)

            self.input_ids.append(torch.tensor(encodings['input_ids']))
            self.attn_masks.append(torch.tensor(encodings['attention_mask']))
    
    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]   

def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))    

In [15]:
import gc
gc.collect() 

52

In [16]:
#create an instance of Dataset
dataset = JapanDataset(all_sentences, tokenizer, max_length=max_len)

# Split into training and validation sets
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size

train_set, val_set = random_split(dataset, [train_size, val_size])
print("train_size :",train_size)
print("val_size   :",val_size)

gc.collect() 

train_size : 36350
val_size   : 4039


0

In [17]:
#lets check a sample from dataset 
#50257 beginning of sentence token
#50258 end of sentence token
#50259 pad token
dataset[0]

(tensor([50257,    39,   482,    74, 44354,   373, 15734,  1900,   355,   412,
         10872,   220,   575,  8471,    78,   220,  3363,    78,   220,   393,
           575,   408,    78,    13, 50258, 50259, 50259, 50259, 50259, 50259,
         50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
         50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
         50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
         50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
         50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259, 50259,
         50259, 50259, 50259, 50259, 50259]),
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [18]:
#define dataloaders
train_dataloader = DataLoader(train_set,  sampler = RandomSampler(train_set), batch_size = 32)
validation_dataloader = DataLoader(val_set, sampler = SequentialSampler(val_set), batch_size = 32 )

In [22]:
# Create default config
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)
# Load pretrained gpt2
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)
model.resize_token_embeddings(len(tokenizer))

# Create device
device = torch.device("cpu")
model.cpu()


optimizer = torch.optim.Adam(model.parameters(),lr = 0.0005)
model = model.to(device)

In [23]:
#at every step i want to check if generations are getting better.
def eval_keywords(keywords):
    model.eval()
    for keyword in keywords:
        input_seq = "<sos> " + keyword
        generated = torch.tensor(tokenizer.encode(input_seq)).unsqueeze(0)
        generated = generated.to(device)
        sample_outputs = model.generate(
                                generated, 
                                do_sample=True,   
                                top_k=30, 
                                max_length = 50,
                                top_p=0.90, 
                                num_return_sequences=2
                                )
    for i, sample_output in enumerate(sample_outputs):
        print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

keywords = ["Osaka","Japan","Kyoto","Yokohama","Kanto","Nikko","Japan has","Tokyo is the","Osaka is the","Kyoto is the"]

In [24]:
#call model with a batch of input
def process_one_batch(batch):
    b_input_ids = batch[0].to(device)
    b_labels = batch[0].to(device)
    b_masks = batch[1].to(device)
    outputs  = model(b_input_ids,  attention_mask = b_masks,labels=b_labels)
    return outputs

#do one epoch for training
def train_epoch():
    t0 = time.time()
    total_train_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        
        model.zero_grad()        
        outputs = process_one_batch( batch)
        loss = outputs[0]  
        batch_loss = loss.item()
        total_train_loss += batch_loss

        loss.backward()
        optimizer.step()

        
    avg_train_loss = total_train_loss / len(train_dataloader)  
    print("avg_train_loss",avg_train_loss)  
    elapsed_time = format_time(time.time() - t0)
    print("elapsed time for 1 training epoch : ",elapsed_time)

#do one epoch for eval
def eval_epoch():
    t0 = time.time()
    total_eval_loss = 0
    nb_eval_steps = 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:            
        
        with torch.no_grad():        
            outputs = process_one_batch( batch)
            loss = outputs[0]              
            batch_loss = loss.item()
            total_eval_loss += batch_loss         

    avg_val_loss = total_eval_loss / len(validation_dataloader)
    print("avg_val_loss",avg_val_loss) 
    elapsed_time = format_time(time.time() - t0)
    print("elapsed time for 1 eval epoch : ",elapsed_time)

In [25]:
#train eval 1 cycle
#then create sample sentences
train_epoch()
eval_epoch()
eval_keywords( keywords )

KeyboardInterrupt: 