In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
%%time
%%capture
!pip install transformers

CPU times: user 46.1 ms, sys: 24.7 ms, total: 70.8 ms
Wall time: 3.75 s


In [9]:
!nvidia-smi

Mon Nov 22 07:31:02 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P8    26W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [10]:
import os
import io
import requests
import numpy as np
import pandas as pd
import re
import zipfile
import random
import time
import csv
import datetime
from itertools import compress
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from transformers import AutoTokenizer, AutoConfig, AutoModelForPreTraining, \
                         AdamW, get_linear_schedule_with_warmup, \
                         TrainingArguments, BeamScorer, Trainer

import torch
from torch.utils.data import Dataset, random_split, DataLoader, \
                             RandomSampler, SequentialSampler

from IPython.display import clear_output

print(f"PyTorch version: {torch.__version__}")

PyTorch version: 1.10.0+cu111


In [11]:
DEBUG           = False

USE_APEX        = True
APEX_OPT_LEVEL  = 'O1'

MODEL           = 'gpt2' #{gpt2, gpt2-medium, gpt2-large, gpt2-xl}

UNFREEZE_LAST_N = 6 #The last N layers to unfreeze for training

SPECIAL_TOKENS  = { "bos_token": "<|BOS|>",
                    "eos_token": "<|EOS|>",
                    "unk_token": "<|UNK|>",                    
                    "pad_token": "<|PAD|>",
                    "sep_token": "<|SEP|>"}
                    
MAXLEN          = 768  #{768, 1024, 1280, 1600}

TRAIN_SIZE      = 0.8

if USE_APEX:
    TRAIN_BATCHSIZE = 4
    BATCH_UPDATE    = 16
else:
    TRAIN_BATCHSIZE = 2
    BATCH_UPDATE    = 32

EPOCHS          = 20
LR              = 5e-4
EPS             = 1e-8
WARMUP_STEPS    = 1e2

SEED            = 2020

In [12]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)

In [13]:
import pandas as pd
data = pd.read_csv('drive/MyDrive/Colab Notebooks/hotpot_dev_distractor_fullcontext_v1.csv')

In [14]:
pd.set_option('display.max_colwidth', None)
data.head(5)

Unnamed: 0.1,Unnamed: 0,question,text
0,5a8b57f25542995d1e6f1371,Were Scott Derrickson and Ed Wood of the same nationality?,<answer> yes <context> Scott Derrickson born July 16 1966 is an American director screenwriter and producer. He lives in Los Angeles California. He is best known for directing horror films such as Sinister The Exorcism of Emily Rose and Deliver Us From Evil as well as the 2016 Marvel Cinematic Universe installment Doctor Strange.Edward Davis Wood Jr. October 10 1924 – December 10 1978 was an American filmmaker actor writer producer and director.
1,5a8c7595554299585d9e36b6,What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?,<answer> Chief of Protocol <context> Kiss and Tell is a 1945 American comedy film starring then 17-year-old Shirley Temple as Corliss Archer. In the film two teenage girls cause their respective parents much concern when they start to become interested in boys. The parents' bickering about which girl is the worse influence causes more problems than it solves.Shirley Temple Black April 23 1928 – February 10 2014 was an American actress singer dancer businesswoman and diplomat who was Hollywood's number one box-office draw as a child actress from 1935 to 1938. As an adult she was named United States ambassador to Ghana and to Czechoslovakia and also served as Chief of Protocol of the United States.Shirley Temple Black April 23 1928 – February 10 2014 was an American actress singer dancer businesswoman and diplomat who was Hollywood's number one box-office draw as a child actress from 1935 to 1938. As an adult she was named United States ambassador to Ghana and to Czechoslovakia and also served as Chief of Protocol of the United States.
2,5a85ea095542994775f606a8,What science fantasy young adult series told in first person has a set of companion books narrating the stories of enslaved worlds and alien species?,<answer> Animorphs <context> The Hork-Bajir Chronicles is the second companion book to the Animorphs series written by K. A. Applegate. With respect to continuity within the series it takes place before book #23 The Pretender although the events told in the story occur between the time of The Ellimist Chronicles and The Andalite Chronicles. The book is introduced by Tobias who flies to the valley of the free Hork-Bajir where Jara Hamee tells him the story of how the Yeerks enslaved the Hork-Bajir and how Aldrea an Andalite and her companion Dak Hamee a Hork-Bajir tried to save their world from the invasion. Jara Hamee's story is narrated from the points of view of Aldrea Dak Hamee and Esplin 9466 alternating in similar fashion to the Megamorphs books.The Hork-Bajir Chronicles is the second companion book to the Animorphs series written by K. A. Applegate. With respect to continuity within the series it takes place before book #23 The Pretender although the events told in the story occur between the time of The Ellimist Chronicles and The Andalite Chronicles. The book is introduced by Tobias who flies to the valley of the free Hork-Bajir where Jara Hamee tells him the story of how the Yeerks enslaved the Hork-Bajir and how Aldrea an Andalite and her companion Dak Hamee a Hork-Bajir tried to save their world from the invasion. Jara Hamee's story is narrated from the points of view of Aldrea Dak Hamee and Esplin 9466 alternating in similar fashion to the Megamorphs books.The Hork-Bajir Chronicles is the second companion book to the Animorphs series written by K. A. Applegate. With respect to continuity within the series it takes place before book #23 The Pretender although the events told in the story occur between the time of The Ellimist Chronicles and The Andalite Chronicles. The book is introduced by Tobias who flies to the valley of the free Hork-Bajir where Jara Hamee tells him the story of how the Yeerks enslaved the Hork-Bajir and how Aldrea an Andalite and her companion Dak Hamee a Hork-Bajir tried to save their world from the invasion. Jara Hamee's story is narrated from the points of view of Aldrea Dak Hamee and Esplin 9466 alternating in similar fashion to the Megamorphs books.Animorphs is a science fantasy series of young adult books written by Katherine Applegate and her husband Michael Grant writing together under the name K. A. Applegate and published by Scholastic. It is told in first person with all six main characters taking turns narrating the books through their own perspectives. Horror war dehumanization sanity morality innocence leadership freedom and growing up are the core themes of the series.Animorphs is a science fantasy series of young adult books written by Katherine Applegate and her husband Michael Grant writing together under the name K. A. Applegate and published by Scholastic. It is told in first person with all six main characters taking turns narrating the books through their own perspectives. Horror war dehumanization sanity morality innocence leadership freedom and growing up are the core themes of the series.
3,5adbf0a255429947ff17385a,Are the Laleli Mosque and Esma Sultan Mansion located in the same neighborhood?,<answer> no <context> The Laleli Mosque Turkish Laleli Camii or Tulip Mosque is an 18th-century Ottoman imperial mosque located in Laleli Fatih Istanbul Turkey.The Esma Sultan Mansion Turkish Esma Sultan Yalısı a historical yalı English waterside mansion located at Bosphorus in Ortaköy neighborhood of Istanbul Turkey and named after its original owner Esma Sultan is used today as a cultural center after being redeveloped.
4,5a8e3ea95542995a26add48d,The director of the romantic comedy Big Stone Gap is based in what New York city?,<answer> Greenwich Village New York City <context> Big Stone Gap is a 2014 American drama romantic comedy film written and directed by Adriana Trigiani and produced by Donna Gigliotti for Altar Identity Studios a subsidiary of Media Society. Based on Trigiani's 2000 best-selling novel of the same name the story is set in the actual Virginia town of Big Stone Gap circa 1970s. The film had its world premiere at the Virginia Film Festival on November 6 2014.Adriana Trigiani is an Italian American best-selling author of sixteen books television writer film director and entrepreneur based in Greenwich Village New York City. Trigiani has published a novel a year since 2000.


In [15]:
data = data.to_dict()

In [16]:
colnames = ['_id', 'answer', 'question', 'supporting_facts', 'context', 'type', 'level']
#url = "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_train_v1.1.json"
url = "http://curtis.ml.cmu.edu/datasets/hotpot/hotpot_dev_distractor_v1.json"
train_data = pd.read_json(url)
train_data = train_data.drop(columns=['_id','supporting_facts', 'type', 'level'])
contexts = train_data.context.tolist()
# print(len(contexts))
ccontexts = []
ids = []
for i in range(len(contexts)):
   temp = contexts[i][0][1] + [" "] + contexts[i][1][1]
   temp = ''.join(temp)
   temp = temp.replace('"', "")
   temp = temp.replace("\'", "")
   temp = temp.replace(",", "")
   temp = temp.replace(';', "")
   temp = temp.replace(':', "")
   temp = temp.replace('[', "")
   temp = temp.replace(']', "")
   temp = temp.replace('(', "")
   temp = temp.replace(')', "")
   ccontexts.append(temp)
   ids.append(i)


train_data['contexts'] = ccontexts
train_data['id'] = ids
train_data = train_data.drop(columns=['context'])
pd.options.display.max_colwidth = 8000

# train_data = train_data.to_dict('records')

# train_data22 = train_data.to_dict('split')
train_data = train_data.set_index('id').T.to_dict(orient='list')

train_data

{0: ['yes',
  'Were Scott Derrickson and Ed Wood of the same nationality?',
  'Ed Wood is a 1994 American biographical period comedy-drama film directed and produced by Tim Burton and starring Johnny Depp as cult filmmaker Ed Wood. The film concerns the period in Woods life when he made his best-known films as well as his relationship with actor Bela Lugosi played by Martin Landau. Sarah Jessica Parker Patricia Arquette Jeffrey Jones Lisa Marie and Bill Murray are among the supporting cast. Scott Derrickson born July 16 1966 is an American director screenwriter and producer. He lives in Los Angeles California. He is best known for directing horror films such as Sinister The Exorcism of Emily Rose and Deliver Us From Evil as well as the 2016 Marvel Cinematic Universe installment Doctor Strange.'],
 1: ['Chief of Protocol',
  'What government position was held by the woman who portrayed Corliss Archer in the film Kiss and Tell?',
  'Meet Corliss Archer a program from radios Golden Age ra

In [17]:
x = train_data.items()

In [18]:
class myDataset(Dataset):

    def __init__(self, data, tokenizer, randomize=True):

        context, question, answer = [], [], []
        for k, v in data.items():
            context.append(v[2])
            question.append(v[1])
            answer.append(v[0])

        self.randomize = randomize
        self.tokenizer = tokenizer 
        self.context = context
        self.question = question
        self.answer = answer  
    #---------------------------------------------#

    def __len__(self):
        return len(self.question)

    #---------------------------------------------#
    
    def __getitem__(self, i):
        
        input = SPECIAL_TOKENS['bos_token'] + self.context[i] + \
                SPECIAL_TOKENS['sep_token'] + self.answer[i] + SPECIAL_TOKENS['sep_token'] + \
                self.question[i] + SPECIAL_TOKENS['eos_token']

        encodings_dict = tokenizer(input,                                   
                                   truncation=True, 
                                   max_length=MAXLEN, 
                                   padding="max_length")   
        
        input_ids = encodings_dict['input_ids']
        attention_mask = encodings_dict['attention_mask']
        
        return {'label': torch.tensor(input_ids),
                'input_ids': torch.tensor(input_ids), 
                'attention_mask': torch.tensor(attention_mask)}

In [19]:
def split_data(data, S=TRAIN_SIZE):
    # Shuffle ids
    ids = list(data.keys())
    random.shuffle(ids)

    # Split into training and validation sets    
    train_size = int(S * len(data))

    train_ids = ids[:train_size]
    val_ids = ids[train_size:]

    train_data = dict()
    for id in train_ids:
        train_data[id] = data[id]

    val_data = dict()
    for id in val_ids:
        val_data[id] = data[id]

    return train_data, val_data

### Loading Tokenizer, Config and Model

In [20]:
def get_tokenier(special_tokens=None):
    tokenizer = AutoTokenizer.from_pretrained(MODEL) #GPT2Tokenizer

    if special_tokens:
        tokenizer.add_special_tokens(special_tokens)
        print("Special tokens added")
    return tokenizer

def get_model(tokenizer, special_tokens=None, load_model_path=None):

    #GPT2LMHeadModel
    if special_tokens:
        config = AutoConfig.from_pretrained(MODEL, #change
                                            bos_token_id=tokenizer.bos_token_id,
                                            eos_token_id=tokenizer.eos_token_id,
                                            sep_token_id=tokenizer.sep_token_id,
                                            pad_token_id=tokenizer.pad_token_id,
                                            output_hidden_states=False)
    else: 
        config = AutoConfig.from_pretrained(MODEL,         #change                             
                                            pad_token_id=tokenizer.eos_token_id,
                                            output_hidden_states=False)    

    #----------------------------------------------------------------#
    model = AutoModelForPreTraining.from_pretrained(MODEL, config=config)

    if special_tokens:
        #Special tokens added, model needs to be resized accordingly
        model.resize_token_embeddings(len(tokenizer))

    if load_model_path:
        model.load_state_dict(torch.load(load_model_path))

    model.cuda()
    return model

In [22]:
%%time

tokenizer = get_tokenier(special_tokens=SPECIAL_TOKENS)
model = get_model(tokenizer, 
                  special_tokens=SPECIAL_TOKENS,
                  #load_model_path='drive/MyDrive/Colab Notebooks/pytorch_model_V2_1ep.bin'
                 )

Special tokens added
CPU times: user 4.94 s, sys: 1.73 s, total: 6.67 s
Wall time: 24.5 s


In [23]:
# - Freeze selective layers:
# - Freeze all layers except last n:
for parameter in model.parameters():
    parameter.requires_grad = False

for i, m in enumerate(model.transformer.h):        
    #Only un-freeze the last n transformer blocks
    if i+1 > 12 - UNFREEZE_LAST_N:
        for parameter in m.parameters():
            parameter.requires_grad = True 

for parameter in model.transformer.ln_f.parameters():        
    parameter.requires_grad = True

for parameter in model.lm_head.parameters():        
    parameter.requires_grad = True

In [24]:
train_data1, val_data1 = split_data(train_data)

train_dataset = myDataset(train_data1, tokenizer)
val_dataset = myDataset(val_data1, tokenizer, randomize=False)

f'There are {len(train_dataset) :,} samples for training, and {len(val_dataset) :,} samples for validation testing'

'There are 5,924 samples for training, and 1,481 samples for validation testing'

In [None]:
%%time

training_args = TrainingArguments(
    output_dir="drive/MyDrive/Colab Notebooks/gpt_model_output-final/",
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BATCHSIZE,
    per_device_eval_batch_size=TRAIN_BATCHSIZE,
    gradient_accumulation_steps=BATCH_UPDATE,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=True,
    fp16_opt_level=APEX_OPT_LEVEL,
    warmup_steps=WARMUP_STEPS,    
    learning_rate=LR,
    adam_epsilon=EPS,
    weight_decay=0.01,        
    save_total_limit=1,
    load_best_model_at_end=True,     
)

#---------------------------------------------------#
trainer = Trainer(
    model=model,
    args=training_args,    
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

#---------------------------------------------------#
trainer.train()
trainer.save_model()    

### Generating questions with Fine-tuned GPT-2 model

---

In [None]:
tokenizer = get_tokenier(special_tokens=SPECIAL_TOKENS)
model = get_model(tokenizer, 
                  special_tokens=SPECIAL_TOKENS,
                  load_model_path='pytorch_model.bin')

In [None]:
context = "Typologically, Estonian represents a transitional form from an agglutinating language to a fusional language. The canonical word order is SVO (subject–verb–object)."
answer = "a fusional language"

prompt = SPECIAL_TOKENS['bos_token'] + context + \
         SPECIAL_TOKENS['sep_token'] + answer + SPECIAL_TOKENS['sep_token']
         
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
device = torch.device("cuda")
generated = generated.to(device)

model.eval();

In [None]:
# Top-p (nucleus) text generation (10 samples):
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                min_length=50, 
                                max_length=MAXLEN,
                                top_k=30,                                 
                                top_p=0.7,        
                                temperature=0.9,
                                repetition_penalty=2.0,
                                num_return_sequences=3
                                )

for i, sample_output in enumerate(sample_outputs):
    question = tokenizer.decode(sample_output, skip_special_tokens=True)
    a = len(context) + len(answer)    
    print("{}: {}\n\n".format(i+1,  question[a:]))

In [None]:
# Beam-search text generation:
sample_outputs = model.generate(generated, 
                                do_sample=True,   
                                max_length=MAXLEN,                                                      
                                num_beams=5,
                                repetition_penalty=5.0,
                                early_stopping=True,      
                                num_return_sequences=1
                                )

for i, sample_output in enumerate(sample_outputs):
    question = tokenizer.decode(sample_output, skip_special_tokens=True)
    a = len(context) + len(answer)    
    print("{}: {}\n\n".format(i+1,  question[a:]))