In [1]:
import os 
import numpy as np
import torch
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import GPT2LMHeadModel, GPT2Config, AdamW, get_linear_schedule_with_warmup
from utils import get_tokenizer, set_seed
from gpt_dataset import GPT2Dataset
from torch.utils.tensorboard import SummaryWriter
import torch.nn.functional as F

import json
import argparse
import time 
from tqdm import tqdm_notebook, tnrange

In [2]:
parser = argparse.ArgumentParser()
parser.add_argument("--lr",default=5e-5, type=float, help="learning rate")
parser.add_argument("--seed",default=42, type=int,  help="seed to replicate results")
parser.add_argument("--n_gpu",default=1, type=int,  help="no of gpu available")
parser.add_argument("--gradient_accumulation_steps",default=2, type=int, help="gradient_accumulation_steps")
parser.add_argument("--batch_size",default=1, type=int,  help="batch_size")
parser.add_argument("--num_workers",default=4, type=int,  help="num of cpus available")
parser.add_argument("--device",default=torch.device('cpu'), type=torch.device, help="torch.device object")
parser.add_argument("--num_train_epochs",default=1, type=int,  help="no of epochs of training")
parser.add_argument("--output_dir",default='./output', type=str,  help="path to save evaluation results")
parser.add_argument("--model_dir",default='./weights', type=str,  help="path to save trained model")
parser.add_argument("--max_grad_norm",default=1.0, type=float, help="max gradient norm.")
parser.add_argument("--data_dir",default='./data', type=str, help="location of json dataset.")
# parser.add_argument("--ids_file",default='./data', type=str, help="location of train, valid and test file indexes")
args = parser.parse_args(["--device", "cuda", "--data_dir", "data/train-balanced-sarcasm.csv", "--model_dir", "data/"])
print(args)

Namespace(lr=5e-05, seed=42, n_gpu=1, gradient_accumulation_steps=2, batch_size=1, num_workers=4, device=device(type='cuda'), num_train_epochs=1, output_dir='./output', model_dir='data/', max_grad_norm=1.0, data_dir='data/train-balanced-sarcasm.csv')


In [2]:
from tqdm.notebook import tqdm, trange 

In [9]:
def train(args, model, tokenizer, train_dataset, ignore_index):
    writer = SummaryWriter('./logs')
    train_sampler = RandomSampler(train_dataset)
    train_dl = DataLoader(
        train_dataset, 
        sampler=train_sampler,
        batch_size=args.batch_size,
        num_workers=args.num_workers
    )
    loss_fact = CrossEntropyLoss(ignore_index=ignore_index)
    optimizer = AdamW(model.parameters(), lr=args.lr)
    scheduler = get_linear_schedule_with_warmup(optimizer, 100, 80000)
    
    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="epochs")
    set_seed(args)
    
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dl, desc='training')
        for step, batch in enumerate(epoch_iterator):
            inputs, labels = batch['context'], batch['context']
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model = model.to(args.device)
            model.train()
            logits = model(inputs)[0]
            
            loc_sep = batch['loc_sep']
            shifted_logits = logits[:, loc_sep:-1, :].contiguous()
            shifted_labels = labels[:, loc_sep+1:].contiguous()
            
            loss = loss_fact(shifted_logits.view(-1, shifted_logits.size(-1)), shifted_labels.view(-1))
            loss /= args.gradient_accumulation_steps
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
            tr_loss += loss.item()
            
            if (step+1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step += 1
                writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                writer.add_scalar('loss', (tr_loss - logging_loss)/args.gradient_accumulation_steps, global_step)
                logging_loss = tr_loss
                print("Loss: ", loss.item(), end="\n\n")

In [5]:
train_data = GPT2Dataset(args.data_dir)

100%|██████████| 17888/17888 [00:04<00:00, 3605.34it/s]


In [6]:
len(train_data)

17888

In [7]:
tokenizer = get_tokenizer()
ignore_index = tokenizer.pad_token_id

model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer))

Embedding(50260, 768)

In [10]:
set_seed(args)
start = time.time()
print("Start time: ", start)
train(args, model, tokenizer, train_data, ignore_index)

Start time:  1691793548.2192638




epochs:   0%|          | 0/1 [00:00<?, ?it/s]

training:   0%|          | 0/17888 [00:00<?, ?it/s]



Loss:  51.353858947753906

Loss:  56.93653106689453

Loss:  49.18165969848633

Loss:  46.95329284667969

Loss:  56.562740325927734

Loss:  39.54401779174805

Loss:  52.501548767089844

Loss:  38.652687072753906

Loss:  53.08421325683594

Loss:  46.54054260253906

Loss:  48.969730377197266

Loss:  45.39422607421875

Loss:  47.49380874633789

Loss:  32.578643798828125

Loss:  49.13037872314453

Loss:  43.82064437866211

Loss:  33.50526809692383

Loss:  30.307722091674805

Loss:  53.80451583862305

Loss:  53.27412796020508

Loss:  40.14712905883789

Loss:  36.830081939697266

Loss:  36.907432556152344

Loss:  39.81113052368164

Loss:  33.867610931396484

Loss:  37.34580993652344

Loss:  32.081932067871094

Loss:  34.3828010559082

Loss:  20.223665237426758

Loss:  22.06003189086914

Loss:  24.696008682250977

Loss:  24.050020217895508

Loss:  22.766794204711914

Loss:  22.20594596862793

Loss:  12.083234786987305

Loss:  12.368781089782715

Loss:  16.92934799194336

Loss:  13.300006866455

KeyboardInterrupt: 

In [12]:
print('Saving trained model...')
model_file = os.path.join("data", 'model_data{}_trained_after_{}_epochs_only_sum_loss_ignr_pad.bin'.format(len(train_data),args.num_train_epochs))
config_file = os.path.join("data", 'config_data{}_trained_after_{}_epochs_only_sum_loss_ignr_pad.json'.format(len(train_data),args.num_train_epochs))
torch.save(model.state_dict(), model_file)
model.config.to_json_file(config_file)

Saving trained model...


In [13]:
torch.save(model, 'data/model_1.pt')

In [14]:
tokenizer.save_vocabulary('data/')

('data/vocab.json', 'data/merges.txt')

### Testing model

In [6]:
model = torch.load('./data/model_1.pt', map_location=torch.device('cpu'))
tokenizer = get_tokenizer()

In [7]:
from utils import gen_reply

In [9]:
gen_reply(model, tokenizer, tokenizer.encode(tokenizer.eos_token+"This cannot be true", return_tensors='pt').to('cpu'))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"you're right, but you should have worked hard for it, you know you need to work harder to make a living"

## Training on GPT3 generated dataset

In [4]:
from gpt_dataset import CustomDataset

In [5]:
parser = argparse.ArgumentParser()
parser.add_argument("--lr",default=5e-5, type=float, help="learning rate")
parser.add_argument("--seed",default=42, type=int,  help="seed to replicate results")
parser.add_argument("--n_gpu",default=1, type=int,  help="no of gpu available")
parser.add_argument("--gradient_accumulation_steps",default=2, type=int, help="gradient_accumulation_steps")
parser.add_argument("--batch_size",default=1, type=int,  help="batch_size")
parser.add_argument("--num_workers",default=4, type=int,  help="num of cpus available")
parser.add_argument("--device",default=torch.device('cuda'), type=torch.device, help="torch.device object")
parser.add_argument("--num_train_epochs",default=1, type=int,  help="no of epochs of training")
parser.add_argument("--output_dir",default='./output', type=str,  help="path to save evaluation results")
parser.add_argument("--model_dir",default='./weights', type=str,  help="path to save trained model")
parser.add_argument("--max_grad_norm",default=1.0, type=float, help="max gradient norm.")
parser.add_argument("--data_dir",default='./data', type=str, help="location of json dataset.")
# parser.add_argument("--ids_file",default='./data', type=str, help="location of train, valid and test file indexes")
args = parser.parse_args(["--device", "cuda", "--data_dir", "./data/dataset.csv", "--model_dir", "./data/model_3.pt"])
print(args)

Namespace(lr=5e-05, seed=42, n_gpu=1, gradient_accumulation_steps=2, batch_size=1, num_workers=4, device=device(type='cuda'), num_train_epochs=1, output_dir='./output', model_dir='./data/model_3.pt', max_grad_norm=1.0, data_dir='./data/dataset.csv')


In [6]:
train_data = CustomDataset(args.data_dir)

100%|██████████| 175672/175672 [00:41<00:00, 4243.85it/s]


In [7]:
tokenizer_3 = get_tokenizer()
ignore_index = tokenizer_3.pad_token_id

model_3 = GPT2LMHeadModel.from_pretrained('gpt2')
model_3.resize_token_embeddings(len(train_data))

Embedding(172177, 768)

In [10]:
set_seed(args)
start = time.time()
print("Start time: ", start)
train(args, model_3, tokenizer_3, train_data, ignore_index)

Start time:  1692021104.152762


epochs:   0%|          | 0/1 [00:00<?, ?it/s]

training:   0%|          | 0/172177 [00:00<?, ?it/s]



Loss:  60.52371597290039



KeyboardInterrupt: 

In [12]:
model_3 = torch.load('data/Model_3/model_3.pt')
tokenizer_3 = get_tokenizer()

In [14]:
from utils import gen_reply

In [17]:
gen_reply(model_3, tokenizer_3, tokenizer_3.encode(tokenizer_3.eos_token+"How are you", return_tensors='pt').to('cuda'))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'sorry, this is a long sentence. edit. sorry. my grammar is too long, sorry'