In [None]:
!unzip colab.zip

In [9]:
import numpy as np
import torch
import torch.nn as nn
from tqdm import tqdm
from torch.nn import functional as F
import random
import argparse
random.seed(0)

from utils import dataset
from utils import model 
from utils import trainer as t
from utils import utils

In [11]:
pretrain_corpus_path = "data/wiki.txt"
finetune_corpus_path = "data/birth_places_train.tsv"
eval_corpus_path = "data/birth_dev.tsv"

vanilla_pretrain_params = "data/vanilla.pretrain.params"
vanilla_finetune_params = "data/vanilla.finetune.params"
vanilla_outputs_path = "data/vanilla.nopretrain.test.predictions.txt"

synthesizer_pretrain_params = "data/synthesizer.pretrain.params"
synthesizer_finetune_params = "data/synthesizer.finetune.params"
synthesizer_outputs_path = "data/vanilla.nopretrain.test.predictions.txt"

# Vanilla model

In [4]:
block_size = 128
text = open(pretrain_corpus_path, encoding="utf8").read()
pretrain_dataset = dataset.CharCorruptionDataset(text, block_size)

mconf = model.GPTConfig(pretrain_dataset.vocab_size, pretrain_dataset.block_size,
    n_layer=4, n_head=8, n_embd=256, synthesizer=False)

device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu'

#pretrain

m = model.GPT(mconf)

tconf = t.TrainerConfig(max_epochs=650, batch_size=128, learning_rate=6e-3,
                    lr_decay=True, warmup_tokens=512*20, final_tokens=200*len(pretrain_dataset)*block_size,
                    num_workers=4)

trainer = t.Trainer(m, pretrain_dataset, None, tconf)
trainer.train()
torch.save(m.state_dict(), vanilla_pretrain_params)

data has 418352 characters, 256 unique.
number of parameters: 3323392


  cpuset_checked))
epoch 1 iter 22: train loss 3.45291. lr 5.999655e-03: 100%|██████████| 23/23 [00:03<00:00,  6.10it/s]
epoch 2 iter 22: train loss 3.11841. lr 5.998582e-03: 100%|██████████| 23/23 [00:03<00:00,  6.44it/s]
epoch 3 iter 22: train loss 2.95602. lr 5.996780e-03: 100%|██████████| 23/23 [00:03<00:00,  6.40it/s]
epoch 4 iter 22: train loss 2.90039. lr 5.994250e-03: 100%|██████████| 23/23 [00:03<00:00,  6.31it/s]
epoch 5 iter 22: train loss 2.82573. lr 5.990993e-03: 100%|██████████| 23/23 [00:03<00:00,  6.31it/s]
epoch 6 iter 22: train loss 2.80457. lr 5.987009e-03: 100%|██████████| 23/23 [00:03<00:00,  6.40it/s]
epoch 7 iter 22: train loss 2.78576. lr 5.982299e-03: 100%|██████████| 23/23 [00:03<00:00,  6.34it/s]
epoch 8 iter 22: train loss 2.74151. lr 5.976865e-03: 100%|██████████| 23/23 [00:03<00:00,  6.39it/s]
epoch 9 iter 22: train loss 2.70407. lr 5.970707e-03: 100%|██████████| 23/23 [00:03<00:00,  6.34it/s]
epoch 10 iter 22: train loss 2.69707. lr 5.963828e-03: 100%|███

In [7]:
#finetune

model = model.GPT(mconf)
model.load_state_dict(torch.load(vanilla_pretrain_params))
model = model.to(device)

fine_text = open(finetune_corpus_path, encoding="utf8").read()
train_dataset = dataset.NameDataset(pretrain_dataset, fine_text)

tconf = t.TrainerConfig(max_epochs=10, batch_size=256, learning_rate=6e-4,
            lr_decay=True, warmup_tokens=512*20, final_tokens=200*len(pretrain_dataset)*block_size,
            num_workers=4)

trainer = t.Trainer(model, train_dataset, None, tconf)
trainer.train()
torch.save(model.state_dict(), vanilla_finetune_params)

#evaluation

correct = 0
total = 0
with open(vanilla_outputs_path, 'w') as fout:
    predictions = []
    for line in tqdm(open(eval_corpus_path)):
        x = line.split('\t')[0]
        x = x + '⁇'
        x = torch.tensor([pretrain_dataset.stoi[s] for s in x], dtype=torch.long)[None,...].to(device)
        pred = utils.sample(model, x, 32, sample=False)[0]
        completion = ''.join([pretrain_dataset.itos[int(i)] for i in pred])
        pred = completion.split('⁇')[1]
        predictions.append(pred)
        fout.write(pred + '\n')
    total, correct = utils.evaluate_places(eval_corpus_path, predictions)
if total > 0:
    print('Correct: {} out of {}: {}%'.format(correct, total, correct/total*100))
else:
    print('Predictions written to {}; no targets provided'
            .format(vanilla_outputs_path))


number of parameters: 3323392


  cpuset_checked))
epoch 1 iter 7: train loss 0.76693. lr 5.999844e-04: 100%|██████████| 8/8 [00:02<00:00,  3.25it/s]
epoch 2 iter 7: train loss 0.60305. lr 5.999351e-04: 100%|██████████| 8/8 [00:02<00:00,  3.33it/s]
epoch 3 iter 7: train loss 0.51317. lr 5.998521e-04: 100%|██████████| 8/8 [00:02<00:00,  3.39it/s]
epoch 4 iter 7: train loss 0.45474. lr 5.997352e-04: 100%|██████████| 8/8 [00:02<00:00,  3.29it/s]
epoch 5 iter 7: train loss 0.38587. lr 5.995847e-04: 100%|██████████| 8/8 [00:02<00:00,  3.39it/s]
epoch 6 iter 7: train loss 0.34488. lr 5.994004e-04: 100%|██████████| 8/8 [00:02<00:00,  3.35it/s]
epoch 7 iter 7: train loss 0.27735. lr 5.991823e-04: 100%|██████████| 8/8 [00:02<00:00,  3.34it/s]
epoch 8 iter 7: train loss 0.23862. lr 5.989306e-04: 100%|██████████| 8/8 [00:02<00:00,  3.30it/s]
epoch 9 iter 7: train loss 0.20795. lr 5.986453e-04: 100%|██████████| 8/8 [00:02<00:00,  3.29it/s]
epoch 10 iter 7: train loss 0.16498. lr 5.983263e-04: 100%|██████████| 8/8 [00:02<00:00,  

Correct: 110.0 out of 500.0: 22.0%





# synthesizer

In [10]:
block_size = 128
text = open(pretrain_corpus_path, encoding="utf8").read()
pretrain_dataset = dataset.CharCorruptionDataset(text, block_size)

mconf = model.GPTConfig(pretrain_dataset.vocab_size, pretrain_dataset.block_size,
    n_layer=4, n_head=8, n_embd=256, synthesizer=True)

device = torch.cuda.current_device() if torch.cuda.is_available() else 'cpu'

#pretrain

m = model.GPT(mconf)

tconf = t.TrainerConfig(max_epochs=650, batch_size=128, learning_rate=6e-3,
                    lr_decay=True, warmup_tokens=512*20, final_tokens=200*len(pretrain_dataset)*block_size,
                    num_workers=4)

trainer = t.Trainer(m, pretrain_dataset, None, tconf)
trainer.train()
torch.save(m.state_dict(), synthesizer_pretrain_params)

data has 418352 characters, 256 unique.
number of parameters: 3076988


  cpuset_checked))
epoch 1 iter 22: train loss 3.44117. lr 5.999655e-03: 100%|██████████| 23/23 [00:03<00:00,  6.54it/s]
epoch 2 iter 22: train loss 3.44818. lr 5.998582e-03: 100%|██████████| 23/23 [00:03<00:00,  6.67it/s]
epoch 3 iter 22: train loss 3.16971. lr 5.996780e-03: 100%|██████████| 23/23 [00:03<00:00,  6.68it/s]
epoch 4 iter 22: train loss 2.97910. lr 5.994250e-03: 100%|██████████| 23/23 [00:03<00:00,  6.62it/s]
epoch 5 iter 22: train loss 2.84817. lr 5.990993e-03: 100%|██████████| 23/23 [00:03<00:00,  6.58it/s]
epoch 6 iter 22: train loss 2.78073. lr 5.987009e-03: 100%|██████████| 23/23 [00:03<00:00,  6.54it/s]
epoch 7 iter 22: train loss 2.73707. lr 5.982299e-03: 100%|██████████| 23/23 [00:03<00:00,  6.62it/s]
epoch 8 iter 22: train loss 2.66887. lr 5.976865e-03: 100%|██████████| 23/23 [00:03<00:00,  6.64it/s]
epoch 9 iter 22: train loss 2.60324. lr 5.970707e-03: 100%|██████████| 23/23 [00:03<00:00,  6.56it/s]
epoch 10 iter 22: train loss 2.54533. lr 5.963828e-03: 100%|███

NameError: ignored

In [12]:
torch.save(m.state_dict(), synthesizer_pretrain_params)

In [13]:
#finetune

model = model.GPT(mconf)
model.load_state_dict(torch.load(synthesizer_pretrain_params))
model = model.to(device)

fine_text = open(finetune_corpus_path, encoding="utf8").read()
train_dataset = dataset.NameDataset(pretrain_dataset, fine_text)

tconf = t.TrainerConfig(max_epochs=10, batch_size=256, learning_rate=6e-4,
            lr_decay=True, warmup_tokens=512*20, final_tokens=200*len(pretrain_dataset)*block_size,
            num_workers=4)

trainer = t.Trainer(model, train_dataset, None, tconf)
trainer.train()
torch.save(model.state_dict(), synthesizer_finetune_params)

#evaluation

correct = 0
total = 0
with open(synthesizer_outputs_path, 'w') as fout:
    predictions = []
    for line in tqdm(open(eval_corpus_path)):
        x = line.split('\t')[0]
        x = x + '⁇'
        x = torch.tensor([pretrain_dataset.stoi[s] for s in x], dtype=torch.long)[None,...].to(device)
        pred = utils.sample(model, x, 32, sample=False)[0]
        completion = ''.join([pretrain_dataset.itos[int(i)] for i in pred])
        pred = completion.split('⁇')[1]
        predictions.append(pred)
        fout.write(pred + '\n')
    total, correct = utils.evaluate_places(eval_corpus_path, predictions)
if total > 0:
    print('Correct: {} out of {}: {}%'.format(correct, total, correct/total*100))
else:
    print('Predictions written to {}; no targets provided'
            .format(vanilla_outputs_path))


number of parameters: 3076988


  cpuset_checked))
epoch 1 iter 7: train loss 0.79474. lr 5.999844e-04: 100%|██████████| 8/8 [00:02<00:00,  3.24it/s]
epoch 2 iter 7: train loss 0.66680. lr 5.999351e-04: 100%|██████████| 8/8 [00:02<00:00,  3.49it/s]
epoch 3 iter 7: train loss 0.59799. lr 5.998521e-04: 100%|██████████| 8/8 [00:02<00:00,  3.46it/s]
epoch 4 iter 7: train loss 0.55612. lr 5.997352e-04: 100%|██████████| 8/8 [00:02<00:00,  3.48it/s]
epoch 5 iter 7: train loss 0.51575. lr 5.995847e-04: 100%|██████████| 8/8 [00:02<00:00,  3.44it/s]
epoch 6 iter 7: train loss 0.49178. lr 5.994004e-04: 100%|██████████| 8/8 [00:02<00:00,  3.46it/s]
epoch 7 iter 7: train loss 0.42224. lr 5.991823e-04: 100%|██████████| 8/8 [00:02<00:00,  3.45it/s]
epoch 8 iter 7: train loss 0.37323. lr 5.989306e-04: 100%|██████████| 8/8 [00:02<00:00,  3.44it/s]
epoch 9 iter 7: train loss 0.31156. lr 5.986453e-04: 100%|██████████| 8/8 [00:02<00:00,  3.49it/s]
epoch 10 iter 7: train loss 0.28890. lr 5.983263e-04: 100%|██████████| 8/8 [00:02<00:00,  

Correct: 66.0 out of 500.0: 13.200000000000001%



