# BERT

Training a BERT model using a common transformer

## Import Modules

In [1]:
# =============================================================================
# Libs
# =============================================================================
# Standard Library Imports
import random
import math
import re
from collections import Counter
from os.path import exists

# Third-Party Imports
import numpy as np
import torch
from torch.utils.data import Dataset
import torch.nn.functional as F
import torch.optim as optim
import torch.nn as nn

# Local Imports
!git clone https://github.com/Shilpaj1994/ERA.git
# Import files from the downloaded repository
sys.path.insert(0,'./ERA/Session17/')

## DataSet

In [2]:
# =============================================================================
# Dataset
# =============================================================================
class SentencesDataset(Dataset):
    #Init dataset
    def __init__(self, sentences, vocab, seq_len):
        dataset = self

        dataset.sentences = sentences
        dataset.vocab = vocab + ['<ignore>', '<oov>', '<mask>']
        dataset.vocab = {e:i for i, e in enumerate(dataset.vocab)}
        dataset.rvocab = {v:k for k,v in dataset.vocab.items()}
        dataset.seq_len = seq_len

        #special tags
        dataset.IGNORE_IDX = dataset.vocab['<ignore>'] #replacement tag for tokens to ignore
        dataset.OUT_OF_VOCAB_IDX = dataset.vocab['<oov>'] #replacement tag for unknown words
        dataset.MASK_IDX = dataset.vocab['<mask>'] #replacement tag for the masked word prediction task


    #fetch data
    def __getitem__(self, index, p_random_mask=0.15):
        dataset = self

        #while we don't have enough word to fill the sentence for a batch
        s = []
        while len(s) < dataset.seq_len:
            s.extend(dataset.get_sentence_idx(index % len(dataset)))
            index += 1

        #ensure that the sequence is of length seq_len
        s = s[:dataset.seq_len]
        [s.append(dataset.IGNORE_IDX) for i in range(dataset.seq_len - len(s))] #PAD ok

        #apply random mask
        s = [(dataset.MASK_IDX, w) if random.random() < p_random_mask else (w, dataset.IGNORE_IDX) for w in s]

        return {'input': torch.Tensor([w[0] for w in s]).long(),
                'target': torch.Tensor([w[1] for w in s]).long()}

    #return length
    def __len__(self):
        return len(self.sentences)

    #get words id
    def get_sentence_idx(self, index):
        dataset = self
        s = dataset.sentences[index]
        s = [dataset.vocab[w] if w in dataset.vocab else dataset.OUT_OF_VOCAB_IDX for w in s]
        return s


In [3]:
# =============================================================================
# Methods / Class
# =============================================================================
def get_batch(loader, loader_iter):
    try:
        batch = next(loader_iter)
    except StopIteration:
        loader_iter = iter(loader)
        batch = next(loader_iter)
    return batch, loader_iter

## Parameters

In [4]:
# =============================================================================
# #Init
# =============================================================================
print('initializing..')
batch_size = 1024 # B
seq_len = 20      # T
embed_size = 128  # C
inner_ff_size = embed_size * 4
n_heads = 8
n_code = 8
n_vocab = 40000
dropout = 0.1
# n_workers = 12

#optimizer
optim_kwargs = {'lr':1e-4, 'weight_decay':1e-4, 'betas':(.9,.999)}

initializing..


## DataSet Creation

In [5]:
# =============================================================================
# Input
# =============================================================================
#1) load text
print('loading text...')
pth = './ERA/Session17/datasets/BERT/training.txt'
sentences = open(pth).read().lower().split('\n')

#2) tokenize sentences (can be done during training, you can also use spacy udpipe)
print('tokenizing sentences...')
special_chars = ',?;.:/*!+-()[]{}"\'&'
sentences = [re.sub(f'[{re.escape(special_chars)}]', ' \g<0> ', s).split(' ') for s in sentences]
sentences = [[w for w in s if len(w)] for s in sentences]

#3) create vocab if not already created
print('creating/loading vocab...')
pth = './ERA/Session17/datasets/BERT/vocab.txt'
if not exists(pth):
    words = [w for s in sentences for w in s]
    vocab = Counter(words).most_common(n_vocab) #keep the N most frequent words
    vocab = [w[0] for w in vocab]
    open(pth, 'w+').write('\n'.join(vocab))
else:
    vocab = open(pth).read().split('\n')

#4) create dataset
print('creating dataset...')
dataset = SentencesDataset(sentences, vocab, seq_len)
# kwargs = {'num_workers':n_workers, 'shuffle':True,  'drop_last':True, 'pin_memory':True, 'batch_size':batch_size}
kwargs = {'shuffle':True,  'drop_last':True, 'pin_memory':True, 'batch_size':batch_size}
data_loader = torch.utils.data.DataLoader(dataset, **kwargs)

loading text...
tokenizing sentences...
creating/loading vocab...
creating dataset...


## Model

In [6]:
from super_repo.transformer import PositionalEmbedding, EncoderLayer

class BERT(nn.Module):
    def __init__(self, n_code, n_heads, embed_size, inner_ff_size, n_embeddings, seq_len, dropout=.1):
        super().__init__()

        #model input
        self.embeddings = nn.Embedding(n_embeddings, embed_size)
        self.pe = PositionalEmbedding(embed_size, seq_len)

        #backbone
        encoders = []
        for i in range(n_code):
            encoders += [EncoderLayer(n_heads, embed_size, inner_ff_size, seq_len, dropout)]
        self.encoders = nn.ModuleList(encoders)

        #language model
        self.norm = nn.LayerNorm(embed_size)
        self.linear = nn.Linear(embed_size, n_embeddings, bias=False)


    def forward(self, x):
        x = self.embeddings(x)
        x = x + self.pe(x)
        for encoder in self.encoders:
            x = encoder(x)
        x = self.norm(x)
        x = self.linear(x)
        return x


# =============================================================================
# Model
# =============================================================================
#init model
print('initializing model...')
model = BERT(n_code, n_heads, embed_size, inner_ff_size, len(dataset.vocab), seq_len, dropout)
model = model.cuda()

initializing model...


## Train Model

In [7]:
# =============================================================================
# Optimizer
# =============================================================================
print('initializing optimizer and loss...')
optimizer = optim.Adam(model.parameters(), **optim_kwargs)
loss_model = nn.CrossEntropyLoss(ignore_index=dataset.IGNORE_IDX)

# =============================================================================
# Train
# =============================================================================
print('training...')
print_each = 10
model.train()
batch_iter = iter(data_loader)
n_iteration = 10_000
for it in range(n_iteration):

    #get batch
    batch, batch_iter = get_batch(data_loader, batch_iter)

    #infer
    masked_input = batch['input']
    masked_target = batch['target']

    masked_input = masked_input.cuda(non_blocking=True)
    masked_target = masked_target.cuda(non_blocking=True)
    output = model(masked_input)

    #compute the cross entropy loss
    output_v = output.view(-1,output.shape[-1])
    target_v = masked_target.view(-1,1).squeeze()
    loss = loss_model(output_v, target_v)

    #compute gradients
    loss.backward()

    #apply gradients
    optimizer.step()

    #print step
    if it % print_each == 0:
        print('it:', it,
              ' | loss', np.round(loss.item(),2),
              ' | Δw:', round(model.embeddings.weight.grad.abs().sum().item(),3))

    #reset gradients
    optimizer.zero_grad()

initializing optimizer and loss...
training...
it: 0  | loss 10.31  | Δw: 1.36
it: 10  | loss 9.52  | Δw: 0.69
it: 20  | loss 9.29  | Δw: 0.442
it: 30  | loss 9.1  | Δw: 0.353
it: 40  | loss 8.93  | Δw: 0.291
it: 50  | loss 8.75  | Δw: 0.255
it: 60  | loss 8.63  | Δw: 0.225
it: 70  | loss 8.5  | Δw: 0.203
it: 80  | loss 8.35  | Δw: 0.201
it: 90  | loss 8.15  | Δw: 0.189
it: 100  | loss 8.06  | Δw: 0.179
it: 110  | loss 7.83  | Δw: 0.172
it: 120  | loss 7.75  | Δw: 0.17
it: 130  | loss 7.63  | Δw: 0.167
it: 140  | loss 7.52  | Δw: 0.16
it: 150  | loss 7.42  | Δw: 0.157
it: 160  | loss 7.29  | Δw: 0.152
it: 170  | loss 7.17  | Δw: 0.151
it: 180  | loss 7.05  | Δw: 0.144
it: 190  | loss 6.98  | Δw: 0.148
it: 200  | loss 6.88  | Δw: 0.143
it: 210  | loss 6.84  | Δw: 0.143
it: 220  | loss 6.78  | Δw: 0.153
it: 230  | loss 6.7  | Δw: 0.149
it: 240  | loss 6.74  | Δw: 0.155
it: 250  | loss 6.75  | Δw: 0.167
it: 260  | loss 6.49  | Δw: 0.163
it: 270  | loss 6.52  | Δw: 0.167
it: 280  | loss 6.

## Analysis

In [8]:
# =============================================================================
# Results analysis
# =============================================================================
print('saving embeddings...')
N = 3000
np.savetxt('values.tsv', np.round(model.embeddings.weight.detach().cpu().numpy()[0:N], 2), delimiter='\t', fmt='%1.2f')
s = [dataset.rvocab[i] for i in range(N)]
open('names.tsv', 'w+').write('\n'.join(s) )

saving embeddings...
end


## Save Model

In [9]:
torch.save(model.state_dict(), 'BERT.pth')