In [1]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [2]:
!pip install torchtext==0.6.0

Collecting torchtext==0.6.0
  Downloading torchtext-0.6.0-py3-none-any.whl.metadata (6.3 kB)
Downloading torchtext-0.6.0-py3-none-any.whl (64 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.2/64.2 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchtext
Successfully installed torchtext-0.6.0


In [None]:
import numpy as np
import torch
import torchtext
from torch import nn
from torch.utils.data import DataLoader, Dataset
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
from torch import Tensor

import evaluate

import io
import os
import random
from collections import Counter
from typing import List, Optional, Any, Type, Union, Tuple
import math
import time
import itertools

from tqdm.notebook import tqdm
from IPython.display import clear_output

import wandb
!wandb login 

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
DEVICE

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


device(type='cuda')

In [4]:
SPECIALS = ['<unk>', '<pad>', '<bos>', '<eos>']
UNK_IDX = SPECIALS.index('<unk>')
PAD_IDX = SPECIALS.index('<pad>')
BOS_IDX = SPECIALS.index('<bos>')
EOS_IDX = SPECIALS.index('<eos>')

In [5]:
def build_vocab(filepath, tokenizer, max_size, min_freq):
    counter = Counter()
    with io.open(filepath, encoding="utf8") as f:
        for string_ in f:
            counter.update(tokenizer(string_))
    return Vocab(counter, specials=SPECIALS, max_size=max_size, min_freq=min_freq)

def data_process(src_data, trg_data, src_vocab, trg_vocab, src_tokenizer, trg_tokenizer, max_len):
    data = []
    for (raw_src, raw_trg) in zip(src_data, trg_data):
        src_tensor = torch.tensor([src_vocab[token] for token in src_tokenizer(raw_src)],
                                dtype=torch.long)[:max_len-2]
        trg_tensor = torch.tensor([trg_vocab[token] for token in trg_tokenizer(raw_trg)],
                                dtype=torch.long)[:max_len-2]
        data.append([src_tensor, trg_tensor])
    return data

def tensor_transform(token_ids):
    return torch.cat(
        (torch.tensor([BOS_IDX]),
        token_ids,
        torch.tensor([EOS_IDX]))
    )

def generate_batch(data_batch):
    de_batch, en_batch = [], []
    for (de_item, en_item) in data_batch:
        de_batch.append(tensor_transform(de_item))
        en_batch.append(tensor_transform(en_item))
    de_batch = pad_sequence(de_batch, padding_value=PAD_IDX)
    en_batch = pad_sequence(en_batch, padding_value=PAD_IDX)
    return de_batch, en_batch


In [6]:
def generate_square_subsequent_mask(size):
    mask = (torch.triu(torch.ones((size, size), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(src, trg):
    src_seq_len = src.shape[0]
    trg_seq_len = trg.shape[0]

    trg_mask = generate_square_subsequent_mask(trg_seq_len)
    src_mask = torch.zeros((src_seq_len, src_seq_len), dtype=torch.bool, device=DEVICE)

    src_padding_mask = (src == PAD_IDX).transpose(0, 1)
    trg_padding_mask = (trg == PAD_IDX).transpose(0, 1)
    return src_mask, trg_mask, src_padding_mask, trg_padding_mask


class PositionalEncoding(nn.Module):
    def __init__(
        self,
        emb_size,
        dropout,
        maxlen=5000
    ):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0), :])

class Translator(nn.Module):
    def __init__(
            self,
            num_encoder_layers,
            num_decoder_layers,
            embed_size,
            num_heads,
            src_vocab_size,
            trg_vocab_size,
            dim_feedforward,
            dropout
        ):
        super(Translator, self).__init__()

        self.src_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.trg_embedding = nn.Embedding(trg_vocab_size, embed_size)

        self.pos_enc = PositionalEncoding(embed_size, dropout)

        self.transformer = nn.Transformer(
            d_model=embed_size,
            nhead=num_heads,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout
        )

        self.ff = nn.Linear(embed_size, trg_vocab_size)

        self._init_weights()

    def _init_weights(self):
        pass

    def forward(self, src, trg, src_mask, trg_mask, src_padding_mask, trg_padding_mask, memory_key_padding_mask):

        src_emb = self.pos_enc(self.src_embedding(src))
        trg_emb = self.pos_enc(self.trg_embedding(trg))

        outs = self.transformer(
            src_emb,
            trg_emb,
            src_mask,
            trg_mask,
            None,
            src_padding_mask,
            trg_padding_mask,
            memory_key_padding_mask
        )

        return self.ff(outs)

    def encode(self, src, src_mask):
        src_emb = self.pos_enc(self.src_embedding(src))
        return self.transformer.encoder(src_emb, src_mask)

    def decode(self, trg, memory, trg_mask):
        trg_emb = self.pos_enc(self.trg_embedding(trg))
        return self.transformer.decoder(trg_emb, memory, trg_mask)

In [7]:
def greedy_decode(model : Translator, src, src_mask, max_len):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    memory = model.encode(src, src_mask).to(DEVICE)
    generated = BOS_IDX * torch.ones(1, 1, dtype=torch.long, device=DEVICE)

    for _ in range(max_len-1):
        trg_mask = (generate_square_subsequent_mask(generated.shape[0])).type(torch.bool).to(DEVICE)
        out = model.decode(generated, memory, trg_mask)
        out = out.transpose(0, 1)

        prob = model.ff(out[:, -1])
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()

        generated = torch.cat([generated, next_word * torch.ones(1, 1, dtype=torch.long, device=DEVICE)], dim=0)
        if next_word == EOS_IDX:
            break

    return generated

def calc_test(model, test_data, trg_vocab):
    model.eval()
    result = list()
    
    for sentence in tqdm(test_data):
        sentence = tensor_transform(sentence).reshape((-1, 1))
        num_tokens = sentence.shape[0]

        src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
        trg_tokens = greedy_decode(model, sentence, src_mask, max_len=num_tokens+5).flatten()
        output_as_list = list(trg_tokens.cpu().numpy())
        output_list_words = [trg_vocab.itos[i] for i in output_as_list]
        translation = " ".join(output_list_words).replace("<bos>", "").replace("<eos>", "").replace("<unk>", "")
        result.append(translation)
    return result

def calc_bleu(result, reference):
    bleu = evaluate.load('bleu')
    result = bleu.compute(predictions=result, references=[[ref] for ref in reference])
    return result['bleu']


In [8]:
def count_parameters(model: nn.Module):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def train(model: Translator,
          data: torch.utils.data.DataLoader,
          optimizer: torch.optim.Optimizer,
          scheduler : torch.optim.lr_scheduler.LRScheduler,
          loss_fn: nn.Module,
          epoch : int):

    model.train()
    loss = torch.tensor(0.0)
    epoch_loss = 0
    progress_train = tqdm(total=len(data), desc=f"Train epoch {epoch}, Loss {loss.item()}", leave=False)
    for _, (src, trg) in enumerate(data):
        src, trg = src.to(DEVICE), trg.to(DEVICE)
        trg_input = trg[:-1]
        src_mask, trg_mask, src_padding_mask, trg_padding_mask = create_mask(src, trg_input)

        logits = model.forward(src, trg_input, src_mask, trg_mask, src_padding_mask, trg_padding_mask, src_padding_mask)

        optimizer.zero_grad()
        loss = loss_fn(logits.reshape((-1, logits.shape[-1])), trg[1:].reshape(-1))
        loss.backward()
        optimizer.step()
        scheduler.step()
        epoch_loss += loss.item()
        progress_train.update()
        progress_train.set_description(f"Train epoch {epoch}, Loss {loss.item()}")

    progress_train.close()

    return epoch_loss / len(data)


def validate(model: Translator,
             data: torch.utils.data.DataLoader,
             loss_fn: nn.Module,
             epoch : int):
    
    model.eval()
    epoch_loss = 0
    loss = torch.tensor(0.0)
    progress_val = tqdm(total=len(data), desc=f"Validation epoch {epoch},  Loss {loss.item()}", leave=False)
    
    with torch.no_grad():
        for _, (src, trg) in enumerate(data):
            src, trg = src.to(DEVICE), trg.to(DEVICE)
            trg_input = trg[:-1]
            src_mask, trg_mask, src_padding_mask, trg_padding_mask = create_mask(src, trg_input)
            logits = model.forward(src, trg_input, src_mask, trg_mask, src_padding_mask, trg_padding_mask, src_padding_mask)
            loss = loss_fn(logits.reshape((-1, logits.shape[-1])), trg[1:].reshape(-1))
            epoch_loss += loss.item()
            progress_val.update()
            progress_val.set_description(f"Validation epoch {epoch}, Loss {loss.item()}")
    progress_val.close()

    return epoch_loss / len(data)


def train_model(config, 
                train_data, val_data, de_vocab, en_vocab, 
                test_de_tokens, test_en_text,
                wandb_run,
                model_state_dict=None):
    
    train_config = config['train']
    model_config = config['model']

    dl_train = DataLoader(train_data, batch_size=train_config['batch_size'], shuffle=True, collate_fn=generate_batch)
    dl_valid = DataLoader(val_data, batch_size=train_config['batch_size'], shuffle=False, collate_fn=generate_batch)

    model = Translator(
        num_encoder_layers=model_config['num_encoder_layers'],
        num_decoder_layers=model_config['num_decoder_layers'],
        embed_size=model_config['embed_size'],
        num_heads=model_config['num_heads'],
        src_vocab_size=len(de_vocab),
        trg_vocab_size=len(en_vocab),
        dim_feedforward=model_config['dim_feedforward'],
        dropout=model_config['dropout'],
    ).to(DEVICE)

    model_config['count_parameters'] = count_parameters(model)
    wandb.config.update({'model' : model_config}, allow_val_change=True)

    if model_state_dict is not None:
        model.load_state_dict(model_state_dict)

    loss_fn = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
    optimizer = torch.optim.AdamW(model.parameters(), 
                                  lr=train_config['lr'], 
                                  betas=train_config['betas'], 
                                  weight_decay=train_config['weight_decay'])
    
    #warmup_sch = torch.optim.lr_scheduler.LinearLR(optimizer, 
#   #                                                start_factor=0.1, 
    #                                               total_iters=train_config['warmup_epochs'])
    # cosine_sch = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, 
    #                                                start_factor=0.1, 
    #                                                total_iters=train_config['warmup_epochs'])
    #scheduler = torch.optim.lr_scheduler.SequentialLR(optimizer, [warmup_sch], milestones=[])
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=train_config['lr'], 
                                                    steps_per_epoch=len(dl_train), 
                                                    epochs=train_config['n_epoch'],
                                                    pct_start=train_config['warmup_epochs']/train_config['n_epoch'])


    best_valid_loss = float('inf')
    for epoch in range(train_config['n_epoch']):
        train_loss = train(model, dl_train, optimizer, scheduler, loss_fn, epoch)
        valid_loss = validate(model, dl_valid, loss_fn, epoch)

        translation = calc_test(model, test_de_tokens, en_vocab)
        bleu = calc_bleu(translation, test_en_text)

        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), "best.pt")
            wandb_run.save("best.pt")

        torch.save(model.state_dict(), "last.pt")
        wandb_run.save("last.pt")

        print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')
        print(f'\t Test BLEU: {bleu * 100:.3f}')

        wandb_run.log({
            "Learning rate" : scheduler.get_last_lr()[-1],
            "Train PPL": math.exp(train_loss),
            "Train Loss": train_loss,
            "Valid PPL": math.exp(valid_loss),
            "Valid Loss": valid_loss,
            "Test BLEU": bleu * 100,
        })

        #scheduler.step()
        
    return model

In [9]:
# wandb.finish()

In [None]:
config = {
    'train' : {
        'batch_size' : 200,
        'val_ratio' : 0.05,
        'calc_bleu_size' : 500, 
        'n_epoch' : 60,
        'lr' : 2e-3,
        'betas' : (0.9, 0.999), 
        'weight_decay' : 0.1,
        'warmup_epochs' : 5,
    },
    'vocab' : {
        'vocab_size' : 100000, 
        'max_seq_len' : 1000, 
        'min_freq' : 4,
    },
    'model' : {
        'num_encoder_layers' : 5,
        'num_decoder_layers' : 5,
        'embed_size' : 200,
        'num_heads' : 5,
        'dim_feedforward' : 256,
        'dropout' : 0.1,
    },
} 

for _ in [0]:
    wandb_run = wandb.init(name=f"Full train 8 no init larger", 
                           group=f"Full train",
                           project="DL LHW 2", 
                           config=config)
    
    data_dir = '/kaggle/input/bdz2-data'
    train_filepaths = [f'{data_dir}/train.de-en.de', f'{data_dir}/train.de-en.en']
    val_filepaths = [f'{data_dir}/val.de-en.de', f'{data_dir}/val.de-en.en']
    test_filepaths = [f'{data_dir}/test1.de-en.de', f'{data_dir}/test1.de-en.de'] # dup
    
    de_tokenizer = get_tokenizer(None)
    en_tokenizer = get_tokenizer(None)

    vocab_config = config['vocab']
    de_vocab = build_vocab(train_filepaths[0], de_tokenizer, max_size=vocab_config['vocab_size'], min_freq=vocab_config['min_freq'])
    en_vocab = build_vocab(train_filepaths[1], en_tokenizer, max_size=vocab_config['vocab_size'], min_freq=vocab_config['min_freq'])
    
    full_de_text = list(itertools.chain(iter(io.open(train_filepaths[0], encoding="utf8")),
                                   iter(io.open(val_filepaths[0], encoding="utf8"))))
    full_en_text = list(itertools.chain(iter(io.open(train_filepaths[1], encoding="utf8")),
                                   iter(io.open(val_filepaths[1], encoding="utf8"))))
    
    all_data = data_process(full_de_text, full_en_text, de_vocab, en_vocab, de_tokenizer, en_tokenizer, vocab_config['max_seq_len'])
    
    train_config = config['train']
    valid_size = int(len(all_data) * train_config['val_ratio'])
    train_data, val_data = all_data[valid_size:], all_data[:valid_size]
    
    valid_en_text = full_en_text[:valid_size]

    model = train_model(config, 
                        train_data, val_data, de_vocab, en_vocab, 
                        [i[0] for i in val_data[:train_config['calc_bleu_size']]], valid_en_text[:train_config['calc_bleu_size']],
                        wandb_run=wandb_run)
    
    test_de_text = list(iter(io.open(test_filepaths[0], encoding="utf8")))
    test_data = [i[0] for i in data_process(test_de_text, test_de_text, de_vocab, en_vocab, de_tokenizer, en_tokenizer, vocab_config['max_seq_len'])]
    
    translation = calc_test(model, test_data, en_vocab)
    
    file = open(f'translation.en', 'w')
    file.write('\n'.join(translation))
    file.write('\n')
    file.close()
    wandb_run.save('translation.en')
    wandb_run.finish()