In [2]:
import torch
import torch.nn as nn
import torch.functional as F
import torch.optim as optim
import spacy
import numpy as np
import pandas as pd
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset, DataLoader
import os
import matplotlib.pyplot as plt
from tqdm import tqdm
import random

In [3]:
!python -m spacy download en_core_web_sm
!python -m spacy download fr_core_news_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[K     |████████████████████████████████| 12.8 MB 26.2 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting fr-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.4.0/fr_core_news_sm-3.4.0-py3-none-any.whl (16.3 MB)
[K     |████████████████████████████████| 16.3 MB 84.8 MB/s 
Installing collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.4.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')


In [4]:
spacy_en = spacy.load('en_core_web_sm')
spacy_fr = spacy.load('fr_core_news_sm')


class Vocabulary:
    def __init__(self, frequency_threshold):
        self.itos = {
            0: '<PAD>',
            1: '<SOS>',
            2: '<EOS>',
            3: '<UNK>'
        }

        self.stoi = {
            '<PAD>': 0,
            '<SOS>': 1,
            '<EOS>': 2,
            '<UNK>': 3
        }

        self.frequency_threshold = frequency_threshold

    def __len__(self):
        return len(self.itos)

    def __getitem__(self, value):
      if isinstance(value, int):
        return self.itos[value]
      else:
        return self.stoi[value]

    @staticmethod
    def tokenizer(text):
        return []

    def build_vocabulary(self, sentence_list):
        frequencies = {}
        idx = 4
        for sentence in sentence_list:
            for word in self.tokenizer(sentence):
                if word not in frequencies:
                    frequencies[word] = 1
                else:
                    frequencies[word] += 1
                if frequencies[word] == self.frequency_threshold:
                    self.itos[idx] = word
                    self.stoi[word] = idx
                    idx += 1

    def numericalize(self, text):
        tokenized_text = self.tokenizer(text)
        return [
            self.stoi[token] if token in self.stoi else self.stoi['<UNK>']
            for token in tokenized_text
        ]

    def un_numericalize(self, encoding):
        return " ".join([
            self.itos[token.data.item()] if token.data.item() in self.itos else self.itos[3]
            for token in encoding
        ])


class EngVocabulary(Vocabulary):
    def __init__(self, frequency_threshold):
        super().__init__(frequency_threshold)

    @staticmethod
    def tokenizer(text):
        return [tok.text.lower() for tok in spacy_en.tokenizer(text)]


class FrVocabulary(Vocabulary):
    def __init__(self, frequency_threshold):
        super().__init__(frequency_threshold)

    @staticmethod
    def tokenizer(text):
        return [tok.text.lower() for tok in spacy_fr.tokenizer(text)]

In [5]:
class CustomDataset(Dataset):
    def __init__(self, root_dir, frequency_threshold_en=2, frequency_threshold_fr=1, vocab=None):
        super(CustomDataset, self).__init__()
        self.root_dir = root_dir
        self.english = open(os.path.join(root_dir, "english.txt")).read().split("\n")[:-1]
        self.french = open(os.path.join(root_dir, "french.txt")).read().split("\n")[:-1]

        if vocab is None:
            self.vocab_en = EngVocabulary(frequency_threshold_en)
            self.vocab_fr = FrVocabulary(frequency_threshold_fr)
            self.vocab_en.build_vocabulary(self.english)
            self.vocab_fr.build_vocabulary(self.french)
        else:
            self.vocab_en = vocab[0]
            self.vocab_fr = vocab[1]

    def __len__(self):
        return len(self.english)

    def __getitem__(self, index):
        english_sentence = self.english[index]
        french_sentence = self.french[index]
        numericalized_en = [self.vocab_en.stoi['<SOS>']]
        numericalized_en += self.vocab_en.numericalize(english_sentence)
        numericalized_en.append(self.vocab_en.stoi['<EOS>'])
        numericalized_en = torch.tensor(numericalized_en)

        numericalized_fr = [self.vocab_fr.stoi['<SOS>']]
        numericalized_fr += self.vocab_fr.numericalize(french_sentence)
        numericalized_fr.append(self.vocab_fr.stoi['<EOS>'])
        numericalized_fr = torch.tensor(numericalized_fr)

        return numericalized_fr, numericalized_en


class MyCollate:
    def __init__(self, pad_idx_fr, pad_idx_en):
        self.pad_idx_fr = pad_idx_fr
        self.pad_idx_en = pad_idx_en

    def __call__(self, batch):
        fr = [item[0] for item in batch]
        en = [item[1] for item in batch]
        fr = pad_sequence(fr, padding_value=self.pad_idx_fr)
        en = pad_sequence(en, padding_value=self.pad_idx_en)
        return fr, en


def get_loader(root_dir, batch_size, shuffle, vocab=None):
    dataset = CustomDataset(root_dir, vocab=vocab)
    pad_idx_en = dataset.vocab_en.stoi['<PAD>']
    pad_idx_fr = dataset.vocab_fr.stoi['<PAD>']
    loader = DataLoader(
        dataset,
        batch_size=batch_size, shuffle=shuffle,
        collate_fn=MyCollate(pad_idx_fr, pad_idx_en),
    )
    return dataset, loader

In [6]:
if not os.path.exists(os.path.join(os.getcwd(), "data")):
  !wget https://d2beiqkhq929f0.cloudfront.net/public_assets/assets/000/019/853/original/data.zip

  import zipfile
  with zipfile.ZipFile("data.zip", 'r') as zip_ref:
      zip_ref.extractall()

--2022-11-19 04:33:38--  https://d2beiqkhq929f0.cloudfront.net/public_assets/assets/000/019/853/original/data.zip
Resolving d2beiqkhq929f0.cloudfront.net (d2beiqkhq929f0.cloudfront.net)... 13.33.28.35, 13.33.28.145, 13.33.28.138, ...
Connecting to d2beiqkhq929f0.cloudfront.net (d2beiqkhq929f0.cloudfront.net)|13.33.28.35|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1269302 (1.2M) [application/zip]
Saving to: ‘data.zip’


2022-11-19 04:33:39 (3.32 MB/s) - ‘data.zip’ saved [1269302/1269302]



In [7]:
class PositionalEncoding(nn.Module):
  def __init__(self, embeds, max_len, device):
    super(PositionalEncoding, self).__init__()
    self.embed_size = embeds
    self.max_len = max_len
    self.device = device

  def forward(self, x):
    encoding = torch.zeros(self.max_len, self.embed_size, device=self.device)
    # encoding.requires_grad = False
    pos = torch.arange(0, self.max_len, device=self.device)
    pos = pos.float().unsqueeze(dim=1)
    i = torch.arange(0, self.embed_size, step=2, device=self.device).float()

    encoding[:, 0::2] = torch.sin(pos / (10000 ** (i / self.embed_size)))
    encoding[:, 1::2] = torch.cos(pos / (10000 ** (i / self.embed_size)))
    
    batch_size, seq_len = x.size()
    return encoding[:seq_len, :].expand(batch_size, seq_len, self.embed_size)

In [20]:
class Transformer(nn.Module):
  def __init__(self, embed_size, src_vocab_size, trg_vocab_size, src_pad_idx,
               num_heads, num_encoder_layers, num_decoder_layers, forward_expansion,
               dropout, max_len, device):
    super(Transformer, self).__init__()
    self.src_word_embedding = nn.Embedding(src_vocab_size, embed_size)
    self.src_positional_encoding = PositionalEncoding(embed_size, max_len, device)

    self.trg_word_embedding = nn.Embedding(trg_vocab_size, embed_size)
    self.trg_positional_encoding = PositionalEncoding(embed_size, max_len, device)

    self.device = device

    self.transformer = nn.Transformer(embed_size, num_heads, num_encoder_layers, num_decoder_layers,
                                      forward_expansion, dropout)
    
    self.fc_out = nn.Linear(embed_size, trg_vocab_size)
    self.dropout = nn.Dropout(dropout)
    self.src_pad_idx = src_pad_idx

  def make_source_mask(self, x):
    # [seq_len, batch]
    src_mask = x.permute(1,0) == self.src_pad_idx
    # [batch, seq_len]
    return src_mask

  def forward(self, source, target):
    src_seq_len, batch = source.shape
    trg_seq_len, batch = target.shape

    src_embeddings = self.dropout(self.src_word_embedding(source) + self.src_positional_encoding(source))
    trg_embeddings = self.dropout(self.trg_word_embedding(target) + self.trg_positional_encoding(target))

    src_padding_mask = self.make_source_mask(source)
    trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_len).to(self.device)

    out = self.transformer(src_embeddings, trg_embeddings, src_key_padding_mask=src_padding_mask, tgt_mask=trg_mask)
    out = self.fc_out(out)
    return out

In [67]:
batch_size = 128
train_set, train_loader = get_loader("data/train", batch_size=batch_size, shuffle=True)
val_set, val_loader = get_loader("data/val", batch_size=batch_size, shuffle=True,
                                  vocab=[train_set.vocab_en, train_set.vocab_fr])

In [76]:
num_epochs = 100
learning_rate = 3e-4


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

src_vocab_size = len(train_set.vocab_fr)
trg_vocab_size = len(train_set.vocab_en)
embed_size = 512
num_heads = 8
num_encoder_layers = 6
num_decoder_layers = 6
dropout = 0.10
max_len = 1000
forward_expansion = 4
src_pad_idx = train_set.vocab_en['<PAD>']

In [77]:
model = Transformer(embed_size, src_vocab_size, trg_vocab_size, src_pad_idx, num_heads, num_encoder_layers, num_decoder_layers,
                    forward_expansion, dropout, max_len, device).to(device)

In [78]:
criterion = nn.CrossEntropyLoss(ignore_index=src_pad_idx)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, factor=0.1, patience=10, verbose=True
)

In [79]:
f,e = next(iter(train_loader))
f,e = f.to(device), e.to(device)
f.shape, e.shape

(torch.Size([38, 128]), torch.Size([34, 128]))

In [80]:
op = model(f,e)
op.shape

torch.Size([34, 128, 5893])

In [81]:
def translate(model, sentence, french, english, device, max_length=100):
  num_french = [french.stoi['<SOS>']]
  num_french += french.numericalize(sentence)
  num_french.append(french.stoi['<EOS>'])
  num_french = torch.tensor(num_french)
  num_french = num_french.unsqueeze(1)
  num_french = num_french.to(device)

  outputs = [english["<SOS>"]]
  for i in range(max_length):
    trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)

    with torch.no_grad():
        output = model(num_french, trg_tensor)

    best_guess = output.argmax(2)[-1, :].item()
    outputs.append(best_guess)

    if best_guess == english.stoi["<EOS>"]:
        break
  translated_sentence = [english[idx] for idx in outputs]
    # remove start token
  return " ".join(translated_sentence[1:])

In [82]:
english_vocab = train_set.vocab_en
french_vocab = train_set.vocab_fr

In [None]:
sentence = "Une petite fille grimpe dans une maisonnette en bois."
translation = "A little girl climbing into a wooden playhouse."

for epoch in range(1, num_epochs+1):
    with torch.inference_mode():
      print(translate(model, sentence, french_vocab, english_vocab, device))
    train_loop = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)
    model.train()
    losses = []
    for batch_idx, (french, english) in train_loop:
        french = french.to(device)
        english = english.to(device)
        batch_size = english.shape[1]

        output = model(french, english[:-1, :])

        output = output.reshape(-1, output.shape[2])
        english = english[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, english)
        losses.append(loss.item())

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

        optimizer.step()

        per_example_loss = loss.data.item()/batch_size
        train_loop.set_description(f"Epoch: {epoch}/{num_epochs}")
        train_loop.set_postfix({"batch_loss": loss.data.item(), "per_example_loss":per_example_loss})
        if epoch % 20 == 0:
          torch.save(model.state_dict(), "checkpoint.pt")
          print("model saving done")

    mean_loss = sum(losses) / len(losses)
    scheduler.step(mean_loss)

obstacle pillar drinks pillar 100 explosion marble marble 100 marble obstacle monster drinks drinks 100 drinks drinks drinks marble grocery retail tape drinks seated watches retail 100 lobby drinks drinks marble obstacle watches drinks watches obstacle union monster tape wrestler 100 lobby obstacle wrestler 100 drinks marble lobby 100 marble monster watches marble drinks drinks tape marble historic pillar 100 obstacle drinks tape watches explosion marble wrestler 100 100 drinks obstacle marble drinks 100 explosion drinks watches drinks marble drinks tape drinks knitting pillar watches 100 drinks lobby marble obstacle watches tape drinks wrestler drinks wrestler watches marble grocery lobby




a girl in a girl is playing a girl . <EOS>




a little girl in a pink dress is walking in a little girl in a small pool . <EOS>




a little girl in a little girl is jumping in a small wooden wooden . <EOS>




a little girl is climbing a wooden in a wooden area . <EOS>




a little girl in a wooden hat is walking in a wooden area . <EOS>




a little girl in a wooden room in a wooden room . <EOS>




a little girl is climbing a wooden wooden structure into a wooden wooden structure . <EOS>




a girl climbing a wooden structure in a wooden area . <EOS>




a little girl in a wooden coat is climbing a wooden wooden structure . <EOS>




a wooden little girl climbs a wooden wooden wooden wooden in a wooden area . <EOS>




a little girl is climbing a wooden pole in a <UNK> a hardwood wooden wooden wooden . <EOS>




a little girl climbing a wooden wooden wooden castle in the woods . <EOS>




a little girl is climbing a wooden wooden structure in the woods . <EOS>




a little girl is climbing a wooden playhouse in a wooden room . <EOS>




a little girl in a wooden room is climbing a wooden fence . <EOS>


