In [1]:
!pip install -qU portalocker

In [2]:
from google.colab import drive
# drive.mount('/content/drive')

In [3]:
!mkdir data
!cp "/content/drive/MyDrive/KAGGLE DATASETS/english2german/deu.txt.zip" "data/data.txt.zip"
!unzip "data/data.txt.zip" -d "data"

Archive:  data/data.txt.zip
  inflating: data/deu.txt            


In [4]:
drive.flush_and_unmount()

In [5]:
f = open("data/deu.txt", "r")
data = f.read()

In [6]:
line_split = data.split("\n")
line_split[:5]

['Go.\tGeh.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8597805 (Roujin)',
 'Hi.\tHallo!\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #380701 (cburgmer)',
 'Hi.\tGrüß Gott!\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #659813 (Esperantostern)',
 'Run!\tLauf!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #941078 (Fingerhut)',
 'Run.\tLauf!\tCC-BY 2.0 (France) Attribution: tatoeba.org #4008918 (JSakuragi) & #941078 (Fingerhut)']

# Fixing Dataset

In [7]:
eng_sents = []
ger_sents = []
for line in line_split:
  try:
    english_text, german_text, _ = line.split("\t")
    eng_sents.append(english_text)
    ger_sents.append(german_text)
  except:
    print(line.split("\t"))

print("English and german sentences")
len(eng_sents), len(ger_sents)

['']
English and german sentences


(221533, 221533)

In [8]:
eng_sents[:10], ger_sents[:10]

(['Go.',
  'Hi.',
  'Hi.',
  'Run!',
  'Run.',
  'Wow!',
  'Wow!',
  'Fire!',
  'Help!',
  'Help!'],
 ['Geh.',
  'Hallo!',
  'Grüß Gott!',
  'Lauf!',
  'Lauf!',
  'Potzdonner!',
  'Donnerwetter!',
  'Feuer!',
  'Hilfe!',
  'Zu Hülf!'])

In [9]:
!python -m spacy download de_core_news_sm

2023-10-21 14:06:12.026948: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-21 14:06:12.081985: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting de-core-news-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.6.0/de_core_news_sm-3.6.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m70.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: de-core-news-sm
Successfully installed de-c

In [10]:
!python -m spacy download en_core_web_sm

2023-10-21 14:06:26.423001: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-21 14:06:26.477092: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Collecting en-core-web-sm==3.6.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.6.0/en_core_web_sm-3.6.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m98.9 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the pack

# Building Tokenizer and Vocab

In [11]:
import torch
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer

In [12]:
english_tokenizer = get_tokenizer("spacy", "en_core_web_sm")
german_tokenizer = get_tokenizer("spacy", "de_core_news_sm")

In [13]:
english_tokenizer("Hello, this is ronit!")

['Hello', ',', 'this', 'is', 'ronit', '!']

In [14]:
german_tokenizer("Hallo, das ist Ronit!")

['Hallo', ',', 'das', 'ist', 'Ronit', '!']

In [15]:
def build_vocab(data, tokenizer):
  for sent in data:
    yield tokenizer(sent)

In [16]:
english_vocab = build_vocab_from_iterator(
    build_vocab(eng_sents, english_tokenizer),
    specials=["<pad>", "<unk>"]
)
english_vocab.set_default_index(english_vocab["<unk>"])

german_vocab = build_vocab_from_iterator(
    build_vocab(ger_sents, german_tokenizer),
    specials=["<pad>", "<unk>", "<sos>", "<eos>"]
)
german_vocab.set_default_index(german_vocab["<unk>"])

In [17]:
len(english_vocab), len(german_vocab)

(18014, 38614)

In [18]:
english_example = english_tokenizer("Hello how are you?, I am Ronit!")
indexes = english_vocab(english_example)
english_example, indexes, english_vocab.lookup_tokens(indexes)

(['Hello', 'how', 'are', 'you', '?', ',', 'I', 'am', 'Ronit', '!'],
 [3480, 120, 37, 6, 8, 18, 3, 162, 1, 185],
 ['Hello', 'how', 'are', 'you', '?', ',', 'I', 'am', '<unk>', '!'])

In [19]:
german_example = german_tokenizer("Hallo, wie geht es dir? Ich bin Ronit!")
german_example.append("<eos>")
german_example.insert(0, "<sos>")
indexes = german_vocab(german_example)
german_example, indexes, german_vocab.lookup_tokens(indexes)

(['<sos>',
  'Hallo',
  ',',
  'wie',
  'geht',
  'es',
  'dir',
  '?',
  'Ich',
  'bin',
  'Ronit',
  '!',
  '<eos>'],
 [2, 2083, 5, 59, 154, 19, 56, 8, 7, 48, 1, 24, 3],
 ['<sos>',
  'Hallo',
  ',',
  'wie',
  'geht',
  'es',
  'dir',
  '?',
  'Ich',
  'bin',
  '<unk>',
  '!',
  '<eos>'])

# Generating Batches and vectorizing text

In [20]:
from torchtext.data.functional import to_map_style_dataset

sentences = to_map_style_dataset(zip(eng_sents, ger_sents))
# german_sentences = to_map_style_dataset(ger_sents)
sentences = [list(sent) for sent in sentences]
sentences[:5]

[['Go.', 'Geh.'],
 ['Hi.', 'Hallo!'],
 ['Hi.', 'Grüß Gott!'],
 ['Run!', 'Lauf!'],
 ['Run.', 'Lauf!']]

In [21]:
def vectorize_batch(batch):

  english_sents = [sents[0] for sents in batch]
  german_sents = [sents[1] for sents in batch]

  english_max_len=max([len(english_tokenizer(sent)) for sent in english_sents])

  english_tokens = [english_vocab(english_tokenizer(sent)) for sent in english_sents]
  english_tokens = [tokens + ([0]*(english_max_len-len(tokens))) for tokens in english_tokens]

  german_max_len=max([len(german_tokenizer(sent)) for sent in german_sents])

  german_tokens_input = [german_vocab(german_tokenizer(sent)) for sent in german_sents]
  german_tokens_input = [german_vocab(["<sos>"])+tokens + german_vocab(["<eos>"]) + ([0]*(german_max_len-len(tokens))) for tokens in german_tokens_input]

  return (torch.tensor(english_tokens, dtype=torch.int32),
  torch.tensor(german_tokens_input, dtype=torch.long))

In [22]:
eng, ger = vectorize_batch(sentences[:5])
for s in eng:
  print(s)
  print(english_vocab.lookup_tokens(s.tolist()))

tensor([635,   2], dtype=torch.int32)
['Go', '.']
tensor([3023,    2], dtype=torch.int32)
['Hi', '.']
tensor([3023,    2], dtype=torch.int32)
['Hi', '.']
tensor([5413,  185], dtype=torch.int32)
['Run', '!']
tensor([5413,    2], dtype=torch.int32)
['Run', '.']


In [23]:
for s in ger:
  print(s)
  print(german_vocab.lookup_tokens(s.tolist()))

tensor([  2, 761,   4,   3,   0])
['<sos>', 'Geh', '.', '<eos>', '<pad>']
tensor([   2, 2083,   24,    3,    0])
['<sos>', 'Hallo', '!', '<eos>', '<pad>']
tensor([   2, 5882, 1636,   24,    3])
['<sos>', 'Grüß', 'Gott', '!', '<eos>']
tensor([   2, 5351,   24,    3,    0])
['<sos>', 'Lauf', '!', '<eos>', '<pad>']
tensor([   2, 5351,   24,    3,    0])
['<sos>', 'Lauf', '!', '<eos>', '<pad>']


In [24]:
from torch.utils.data import DataLoader

train_loader = DataLoader(sentences, batch_size=256, collate_fn=vectorize_batch, shuffle=True)

In [25]:
i=0
for eng, ger_in in train_loader:
  i+=1
  print(eng.shape, ger_in.shape)
  if i==5: break


torch.Size([256, 16]) torch.Size([256, 22])
torch.Size([256, 25]) torch.Size([256, 26])
torch.Size([256, 20]) torch.Size([256, 27])
torch.Size([256, 19]) torch.Size([256, 25])
torch.Size([256, 21]) torch.Size([256, 21])


# Saving the dataloader


In [26]:
torch.save(train_loader, "train_loader.pth")

In [27]:
saved_loader = torch.load("train_loader.pth")

In [28]:
i=0
for eng, ger_in in saved_loader:
  i+=1
  print(eng.shape, ger_in.shape)
  if i==5: break

torch.Size([256, 23]) torch.Size([256, 27])
torch.Size([256, 18]) torch.Size([256, 23])
torch.Size([256, 23]) torch.Size([256, 30])
torch.Size([256, 24]) torch.Size([256, 28])
torch.Size([256, 17]) torch.Size([256, 21])


# Model building

In [29]:
import torch
from torch import nn
from torch.nn import functional as F

In [30]:
# [batch, seq]
a = torch.randn(256, 25, 128)
print(a.shape)
ls = nn.LSTM(25, 75, num_layers=2, batch_first=True)
out, (h, c) = ls(a)
# [batch, seq], [1, hidden], [1, hidden]
out.shape, h.shape, c.shape

torch.Size([256, 25, 128])


(torch.Size([256, 25, 75]), torch.Size([2, 256, 75]), torch.Size([2, 256, 75]))

In [31]:
device = "cuda" if torch.cuda.is_available() else "cpu"

### Encoder

In [32]:
class Encoder(nn.Module):
  def __init__(self, input_dim, embed_dim, hidden_dim, n_layers, dropout):
    super(Encoder, self).__init__()
    self.hidden_dim = hidden_dim
    self.n_layers = n_layers

    self.embedding_layer = nn.Embedding(input_dim, embed_dim)
    self.rnn = nn.LSTM(embed_dim, hidden_dim, num_layers=n_layers, batch_first=True, dropout=dropout)
    self.dropout = nn.Dropout(dropout)

  def forward(self, src):
    # src: [batch, seq]
    embeddings = self.dropout(self.embedding_layer(src))

    # [batch, seq, embed], [1, hidden, embed], [1, hidden, embed]
    out, (h, c) = self.rnn(embeddings)
    # out: [batch, seq, hidden*n_direction]
    # h: [n_layers*n_direction, batch, hidden]
    # c: [n_layers*n_direction, batch, hidden]

    return h, c

In [33]:
class Decoder(nn.Module):
  def __init__(self, output_dim, embed_dim, hidden_dim, n_layers, dropout):
    super(Decoder, self).__init__()
    self.hidden_dim = hidden_dim
    self.n_layers = n_layers

    self.embedding_layer = nn.Embedding(output_dim, embed_dim)
    self.rnn = nn.LSTM(embed_dim, hidden_dim, num_layers=n_layers, batch_first=True, dropout=dropout)
    self.linear = nn.Linear(hidden_dim, output_dim)
    self.dropout = nn.Dropout(dropout)

  def forward(self, input, hidden, cell):

    # h: [n_layers*n_direction, batch, hidden]
    # c: [n_layers*n_direction, batch, hidden]


    # input: [batch_size]-->[1, batch_size]
    input = input.unsqueeze(0)

    embeddings = self.dropout(self.embedding_layer(input))
    # [1, batch, embed_dim]
    embeddings = embeddings.view(embeddings.size(1), embeddings.size(0), -1)
    # print("in decoder", embeddings.shape, hidden.shape, cell.shape)
    out, (h, c) = self.rnn(embeddings, (hidden, cell))

    prediction = self.linear(out.squeeze(0))


    return prediction, h, c

In [34]:
import random

In [35]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, device):
    super(Seq2Seq, self).__init__()

    self.encoder = encoder
    self.decoder = decoder
    self.device = device

    assert encoder.hidden_dim == decoder.hidden_dim, \
      'hidden dimensions of encoder and decoder must be equal.'
    assert encoder.n_layers == decoder.n_layers, \
      'n_layers of encoder and decoder must be equal.'

  def forward(self, src, trg, teacher_force_ratio=0.5):

    # src: [batch, seq]
    # trg: [batch, seq]

    trg_len = trg.shape[1]
    batch_size = trg.shape[0]

    # tensor to store decoder outputs
    outputs = torch.zeros(batch_size, trg_len, len(german_vocab)).to(self.device)

    h, c = self.encoder(src)

    input = trg[:, 0]
    for t in range(1, trg_len):
      # print(input.shape, h.shape, c.shape)
      # print("-"*50)
      out, h, c = self.decoder(input, h, c)
      outputs[:, t, :] = out[:, 0, :]

      teacher_force = random.random()<teacher_force_ratio

      # get the highest predicted token from our predictions.
      # top1 = out.argmax(1)

      # update input : use ground_truth when teacher_force
      # input = trg[: t] if teacher_force else top1
      input = trg[:, t]

    return outputs


In [36]:
INPUT_DIM = len(english_vocab)
OUTPUT_DIM = len(german_vocab)
ENC_EMBED_DIM = 256
DEC_EMBED_DIM = 256
HIDDEN_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

encoder = Encoder(INPUT_DIM, ENC_EMBED_DIM, HIDDEN_DIM, N_LAYERS, ENC_DROPOUT)
decoder = Decoder(OUTPUT_DIM, DEC_EMBED_DIM, HIDDEN_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(encoder, decoder, device).to(device)

In [37]:
def init_weights(m):
  for name, params in m.named_parameters():
    nn.init.uniform_(params.data, -0.08, 0.08)

model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding_layer): Embedding(18014, 256)
    (rnn): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding_layer): Embedding(38614, 256)
    (rnn): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.5)
    (linear): Linear(in_features=512, out_features=38614, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

In [38]:
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Number of trainable parameters: {count_parameters(model):,}")

Number of trainable parameters: 41,662,166


# Loss and optimizer

In [39]:
from torch import optim

optimizer = optim.Adam(
    params=model.parameters()
)

TRG_PAD_IDX = german_vocab.lookup_indices(["<pad>"])
loss_fn = nn.CrossEntropyLoss(ignore_index=TRG_PAD_IDX[0])

# Training and Evaluation

In [40]:
def train(model, data, criterion, optimizer, clip):

  model.train()

  epoch_loss = 0

  for i, batch in enumerate(data):
    src, trg = batch

    optimizer.zero_grad()

    # trg: [batch, seq]
    # src = [batch, seq]
    # print(src.shape, trg.shape, "src and target")
    src = src.to(device)
    trg = trg.to(device)
    output = model(src, trg)

    output_dim = output.shape[-1]

    # now slice off the first column from both output and target

    output = output[1:].view(-1, output_dim)
    trg = trg[1:].view(-1)
    trg.to(torch.long)
    loss = criterion(output, trg)
    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

    optimizer.step()
    if i%100==0:
      print(loss.item())
    epoch_loss+=loss.item()

  return epoch_loss / len(data)


In [41]:
def epoch_time(start_time, end_time):
  elapsed_time = end_time - start_time
  elapsed_mins = int(elapsed_time  / 60)
  elapsed_secs = int(elapsed_time -  (elapsed_mins * 60))
  return  elapsed_mins, elapsed_secs

In [42]:
import time
import math

In [43]:
N_EPOCHS = 10

CLIP = 1

for epoch in range(N_EPOCHS):
  start_time = time.time()

  train_loss = train(model, train_loader, loss_fn, optimizer, CLIP)

  end_time = time.time()

  epoch_mins, epoch_secs = epoch_time(start_time, end_time)

  print(f"Epoch: {epoch+1:02} | Time {epoch_mins}m {epoch_secs}s")
  print(f"\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}")

10.55378246307373
5.762933731079102
5.381701469421387
5.128652572631836
5.065828800201416
4.699405193328857
4.571733474731445
4.618707180023193
4.444691181182861
Epoch: 01 | Time 7m 4s
	Train Loss: 5.052 | Train PPL: 156.299
4.200536251068115
4.142460346221924
4.129533290863037
4.046777725219727
3.940574884414673
4.115660190582275
3.9681921005249023
3.847494602203369
3.7532436847686768
Epoch: 02 | Time 7m 3s
	Train Loss: 4.033 | Train PPL:  56.432
3.6573238372802734
3.5768706798553467
3.538433790206909
3.3904600143432617
3.6309590339660645
3.4876420497894287
3.3868696689605713
3.396064043045044
3.3146250247955322
Epoch: 03 | Time 7m 3s
	Train Loss: 3.497 | Train PPL:  33.022
3.241957187652588
3.270005464553833
3.1759088039398193
3.068887948989868
3.134232759475708
3.1021978855133057
2.978907585144043
3.123380661010742
2.962014675140381
Epoch: 04 | Time 7m 4s
	Train Loss: 3.094 | Train PPL:  22.060
2.8101983070373535
2.83697772026062
2.6863787174224854
2.910993814468384
2.78710174560546

In [51]:
torch.save({
            'encoder_state_dict': encoder.state_dict(),
            'decoder_state_dict': decoder.state_dict(),
            'seq2seq_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict()
            }, "all.pth")

In [50]:
torch.save(encoder.state_dict(), "encoder.pth")
torch.save(decoder.state_dict(), "decoder.pth")
torch.save(model.state_dict(), "model.pth")
torch.save(optimizer.state_dict(), "optimizer.pth")

In [53]:
torch.save(english_vocab, "english_vocab.pth")
torch.save(german_vocab, "german_vocab.pth")
torch.save(train_loader, "train_loader.pth")

In [342]:
def predict(text):
  encoder.eval()
  with torch.inference_mode():
    tokens = english_tokenizer(text)
    token_ids = english_vocab.lookup_indices(tokens)
    token_tensors = torch.tensor(token_ids).unsqueeze(0)

    german_start = ["<sos>"]

    german_start_id = german_vocab(german_start)
    german_start_tensor = torch.tensor(german_start_id).unsqueeze(0)

    hidden, cell = encoder(token_tensors.to(device))


    pred = ""
    i=0
    while pred!="<eos>":
      prediction, hidden, cell = decoder(german_start_tensor.to(device), hidden, cell)

      predictions = prediction.squeeze(0)

      pred = torch.argmax(predictions)
      pred = german_vocab.lookup_token(pred)
      print(pred, end=" ")

      if i>6: break
      i+=1


In [350]:
# google translation: Willkommen zurück
predict("Welcome back.")

Willkommen zurück . . . . . . 