In [21]:
# !pip install torch
# !pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu117
# !pip install torch==1.8.0+cpu torchvision==0.9.0+cpu torchaudio==0.8.0 -f https://download.pytorch.org/whl/torch_stable.html
# !pip install torchtext==0.9.1
# !pip install spacy

In [1]:
import torch

print(torch.cuda.is_available())

1.8.1+cpu
False


  from .autonotebook import tqdm as notebook_tqdm


In [21]:
import torchvision
import torchaudio
import spacy

print(torchvision.__version__)
print(torch.__version__)
print(torchaudio.__version__)
print(spacy.__version__)

0.9.0+cpu
1.8.1+cpu
0.8.0
3.5.3




In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import numpy as np
import spacy
import random
from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard

In [17]:
import torch
import spacy
from torchtext.data import bleu_score
import sys


def translate_sentence(model, sentence, german, english, device, max_length=50):
    # print(sentence)

    # sys.exit()

    # Load german tokenizer
    spacy_ger = spacy.load("de_core_news_sm")

    # Create tokens using spacy and everything in lower case (which is what our vocab is)
    if type(sentence) == str:
        tokens = [token.text.lower() for token in spacy_ger(sentence)]
    else:
        tokens = [token.lower() for token in sentence]

    # print(tokens)

    # sys.exit()
    # Add <SOS> and <EOS> in beginning and end respectively
    tokens.insert(0, german.init_token)
    tokens.append(german.eos_token)

    # Go through each german token and convert to an index
    text_to_indices = [german.vocab.stoi[token] for token in tokens]

    # Convert to Tensor
    sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)

    # Build encoder hidden, cell state
    with torch.no_grad():
        hidden, cell = model.encoder(sentence_tensor)

    outputs = [english.vocab.stoi["<sos>"]]

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        # Model predicts it's the end of the sentence
        if output.argmax(1).item() == english.vocab.stoi["<eos>"]:
            break

    translated_sentence = [english.vocab.itos[idx] for idx in outputs]

    # remove start token
    return translated_sentence[1:]


def bleu(data, model, german, english, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["src"]
        trg = vars(example)["trg"]

        prediction = translate_sentence(model, src, german, english, device)
        prediction = prediction[:-1]  # remove <eos> token

        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)


def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
    print("=> Saving checkpoint")
    torch.save(state, filename)


def load_checkpoint(checkpoint, model, optimizer):
    print("=> Loading checkpoint")
    model.load_state_dict(checkpoint["state_dict"])
    optimizer.load_state_dict(checkpoint["optimizer"])

In [33]:
# !python -m spacy download de_core_news_sm
# !python -m spacy download en_core_web_sm

In [7]:
spacy_ger = spacy.load('de_core_news_sm')
spacy_eng = spacy.load('en_core_web_sm')

def tokenizer_ger(text):
    return [token.text for token in spacy_ger.tokenizer(text)]

def tokenizer_eng(text):
    return [token.text for token in spacy_eng.tokenizer(text)]    

print(tokenizer_eng("Hello is this even working!"))    

['Hello', 'is', 'this', 'even', 'working', '!']


In [8]:
german = Field(tokenize=tokenizer_ger, lower=True, 
            init_token='<start>', eos_token='<end>')

english = Field(tokenize=tokenizer_eng, lower=True, 
            init_token='<start>', eos_token='<end>')            

In [9]:
train_data, validation_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                    fields=(german, english))

In [10]:
german.build_vocab(train_data, max_size=7000, min_freq=2)
english.build_vocab(train_data, max_size=7000, min_freq=2)

In [11]:
class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, dropout_):
        """
        input_size = size of the vocabulary
        embedding_size = representation of word with d size vectors
        """
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(dropout_)
        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout_)

    def forward(self, x):
        """
        x = vector representation of the word that is tokenized and mapped to the index in vocabulary
        """ 
        # shape of x - (seq_length, N), we have sequence of words in N batches. So in encoder we can feed in the whole sentence.
        embedding = self.dropout(self.embedding(x))  
        # shape - (seq_length, N, embedding_size) 
        outputs, (hidden, cell) = self.rnn(embedding)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, output_size, num_layers, dropout_):
        """
        input_size = size of the vocabulary
        embedding_size = representation of word with d size vectors
        output_size = same size as input size, vector representation of word translated into the other language. It is the prob distribution over the vocuabulary - a size(vocab) lenght vector having probs of each word from the vocabulary.
        """
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout = nn.Dropout(dropout_)
        self.embedding = nn.Embedding(input_size, embedding_size)   
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=dropout_)
        # hidden size of the decoder and encoder are the same.
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        # shape of x - (1, N), here in decoder we feed in a single word from the sentence.  
        x = x.unsqueeze(0) # reshaped from (N) to (1,N)
        embedding = self.dropout(self.embedding(x))  
        # shape - (1, N, embedding_size)
        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # shape - (1, N, hidden_size)
        predictions = self.fc(outputs)
        # shape  - (1, N, len(vocab))
        predictions.squeeze(0) # changing the shape from (1, N, len(vocab)) to (N, len(vocab))

        return predictions, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, Encoder, Decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = Encoder
        self.decoder = Decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        """
        teacher_force_ratio = While decoding, predictions of words happen word-by-word, so if the predictions is wrong, then the wrong word gets fed into the decoder for next prediction. This ratio foretells ratio of how many predicted and actual words to feed for next prediction while decoding.
        """
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(english.vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden, cell = self.encoder(source)

        x = target[0] # grabbing start token
        for t in range(1, target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            outputs[t] = output
            best_guess = output.argmax(1)
            x = target[t] if random.random() < teacher_force_ratio else best_guess
        
        return outputs    

In [20]:
# Training the model now

num_epoch = 20
learning_rate = 0.001
batch_size = 64

load_model = False
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

# Tensorboard
writer = SummaryWriter(f'runs/loss_plot')
step = 0

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, validation_data, test_data),
    batch_size=batch_size,
    sort_within_batch = True,
    sort_key = lambda x: len(x.src), # this helps in sorting sentences with similar sizes to same batches.
    device = device,
)

encoder_net = Encoder(input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout).to(device)
decoder_net = Decoder(input_size_decoder, decoder_embedding_size, hidden_size, output_size, num_layers, dec_dropout).to(device)

model = Seq2Seq(encoder_net, decoder_net).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = english.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

if load_model:
    load_checkpoint(torch.load('my_checkpoint.pth.ptar'), model, optimizer)

for epoch in range(num_epoch):
    print(f'epoch [{epoch}/{num_epoch}]')
    checkpoint = {'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict()}
    save_checkpoint(checkpoint)

    save_checkpoint(checkpoint)

    # sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."
    # model.eval()

    # translated_sentence = translate_sentence(
    #     model, sentence, german, english, device, max_length=50
    # )

    # print(f"Translated example sentence: \n {translated_sentence}")

    for batch_idx, batch in enumerate(train_iterator):
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)
        print(inp_data.shape, target.shape)

        output = model(inp_data, target) # shape (trg_len, batch_size, output_dim)
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1) 
        optimizer.zero_grad()
        loss = criterion(output, target)
        loss.backward()

        torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=1)
        optimizer.step()

        writer.add_scalar("Training loss", loss, global_step=step)
        step += 1

epoch [0/20]
=> Saving checkpoint
=> Saving checkpoint
torch.Size([46, 64]) torch.Size([42, 64])


RuntimeError: input must have 3 dimensions, got 4