In [None]:

import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator
import numpy as np
import random
# from torch.utils.tensorboard import SummaryWriter  # to print to tensorboard
from nlp_notebook.utils import translate_sentence, bleu, save_checkpoint, load_checkpoint

OSError: [WinError 127] The specified procedure could not be found

In [5]:
import spacy

In [None]:
# tokenize_ger Function:
# This function takes German text as input.
# It uses the spacy_ger tokenizer to split the text into individual tokens.
# The function returns a list of tokenized text.
# tokenize_eng Function:
# This function takes English text as input.
# It uses the spacy_eng tokenizer to split the text into individual tokens.
# The function returns a list of tokenized text.

In [14]:
spacy_ger = spacy.load("de_core_news_sm") #  German (Deutsch)
spacy_eng = spacy.load("en") # english


def tokenize_ger(text):
    return [tok.text for tok in spacy_ger.tokenizer(text)]


def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

OSError: [E050] Can't find model 'de_core_news_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [None]:

german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>")

english = Field(
    tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>"
)

In [None]:
# Field Class:
# The Field class represents a single field in a dataset, such as the source text or target text in a machine translation task.
# Parameters:
# Here's a breakdown of the parameters used to define the german and english fields:
# tokenize: A function that takes in text data and returns a list of tokenized words. In this case, tokenize_ger and tokenize_eng are used for German and English text, respectively.
# lower: A boolean indicating whether to convert all text to lowercase. In this case, it's set to True.
# init_token: A special token to add at the beginning of each sequence. In this case, it's set to "<sos>" (start of sequence).
# eos_token: A special token to add at the end of each sequence. In this case, it's set to "<eos>" (end of sequence).

In [None]:

train_data, valid_data, test_data = Multi30k.splits(
    exts=(".de", ".en"), fields=(german, english)
)

german.build_vocab(train_data, max_size=10000, min_freq=2)
english.build_vocab(train_data, max_size=10000, min_freq=2)

In [None]:

# This code snippet is used to load and preprocess the Multi30k dataset for a German-English machine translation task.
# Multi30k Dataset:
# The Multi30k dataset is a multilingual dataset containing about 30,000 images, each described by a caption in German, English, and French.
# Code Explanation:
# train_data, valid_data, test_data = Multi30k.splits(exts=(".de", ".en"), fields=(german, english)):
# This line splits the Multi30k dataset into training, validation, and testing sets.
# The exts parameter specifies the file extensions for the German (.de) and English (.en) datasets.
# The fields parameter maps the German and English fields to the corresponding datasets.
# german.build_vocab(train_data, max_size=10000, min_freq=2) and english.build_vocab(train_data, max_size=10000, min_freq=2):
# These lines build the vocabularies for the German and English fields using the training data.
# The max_size parameter limits the vocabulary size to 10,000 words.
# The min_freq parameter sets the minimum frequency for a word to be included in the vocabulary to 2.a

In [None]:



class Encoder(nn.Module):
    def __init__(self, input_size, embedding_size, hidden_size, num_layers, p):
        super(Encoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)

    def forward(self, x):
        # x shape: (seq_length, N) where N is batch size

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (seq_length, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding)
        # outputs shape: (seq_length, N, hidden_size)

        return hidden, cell

In [None]:
# Encoder Class
# __init__ Method
# The __init__ method initializes the Encoder object with the following parameters:
# input_size: The size of the input vocabulary.
# embedding_size: The size of the embedding layer.
# hidden_size: The size of the hidden state in the LSTM.
# num_layers: The number of layers in the LSTM.
# p: The dropout probability.
# Inside the __init__ method:
# self.dropout = nn.Dropout(p): Initializes a dropout layer with the given probability.
# self.hidden_size = hidden_size and self.num_layers = num_layers: Stores the hidden size and number of layers for later use.
# self.embedding = nn.Embedding(input_size, embedding_size): Initializes an embedding layer that converts input indices into dense vectors.
# self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p): Initializes an LSTM layer with the specified parameters.
# forward Method
# The forward method defines the forward pass through the Encoder. It takes an input sequence x and returns the final hidden and cell states.
# Inside the forward method:
# embedding = self.dropout(self.embedding(x)): Embeds the input sequence and applies dropout.
# outputs, (hidden, cell) = self.rnn(embedding): Passes the embedded sequence through the LSTM and returns the outputs, hidden state, and cell state.
# return hidden, cell: Returns the final hidden and cell states.

In [None]:



class Decoder(nn.Module):
    def __init__(
        self, input_size, embedding_size, hidden_size, output_size, num_layers, p
    ):
        super(Decoder, self).__init__()
        self.dropout = nn.Dropout(p)
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.embedding = nn.Embedding(input_size, embedding_size)
        self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x, hidden, cell):
        # x shape: (N) where N is for batch size, we want it to be (1, N), seq_length
        # is 1 here because we are sending in a single word and not a sentence
        x = x.unsqueeze(0)

        embedding = self.dropout(self.embedding(x))
        # embedding shape: (1, N, embedding_size)

        outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell))
        # outputs shape: (1, N, hidden_size)

        predictions = self.fc(outputs)

        # predictions shape: (1, N, length_target_vocabulary) to send it to
        # loss function we want it to be (N, length_target_vocabulary) so we're
        # just gonna remove the first dim
        predictions = predictions.squeeze(0)

        return predictions, hidden, cell


In [None]:
# Decoder Class
# __init__ Method
# The __init__ method initializes the Decoder object with the following parameters:
# input_size: The size of the input vocabulary.
# embedding_size: The size of the embedding layer.
# hidden_size: The size of the hidden state in the LSTM.
# output_size: The size of the output vocabulary.
# num_layers: The number of layers in the LSTM.
# p: The dropout probability.
# Inside the __init__ method:
# self.dropout = nn.Dropout(p): Initializes a dropout layer with the given probability.
# self.hidden_size = hidden_size and self.num_layers = num_layers: Stores the hidden size and number of layers for later use.
# self.embedding = nn.Embedding(input_size, embedding_size): Initializes an embedding layer that converts input indices into dense vectors.
# self.rnn = nn.LSTM(embedding_size, hidden_size, num_layers, dropout=p): Initializes an LSTM layer with the specified parameters.
# self.fc = nn.Linear(hidden_size, output_size): Initializes a fully connected (dense) layer that produces the final output.
# forward Method
# The forward method defines the forward pass through the Decoder. It takes an input x, hidden state hidden, and cell state cell and returns the predictions, updated hidden state, and updated cell state.
# Inside the forward method:
# x = x.unsqueeze(0): Adds a batch dimension to the input x.
# embedding = self.dropout(self.embedding(x)): Embeds the input x and applies dropout.
# outputs, (hidden, cell) = self.rnn(embedding, (hidden, cell)): Passes the embedded input through the LSTM and returns the outputs, updated hidden state, and updated cell state.
# predictions = self.fc(outputs): Produces the final output using the fully connected layer.
# predictions = predictions.squeeze(0): Removes the batch dimension from the predictions.

In [None]:


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, source, target, teacher_force_ratio=0.5):
        batch_size = source.shape[1]
        target_len = target.shape[0]
        target_vocab_size = len(english.vocab)

        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)

        hidden, cell = self.encoder(source)

        # Grab the first input to the Decoder which will be <SOS> token
        x = target[0]

        for t in range(1, target_len):
            # Use previous hidden, cell as context from encoder at start
            output, hidden, cell = self.decoder(x, hidden, cell)

            # Store next output prediction
            outputs[t] = output

            # Get the best word the Decoder predicted (index in the vocabulary)
            best_guess = output.argmax(1)

            # With probability of teacher_force_ratio we take the actual next word
            # otherwise we take the word that the Decoder predicted it to be.
            # Teacher Forcing is used so that the model gets used to seeing
            # similar inputs at training and testing time, if teacher forcing is 1
            # then inputs at test time might be completely different than what the
            # network is used to. This was a long comment.
            x = target[t] if random.random() < teacher_force_ratio else best_guess  # reinforcement learning

        return outputs


In [None]:
# Seq2Seq Class
# __init__ Method
# The __init__ method initializes the Seq2Seq object with the following parameters:
# encoder: The encoder network, which takes in a source sequence and produces a context vector.
# decoder: The decoder network, which takes in the context vector and generates a target sequence.
# forward Method
# The forward method defines the forward pass through the Seq2Seq model. It takes in the following parameters:
# source: The source sequence.
# target: The target sequence.
# teacher_force_ratio: The probability of using teacher forcing during training.
# Inside the forward method:
# hidden, cell = self.encoder(source): Passes the source sequence through the encoder and gets the final hidden and cell states.
# x = target[0]: Gets the first token of the target sequence, which is the <SOS> token.
# The model then enters a loop where it generates each token of the target sequence:
# output, hidden, cell = self.decoder(x, hidden, cell): Passes the current token x and the previous hidden and cell states through the decoder and gets the output and updated hidden and cell states.
# outputs[t] = output: Stores the output of the decoder at time step t.
# best_guess = output.argmax(1): Gets the index of the token with the highest probability in the output.
# x = target[t] if random.random() < teacher_force_ratio else best_guess: With probability teacher_force_ratio, 
# uses the actual next token in the target sequence; otherwise, uses the token predicted by the decoder.
# this line like reinforcment learning

In [None]:


### We're ready to define everything we need for training our Seq2Seq model ###

# Training hyperparameters
num_epochs = 100
learning_rate = 0.001
batch_size = 64

# Model hyperparameters
load_model = False
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size_encoder = len(german.vocab)
input_size_decoder = len(english.vocab)
output_size = len(english.vocab)
encoder_embedding_size = 300
decoder_embedding_size = 300
hidden_size = 1024  # Needs to be the same for both RNN's
num_layers = 2
enc_dropout = 0.5
dec_dropout = 0.5

# Tensorboard to get nice loss plot
# writer = SummaryWriter(f"runs/loss_plot")
step = 0

In [None]:
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=batch_size,
    sort_within_batch=True,
    sort_key=lambda x: len(x.src),
    device=device,
)

In [None]:
# BucketIterator.splits((train_data, valid_data, test_data), ...):
# This line splits the training, validation, and testing datasets into batches using the BucketIterator.
# The BucketIterator is a type of iterator that groups sequences of similar lengths together into batches, which can improve training efficiency.
# batch_size=batch_size:
# This parameter sets the size of each batch.
# sort_within_batch=True:
# This parameter sorts the sequences within each batch by length.
# sort_key=lambda x: len(x.src):
# This parameter specifies the key to use for sorting the sequences.
# In this case, the key is the length of the source sequence (x.src).
# device=device:
# This parameter specifies the device (e.g., GPU or CPU) to use for the iterators.

In [None]:

encoder_net = Encoder(
    input_size_encoder, encoder_embedding_size, hidden_size, num_layers, enc_dropout
).to(device)

In [None]:
decoder_net = Decoder(
    input_size_decoder,
    decoder_embedding_size,
    hidden_size,
    output_size,
    num_layers,
    dec_dropout,
).to(device)

In [None]:

model = Seq2Seq(encoder_net, decoder_net).to(device)


optimizer = optim.Adam(model.parameters(), lr=learning_rate)

pad_idx = english.vocab.stoi["<pad>"]

criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)

# pad_idx = english.vocab.stoi["<pad>"]:
# This line retrieves the index of the <pad> token in the English vocabulary.
# The <pad> token is a special token used to pad sequences to a uniform length.
# criterion = nn.CrossEntropyLoss(ignore_index=pad_idx):
# This line defines the loss function as Cross-Entropy Loss.
# The ignore_index parameter is set to pad_idx, which means that the loss function will ignore the <pad> tokens when calculating the loss

if load_model:
    load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)

In [1]:



sentence = "ein boot mit mehreren männern darauf wird von einem großen pferdegespann ans ufer gezogen."

for epoch in range(num_epochs):
    print(f"[Epoch {epoch} / {num_epochs}]")
    # This line prints the current epoch number and the total number of epochs.

    checkpoint = {"state_dict": model.state_dict(), "optimizer": optimizer.state_dict()}
    save_checkpoint(checkpoint)
    # This section saves a checkpoint of the model's current state and the optimizer's state.

    model.eval() # model evaluation 
    # This line sets the model to evaluation mode, which is necessary for translating sentences

    translated_sentence = translate_sentence(
        model, sentence, german, english, device, max_length=50
    )
    # This section translates an example sentence using the translate_sentence function.

    print(f"Translated example sentence: \n {translated_sentence}")

    model.train()
    # This line sets the model back to training mode, which is necessary for training the model.

    for batch_idx, batch in enumerate(train_iterator):
        # Get input and targets and get to cuda
        inp_data = batch.src.to(device)
        target = batch.trg.to(device)

        # These lines retrieve the input data (inp_data) and target data (target) from the current batch and 
        # move them to the specified device (e.g., GPU or CPU).

        # Forward prop
        output = model(inp_data, target)
        # This line passes the input data and target data through the model to obtain the output.

        # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
        # doesn't take input in that form. For example if we have MNIST we want to have
        # output to be: (N, 10) and targets just (N). Here we can view it in a similar
        # way that we have output_words * batch_size that we want to send in into
        # our cost function, so we need to do some reshapin. While we're at it
        # Let's also remove the start token while we're at it
        output = output[1:].reshape(-1, output.shape[2])
        target = target[1:].reshape(-1)

        optimizer.zero_grad()
        loss = criterion(output, target)

        # Back prop
        loss.backward()

        # Clip to avoid exploding gradient issues, makes sure grads are
        # within a healthy range
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
        # This line clips the gradients of the model's parameters to prevent exploding gradients.

        # Gradient descent step
        optimizer.step()
        # This line updates the model's parameters using the gradients and the optimizer.

        # Plot to tensorboard
        # writer.add_scalar("Training loss", loss, global_step=step)
        step += 1


score = bleu(test_data[1:100], model, german, english, device)
print(f"Bleu score {score*100:.2f}")


OSError: [WinError 127] The specified procedure could not be found