In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.model_selection import train_test_split

# Vocabulary class to handle mapping between words and numerical indices
class Vocabulary:
    def __init__(self):
        # Initialize dictionaries for word to index and index to word mappings
        self.word2index = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.index2word = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.word_count = {}  # Keep track of word frequencies
        self.n_words = 4  # Start counting from 3 to account for special tokens

    def add_sentence(self, sentence):
        # Add all words in a sentence to the vocabulary
        for word in sentence.split(' '):
            self.add_word(word)

    def add_word(self, word):
        # Add a word to the vocabulary
        if word not in self.word2index:
            # Assign a new index to the word and update mappings
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.word_count[word] = 1
            self.n_words += 1
        else:
            # Increment word count if the word already exists in the vocabulary
            self.word_count[word] += 1

def tokenize_and_pad(sentences, vocab):
    # Calculate the maximum sentence length for padding
    max_length = max(len(sentence.split(' ')) for sentence in sentences) + 2  # +2 for SOS and EOS tokens
    tokenized_sentences = []
    for sentence in sentences:
        # Convert each sentence to a list of indices, adding SOS and EOS tokens
        tokens = [vocab.word2index["<SOS>"]] + [vocab.word2index[word] for word in sentence.split(' ')] + [vocab.word2index["<EOS>"]]
        # Pad sentences to the maximum length
        padded_tokens = tokens + [vocab.word2index["<PAD>"]] * (max_length - len(tokens))
        tokenized_sentences.append(padded_tokens)
    return torch.tensor(tokenized_sentences, dtype=torch.long)

# Custom Dataset class for English to French sentences
class EngFrDataset(Dataset):
    def __init__(self, pairs):
        self.eng_vocab = Vocabulary()
        self.fr_vocab = Vocabulary()
        self.pairs = []

        # Process each English-French pair
        for eng, fr in pairs:
            self.eng_vocab.add_sentence(eng)
            self.fr_vocab.add_sentence(fr)
            self.pairs.append((eng, fr))

        # Separate English and French sentences
        self.eng_sentences = [pair[0] for pair in self.pairs]
        self.fr_sentences = [pair[1] for pair in self.pairs]
        
        # Tokenize and pad sentences
        self.eng_tokens = tokenize_and_pad(self.eng_sentences, self.eng_vocab)
        self.fr_tokens = tokenize_and_pad(self.fr_sentences, self.fr_vocab)

        # Define the embedding layers for English and French
        self.eng_embedding = torch.nn.Embedding(self.eng_vocab.n_words, 100)  # Embedding size = 100
        self.fr_embedding = torch.nn.Embedding(self.fr_vocab.n_words, 100)    # Embedding size = 100

    def __len__(self):
        # Return the number of sentence pairs
        return len(self.pairs)

    def __getitem__(self, idx):
        # Get the tokenized and padded sentences by index
        eng_tokens = self.eng_tokens[idx]
        fr_tokens = self.fr_tokens[idx]
        # Lookup embeddings for the tokenized sentences
        eng_emb = self.eng_embedding(eng_tokens)
        fr_emb = self.fr_embedding(fr_tokens)
        return eng_tokens, fr_tokens, eng_emb, fr_emb

# Sample dataset of English-French sentence pairs
english_to_french = [
    ("I am cold", "J'ai froid"),
    ("You are tired", "Tu es fatigu√©"),
    # Add more pairs here
]

# Splitting dataset into train and test sets
train_data, test_data = train_test_split(english_to_french, test_size=0.2, random_state=42)

# Initialize train and test datasets
train_dataset = EngFrDataset(train_data)
test_dataset = EngFrDataset(test_data)

# Initialize train and test data loaders
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)


class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)  # Embedding layer
        self.gru = nn.GRU(hidden_size, hidden_size)  # GRU layer

    def forward(self, input, hidden):
        # Move input tensor to the correct device
        input = input.to(hidden.device)
        
        # Forward pass for the encoder
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.gru(embedded, hidden)
        return output, hidden

    def initHidden(self):
        # Initializes hidden state
        return torch.zeros(1, 1, self.hidden_size, device=device)


class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
                             
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.gru(embedded, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size)  # GRU layer doesn't require cell state


# Assuming all words in the dataset + 'SOS' and 'EOS' tokens are included in eng_vocab and fr_vocab
input_size = train_dataset.eng_vocab.n_words
hidden_size = 256  # Example hidden size
output_size = train_dataset.fr_vocab.n_words

# Create the encoder and decoder instances
encoder = Encoder(input_size, hidden_size)
decoder = Decoder(hidden_size, output_size)

# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder = encoder.to(device)
decoder = decoder.to(device)

# Set the learning rate for optimization
learning_rate = 0.01

# Initializing optimizers for both encoder and decoder with Stochastic Gradient Descent (SGD)
encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)

# Negative Log Likelihood Loss function for calculating loss
criterion = nn.NLLLoss()

# Set number of epochs for training
n_epochs = 100

def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=12):
    # Initialize encoder hidden state
    encoder_hidden = encoder.initHidden()

    # Clear gradients for optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    # Calculate the length of input and target tensors
    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    # Initialize loss
    loss = 0

    # Encoding each token in the input
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei].unsqueeze(0), encoder_hidden)

    # Decoder's first input is the SOS token
    decoder_input = torch.tensor([[train_dataset.fr_vocab.word2index['<SOS>']]], device=device)

    # Decoder starts with the encoder's last hidden state
    decoder_hidden = encoder_hidden

    # Decoding loop
    for di in range(target_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        # Choose top1 word from decoder's output
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()  # Detach from history as input

        # Calculate loss
        loss += criterion(decoder_output, target_tensor[di].unsqueeze(0))
        if decoder_input.item() == train_dataset.fr_vocab.word2index['<EOS>']:  # Stop if EOS token is generated
            break

    # Backpropagation
    loss.backward()

    # Update encoder and decoder parameters
    encoder_optimizer.step()
    decoder_optimizer.step()

    # Return average loss
    return loss.item() / target_length

# Training loop
for epoch in range(n_epochs):
    total_loss = 0
    for eng_tokens, fr_tokens, _, _ in train_dataloader:  # Fixed variable name
        # Move tensors to the correct device
        input_tensor = eng_tokens.squeeze().to(device)
        target_tensor = fr_tokens.squeeze().to(device)
        
        # Perform a single training step and update total loss
        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        total_loss += loss
    
    # Print loss every 10 epochs
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {total_loss / len(train_dataloader)}')

def translate_sentence(encoder, decoder, sentence, eng_vocab, fr_vocab, device, max_length=12):
    # Preprocess the input sentence
    tokens = tokenize_and_pad([sentence], eng_vocab)
    input_tensor = tokens.squeeze().to(device)
    
    # Initialize encoder hidden state
    encoder_hidden = encoder.initHidden()

    # Encode the input sentence
    for ei in range(input_tensor.size(0)):
        encoder_output, encoder_hidden = encoder(input_tensor[ei].unsqueeze(0), encoder_hidden)

    # Initialize decoder input with SOS token
    decoder_input = torch.tensor([[fr_vocab.word2index['<SOS>']]], device=device)
    decoder_hidden = encoder_hidden
    
    # Initialize list to store decoded tokens
    decoded_tokens = []

    # Decode tokens until EOS token is generated or maximum length is reached
    for di in range(max_length):
        decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
        topv, topi = decoder_output.topk(1)
        token_index = topi.squeeze().detach().item()
        if token_index == fr_vocab.word2index['<EOS>']:
            break
        else:
            # Replace out-of-vocabulary token indices with '<UNK>' token index
            if token_index not in fr_vocab.index2word:
                token_index = fr_vocab.word2index['<UNK>']
            decoded_tokens.append(token_index)
        decoder_input = topi.squeeze().detach()

    # Convert decoded tokens to words
    translated_sentence = ' '.join([fr_vocab.index2word[token] for token in decoded_tokens])
    
    return translated_sentence


def evaluate_and_show_examples(encoder, decoder, dataloader, criterion, n_examples=5):
    # Switch model to evaluation mode
    encoder.eval()
    decoder.eval()

    total_loss = 0
    correct_predictions = 0
    predicted_examples = []

    # No gradient calculation
    with torch.no_grad():
        for i, (eng_tokens, fr_tokens, _, _) in enumerate(dataloader):
            # Move tensors to the correct device
            input_tensor = eng_tokens.squeeze().to(device)
            target_tensor = fr_tokens.squeeze().to(device)

            encoder_hidden = encoder.initHidden()

            input_length = input_tensor.size(0)
            target_length = target_tensor.size(0)

            loss = 0

            # Encoding step
            for ei in range(input_length):
                encoder_output, encoder_hidden = encoder(input_tensor[ei].unsqueeze(0), encoder_hidden)

            # Decoding step
            decoder_input = torch.tensor([[train_dataset.fr_vocab.word2index['<SOS>']]], device=device)
            decoder_hidden = encoder_hidden

            predicted_indices = []

            for di in range(target_length):
                decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden)
                topv, topi = decoder_output.topk(1)
                predicted_indices.append(topi.item())
                decoder_input = topi.squeeze().detach()

                loss += criterion(decoder_output, target_tensor[di].unsqueeze(0))
                if decoder_input.item() == train_dataset.fr_vocab.word2index['<EOS>']:
                    break

            # Calculate and print loss and accuracy for the evaluation
            total_loss += loss.item() / target_length
            if predicted_indices == target_tensor.tolist():
                correct_predictions += 1

            # Optionally, print some examples
            if i < n_examples:
                input_string = ' '.join([train_dataset.eng_vocab.index2word[index.item()] for index in input_tensor if
                                         index.item() not in (train_dataset.eng_vocab.word2index['<SOS>'],
                                                              train_dataset.eng_vocab.word2index['<EOS>'])])
                target_string = ' '.join([train_dataset.fr_vocab.index2word[index.item()] for index in target_tensor if
                                          index.item() not in (train_dataset.fr_vocab.word2index['<SOS>'],
                                                               train_dataset.fr_vocab.word2index['<EOS>'])])
                predicted_string = ' '.join([train_dataset.fr_vocab.index2word[index] for index in predicted_indices if
                                             index not in (train_dataset.fr_vocab.word2index['<SOS>'],
                                                           train_dataset.fr_vocab.word2index['<EOS>'])])
                predicted_examples.append((input_string, target_string, predicted_string))

        # Print overall evaluation results
        average_loss = total_loss / len(dataloader)
        accuracy = correct_predictions / len(dataloader)
        print(f'Evaluation Loss: {average_loss}, Accuracy: {accuracy}')
        return predicted_examples


# Training loop
for epoch in range(n_epochs):
    total_loss = 0
    for eng_tokens, fr_tokens, _, _ in train_dataloader:
        # Move tensors to the correct device
        input_tensor = eng_tokens.squeeze().to(device)
        target_tensor = fr_tokens.squeeze().to(device)

        # Perform a single training step and update total loss
        loss = train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        total_loss += loss

    # Print loss every 10 epochs
    if epoch % 10 == 0:
        print(f'Epoch {epoch}, Loss: {total_loss / len(train_dataloader)}')

# Perform evaluation with examples
predicted_examples = evaluate_and_show_examples(encoder, decoder, test_dataloader, criterion)


Epoch 0, Loss: 1.7408666610717773
Epoch 10, Loss: 0.755328893661499
Epoch 20, Loss: 0.3559364676475525
Epoch 30, Loss: 0.22786998748779297
Epoch 40, Loss: 0.1615726500749588
Epoch 50, Loss: 0.12053997814655304
Epoch 60, Loss: 0.09330885112285614
Epoch 70, Loss: 0.0744636133313179
Epoch 80, Loss: 0.060969337821006775
Epoch 90, Loss: 0.05101003497838974
Epoch 0, Loss: 0.04345971718430519
Epoch 10, Loss: 0.03759844973683357
Epoch 20, Loss: 0.032952744513750076
Epoch 30, Loss: 0.02920311689376831
Epoch 40, Loss: 0.026128022000193596
Epoch 50, Loss: 0.02357071079313755
Epoch 60, Loss: 0.02141736075282097
Epoch 70, Loss: 0.01958439312875271
Epoch 80, Loss: 0.018008815124630928
Epoch 90, Loss: 0.01664264127612114


../aten/src/ATen/native/cuda/Loss.cu:250: nll_loss_forward_reduce_cuda_kernel_2d: block: [0,0,0], thread: [0,0,0] Assertion `t >= 0 && t < n_classes` failed.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
