In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Vocabulary class to handle mapping between words and numerical indices
class Vocabulary:
    def __init__(self):
        # Initialize dictionaries for word to index and index to word mappings
        self.word2index = {"<PAD>": 0, "<SOS>": 1, "<EOS>": 2, "<UNK>": 3}
        self.index2word = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>", 3: "<UNK>"}
        self.word_count = {}  # Keep track of word frequencies
        self.n_words = 4  # Start counting from 3 to account for special tokens

    def add_sentence(self, sentence):
        # Add all words in a sentence to the vocabulary
        for word in sentence.split(' '):
            self.add_word(word)

    def add_word(self, word):
        # Add a word to the vocabulary
        if word not in self.word2index:
            # Assign a new index to the word and update mappings
            self.word2index[word] = self.n_words
            self.index2word[self.n_words] = word
            self.word_count[word] = 1
            self.n_words += 1
        else:
            # Increment word count if the word already exists in the vocabulary
            self.word_count[word] += 1

def tokenize_and_pad(sentences, vocab):
    # Calculate the maximum sentence length for padding
    max_length = max(len(sentence.split(' ')) for sentence in sentences) + 2  # +2 for SOS and EOS tokens
    tokenized_sentences = []
    for sentence in sentences:
        # Convert each sentence to a list of indices, adding SOS and EOS tokens
        tokens = [vocab.word2index["<SOS>"]] + [vocab.word2index[word] for word in sentence.split(' ')] + [vocab.word2index["<EOS>"]]
        # Pad sentences to the maximum length
        padded_tokens = tokens + [vocab.word2index["<PAD>"]] * (max_length - len(tokens))
        tokenized_sentences.append(padded_tokens)
    return torch.tensor(tokenized_sentences, dtype=torch.long)

# Custom Dataset class for French to English sentences
class FrEnDataset(Dataset):
    def __init__(self, pairs):
        self.fr_vocab = Vocabulary()
        self.en_vocab = Vocabulary()
        self.pairs = []

        # Process each French-English pair
        for fr, en in pairs:
            self.fr_vocab.add_sentence(fr)
            self.en_vocab.add_sentence(en)
            self.pairs.append((fr, en))

        # Separate French and English sentences
        self.fr_sentences = [pair[0] for pair in self.pairs]
        self.en_sentences = [pair[1] for pair in self.pairs]
        
        # Tokenize and pad sentences
        self.fr_tokens = tokenize_and_pad(self.fr_sentences, self.fr_vocab)
        self.en_tokens = tokenize_and_pad(self.en_sentences, self.en_vocab)

        # Define the embedding layers for French and English
        self.fr_embedding = torch.nn.Embedding(self.fr_vocab.n_words, 100)  # Embedding size = 100
        self.en_embedding = torch.nn.Embedding(self.en_vocab.n_words, 100)  # Embedding size = 100

    def __len__(self):
        # Return the number of sentence pairs
        return len(self.pairs)

    def __getitem__(self, idx):
        # Get the tokenized and padded sentences by index
        fr_tokens = self.fr_tokens[idx]
        en_tokens = self.en_tokens[idx]
        # Lookup embeddings for the tokenized sentences
        fr_emb = self.fr_embedding(fr_tokens)
        en_emb = self.en_embedding(en_tokens)
        return fr_tokens, en_tokens, fr_emb, en_emb

# Sample dataset of French-English sentence pairs
french_to_english = [
    ("J'ai froid", "I am cold"),
    ("Tu es fatigué", "You are tired"),
    ("Il a faim", "He is hungry"),
    ("Elle est heureuse", "She is happy"),
    ("Nous sommes amis", "We are friends"),
    ("Ils sont étudiants", "They are students"),
    ("Le chat dort", "The cat is sleeping"),
    ("Le soleil brille", "The sun is shining"),
    ("Nous aimons la musique", "We love music"),
    ("Elle parle français couramment", "She speaks French fluently")
    # Add more pairs as needed
]

# Initialize the dataset
dataset_fr_en = FrEnDataset(french_to_english)

# Initialize the data loader
dataloader_fr_en = DataLoader(dataset_fr_en, batch_size=1, shuffle=True)
def train_fr_en(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion):
    # Initialize encoder hidden state
    encoder_hidden = encoder.initHidden()

    # Clear gradients
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    # Forward pass through encoder
    encoder_outputs = torch.zeros(input_length, encoder.hidden_size, device=device)
    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    # Initialize decoder input with SOS_token
    decoder_input = torch.tensor([[dataset_fr_en.en_vocab.word2index['<SOS>']]], device=device)

    # Initialize decoder hidden state with last hidden state of encoder
    decoder_hidden = encoder_hidden

    # Initialize loss
    loss = 0

    # Teacher forcing: Feed the target as the next input
    for di in range(target_length):
        decoder_output, decoder_hidden, decoder_attention = decoder(
            decoder_input, decoder_hidden, encoder_outputs)
        topv, topi = decoder_output.topk(1)
        decoder_input = topi.squeeze().detach()  # detach from history as input

        # Calculate and accumulate loss
        loss += criterion(decoder_output, target_tensor[di])
        if decoder_input.item() == dataset_fr_en.en_vocab.word2index['<EOS>']:
            break

    # Backpropagation
    loss.backward()

    # Update parameters
    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length



class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)  # Embedding layer
        self.lstm = nn.LSTM(hidden_size, hidden_size)  # LSTM layer

    def forward(self, input, hidden):
        # Move input tensor to the correct device
        input = input.to(hidden[0].device)
        
        # Forward pass for the encoder
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.lstm(embedded, hidden)
        return output, hidden

    def initHidden(self):
        # Initializes hidden state
        return (torch.zeros(1, 1, self.hidden_size, device=device),  # Ensure initialization on the correct device
                torch.zeros(1, 1, self.hidden_size, device=device))  # Ensure initialization on the correct device


class Decoder(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
                             
    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output, hidden = self.lstm(embedded, hidden)
        output = self.softmax(self.out(output[0]))
        return output, hidden

    def initHidden(self):
        return (torch.zeros(1, 1, self.hidden_size),  # LSTM layer requires a tuple of hidden state and cell state
                torch.zeros(1, 1, self.hidden_size))


class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.hidden_size = hidden_size
        self.attn = nn.Linear(hidden_size * 2, hidden_size)
        self.v = nn.Parameter(torch.rand(hidden_size))

    def forward(self, hidden, encoder_outputs):
        seq_len = encoder_outputs.size(0)
        hidden = hidden.repeat(seq_len, 1, 1)
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim=2)))
        energy = energy.permute(1, 0, 2)
        v = self.v.repeat(encoder_outputs.size(0), 1).unsqueeze(1)
        attention_scores = torch.bmm(v, energy.permute(0, 2, 1)).squeeze(1)
        return F.softmax(attention_scores, dim=1).unsqueeze(1)



# Assuming all words in the dataset + 'SOS' and 'EOS' tokens are included in fr_vocab and en_vocab
input_size_fr_en = dataset_fr_en.fr_vocab.n_words
hidden_size_fr_en = 256  # Example hidden size
output_size_fr_en = dataset_fr_en.en_vocab.n_words

# Create the encoder and decoder instances
encoder_fr_en = Encoder(input_size_fr_en, hidden_size_fr_en)
decoder_fr_en = AttentionDecoder(hidden_size_fr_en, output_size_fr_en)

# Set the device to GPU if available, else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
encoder_fr_en = encoder_fr_en.to(device)
decoder_fr_en = decoder_fr_en.to(device)

# Set the learning rate for optimization
learning_rate_fr_en = 0.01

# Initializing optimizers for both encoder and decoder with Stochastic Gradient Descent (SGD)
encoder_optimizer_fr_en = optim.SGD(encoder_fr_en.parameters(), lr=learning_rate_fr_en)
decoder_optimizer_fr_en = optim.SGD(decoder_fr_en.parameters(), lr=learning_rate_fr_en)

# Negative Log Likelihood Loss function for calculating loss
criterion_fr_en = nn.NLLLoss()

# Set number of epochs for training
n_epochs_fr_en = 100

# Training loop
for epoch in range(n_epochs_fr_en):
    total_loss = 0
    for fr_tokens, en_tokens, _, _ in dataloader_fr_en:
        # Move tensors to the correct device
        fr_input_tensor = fr_tokens.squeeze().to(device)
        en_target_tensor = en_tokens.squeeze().to(device)

        # Perform a single training step and update total loss
        loss = train_fr_en(fr_input_tensor, en_target_tensor, encoder_fr_en, decoder_fr_en, encoder_optimizer_fr_en, decoder_optimizer_fr_en, criterion_fr_en)
        total_loss += loss

    # Print average loss after each epoch
    print(f'Epoch {epoch+1}/{n_epochs_fr_en}, Loss: {total_loss/len(dataloader_fr_en)}')

# Evaluate the model on the validation set
evaluation_examples_fr_en = evaluate_fr_en(encoder_fr_en, decoder_fr_en, dataloader_fr_en, criterion_fr_en)

# Some qualitative validation
for example in evaluation_examples_fr_en[:5]:
    print(f'Input: {example[0]}')
    print(f'Target: {example[1]}')
    print(f'Predicted: {example[2]}\n')


RuntimeError: Tensors must have same number of dimensions: got 2 and 1