In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import random

In [None]:
# Sample dataset (English to French)
data = [
    ("hello", "bonjour"),
    ("how are you", "comment ça va"),
    ("I love you", "je t'aime"),
    ("thank you", "merci"),
    ("good morning", "bonjour"),
    ("good night", "bonne nuit"),
    ("see you later", "à plus tard"),
]

# Prepare input and output pairs
input_sentences, target_sentences = zip(*data)

input_sentences, target_sentences

In [None]:
#create vocab
def build_vocab(data):
  words = set(" ".join(data).split())
  word_to_idx = {word: idx for idx, word in enumerate(words)}
  idx_to_word = {idx: word for word, idx in word_to_idx.items()}

  return word_to_idx, idx_to_word, len(words) + 1 # for padding



# Prepare vocabularies
input_vocab, input_idx_to_word, input_vocab_size = build_vocab(input_sentences)
target_vocab, target_idx_to_word, target_vocab_size = build_vocab(target_sentences)
# input_vocab['<SOS>'] = 0
# target_vocab['<SOS>'] = 0
# input_idx_to_word[0] = '<SOS>'
# target_idx_to_word[0] = '<SOS>'


# Encode sentences
def encode_sentence(sentence, vocab):
    return [vocab[word] for word in sentence.split()] + [0]  # Add padding (0)

input_encoded = [encode_sentence(sentence, input_vocab) for sentence in input_sentences]
target_encoded = [encode_sentence(sentence, target_vocab) for sentence in target_sentences]

# Pad sequences to the same length
max_input_len = max(len(seq) for seq in input_encoded)
max_target_len = max(len(seq) for seq in target_encoded)

input_encoded = [seq + [0] * (max_input_len - len(seq)) for seq in input_encoded]
target_encoded = [seq + [0] * (max_target_len - len(seq)) for seq in target_encoded]

In [None]:
# Create <a class="autolink" title="Dataset and DataLoader" href="https://course.aiadventures.in/mod/page/view.php?id=2233">Dataset and DataLoader</a>
class TranslationDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = TranslationDataset(input_encoded, target_encoded)
train_loader = DataLoader(dataset, batch_size=2, shuffle=True)

In [None]:
# Define Encoder
class Encoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Encoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

    def forward(self, x):
        embedded = self.embedding(x)
        _, (hidden, cell) = self.lstm(embedded)
        return hidden, cell

# Define Decoder
class Decoder(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(Decoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden, cell):
        embedded = self.embedding(x)
        output, (hidden, cell) = self.lstm(embedded, (hidden, cell))
        output = self.fc(output)
        return output, hidden, cell

In [None]:
# Define Seq2Seq Model
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt):
        hidden, cell = self.encoder(src)
        outputs = []
        input = tgt[:, 0].unsqueeze(1)  # Start with the first word

        for t in range(1, tgt.size(1)):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs.append(output)
            input = tgt[:, t].unsqueeze(1)  # Teacher forcing

        return torch.cat(outputs, dim=1)

In [None]:
# Hyperparameters
embedding_dim = 10
hidden_dim = 20
n_epochs = 5

# Initialize models
encoder = Encoder(input_vocab_size, embedding_dim, hidden_dim)
decoder = Decoder(target_vocab_size, embedding_dim, hidden_dim)
model = Seq2Seq(encoder, decoder)

# Loss and Optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding
optimizer = optim.Adam(model.parameters(), lr=0.01)

In [None]:
# <a class="autolink" title="Training Loop" href="https://course.aiadventures.in/mod/page/view.php?id=2267">Training Loop</a>
for epoch in range(n_epochs):
    model.train()
    total_loss = 0
    for src, tgt in train_loader:
        optimizer.zero_grad()
        output = model(src, tgt)
        # print(output.view(-1, target_vocab_size))
        loss = criterion(output.view(-1, target_vocab_size), tgt[:, 1:].contiguous().view(-1))  # Shift target
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    # if (epoch + 1) % 10 == 0:
    print(f'Epoch [{epoch + 1}/{n_epochs}], Loss: {total_loss / len(train_loader):.4f}')

In [None]:
# Function to translate an input sentence
def translate(sentence):
    model.eval()
    encoded_input = encode_sentence(sentence, input_vocab)
    input_tensor = torch.tensor(encoded_input + [0] * (max_input_len - len(encoded_input)), dtype=torch.long).unsqueeze(0)

    with torch.no_grad():
        hidden, cell = model.encoder(input_tensor)
        tgt = torch.zeros(1, max_target_len, dtype=torch.long)  # Placeholder for the target
        tgt[0, 0] = list(input_vocab.items())[0][1] # start with the first word of target_vocab

        for t in range(1, max_target_len):
            output, hidden, cell = model.decoder(tgt[:, t-1].unsqueeze(1), hidden, cell)
            predicted_idx = output.argmax(2)[:, -1]
            tgt[0, t] = predicted_idx
            if predicted_idx.item() == 0:  # Stop if we hit padding
                break

    return ' '.join(target_idx_to_word[idx.item()] for idx in tgt[0] if idx.item() != 0)

In [None]:
# Test the model
test_sentence = "see you later"
translated_sentence = translate(test_sentence)
print(f'Translation of "{test_sentence}": "{translated_sentence}"')