In [11]:
import torch
import torch.nn as nn
import torch.optim as optim

In [12]:
# Define the training data
training_data = [
    ("This is an English sentence.", "en"),
    ("Das ist ein deutscher Satz.", "de"),
    ("Ceci est une phrase en français.", "fr"),
    ("Questa è una frase in italiano.", "it")
]

In [13]:
# Define the vocabulary
vocab = set()
for sentence, _ in training_data:
    for word in sentence.split():
        vocab.add(word)
vocab_size = len(vocab)

In [14]:
# Define the model
class LanguageIdentifier(nn.Module):
    def __init__(self, vocab_size, hidden_size, output_size):
        super(LanguageIdentifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = x[-1]
        x = self.fc(x)
        return x

In [15]:
# Define the hyperparameters
hidden_size = 128
output_size = len(set([lang for _, lang in training_data]))
learning_rate = 0.001
num_epochs = 100

In [16]:

# Initialize the model and optimizer
model = LanguageIdentifier(vocab_size, hidden_size, output_size)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss()

In [17]:

# Train the model
for epoch in range(num_epochs):
    for sentence, lang in training_data:
        # Convert the sentence to a tensor of word indices
        sentence_tensor = torch.tensor([list(vocab).index(word) for word in sentence.split()])
        
        # Convert the language to a tensor of one-hot vectors
        lang_tensor = torch.zeros(output_size)
        lang_tensor[list(set([lang for _, lang in training_data])).index(lang)] = 1
        
        # Forward pass
        output = model(sentence_tensor.unsqueeze(1))
        loss = criterion(output, lang_tensor.unsqueeze(0))
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # Print the loss every 10 epochs
    if epoch % 10 == 0:
        print(f"Epoch {epoch}: Loss = {loss.item()}")

Epoch 0: Loss = 1.5032371282577515
Epoch 10: Loss = 0.04966847598552704
Epoch 20: Loss = 0.009631828404963017
Epoch 30: Loss = 0.005111957434564829
Epoch 40: Loss = 0.0032707550562918186
Epoch 50: Loss = 0.002302140463143587
Epoch 60: Loss = 0.0017265664646402001
Epoch 70: Loss = 0.0013546108966693282
Epoch 80: Loss = 0.0010986251290887594
Epoch 90: Loss = 0.0009139174944721162


In [18]:
# Test the model
test_sentence = "est français."
test_sentence_tensor = torch.tensor([list(vocab).index(word) for word in test_sentence.split()])
output = model(test_sentence_tensor.unsqueeze(1))
predicted_lang = list(set([lang for _, lang in training_data]))[torch.argmax(output).item()]
print(f"Predicted language: {predicted_lang}")

Predicted language: fr
