### Text classification using LSTM

In this coding exercise, you will create a simple LSTM model using PyTorch to perform text classification on a dataset of short phrases. Your task is to fill in the missing parts of the code marked with `# TODO`.

You need to:

- Create a vocabulary to represent words as indices.
- Tokenize, encode, and pad the phrases.
- Convert the phrases and categories to PyTorch tensors.
- Instantiate the LSTM model with the vocabulary size, embedding dimensions, hidden dimensions, and output dimensions.
- Define the loss function and optimizer.
- Train the model for a number of epochs.
- Test the model on new phrases and print the category predictions.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Phrases (textual data) and their category labels (0 for sports, 1 for technology, 2 for food)
# Note: this data is extremely less for realistically training an LSTM model. Feel free to use
# a relevant data source or create your own dummy data for this exercise.
phrases = ["great goal scored", "amazing touchdown", "new phone release", "latest laptop model", "tasty pizza", "delicious burger"]
categories = [0, 0, 1, 1, 2, 2]

# TODO: Create a vocabulary to represent words as indices
# Create a vocabulary to represent words as indices
vocab = {"<PAD>": 0, "great": 1, "goal": 2, "scored": 3, "amazing": 4, "touchdown": 5, "new": 6, "phone": 7, "release": 8, "latest": 9, "laptop": 10, "model": 11, "tasty": 12, "pizza": 13, "delicious": 14, "burger": 15}

# TODO: Tokenize, encode, and pad phrases
# Tokenize, encode, and pad phrases
encoded_phrases = [[vocab[word] for word in phrase.split()] for phrase in phrases]
max_length = max([len(phrase) for phrase in encoded_phrases])
padded_phrases = [phrase + [vocab["<PAD>"]] * (max_length - len(phrase)) for phrase in encoded_phrases]

# TODO: Convert phrases and categories to PyTorch tensors
# Convert phrases and categories to PyTorch tensors
inputs = torch.LongTensor(padded_phrases)
labels = torch.LongTensor(categories)

In [3]:
# Define LSTM model
class PhraseClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(PhraseClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        output, (hidden, _) = self.lstm(embedded)
        logits = self.fc(hidden.squeeze(0))
        return logits

In [5]:
# TODO: Instantiate model and define loss and optimizer
# Instantiate model and define loss and optimizer
model = PhraseClassifier(len(vocab), embedding_dim=10, hidden_dim=20, output_dim=3)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# TODO: Train the model for a number of epochs
epochs = 1000
for epoch in range(epochs):
    optimizer.zero_grad()
    predictions = model(inputs.t())
    loss = criterion(predictions, labels)
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        print(f"Epoch: {epoch + 1}, Loss: {loss.item()}")
# TODO: Test the model on new phrases
# Test the model on new phrases
with torch.no_grad():
    test_phrases = ["incredible match", "newest gadget", "yummy cake"]
    encoded_test_phrases = [[vocab.get(word, vocab["<PAD>"]) for word in phrase.split()] for phrase in test_phrases]
    padded_test_phrases = [phrase + [vocab["<PAD>"]] * (max_length - len(phrase)) for phrase in encoded_test_phrases]
    test_inputs = torch.LongTensor(padded_test_phrases)
    test_predictions = torch.argmax(model(test_inputs.t()), dim=1)
    print("Test predictions:", test_predictions)

Epoch: 100, Loss: 0.346204549074173
Epoch: 200, Loss: 0.043716758489608765
Epoch: 300, Loss: 0.01583930291235447
Epoch: 400, Loss: 0.008529440499842167
Epoch: 500, Loss: 0.005532508250325918
Epoch: 600, Loss: 0.003964465111494064
Epoch: 700, Loss: 0.0030175100546330214
Epoch: 800, Loss: 0.0023911702446639538
Epoch: 900, Loss: 0.0019502732902765274
Epoch: 1000, Loss: 0.0016255333321169019
Test predictions: tensor([0, 0, 0])
