In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import numpy as np
import math
import random

In [17]:
# Tokenize & build vocabulary
class TextDataset(Dataset):
    def __init__(self, sentences, vocab=None, seq_len=5):
        self.seq_len = seq_len
        tokens = [word for s in sentences for word in s]
        if vocab is None:
            self.vocab = {w: i+2 for i, (w, _) in enumerate(Counter(tokens).most_common())}
            self.vocab['<PAD>'] = 0
            self.vocab['<UNK>'] = 1
        else:
            self.vocab = vocab
        self.inv_vocab = {i: w for w, i in self.vocab.items()}

        self.data = []
        for s in sentences:
            indexed = [self.vocab.get(w, 1) for w in s]
            for i in range(len(indexed) - seq_len):
                self.data.append((indexed[i:i+seq_len], indexed[i+seq_len]))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        x, y = self.data[idx]
        return torch.tensor(x), torch.tensor(y)

In [18]:
# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_dim=64, hidden_dim=128):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        emb = self.embedding(x)
        out, _ = self.lstm(emb)
        out = out[:, -1, :]  # Take output of last LSTM step
        return self.fc(out)

In [19]:
# Train function
def train_model(model, dataloader, criterion, optimizer, epochs=5):
    for epoch in range(epochs):
        total_loss = 0
        for x, y in dataloader:
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            out = model(x)
            loss = criterion(out, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")

In [20]:
# Evaluate accuracy
def evaluate_detailed_lstm(model, dataset, k=3):
    model.eval()
    total = 0
    correct = 0
    top_k_correct = 0
    log_probs = []

    with torch.no_grad():
        for x, y in DataLoader(dataset, batch_size=64):
            x, y = x.to(device), y.to(device)
            outputs = model(x)  # shape: [batch, vocab_size]
            probs = torch.softmax(outputs, dim=1)

            # Top-1 accuracy
            preds = probs.argmax(dim=1)
            correct += (preds == y).sum().item()

            # Top-K accuracy
            top_k_preds = probs.topk(k, dim=1).indices  # shape: [batch, k]
            top_k_correct += sum([y[i].item() in top_k_preds[i] for i in range(len(y))])

            # Perplexity (negative log-likelihood)
            probs_for_targets = probs[range(len(y)), y] + 1e-10  # avoid log(0)
            log_probs.extend(torch.log(probs_for_targets).tolist())

            total += y.size(0)

    accuracy = correct / total if total > 0 else 0
    top_k_accuracy = top_k_correct / total if total > 0 else 0
    perplexity = math.exp(-sum(log_probs) / total) if total > 0 else float("inf")

    return accuracy, top_k_accuracy, perplexity


In [21]:
def load_conll2003_file(filepath):
    sentences = []
    with open(filepath, 'r', encoding='utf-8') as file:
        words = []
        for line in file:
            line = line.strip()
            if not line:
                if words:
                    sentences.append(words)
                    words = []
            else:
                parts = line.split()
                if len(parts) >= 1:
                    words.append(parts[0])
        if words:
            sentences.append(words)
    return sentences

In [22]:
# Main runner
def run_lstm_text_prediction():
    # Load data
    sentences = load_conll2003_file("conll2003/eng.train")
    split = int(0.8 * len(sentences))
    train_data = sentences[:split]
    test_data = sentences[split:]

    # Dataset
    train_dataset = TextDataset(train_data)
    test_dataset = TextDataset(test_data, vocab=train_dataset.vocab)

    # Model
    global device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = LSTMModel(vocab_size=len(train_dataset.vocab)).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    # Train
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    train_model(model, train_loader, criterion, optimizer, epochs=5)

    # Evaluate
    acc, top_k_acc, ppl = evaluate_detailed_lstm(model, test_dataset, k=3)
    print(f"\nLSTM Evaluation Metrics:")
    print(f"Accuracy         : {acc:.2%}")
    print(f"Top-3 Accuracy   : {top_k_acc:.2%}")
    print(f"Perplexity       : {ppl:.2f}")

In [23]:
if __name__ == "__main__":
    run_lstm_text_prediction()

Epoch 1, Loss: 11700.6387
Epoch 2, Loss: 9994.0346
Epoch 3, Loss: 8903.7285
Epoch 4, Loss: 7959.2099
Epoch 5, Loss: 7146.5678

LSTM Evaluation Metrics:
Accuracy         : 15.22%
Top-3 Accuracy   : 25.31%
Perplexity       : 1258.34
