In [43]:
# Imports and Device Setup

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
import kagglehub
import os
from collections import Counter
import re

device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
print("Using device:", device)

Using device: mps


In [44]:
# Download and load the Sherlock Holmes dataset into a DataFrame

path = kagglehub.dataset_download("muhammadbilalhaneef/sherlock-holmes-next-word-prediction-corpus")
print("Dataset folder:", path)

file_path = os.path.join(path, "Sherlock Holmes.txt")  # correct filename
with open(file_path, "r", encoding="utf-8") as f:
    lines = f.readlines()

df = pd.DataFrame(lines, columns=["text"])
print(df.head())

Dataset folder: /Users/ma/.cache/kagglehub/datasets/muhammadbilalhaneef/sherlock-holmes-next-word-prediction-corpus/versions/1
                                                text
0                                                 \n
1                                                 \n
2                                                 \n
3                                                 \n
4                          THE ADVENTURES OF SHER...


In [45]:
# Tokenize text and create word-to-index mapping
text = " ".join(df['text'].tolist()).lower()
words = re.findall(r'\b\w+\b', text)
word_counts = Counter(words)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
word2idx = {w:i for i, w in enumerate(vocab)}
idx2word = {i:w for w,i in word2idx.items()}
print("Vocab size:", len(vocab))

Vocab size: 7901


In [46]:
# Prepare sequences of fixed length for next-word prediction
seq_length = 10
word_indices = [word2idx[w] for w in words]
X, y = [], []
for i in range(len(word_indices) - seq_length):
    X.append(word_indices[i:i+seq_length])
    y.append(word_indices[i+seq_length])
X = torch.tensor(X, dtype=torch.long)
y = torch.tensor(y, dtype=torch.long)
print("Number of sequences:", len(X))

Number of sequences: 106022


In [47]:
# Wrap sequences into a PyTorch Dataset and DataLoader
class WordDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]
dataset = WordDataset(X, y)
loader = DataLoader(dataset, batch_size=64, shuffle=True)

In [48]:
# Define LSTM model for next-word prediction
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)
    def forward(self, x, hidden=None):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out[:, -1, :])
        return out, hidden
vocab_size = len(vocab)
embed_size = 256
hidden_size = 512
num_layers = 3
model = LSTMModel(vocab_size, embed_size, hidden_size, num_layers).to(device)

In [49]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
epochs = 50

In [50]:
# Train the LSTM model
for epoch in range(epochs):
    total_loss = 0
    for inputs, targets in loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs, _ = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(loader):.4f}")

Epoch 1/50, Loss: 6.6210
Epoch 2/50, Loss: 6.4593
Epoch 3/50, Loss: 5.8445
Epoch 4/50, Loss: 5.4085
Epoch 5/50, Loss: 5.0538
Epoch 6/50, Loss: 4.6996
Epoch 7/50, Loss: 4.3029
Epoch 8/50, Loss: 3.8491
Epoch 9/50, Loss: 3.3550
Epoch 10/50, Loss: 2.8456
Epoch 11/50, Loss: 2.3545
Epoch 12/50, Loss: 1.9008
Epoch 13/50, Loss: 1.5013
Epoch 14/50, Loss: 1.1523
Epoch 15/50, Loss: 0.8635
Epoch 16/50, Loss: 0.6390
Epoch 17/50, Loss: 0.4805
Epoch 18/50, Loss: 0.3709
Epoch 19/50, Loss: 0.3220
Epoch 20/50, Loss: 0.2709
Epoch 21/50, Loss: 0.2483
Epoch 22/50, Loss: 0.2341
Epoch 23/50, Loss: 0.2202
Epoch 24/50, Loss: 0.2109
Epoch 25/50, Loss: 0.1998
Epoch 26/50, Loss: 0.1997
Epoch 27/50, Loss: 0.1876
Epoch 28/50, Loss: 0.1877
Epoch 29/50, Loss: 0.1806
Epoch 30/50, Loss: 0.1872
Epoch 31/50, Loss: 0.1673
Epoch 32/50, Loss: 0.1668
Epoch 33/50, Loss: 0.1868
Epoch 34/50, Loss: 0.1735
Epoch 35/50, Loss: 0.1605
Epoch 36/50, Loss: 0.1698
Epoch 37/50, Loss: 0.1638
Epoch 38/50, Loss: 0.1643
Epoch 39/50, Loss: 0.

In [None]:
# Generate text using the trained model
def predict_next(model, seed_text, next_words=10):
    model.eval()
    words = seed_text.lower().split()
    for _ in range(next_words):
        seq = [word2idx.get(w,0) for w in words[-seq_length:]]
        seq = torch.tensor(seq, dtype=torch.long).unsqueeze(0).to(device)
        with torch.no_grad():
            output, _ = model(seq)
            pred_idx = torch.argmax(output, dim=1).item()
            words.append(idx2word[pred_idx])
    return " ".join(words)

print(predict_next(model, "sherlock holmes said", next_words=10))