In [14]:
!pip install torch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import string
from collections import Counter
from sklearn.model_selection import train_test_split



In [15]:
with open("book.txt", encoding="utf-8") as f:
    text = f.read().lower()

text = re.sub(f"[{string.punctuation}]", "", text)
text = re.sub(r'\s+', ' ', text).strip()
words = text.split()

print(f"Total words: {len(words)}")

Total words: 107585


In [16]:
word_counts = Counter(words)
vocab = sorted(word_counts, key=word_counts.get, reverse=True)
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
idx_to_word = {idx: word for word, idx in word_to_idx.items()}
VOCAB_SIZE = len(vocab)

In [17]:
SEQUENCE_LENGTH = 5
sequences = []
for i in range(SEQUENCE_LENGTH, len(words)):
    seq = words[i - SEQUENCE_LENGTH:i + 1]
    sequences.append([word_to_idx[word] for word in seq])

sequences = np.array(sequences)
X, y = sequences[:, :-1], sequences[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [18]:
class WordDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.LongTensor(X)
        self.y = torch.LongTensor(y)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_dataset = WordDataset(X_train, y_train)
test_dataset = WordDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=128)

In [19]:
class NextWordModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super(NextWordModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])  # Use output of last timestep
        return out

model = NextWordModel(VOCAB_SIZE, embed_dim=50, hidden_dim=128)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

NextWordModel(
  (embedding): Embedding(10409, 50)
  (lstm): LSTM(50, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=10409, bias=True)
)

In [20]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

EPOCHS = 10
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for batch_X, batch_y in train_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")

Epoch 1, Loss: 6.7638
Epoch 2, Loss: 6.0119
Epoch 3, Loss: 5.6125
Epoch 4, Loss: 5.2846
Epoch 5, Loss: 4.9884
Epoch 6, Loss: 4.7190
Epoch 7, Loss: 4.4679
Epoch 8, Loss: 4.2329
Epoch 9, Loss: 4.0128
Epoch 10, Loss: 3.8052


In [22]:
def predict_next_word(model, seed_text, top_k=3):
    model.eval()
    tokens = [word_to_idx.get(w, 0) for w in seed_text.lower().split()]
    tokens = tokens[-SEQUENCE_LENGTH:]
    if len(tokens) < SEQUENCE_LENGTH:
        tokens = [0] * (SEQUENCE_LENGTH - len(tokens)) + tokens
    input_tensor = torch.LongTensor(tokens).unsqueeze(0).to(device)
    
    with torch.no_grad():
        output = model(input_tensor)
        probs = torch.softmax(output, dim=1).cpu().numpy().flatten()
        top_indices = np.argsort(probs)[-top_k:][::-1]
        return [idx_to_word[idx] for idx in top_indices]

seed = input("Insert Seed: ")
print("Top predictions:", predict_next_word(model, seed))

Insert Seed:  To Sherlock Holmes


Top predictions: ['was', 'had', 'and']
