In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import nltk
import re
from nltk.corpus import gutenberg
from collections import Counter
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader




In [3]:
corpus = " ".join(gutenberg.words())

def clean_text(text):
    text = re.sub(r"[^a-zA-Z]", " ", text)  
    text = text.lower()  
    text = text.split()  
    return text

cleaned_corpus = clean_text(corpus)


In [4]:
MAX_WORDS = 5000  
SEQ_LENGTH = 5  
word_counts = Counter(cleaned_corpus)
vocab = [word for word, _ in word_counts.most_common(MAX_WORDS - 1)]  
word_to_index = {word: idx + 1 for idx, word in enumerate(vocab)}  
sequences = []
for i in range(len(cleaned_corpus) - SEQ_LENGTH):
    seq = cleaned_corpus[i : i + SEQ_LENGTH + 1]  
    sequences.append([word_to_index.get(word, 0) for word in seq])

sequences = np.array(sequences)


In [5]:
X, y = sequences[:, :-1], sequences[:, -1]

X, y = torch.tensor(X, dtype=torch.long), torch.tensor(y, dtype=torch.long)


In [6]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

BATCH_SIZE = 64
train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)


In [7]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        _, (hidden, _) = self.lstm(x)
        out = self.fc(hidden[-1])
        return out


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [10]:
def train_model(model, train_loader, epochs=5):
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            optimizer.zero_grad()
            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")

lstm_model = LSTMModel(vocab_size=MAX_WORDS, embedding_dim=64, hidden_dim=128)
train_model(lstm_model, train_loader)



  from .autonotebook import tqdm as notebook_tqdm


Epoch 1, Loss: 5.0130
Epoch 2, Loss: 4.6727
Epoch 3, Loss: 4.5719
Epoch 4, Loss: 4.5129
Epoch 5, Loss: 4.4730


In [11]:
index_to_word = {idx: word for word, idx in word_to_index.items()}
print(index_to_word) 



In [15]:
def predict_next_word(model, text):
    model.eval()
    words = clean_text(text)
    
    input_seq = [word_to_index.get(word, 0) for word in words][-SEQ_LENGTH:]
    input_seq = torch.tensor([input_seq], dtype=torch.long).to(device)

    with torch.no_grad():
        output = model(input_seq)

    predicted_index = torch.argmax(output, dim=1).item()
    
    return index_to_word.get(predicted_index, "<UNK>") 

print("LSTM Prediction:", predict_next_word(lstm_model, "emma was very"))



LSTM Prediction: much


In [None]:
import torch.nn.functional as F

def evaluate_model(model, test_loader):
    model.eval()
    total_loss = 0
    total_correct = 0
    total_samples = 0
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)

            y_pred = model(X_batch)
            loss = criterion(y_pred, y_batch)
            total_loss += loss.item()

            predicted = torch.argmax(y_pred, dim=1)
            total_correct += (predicted == y_batch).sum().item()
            total_samples += y_batch.size(0)

    avg_loss = total_loss / len(test_loader)
    accuracy = total_correct / total_samples

    perplexity = torch.exp(torch.tensor(avg_loss)).item()

    return avg_loss, accuracy, perplexity

lstm_loss, lstm_acc, lstm_ppl = evaluate_model(lstm_model, test_loader)

print(f"LSTM Performance:\n Loss: {lstm_loss:.4f}, Accuracy: {lstm_acc:.4f}, Perplexity: {lstm_ppl:.4f}")



LSTM Performance:
 Loss: 4.6347, Accuracy: 0.1993, Perplexity: 102.9951
