<a href="https://colab.research.google.com/github/Rithvickkr/Deeplearning-MODELS/blob/main/pytorch_lstm_next_word_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install huggingface_hub



In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from collections import Counter
from torch.utils.data import Dataset, DataLoader


In [27]:
from google.colab import userdata


In [31]:
from huggingface_hub import login
login(userdata.get('hf_token'))

In [10]:
import pandas as pd
import pandas as pd

splits = {'train': 'data/train/train.jsonl.zst', 'validation': 'data/validation/validation.jsonl.zst', 'test': 'data/test/test.jsonl.zst'}
df = pd.read_json("hf://datasets/dlwh/wikitext_2_detokenized/" + splits["train"], lines=True)


In [12]:
texts = df["text"].tolist()


In [14]:
tokens = [word for sentence in texts for word in sentence.split()]


In [15]:
# build vocab
word_counts = Counter(tokens)
vocab = {word: i+1 for i, (word, _) in enumerate(word_counts.items())}  # Start from index 1 (0 is padding)
vocab["<PAD>"] = 0  # Add padding token


In [None]:
len(vocab)

149583

In [16]:
encoded_texts = [[vocab[word] for word in sentence.split() if word in vocab] for sentence in texts]

In [18]:
class NextWordDataset(Dataset):
    def __init__(self, encoded_texts, seq_length=5):
        self.inputs = []
        self.targets = []

        for text in encoded_texts:
            for i in range(len(text) - seq_length):
                self.inputs.append(text[i:i+seq_length])  # Input sequence
                self.targets.append(text[i+seq_length])   # Next word (target)

        # Convert to PyTorch tensors
        self.inputs = torch.tensor(self.inputs, dtype=torch.long)
        self.targets = torch.tensor(self.targets, dtype=torch.long)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]


In [20]:
seq_length = 5  # Input sequence length
 # Reduce dataset to 1000 samples for testing
dataloader = DataLoader(NextWordDataset(encoded_texts, seq_length), batch_size=32, shuffle=True)


In [17]:
# 4️⃣ Define LSTM Model
class NextWordModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256):
        super(NextWordModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        _, (h_n, _) = self.lstm(x)  # Get last hidden state
        output = self.fc(h_n[-1])  # Fully connected layer
        return output


In [21]:
# Initialize model
vocab_size = len(vocab)
model = NextWordModel(vocab_size)

In [22]:
# 5️⃣ Training Setup
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [None]:
num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    total_loss = 0
    for batch_idx, (inputs, targets) in enumerate(dataloader):
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Print every 10 batches
        if (batch_idx + 1) % 10 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(dataloader)}], Loss: {loss.item():.4f}")

    print(f"Epoch {epoch+1}/{num_epochs} - Avg Loss: {total_loss / len(dataloader):.4f}")


Epoch [1/10], Step [10/50782], Loss: 6.4307
Epoch [1/10], Step [20/50782], Loss: 6.8397
Epoch [1/10], Step [30/50782], Loss: 6.3567
Epoch [1/10], Step [40/50782], Loss: 8.0544
Epoch [1/10], Step [50/50782], Loss: 6.5673
Epoch [1/10], Step [60/50782], Loss: 6.7529
Epoch [1/10], Step [70/50782], Loss: 6.9953
Epoch [1/10], Step [80/50782], Loss: 6.9647
Epoch [1/10], Step [90/50782], Loss: 6.9387
Epoch [1/10], Step [100/50782], Loss: 6.2349
Epoch [1/10], Step [110/50782], Loss: 7.3900
Epoch [1/10], Step [120/50782], Loss: 7.9856
Epoch [1/10], Step [130/50782], Loss: 7.5673
Epoch [1/10], Step [140/50782], Loss: 6.7242
Epoch [1/10], Step [150/50782], Loss: 7.2163
Epoch [1/10], Step [160/50782], Loss: 6.7395
Epoch [1/10], Step [170/50782], Loss: 6.0334
Epoch [1/10], Step [180/50782], Loss: 7.6042
Epoch [1/10], Step [190/50782], Loss: 7.3243
Epoch [1/10], Step [200/50782], Loss: 7.4722
Epoch [1/10], Step [210/50782], Loss: 7.2350
Epoch [1/10], Step [220/50782], Loss: 7.0545
Epoch [1/10], Step 

In [None]:
torch.save(model.state_dict(), "next_word_model.pth")
print("✅ Model trained & saved successfully!")

✅ Model trained & saved successfully!


In [None]:
import torch

# Load trained model
model = NextWordModel(vocab_size)
model.load_state_dict(torch.load("next_word_model.pth"))
model.eval()

# Create reverse vocab mapping (index -> word)
index_to_word = {idx: word for word, idx in vocab.items()}

def predict_next_word(sentence):
    words = sentence.split()
    encoded = [vocab[word] for word in words if word in vocab]

    # Ensure sequence length
    if len(encoded) < seq_length:
        encoded = [0] * (seq_length - len(encoded)) + encoded  # Padding if needed

    # Convert to tensor
    input_tensor = torch.tensor([encoded[-seq_length:]], dtype=torch.long)

    # Forward pass
    output = model(input_tensor)

    # Get highest probability word index
    predicted_index = torch.argmax(output, dim=1).item()

    # Lookup word from index
    predicted_word = index_to_word.get(predicted_index, "UNKNOWN")

    return predicted_word

# Example Prediction
print(predict_next_word("battle"))


prosperous".
