In [2]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.optim import Adam
from torch.utils.data import DataLoader, TensorDataset
from torchsummary import summary   
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [3]:
import kagglehub
import shutil
import os

# Download dataset from Kaggle
path = kagglehub.dataset_download("ashishpandey2062/next-word-predictor-text-generator-dataset")

# Destination path
dest_path = r"E:\github\data science\data-science\NLP\next-word-predictor-text-generator-dataset"

# Copy everything from KaggleHub's cache to your chosen folder
if not os.path.exists(dest_path):
    os.makedirs(dest_path)

for item in os.listdir(path):
    s = os.path.join(path, item)
    d = os.path.join(dest_path, item)
    if os.path.isdir(s):
        shutil.copytree(s, d, dirs_exist_ok=True)
    else:
        shutil.copy2(s, d)

print("Dataset copied to:", dest_path)


Dataset copied to: E:\github\data science\data-science\NLP\next-word-predictor-text-generator-dataset


In [4]:
file_path = r"E:\github\data science\data-science\NLP\next-word-predictor-text-generator-dataset\next_word_predictor.txt"

with open(file_path, "r", encoding="utf-8") as f:
    text_data = f.read()
    text_data=text_data.lower()
print(text_data[:500])  # show first 500 characters


the sun was shining brightly in the clear blue sky, and a gentle breeze rustled the leaves of the tall trees. people were out enjoying the beautiful weather, some sitting in the park, others taking a leisurely stroll along the riverbank. children were playing games, and laughter filled the air.

as the day turned into evening, the temperature started to drop, and the sky transformed into a canvas of vibrant colors. families gathered for picnics, and the smell of barbecues wafted through the air.


In [5]:
import nltk
from nltk.tokenize import word_tokenize,sent_tokenize

nltk.download('punkt')          # sentence + word tokenizer
nltk.download('punkt_tab')     
sent=sent_tokenize(text_data)
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sent]

for i, tokens in enumerate(tokenized_sentences[:5]):
    print(f"Sentence {i+1} tokens: {tokens}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Sentence 1 tokens: ['the', 'sun', 'was', 'shining', 'brightly', 'in', 'the', 'clear', 'blue', 'sky', ',', 'and', 'a', 'gentle', 'breeze', 'rustled', 'the', 'leaves', 'of', 'the', 'tall', 'trees', '.']
Sentence 2 tokens: ['people', 'were', 'out', 'enjoying', 'the', 'beautiful', 'weather', ',', 'some', 'sitting', 'in', 'the', 'park', ',', 'others', 'taking', 'a', 'leisurely', 'stroll', 'along', 'the', 'riverbank', '.']
Sentence 3 tokens: ['children', 'were', 'playing', 'games', ',', 'and', 'laughter', 'filled', 'the', 'air', '.']
Sentence 4 tokens: ['as', 'the', 'day', 'turned', 'into', 'evening', ',', 'the', 'temperature', 'started', 'to', 'drop', ',', 'and', 'the', 'sky', 'transformed', 'into', 'a', 'canvas', 'of', 'vibrant', 'colors', '.']
Sentence 5 tokens: ['families', 'gathered', 'for', 'picnics', ',', 'and', 'the', 'smell', 'of', 'barbecues', 'wafted', 'through', 'the', 'air', '.']


In [6]:
all_words=[word for sen in tokenized_sentences for word in sen]
print(f"Total words: {all_words[:30]}")

Total words: ['the', 'sun', 'was', 'shining', 'brightly', 'in', 'the', 'clear', 'blue', 'sky', ',', 'and', 'a', 'gentle', 'breeze', 'rustled', 'the', 'leaves', 'of', 'the', 'tall', 'trees', '.', 'people', 'were', 'out', 'enjoying', 'the', 'beautiful', 'weather']


In [7]:
from collections import Counter
word_count=Counter(all_words)
vocab=sorted(word_count,key=word_count.get,reverse=True)
word2idx={word:idx+1 for idx,word in enumerate(vocab)}
idx2word={idx:word for word,idx in word2idx.items()}
print("Vocabulary size:", len(vocab))
print("First 10 word2idx mappings:", dict(list(word2idx.items())[:10]))

Vocabulary size: 5054
First 10 word2idx mappings: {',': 1, '.': 2, 'the': 3, ':': 4, 'and': 5, 'a': 6, 'of': 7, 'to': 8, 'i': 9, 'you': 10}


In [8]:
encodedsen=[[word2idx[word] for word in sen] for sen in tokenized_sentences]
x=[]
y=[]
for i in encodedsen:
    for j in range(1,len(i)):
        x.append(i[:j])
        y.append(i[j])

In [9]:
x=[torch.tensor(i) for i in x]
y=torch.tensor(y)
x=pad_sequence(x,batch_first=True)
print("Input shape:", x.shape)
print("Target shape:", y.shape)

Input shape: torch.Size([33359, 86])
Target shape: torch.Size([33359])


In [10]:
class nextword(nn.Module):
    def __init__(self,vocab_size,embedding_dim,hidden_dim):
        super(nextword,self).__init__()
        self.embedding=nn.Embedding(vocab_size+1,embedding_dim,padding_idx=0)
        self.lstm=nn.LSTM(embedding_dim,hidden_dim,batch_first=True)
        self.fc=nn.Linear(hidden_dim,vocab_size+1)
    def forward(self,x):
        x=self.embedding(x)
        x,_=self.lstm(x)
        x=self.fc(x)
        return x
    

In [11]:
model=nextword(len(word2idx),100,128)
sample=torch.randint(0,len(word2idx),(32,10))
output=model(sample)
print("Sample output shape:", output.shape)

Sample output shape: torch.Size([32, 10, 5055])


In [12]:
# x = model.embedding(x)
# x, _ = model.lstm(x)
# x = model.fc(x[:, -1, :])
# x  

In [13]:
from torch.utils.data import DataLoader, TensorDataset
dataset= TensorDataset(x,y)
train_size=int(0.8 * len(dataset))
val_size=len(dataset)-train_size
train_ds,val_ds=torch.utils.data.random_split(dataset,[train_size,val_size])
train_loader=DataLoader(train_ds,batch_size=32,shuffle=True)
val_loader=DataLoader(val_ds,batch_size=32)

import torch.optim as optim
criterion=nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [15]:
epochs=10
for epoch in range(epochs):
    model.train()
    total_loss=0
    for input,labels in train_loader:
        optimizer.zero_grad()
        output=model(input)
        output=output[:,-1,:]
        loss=criterion(output,labels)
        loss.backward()
        optimizer.step()
        total_loss+=loss.item()
    avg_loss=total_loss/len(train_loader)

    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

Epoch 1/10, Loss: 6.6157
Epoch 2/10, Loss: 6.2915
Epoch 3/10, Loss: 6.2705
Epoch 4/10, Loss: 6.2650
Epoch 5/10, Loss: 6.2630
Epoch 6/10, Loss: 6.2601
Epoch 7/10, Loss: 6.2583
Epoch 8/10, Loss: 6.2569
Epoch 9/10, Loss: 6.2548
Epoch 10/10, Loss: 6.2541


In [16]:
model.eval()
val_loss=0
with torch.no_grad():
    for input,labels in val_loader:
        output=model(input)
        output=output[:,-1,:]
        loss=criterion(output,labels)
        val_loss+=loss.item()
print(f"Validation Loss: {val_loss/len(val_loader):.4f}")


Validation Loss: 6.9445


In [28]:
def predict_next_word(model, text, word2idx, idx2word, top_k=5):
    model.eval()
    tokens = [word2idx[w] for w in text.lower().split() if w in word2idx]
    input_seq = torch.tensor(tokens).unsqueeze(0) # batch=1
    with torch.no_grad():
        with torch.cuda.amp.autocast():
            output = model(input_seq)[:, -1, :]   # last timestep
            probs = torch.softmax(output, dim=1)
            top_probs, top_idxs = probs.topk(top_k)
    return [idx2word[idx.item()] for idx in top_idxs[0]]


In [29]:
print(predict_next_word(model, 
      "As the stars began to twinkle in the night sky, the crowd", 
      word2idx, idx2word))


[',', '.', ':', 'the', 'and']


  with torch.cuda.amp.autocast():
