In [3]:
import torch
import torch.functional as F
import torch.nn as nn


from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import re
import os

In [1]:
import warnings

# Suppressing all warnings
warnings.filterwarnings("ignore")

In [4]:
from watermark import watermark

# Printing versions of libraries used
%load_ext watermark
%watermark -v -p torch,numpy,pandas,scikit-learn,seaborn,matplotlib,tensorflow

Python implementation: CPython
Python version       : 3.13.1
IPython version      : 8.30.0

torch       : 2.6.0+cu118
numpy       : 2.2.2
pandas      : 2.2.3
scikit-learn: 1.6.0
seaborn     : 0.13.2
matplotlib  : 3.9.3
tensorflow  : not installed



In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [6]:
import kagglehub

# Downloading latest version
path = kagglehub.dataset_download("beweird/text-for-next-word-predictor")
file_path = os.path.join(path, "leo tolstoy - war and peace.txt")
print("Path to dataset text file:", file_path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/beweird/text-for-next-word-predictor?dataset_version_number=1...


100%|██████████| 1.13M/1.13M [00:01<00:00, 752kB/s]

Extracting files...
Path to dataset text file: C:\Users\borat\.cache\kagglehub\datasets\beweird\text-for-next-word-predictor\versions\1\leo tolstoy - war and peace.txt





In [None]:
# Open and read the contents of the file
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

filtered_text = re.sub(r'-', ' ', text)
filtered_text = re.sub('[^a-zA-Z0-9 \.\n]', '', filtered_text)
filtered_text = filtered_text.lower()

lines=filtered_text.split(".")
words=['.']
for l in lines:
    for w in l.split():
        if (len(w)>0):
            words.append(w)
words=list(pd.Series(words).unique())


print("Total no. of lines: ", len(lines))
print("Total unique words: ", len(words))

Total no. of lines:  30588
Total unique words:  17877


First 10 words:  ['well prince so genoa and lucca are now just family estates of the buonapartes', ' but i warn you if you dont tell me that this means war if you still try to defend the infamies and horrors perpetrated by that antichrist  i really believe he is antichrist  i will have nothing more to do with you and you are no longer my friend no longer my faithful slave as you call yourself but how do you do i see i have frightened you  sit down and tell me all the news']


In [8]:
stoi={s:i for i,s in enumerate(words)}
itos={i:s for s,i in stoi.items()}
print(len(itos))

17877


In [26]:
block_size = 5 # how many characters we take to predict the next character
X = []
Y = []
for l in lines:
    context = [0] * block_size
    words = l.split()

    if len(words) == 0:
        continue

    for w in words:
        ix = stoi[w]
        X.append(context)
        Y.append(ix)
        context = context[1:] + [ix]
        

    ix = stoi['.']
    X.append(context)
    Y.append(ix)

X = torch.tensor(X, dtype=torch.long, device=device)
Y = torch.tensor(Y, dtype=torch.long, device=device)

print("X shape: ", X.shape)
print("Y shape: ", Y.shape)





X shape:  torch.Size([592621, 5])
Y shape:  torch.Size([592621])


In [27]:

emb_dim = 64 # Hyperparameter

# Embedding layer
emb=torch.nn.Embedding(len(stoi),emb_dim).to(device)
print(emb)

Embedding(17877, 64)


In [28]:
class Next_Word_Predictor(nn.Module):
    def __init__(self, block_size, vocab_size, emb_dim, hidden_dim, activation_fn, seed_value):
        super().__init__()
        self.block_size = block_size
        self.hyperparams = {'block_size':self.block_size, 'emb_dim':emb_dim, 'hidden_dim':hidden_dim, 'activation_fn':activation_fn, 'seed_value':seed_value}
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.linear1 = nn.Linear(block_size * emb_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, vocab_size)
         
        if activation_fn == 'sigmoid':
            self.activation = torch.sigmoid  
        else:
            self.activation = torch.relu 

    def forward(self, x):
        # Embedding layer
        x = self.emb(x)
        x = x.view(x.shape[0], -1)  
        
        # Hidden layer
        x = self.linear1(x)
        x = self.activation(x)
        
        # Output layer
        x = self.linear2(x)
        
        return x

In [None]:
def train(X, Y, block_size, emb_dim, hidden_dim, activation_fn, seed_value, epochs=10, batch_size=32, learning_rate=0.001):
    model = Next_Word_Predictor(block_size, len(stoi), emb_dim, hidden_dim, activation_fn, seed_value).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for i in range(0, len(X), batch_size):
            x_batch = X[i:i+batch_size].to(device)
            y_batch = Y[i:i+batch_size].to(device)

            optimizer.zero_grad()
            outputs = model(x_batch)
            loss = criterion(outputs, y_batch)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f'Epoch [{epoch+1}/{epochs}], Loss: {total_loss/((len(X)-1)//batch_size + 1):.4f}')

    return model

In [33]:
vocab_size = len(stoi)
hidden_dim = 1024
activation_fn = 'relu'
seed_value = 42
model = train(X, Y, block_size, emb_dim, hidden_dim, activation_fn, seed_value, epochs=100, batch_size=1024, learning_rate=0.001)

Epoch [1/100], Loss: 6.0911
Epoch [2/100], Loss: 5.1650
Epoch [3/100], Loss: 4.4374
Epoch [4/100], Loss: 3.8284
Epoch [5/100], Loss: 3.4647
Epoch [6/100], Loss: 3.2028
Epoch [7/100], Loss: 2.9985
Epoch [8/100], Loss: 2.8334
Epoch [9/100], Loss: 2.6965
Epoch [10/100], Loss: 2.5804
Epoch [11/100], Loss: 2.4803
Epoch [12/100], Loss: 2.3925
Epoch [13/100], Loss: 2.3144
Epoch [14/100], Loss: 2.2438
Epoch [15/100], Loss: 2.1798
Epoch [16/100], Loss: 2.1210
Epoch [17/100], Loss: 2.0669
Epoch [18/100], Loss: 2.0165
Epoch [19/100], Loss: 1.9694
Epoch [20/100], Loss: 1.9252
Epoch [21/100], Loss: 1.8837
Epoch [22/100], Loss: 1.8442
Epoch [23/100], Loss: 1.8074
Epoch [24/100], Loss: 1.7720
Epoch [25/100], Loss: 1.7383
Epoch [26/100], Loss: 1.7061
Epoch [27/100], Loss: 1.6754
Epoch [28/100], Loss: 1.6460
Epoch [29/100], Loss: 1.6181
Epoch [30/100], Loss: 1.5910
Epoch [31/100], Loss: 1.5651
Epoch [32/100], Loss: 1.5399
Epoch [33/100], Loss: 1.5159
Epoch [34/100], Loss: 1.4927
Epoch [35/100], Loss: 1

In [34]:
torch.save(model, 'model_variant_1.pth')

In [42]:
# Load the model with weights_only=False
model_1 = torch.load('model_variant_1.pth', map_location=device, weights_only=False)
model_1.eval()

Next_Word_Predictor(
  (emb): Embedding(17877, 64)
  (linear1): Linear(in_features=320, out_features=1024, bias=True)
  (linear2): Linear(in_features=1024, out_features=17877, bias=True)
)

In [45]:
# Generate names from trained model

def generate_next_words(model, itos, stoi, content, seed_value, k, temperature=1.0, max_len=10):
    torch.manual_seed(seed_value)
    
    block_size = model.block_size
    context = content.lower()
    context = re.sub('[^a-zA-Z0-9 \.]', '', context)
    context = re.sub('\.', ' . ', context)
    word_c = context.split()
    context = []
    for i in range(len(word_c)):
        try:
            if stoi[word_c[i]]:
                context.append(word_c[i])
        except:
            context = [stoi[w] for w in context]
            if len(context) <= block_size:
                context = [0] * (block_size - len(context)) + context
            elif len(context) > block_size:
                context = context[-block_size:]
            x = torch.tensor(context).view(1, -1).to(device)
            y_pred = model(x)
            logits = y_pred
            logits = logits/temperature

            ix = torch.distributions.categorical.Categorical(logits=logits).sample().item()
            word = itos[ix]
            content += " " + word
            context = context [1:] + [ix]
            context = [itos[w] for w in context]
            
    context = [stoi[w] for w in context]
               
    if len(context) <= block_size:
        context = [0] * (block_size - len(context)) + context
    elif len(context) > block_size:
        context = context[-block_size:]

    for i in range(k):
        x = torch.tensor(context).view(1, -1).to(device)
        y_pred = model(x)
        logits = y_pred
        logits = logits/temperature
        ix = torch.distributions.categorical.Categorical(logits=logits).sample().item()
        word = itos[ix]
        content += " " + word
        context = context [1:] + [ix]
        
    return content

In [47]:
para =""
content = input("Enter a paragraph to start the text generation: ")
k = int(input("Enter the number of next words to predict (k): "))

generated_text = generate_next_words(model_1, itos, stoi, content, seed_value, k, temperature = 1.0)
print("Generated text:")
print(generated_text)

Generated text:
and he quickly turned around and just to realize that his friend is no longer nothing of nicholas circumstances as if anything are in that power of life is an
