In [1]:
import torch
import torch.nn as nn
import torch.functional as F
import tensorflow as tf

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from collections import Counter


import re
import os

## Preprocessing and Vocabulary Construction

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
import kagglehub

In [4]:
device

device(type='cpu')

In [5]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mehmetlaudatekman/war-and-peace-project-gutenberg")
file_path = os.path.join(path, "war_peace_plain.txt")
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

In [6]:
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

filtered_text = re.sub(r'-', ' ', text)
filtered_text = re.sub('[^a-zA-Z0-9 \.\n]', '', filtered_text)
filtered_text = filtered_text.lower()

lines=filtered_text.split(".")
# text is separated by lines based on full stop.
words=['.']
for l in lines:
    for w in l.split():
        if (len(w)>0):
            words.append(w)
words=list(pd.Series(words).unique())
word_counts = Counter(words)

df_counts = pd.DataFrame(word_counts.items(), columns = ["word", "frequency"])
df_counts = df_counts.sort_values(by = "frequency", ascending=False)  #sorting the words by descending order of their frequency

# vocabulary size
print("Total no. of lines: ", len(lines))
print("Total unique words: ", len(words))

  filtered_text = re.sub('[^a-zA-Z0-9 \.\n]', '', filtered_text)


Total no. of lines:  30660
Total unique words:  19764


In [7]:
# Top 10 most frequenct words
print("Top 10 most frequent words are: \n")
top_10 = df_counts.head(10)
print(top_10.to_string(index = False))

# Bottom 10 least frequent words
print("Top 10 least frequent words are: \n")
bottom_10 = df_counts.tail(10)
print(bottom_10.to_string(index = False))

Top 10 most frequent words are: 

   word  frequency
      .          1
chapter          1
      i          1
   well          1
 prince          1
     so          1
  genoa          1
    and          1
  lucca          1
    are          1
Top 10 least frequent words are: 

           word  frequency
      firmament          1
         joshua          1
            nun          1
      defenders          1
      uninvited          1
    strengthens          1
   immovability          1
personalityfree          1
         earths          1
         unreal          1


## Model Design and Training 

In [8]:
# Creating a mapping between string and integer to help prediction

stoi = {s: i for i,s in enumerate(words)}
itos = {i: s for s, i in stoi.items()}

In [20]:
size = 5  # number of previous words used as context
X, Y = [], []

for line in lines:
    predata = [0] * size
    prewords = line.split()

    for i in range(len(prewords)):
        word1 = stoi[prewords[i]]
        
        X.append(predata.copy())   # store current context
        Y.append(word1)         # store next word
        
        # slide the window
        predata = predata[1:] + [word1]

        # handle end of sentence
        if i == len(prewords) - 1:
            eos = stoi['.']
            X.append(predata.copy())
            Y.append(eos)
            predata = predata[1:] + [eos]    # this helps to keep the length of the words needed to be predicted under 5

# Convert to tensors
X = torch.tensor(X).to(device)
Y = torch.tensor(Y).to(device)

X.shape, Y.shape, X.dtype, Y.dtype


(torch.Size([590609, 5]), torch.Size([590609]), torch.int64, torch.int64)

In [21]:
embed_dim = 64
# using 64 dimensional embedding 

embed = torch.nn.Embedding(len(stoi), embed_dim).to(device)
# creates embedding layer of 2nd parameter of data from the 1st parameter
print(embed)

Embedding(19764, 64)


In [22]:
class Next_Word_Predictor(nn.Module):
    def __init__(self, size, vocab_size, embed_dim, hidden_dim, activation_fn, seed_value):
        super().__init__()
        self.size = size
        self.hyperpams = {'size':self.size, 'embed_dim': embed_dim, 'hidden_dim': hidden_dim, 'activation_fn':activation_fn,'seed_value':seed_value}
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.linear1 = nn.Linear(size* embed_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, vocab_size)

        self.activation_fn = torch.relu

    def forward(self, x):
        # Embedding Layer
        x = self.embed(x)
        x = x.view(x.shape[0], -1) # flattens the embedding

        # Hidden Layer
        x = self.linear1(x)  # maps the flattened vector to the hidden dimension
        x = self.activation_fn(x)  # adds an activation function to x

        # Output Layer
        x = self.linear2(x)

        return x


In [23]:
def train_model(X, Y, size, embed_dim, vocab_size, hidden_dim, activation_fn, seed_value, device, batch_size = 1024, epochs = 100, print_every = 10):
    torch.manual_seed(seed_value)
    model = Next_Word_Predictor(size, vocab_size, embed_dim, hidden_dim, activation_fn, seed_value).to(device)
    loss_fn = nn.CrossEntropyLoss()

    opt = torch.optim.AdamW(model.parameters(), lr = 0.001)

    for epoch in range(epochs):
        for i in range(0, X.shape[0], batch_size):
            x = X[i:i+batch_size].to(device)
            y = Y[i:i+batch_size].to(device)
            y_pred = model(x)
            loss = loss_fn(y_pred, y)

            loss.backward()
            opt.step()
            opt.zero_grad

        if(epoch%print_every == 0):
            print(f'Epoch {epoch}: Loss = {loss.item()}')

        return model

In [24]:
vocab_size = len(stoi)
hidden_dim = 1024
activation_fn = 'relu'
seed_value = 42

In [29]:
model = train_model(X, Y, size, embed_dim, vocab_size, hidden_dim, activation_fn, seed_value, device)

KeyboardInterrupt: 

In [15]:
# saving model
torch.save(model, "model1_task1")

NameError: name 'model' is not defined

In [34]:
# names generation from the trained model

def generateNextWord(model, itos, stoi, content, seed_value, k, temperature =  1, max_len = 10):
    torch.manual_seed(seed_value)

    size = model.size
    predata =  content.lower()
    predata = re.sub(r'[^a-zA-Z0-9 \.]', '', predata)  # remove unwanted punctuation
    predata = re.sub(r'\.', ' . ', predata)             # separate periods with spaces


    wordsNew = predata.split()
    predata = []

# Convert words to integer IDs
    for i in range(len(wordsNew)):
        try:
            if stoi[wordsNew[i]]:
                predata.append(wordsNew[i])
        except:
            predata = [stoi[w] for w in predata]
            if len(predata) <= size:
                predata = [0] * (size - len(predata)) + predata
            elif len(predata) > size:
                predata = predata[-size:] # take the last (size) elements
            x = torch.tensor(predata).view(1, -1).to(device)
            y_pred = model(x)
            logits = y_pred
            logits = logits/temperature

            word1 = torch.distributions.categorical.Categorical(logits = logits).sample().item()
            word = itos[word1]
            content += " " + word
            predata = predata[1:]+[word1]
            predata = [itos[w] for w in predata]


    predata = [stoi[w] for w in predata]

    if len(predata) <= size:
        predata = [0] * (size - len(predata)) + predata
    elif len(predata) > size:
        predata = predata[-size:]

    for i in range(k):
        x = torch.tensor(predata).view(1, -1).to(device)
        y_pred = model(x)
        logits = y_pred
        logits = logits/temperature
        word1 = torch.distributions.categorical.Categorical(logits=logits).sample().item()
        word = itos[word1]
        content += " " + word
        predata = predata[1:] + [word1]

    return content