In [2]:
import torch
import torch.nn as nn
import torch.functional as F
import tensorflow as tf

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from collections import Counter


import re
import os

## Preprocessing and Vocabulary Construction

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
import kagglehub

In [5]:
device

device(type='cpu')

In [6]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mehmetlaudatekman/war-and-peace-project-gutenberg")
file_path = os.path.join(path, "war_peace_plain.txt")
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

In [7]:
with open(file_path, 'r', encoding='utf-8') as file:
    text = file.read()

filtered_text = re.sub(r'-', ' ', text)
filtered_text = re.sub('[^a-zA-Z0-9 \.\n]', '', filtered_text)
filtered_text = filtered_text.lower()

lines=filtered_text.split(".")
# text is separated by lines based on full stop.
words=['.']
for l in lines:
    for w in l.split():
        if (len(w)>0):
            words.append(w)
words=list(pd.Series(words).unique())
word_counts = Counter(words)

df_counts = pd.DataFrame(word_counts.items(), columns = ["word", "frequency"])
df_counts = df_counts.sort_values(by = "frequency", ascending=False)  #sorting the words by descending order of their frequency

# vocabulary size
print("Total no. of lines: ", len(lines))
print("Total unique words: ", len(words))

  filtered_text = re.sub('[^a-zA-Z0-9 \.\n]', '', filtered_text)


Total no. of lines:  30660
Total unique words:  19764


In [8]:
# Top 10 most frequenct words
print("Top 10 most frequent words are: \n")
top_10 = df_counts.head(10)
print(top_10.to_string(index = False))

# Bottom 10 least frequent words
print("Top 10 least frequent words are: \n")
bottom_10 = df_counts.tail(10)
print(bottom_10.to_string(index = False))

Top 10 most frequent words are: 

   word  frequency
      .          1
chapter          1
      i          1
   well          1
 prince          1
     so          1
  genoa          1
    and          1
  lucca          1
    are          1
Top 10 least frequent words are: 

           word  frequency
      firmament          1
         joshua          1
            nun          1
      defenders          1
      uninvited          1
    strengthens          1
   immovability          1
personalityfree          1
         earths          1
         unreal          1


## Model Design and Training 

In [9]:
# Creating a mapping between string and integer to help prediction

stoi = {s: i for i,s in enumerate(words)}
itos = {i: s for s, i in stoi.items()}

In [10]:
size = 5  # number of previous words used as context
X, Y = [], []

for line in lines:
    predata = [0] * size
    prewords = line.split()

    for i in range(len(prewords)):
        word1 = stoi[prewords[i]]
        
        X.append(predata.copy())   # store current context
        Y.append(word1)         # store next word
        
        # slide the window
        predata = predata[1:] + [word1]

        # handle end of sentence
        if i == len(prewords) - 1:
            eos = stoi['.']
            X.append(predata.copy())
            Y.append(eos)
            predata = predata[1:] + [eos]    # this helps to keep the length of the words needed to be predicted under 5

# Convert to tensors
X = torch.tensor(X).to(device)
Y = torch.tensor(Y).to(device)

X.shape, Y.shape, X.dtype, Y.dtype


(torch.Size([590609, 5]), torch.Size([590609]), torch.int64, torch.int64)

In [11]:
embed_dim = 64
# using 64 dimensional embedding 

embed = torch.nn.Embedding(len(stoi), embed_dim).to(device)
# creates embedding layer of 2nd parameter of data from the 1st parameter
print(embed)

Embedding(19764, 64)


In [12]:
class Next_Word_Predictor(nn.Module):
    def __init__(self, size, vocab_size, embed_dim, hidden_dim, activation_fn, seed_value):
        super().__init__()
        self.size = size
        self.hyperpams = {'size':self.size, 'embed_dim': embed_dim, 'hidden_dim': hidden_dim, 'activation_fn':activation_fn,'seed_value':seed_value}
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.linear1 = nn.Linear(size* embed_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, vocab_size)

        self.activation_fn = torch.relu

    def forward(self, x):
        # Embedding Layer
        x = self.embed(x)
        x = x.view(x.shape[0], -1) # flattens the embedding

        # Hidden Layer
        x = self.linear1(x)  # maps the flattened vector to the hidden dimension
        x = self.activation_fn(x)  # adds an activation function to x

        # Output Layer
        x = self.linear2(x)

        return x


In [13]:
def train_model(X, Y, size, embed_dim, vocab_size, hidden_dim, activation_fn, seed_value, device, batch_size=1024, epochs=100, print_every=2):
    torch.manual_seed(seed_value)
    model = Next_Word_Predictor(size, vocab_size, embed_dim, hidden_dim, activation_fn, seed_value).to(device)
    loss_fn = nn.CrossEntropyLoss()
    opt = torch.optim.AdamW(model.parameters(), lr=0.001)

    for epoch in range(epochs):
        for i in range(0, X.shape[0], batch_size):
            x = X[i:i+batch_size].to(device)
            y = Y[i:i+batch_size].to(device)
            y_pred = model(x)
            loss = loss_fn(y_pred, y)

            loss.backward()
            opt.step()
            opt.zero_grad()  # âœ… fixed

        if epoch % print_every == 0:
            print(f'Epoch {epoch}: Loss = {loss.item()}')

    return model  

In [14]:
vocab_size = len(stoi)
hidden_dim = 1024
activation_fn = 'relu'
seed_value = 42

In [23]:
model = train_model(X, Y, size, embed_dim, vocab_size, hidden_dim, activation_fn, seed_value, device)

Epoch 0: Loss = 6.131535530090332
Epoch 2: Loss = 4.148116111755371
Epoch 4: Loss = 2.7473084926605225
Epoch 6: Loss = 2.2369258403778076
Epoch 8: Loss = 1.9346868991851807
Epoch 10: Loss = 1.7186203002929688
Epoch 12: Loss = 1.546530842781067
Epoch 14: Loss = 1.413068175315857
Epoch 16: Loss = 1.2983931303024292
Epoch 18: Loss = 1.1931016445159912
Epoch 20: Loss = 1.107254147529602
Epoch 22: Loss = 1.0426493883132935
Epoch 24: Loss = 0.9838107824325562
Epoch 26: Loss = 0.9329420328140259
Epoch 28: Loss = 0.8849213123321533
Epoch 30: Loss = 0.8428866267204285
Epoch 32: Loss = 0.8042839765548706
Epoch 34: Loss = 0.7687357664108276
Epoch 36: Loss = 0.7372921705245972
Epoch 38: Loss = 0.7091547846794128
Epoch 40: Loss = 0.6839790344238281
Epoch 42: Loss = 0.661050021648407
Epoch 44: Loss = 0.6411907076835632
Epoch 46: Loss = 0.6211549043655396
Epoch 48: Loss = 0.6035213470458984
Epoch 50: Loss = 0.5879759192466736
Epoch 52: Loss = 0.5726709961891174
Epoch 54: Loss = 0.5587177276611328
Epo

In [24]:
# saving model
torch.save(model, "model1_task1")

In [16]:
vocab_size = len(stoi)
hidden_dim2 = 128
activation_fn2 = 'tanh'
seed_value2 = 123
embed_dim = 32

In [17]:
model2 = train_model(X, Y, size, embed_dim, vocab_size, hidden_dim2, activation_fn2, seed_value2, device)

KeyboardInterrupt: 

In [28]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# When loading the model:
model = torch.load("model1_task1.pt", map_location=device, weights_only=False)
model.to(device)
model.eval()

Next_Word_Predictor(
  (embed): Embedding(19764, 64)
  (linear1): Linear(in_features=320, out_features=1024, bias=True)
  (linear2): Linear(in_features=1024, out_features=19764, bias=True)
)

In [29]:
# names generation from the trained model

def generateNextWord(model, itos, stoi, content, seed_value, k, temperature =  1, max_len = 10):
    torch.manual_seed(seed_value)

    size = model.size
    predata =  content.lower()
    predata = re.sub(r'[^a-zA-Z0-9 \.]', '', predata)  # remove unwanted punctuation
    predata = re.sub(r'\.', ' . ', predata)             # separate periods with spaces


    wordsNew = predata.split()
    predata = []

# Convert words to integer IDs
    for i in range(len(wordsNew)):
        try:
            if stoi[wordsNew[i]]:
                predata.append(wordsNew[i])
        except:
            predata = [stoi[w] for w in predata]
            if len(predata) <= size:
                predata = [0] * (size - len(predata)) + predata
            elif len(predata) > size:
                predata = predata[-size:] # take the last (size) elements
            x = torch.tensor(predata).view(1, -1).to(device)
            y_pred = model(x)
            logits = y_pred
            logits = logits/temperature

            word1 = torch.distributions.categorical.Categorical(logits = logits).sample().item()
            word = itos[word1]
            content += " " + word
            predata = predata[1:]+[word1]
            predata = [itos[w] for w in predata]


    predata = [stoi[w] for w in predata]

    if len(predata) <= size:
        predata = [0] * (size - len(predata)) + predata
    elif len(predata) > size:
        predata = predata[-size:]

    for i in range(k):
        x = torch.tensor(predata).view(1, -1).to(device)
        y_pred = model(x)
        logits = y_pred
        logits = logits/temperature
        word1 = torch.distributions.categorical.Categorical(logits=logits).sample().item()
        word = itos[word1]
        content += " " + word
        predata = predata[1:] + [word1]

    return content

In [33]:
### Generating next word 

par = ""
content = input("Enter the beginning of a sentence: ")
k = int(input("Enter number of words to generate: "))
par += generateNextWord(model, itos, stoi, content, seed_value, k, temperature=1)
print("\nGenerated Sentence: \n")
print(par)


Generated Sentence: 

i saw the devil and  only troops that could save the answer and so that


## Embedding visualisation and Interpretation

In [38]:
embedding_weights = model.embed.weight.data.cpu().numpy()
# It is used for visualizing the word embeddings using techniques like PCA or t-SNE.

In [39]:
import random
from sklearn.manifold import TSNE
# random module is used to sampling words and TSME is used for reducing dimensions of embeddings for visualization

In [40]:
# groupig the words based on their suffixes
def groupWords(itos):
    groups = {
        'verb_ing' : [],
        'verb_ed' : [],
        'noun_s' : [],
        'adverb_ly' : [],
        'adjectives' : [],
        'pronouns' : ['i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them']
    }

    # dictionary groups with keys for different morphological categories

    for word in itos.values():
        if word.endswith('ing'):
            groups['verb_ing'].append(word)
        elif word.endswith('ed'):
            groups['verb_ed'].append(word)
        elif word.endswith('s'):
            groups['noun_s'].append(word)
        elif word.endswith('ly'):
            groups['adverb_ly'].append(word)
        elif word in ['big', 'small', 'happy', 'sad', 'quick', 'slow', 'bright', 'dark']:
            groups['adjectives'].append(word)

    return groups

In [41]:
# defining pronouns
pronouns = ['i', 'you', 'he', 'she', 'it', 'we', 'they', 'me', 'him', 'her', 'us', 'them']
groups = groupWords(itos)
groups['pronouns'] = [word for word in pronouns if word in stoi]
# Replace the pronouns group with those pronouns that appear in the vocabulary using stoi.

# Embedding for all words in the vocabulary

allWords = list(stoi.keys())
allWordEmbeddings = [model.embed.weight[stoi[word]].detach().cpu().numpy() for word in allWords]
allWordEmbeddings = np.array(allWordEmbeddings)

In [42]:
# visualising each group

for group_name, group in groups.items():
    selected_words = random.sample(group, min(50, len(group))) if group else []

    # Extract embeddings for the selected words
    selected_embeddings = []
    selected_word_labels = []

    for word in selected_words:
        if word in stoi:
            index = stoi[word]
            selected_embeddings.append(model.embed.weight[index].detach().cpu().numpy())
            selected_word_labels.append(word)

    selected_embeddings = torch.tensor(selected_embeddings)
    n_samples = selected_embeddings.shape[0]
    # stores the number of selected vectors

    # Reducing the dimensions of embeddings using t-SNE
    perplexity_value = min(30, n_samples - 1) if n_samples > 1 else 1
    tsne = TSNE(n_components=2, perplexity=perplexity_value, random_state=42)
    embeddings_tsne = tsne.fit_transform(allWordEmbeddings)

    # Mapping from embedding to words
    word_to_tsne = {word: embed for word, embed in zip(allWords, embeddings_tsne)}

    # Plotting the embeddings
    plt.figure(figsize=(12, 10))
    plt.scatter(embeddings_tsne[:, 0], embeddings_tsne[:, 1], alpha=0.3, label='All Words')

    # Highlight selected words in the group with distinct color
    for word in selected_word_labels:
        embed = word_to_tsne[word]
        plt.scatter(embed[0], embed[1], label=word, color='red', s=100)
        plt.annotate(word, (embed[0], embed[1]), textcoords="offset points", xytext=(0,10), ha='center', color='blue')

    plt.title(f'Word Embeddings Visualization - Group: {group_name}')
    plt.xlabel('TSNE Dimension 1')  
    plt.ylabel('TSNE Dimension 2')
    plt.legend()
    plt.show()

  selected_embeddings = torch.tensor(selected_embeddings)


KeyboardInterrupt: 

Task 1 is completed till visualisation

Streamlit application update is remaining

In [11]:
import pickle

# Example stoi dictionary
stoi1 = stoi

# Save to a .pkl file
with open("stoi1.pkl", "wb") as f:
    pickle.dump(stoi, f, protocol=pickle.HIGHEST_PROTOCOL)


In [12]:
import pickle

# Example stoi dictionary
itos1 = itos

# Save to a .pkl file
with open("itos1.pkl", "wb") as f:
    pickle.dump(itos1, f, protocol=pickle.HIGHEST_PROTOCOL)