In [25]:
#importing libraries
import torch
import torch.nn.functional as F
from torch import nn
import pandas as pd
import matplotlib.pyplot as plt
import urllib.request
import re
from collections import Counter

In [26]:
#setting device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [27]:
import re

#path of the RTF file
file_path = "Emma_by_Jane_Austen.rtf"

#reading the RTF file
with open(file_path, 'r', encoding='utf-8') as file:
    rtf_content = file.read()

#function to clean RTF file
def clean_rtf(rtf):
    # Remove RTF formatting
    # This regex removes everything that isn't plain text
    cleaned_text = re.sub(r'{\\.*?}', '', rtf)  # Remove RTF groups
    cleaned_text = re.sub(r'\\[a-z]+\d* ?', '', cleaned_text)  # Remove RTF commands
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Remove extra spaces
    return cleaned_text.strip()

#cleaning the RTF text
plain_text = clean_rtf(rtf_content)

#displaying the first 500 characters of the plain text
print(plain_text[:500])


{ The Project Gutenberg EBook of Emma, by Jane Austen\ \ This eBook is for the use of anyone anywhere at no cost and with\ almost no restrictions whatsoever. You may copy it, give it away or\ re-use it under the terms of the Project Gutenberg License included\ with this eBook or online at www.gutenberg.org\ \ \ Title: Emma\ \ Author: Jane Austen\ \ Release Date: August, 1994 [Etext #158]\ Posting Date: January 21, 2010\ Last Updated: October 17, 2016\ \ Language: English\ \ Character set encodin


In [28]:
#converting text to lowercase
plain_text = plain_text.lower()

#removing unwanted characters
cleaned_text = re.sub('[^a-zA-Z0-9 .]', '', plain_text)

#splitting into words
words = cleaned_text.split()

print(cleaned_text[:400])

 the project gutenberg ebook of emma by jane austen  this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. you may copy it give it away or reuse it under the terms of the project gutenberg license included with this ebook or online at www.gutenberg.org   title emma  author jane austen  release date august 1994 etext 158 posting date january 21 2010 las


In [29]:
#creating vocab of unique words
words_vocab = sorted(set(words))
stoi = {s: i for i, s in enumerate(words_vocab)}
itos = {i: s for i, s in enumerate(words_vocab)}

In [30]:
print(len(words))

160442


In [31]:
#function to create input-output pairs
def create_dataset(words, block_size):
    X, Y = [], []
    for i in range(len(words) - block_size):
        context = [stoi[words[j]] for j in range(i, i + block_size)]
        next_word = stoi[words[i + block_size]]
        X.append(context)
        Y.append(next_word)
    return torch.tensor(X).to(device), torch.tensor(Y).to(device)

In [32]:
class NextWordMLP(nn.Module):
    def __init__(self, block_size, vocab_size, emb_dim, hidden_size, activation):
        super().__init__()
        self.emb = nn.Embedding(vocab_size, emb_dim)
        self.lin1 = nn.Linear(block_size * emb_dim, hidden_size)
        self.activation = activation
        self.lin2 = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.emb(x)
        x = x.view(x.shape[0], -1)
        x = self.activation(self.lin1(x))
        x = self.lin2(x)
        return x

In [33]:
embedding_sizes = [64, 128]
context_lengths = [5, 10, 15]
activations = [F.relu, torch.tanh]
epochs = 500

In [52]:
def train_model(embedding_size, block_size, activation_fn):
    hidden_size = 512
    model = NextWordMLP(block_size, len(stoi), embedding_size, hidden_size, activation_fn).to(device)
    loss_fn = nn.CrossEntropyLoss()
    opt = torch.optim.AdamW(model.parameters(), lr=0.001)
    X, Y = create_dataset(words, block_size)

    
    batch_size = 512
    losses = []
    for epoch in range(epochs):
        epoch_loss = 0
        for i in range(0, len(X), batch_size):
            x_batch = X[i:i + batch_size]
            y_batch = Y[i:i + batch_size]
            y_pred = model(x_batch)
            loss = loss_fn(y_pred, y_batch)
            loss.backward()
            opt.step()
            opt.zero_grad()
            epoch_loss += loss.item()
        
        losses.append(epoch_loss / (len(X) // batch_size))
        if epoch % 1 == 0:
            print(f"Epoch {epoch}, Loss: {losses[-1]:.4f}")

        #early stopping in case loss plateaus
        if len(losses) > 10 and abs(losses[-1] - losses[-10]) < 0.001:
            print("Early stopping")
            break

    return model

In [35]:
def generate_text(model, itos, stoi, block_size, max_length=50):
    context = [0] * block_size
    generated_words = []
    for _ in range(max_length):
        x = torch.tensor(context).view(1, -1).to(device)
        y_pred = model(x)
        ix = torch.distributions.categorical.Categorical(logits=y_pred).sample().item()
        word = itos[ix]
        generated_words.append(word)
        context = context[1:] + [ix]
    return ' '.join(generated_words)

In [53]:
def save_model(model, embedding_size, block_size, activation_fn_name):
    model_filename = f"saved_models/model_emb{embedding_size}_ctx{block_size}_act{activation_fn_name}.pt"
    torch.save(model.state_dict(), model_filename)
    print(f"Model saved as {model_filename}")

In [54]:
embedding_size = embedding_sizes[0]
block_size = context_lengths[0]
activation_fn = activations[0]
print(f"\nTraining with Embedding Size: {embedding_size}, Context Length: {block_size}, Activation: {activation_fn.__name__}")
model1 = train_model(embedding_size, block_size, activation_fn)
save_model(model1, embedding_size, block_size, activation_fn.__name__)
print("Generated Text:")
print(generate_text(model1, itos, stoi, block_size, max_length=100))


Training with Embedding Size: 64, Context Length: 5, Activation: relu
Epoch 0, Loss: 6.5846
Epoch 1, Loss: 5.5008
Epoch 2, Loss: 4.7264
Epoch 3, Loss: 3.7315
Epoch 4, Loss: 3.1709
Epoch 5, Loss: 2.8659
Epoch 6, Loss: 2.6344
Epoch 7, Loss: 2.4448
Epoch 8, Loss: 2.2835
Epoch 9, Loss: 2.1424
Epoch 10, Loss: 2.0172
Epoch 11, Loss: 1.9042
Epoch 12, Loss: 1.8009
Epoch 13, Loss: 1.7061
Epoch 14, Loss: 1.6177
Epoch 15, Loss: 1.5356
Epoch 16, Loss: 1.4581
Epoch 17, Loss: 1.3852
Epoch 18, Loss: 1.3161
Epoch 19, Loss: 1.2507
Epoch 20, Loss: 1.1882
Epoch 21, Loss: 1.1287
Epoch 22, Loss: 1.0718
Epoch 23, Loss: 1.0170
Epoch 24, Loss: 0.9647
Epoch 25, Loss: 0.9147
Epoch 26, Loss: 0.8663
Epoch 27, Loss: 0.8202
Epoch 28, Loss: 0.7759
Epoch 29, Loss: 0.7332
Epoch 30, Loss: 0.6920
Epoch 31, Loss: 0.6530
Epoch 32, Loss: 0.6153
Epoch 33, Loss: 0.5790
Epoch 34, Loss: 0.5443
Epoch 35, Loss: 0.5114
Epoch 36, Loss: 0.4793
Epoch 37, Loss: 0.4489
Epoch 38, Loss: 0.4198
Epoch 39, Loss: 0.3921
Epoch 40, Loss: 0.3

In [55]:
embedding_size = embedding_sizes[0]
block_size = context_lengths[0]
activation_fn = activations[1]
print(f"\nTraining with Embedding Size: {embedding_size}, Context Length: {block_size}, Activation: {activation_fn.__name__}")
model2 = train_model(embedding_size, block_size, activation_fn)
save_model(model2, embedding_size, block_size, activation_fn.__name__)
print("Generated Text:")
print(generate_text(model2, itos, stoi, block_size, max_length=100))


Training with Embedding Size: 64, Context Length: 5, Activation: tanh
Epoch 0, Loss: 6.8163
Epoch 1, Loss: 5.5102
Epoch 2, Loss: 4.8018
Epoch 3, Loss: 4.1668
Epoch 4, Loss: 3.7250
Epoch 5, Loss: 3.4147
Epoch 6, Loss: 3.1718
Epoch 7, Loss: 2.9695
Epoch 8, Loss: 2.7932
Epoch 9, Loss: 2.6370
Epoch 10, Loss: 2.4941
Epoch 11, Loss: 2.3626
Epoch 12, Loss: 2.2395
Epoch 13, Loss: 2.1240
Epoch 14, Loss: 2.0145
Epoch 15, Loss: 1.9110
Epoch 16, Loss: 1.8115
Epoch 17, Loss: 1.7170
Epoch 18, Loss: 1.6260
Epoch 19, Loss: 1.5390
Epoch 20, Loss: 1.4551
Epoch 21, Loss: 1.3752
Epoch 22, Loss: 1.2977
Epoch 23, Loss: 1.2238
Epoch 24, Loss: 1.1525
Epoch 25, Loss: 1.0849
Epoch 26, Loss: 1.0195
Epoch 27, Loss: 0.9575
Epoch 28, Loss: 0.8982
Epoch 29, Loss: 0.8414
Epoch 30, Loss: 0.7873
Epoch 31, Loss: 0.7365
Epoch 32, Loss: 0.6873
Epoch 33, Loss: 0.6410
Epoch 34, Loss: 0.5972
Epoch 35, Loss: 0.5557
Epoch 36, Loss: 0.5164
Epoch 37, Loss: 0.4789
Epoch 38, Loss: 0.4437
Epoch 39, Loss: 0.4105
Epoch 40, Loss: 0.3