In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.optim import AdamW

from sklearn.model_selection import train_test_split


In [2]:
import requests

url = "https://www.gutenberg.org/files/100/100-0.txt"
response = requests.get(url)
text_data = response.text

# Save the dataset locally
with open("shakespeare.txt", "w", encoding="utf-8") as file:
    file.write(text_data)

print("Dataset saved successfully!")

Dataset saved successfully!


In [3]:
import re

def clean_text(text):
    text = text.lower()
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)  
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

with open("shakespeare.txt", "r", encoding="utf-8") as file:
    raw_text = file.read()

cleaned_text = clean_text(raw_text)

In [4]:
len(cleaned_text)

5029939

In [5]:
cleaned_text_list = cleaned_text.split(" ")

In [6]:
cleaned_text_list = [word for word in cleaned_text_list if not word.startswith("_")]

In [7]:
vocab = sorted(set(cleaned_text_list))

In [53]:
vocab = [word for word in vocab if not word.startswith('_')]

In [9]:
vocab_size = len(vocab)

In [10]:
vocab_size

30766

In [11]:
word2idx = dict((word, idx) for idx, word in enumerate(vocab))
idx2word = dict((idx, word) for idx, word in enumerate(vocab))

In [13]:
step_size = 20
text_length = 10
X = []
y = []
for i in range(0, len(cleaned_text_list)-text_length, step_size):
    X.append(cleaned_text_list[i:i+text_length])
    y.append(cleaned_text_list[i+text_length])

In [15]:
len(X), len(y)

(47901, 47901)

In [16]:
glove_path = "/kaggle/input/glove-embeddings/glove.6B.200d.txt"
glove_embeddings = dict()

with open(glove_path,  "r", encoding = "utf-8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        embeddings = np.asarray(values[1:], dtype = "float32")
        glove_embeddings[word] = embeddings

In [17]:
df = pd.DataFrame({"texts" : X, "target" : y})

In [18]:
len(df)

47901

In [19]:
def tokenize(text):
    tokens = []
    for t in text:
        tokens.append(word2idx[t])
    return tokens

df["texts_tokenized"] = df["texts"].apply(tokenize)
df["target_tokenized"] = df["target"].apply(lambda x: word2idx[x])

In [20]:
df.head()

Unnamed: 0,texts,target,texts_tokenized,target_tokenized
0,"[start, of, the, project, gutenberg, ebook, th...",william,"[25157, 18183, 26614, 20691, 11931, 8364, 2661...",30130
1,"[that, ends, well, the, tragedy, of, antony, a...",you,"[26589, 8667, 29767, 26614, 27375, 18183, 1044...",30686
2,"[coriolanus, cymbeline, the, tragedy, of, haml...",first,"[5711, 6384, 26614, 27375, 18183, 12048, 20536...",10044
3,"[of, king, henry, the, fourth, the, life, of, ...",the,"[18183, 14534, 12537, 26614, 10666, 26614, 152...",26614
4,"[second, part, of, king, henry, the, sixth, th...",of,"[23207, 19010, 18183, 14534, 12537, 26614, 241...",18183


In [21]:
embedding_dim = 200
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for words, idx in word2idx.items():
    if words in glove_embeddings.keys():
        embedding_matrix[idx] = glove_embeddings[words]
    else:
        embedding_matrix[idx] = np.zeros(embedding_dim)

In [22]:
def embeddings(tokens):
    embedding = []
    for t in tokens:
        embedding.append(embedding_matrix[t])
    return np.array(embedding)

text_embeddings = df["texts_tokenized"].apply(embeddings)
target_tokenized = df["target_tokenized"]

In [23]:
target_tensors = torch.tensor(target_tokenized, dtype = torch.long)
text_tensors = torch.tensor(text_embeddings, dtype = torch.float32)

  text_tensors = torch.tensor(text_embeddings, dtype = torch.float32)


In [24]:
target_tensors.shape, text_tensors.shape

(torch.Size([47901]), torch.Size([47901, 10, 200]))

In [25]:
X_train, y_train = text_tensors, target_tensors

In [26]:
text_tensors.shape

torch.Size([47901, 10, 200])

In [27]:
class TextDataset(Dataset):
    def __init__(self, X, y):
        self.inputs = X
        self.target= y

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return {
            "inputs" : self.inputs[idx],
            "target" : self.target[idx]
        }

In [28]:
train_dataset = TextDataset(X_train, y_train)

In [29]:
batch_size = 32
train_dl = DataLoader(train_dataset, batch_size = batch_size, shuffle = False)

In [30]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [31]:
hidden_dim = 128
output_dim = vocab_size
num_layers = 5
dropout = 0.3

class Model(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, num_layers, dropout):
        super(Model, self).__init__()
        self.bilstm = nn.LSTM(input_size = embedding_dim, hidden_size = hidden_dim, batch_first = True, num_layers = num_layers, bidirectional = True, dropout = dropout)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(2*hidden_dim, output_dim)

    def forward(self, inputs):
        output, _ = self.bilstm(inputs)
        output = self.dropout(output[:, -1, :])
        output = self.fc(output)
        return output

In [32]:
model = Model(embedding_dim, hidden_dim, output_dim, num_layers, dropout)
model.to(device)

Model(
  (bilstm): LSTM(200, 128, num_layers=5, batch_first=True, dropout=0.3, bidirectional=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=30766, bias=True)
)

In [33]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-5)
loss_fn = nn.CrossEntropyLoss()

In [45]:
%%time
num_epochs = 400
model_path = "best_model.pth"
best_accuracy = 0
losses = []
   
for epoch in range(num_epochs):
    print("="*100)
    print(f"EPOCH: {epoch+1}/{num_epochs}: ")
    print("-"*100)
    
    model.train()
    correct_prediction = 0 
    total_samples = 0  
    total_loss = 0 
    
    for batch in train_dl:
        inputs = batch["inputs"].to(device)
        target = batch["target"].to(device)

        optimizer.zero_grad()
        output = model(inputs)
        loss = loss_fn(output, target)

        preds = torch.argmax(output, dim=1)

        correct_prediction += (preds == target).sum().item() 
        total_samples += target.size(0) 

        total_loss += loss.item()
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
        optimizer.step()
    
    accuracy = (correct_prediction / total_samples) * 100
    avg_loss = total_loss / len(train_dl)
    
    print(f"\tCorrect Predictions (Train Dataset): {accuracy:.2f}%")
    print(f"\tLoss: {avg_loss:.4f}")

    if accuracy >  best_accuracy:
        best_accuracy = accuracy
        torch.save(model.state_dict(), model_path)

print("Best Model Accuracy: ", best_accuracy)


EPOCH: 1/100: 
----------------------------------------------------------------------------------------------------
	Correct Predictions (Train Dataset): 35.63%
	Loss: 3.3706
EPOCH: 2/100: 
----------------------------------------------------------------------------------------------------
	Correct Predictions (Train Dataset): 35.78%
	Loss: 3.3669
EPOCH: 3/100: 
----------------------------------------------------------------------------------------------------
	Correct Predictions (Train Dataset): 35.50%
	Loss: 3.3634
EPOCH: 4/100: 
----------------------------------------------------------------------------------------------------
	Correct Predictions (Train Dataset): 35.89%
	Loss: 3.3553
EPOCH: 5/100: 
----------------------------------------------------------------------------------------------------
	Correct Predictions (Train Dataset): 35.83%
	Loss: 3.3616
EPOCH: 6/100: 
----------------------------------------------------------------------------------------------------
	Correct 

In [55]:
print(f"Best Accuracy: {best_accuracy:.2f}%")

Best Accuracy: 45.86%


In [56]:
best_model = Model(embedding_dim, hidden_dim, output_dim, num_layers, dropout)
best_model.load_state_dict(torch.load("/kaggle/working/best_model.pth", weights_only = True))

<All keys matched successfully>

In [57]:
best_model.eval()

def generate(prompt, idx2word=idx2word):
    prompt = clean_text(prompt)
    prompt = prompt.lower()
    prompt = prompt.split()
    prompt_tokens = tokenize(prompt)
    prompt_embeddings = embeddings(prompt_tokens)
    
    prompt_tensors = torch.tensor(prompt_embeddings, dtype=torch.float32)
    prompt_tensors = prompt_tensors.unsqueeze(0)
    with torch.no_grad():
        output = best_model(prompt_tensors) 
        preds = torch.argmax(output, dim=1)

    character = idx2word[preds.item()]
    return " ".join(prompt) + " " + character 

def generate_text(prompt):
    text = prompt
    for i in range(10):
        text = generate(text)
    return text

In [58]:
prompt = "By God's grace "
generated_text = generate_text(prompt)
print(generated_text)

by gods grace it to point our eyes which in all wretched blood


In [59]:
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def compute_bleu(model, test_sentences):
    model.eval()
    total_bleu = 0
    count = 0
    smoothing = SmoothingFunction().method1

    for sentence in test_sentences:
        predicted_sentence = generate_text(sentence)
        
        if not predicted_sentence.strip(): 
            continue  
       
        reference = [nltk.word_tokenize(sentence)]
        candidate = nltk.word_tokenize(predicted_sentence)

        bleu_score = sentence_bleu(reference, candidate, smoothing_function=smoothing)
        total_bleu += bleu_score
        count += 1

    return total_bleu / count if count > 0 else 0  # Avoid division by zero

test_sentences = [
    "Methinks the night be filled with dreams untold, where lovers’ sighs do dance upon the air.",
    "Thou art the fairest star that lights my path, and yet thy love doth fade as morning mist.",
    "O gentle breeze, dost thou carry word of love, or sighs of sorrow lost to time’s embrace?",
    "Upon this parchment doth my heart confess, in ink of longing and of love’s despair.",
    "Wouldst thou grant me but a fleeting glance, ere time dost steal thy visage from mine eyes?",
    "The wretched night doth linger long, and yet no solace greets my weary soul.",
    "Lo! The heavens weep with trembling stars, as though the gods dost mourn love’s cruel fate.",
    "A kiss, my love, and all the world is naught but echoes of thy whispered name.",
    "The dawn doth break, yet in mine heart remains the shadow of the night’s sweet sorrow."
]

bleu = compute_bleu(model, test_sentences)
print(f"BLEU Score: {bleu:.4f}")

BLEU Score: 0.4227
