### Q1 - a: Write a very basic RNN model:

In [None]:
!pip install torch torchvision torchaudio




In [None]:
import torch.nn.functional as F

class RNNModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(RNNModel, self).__init__()

        # Embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # RNN layer
        self.rnn = nn.RNN(input_size=embedding_dim, hidden_size=hidden_dim, batch_first=True)

        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Convert word indices into embeddings
        embedded = self.embedding(x)  # Shape: (batch_size, seq_length, embedding_dim)

        # Pass through the RNN
        output, hidden = self.rnn(embedded)  # output: (batch_size, seq_length, hidden_dim)

        # Apply fully connected layer to all timesteps
        final_output = self.fc(output)  # Shape: (batch_size, seq_length, output_dim)

        return final_output  # Predict for all timesteps


### Q1 - b: Write a simple  general LSTM model:

In [None]:
import torch
import torch.nn as nn

class ImprovedLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, fc_dim, output_dim, num_layers=2, dropout=0.5):
        super(ImprovedLSTM, self).__init__()

        # Embedding Layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # LSTM Layers
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)

        # Fully Connected Layers
        self.fc1 = nn.Linear(hidden_dim, fc_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_dim, output_dim)

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = self.embedding(x)  # Shape: (batch_size, seq_length, embedding_dim)

        # LSTM forward pass
        lstm_output, (hidden, cell) = self.lstm(x)  # (batch_size, seq_length, hidden_dim)

        # Apply the first FC layer to all time steps
        x = self.fc1(lstm_output)  # Shape: (batch_size, seq_length, fc_dim)
        x = self.relu(x)
        x = self.dropout(x)

        # Final prediction layer
        x = self.fc2(x)  # Shape: (batch_size, seq_length, vocab_size)

        return x  # Predict a word at every timestep


In [None]:
!pip install pymupdf torch nltk




In [None]:
import fitz  # I want to use my own data set, so I need to convert pdf files in to txt
import os

def extract_text_from_pdfs(pdf_folder):

    all_text = ""

    # Loop through all PDF files in the folder
    for filename in sorted(os.listdir(pdf_folder)):  # Ensure correct order
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder, filename)
            print(f"Extracting text from: {filename}")

            # Open the PDF file
            doc = fitz.open(pdf_path)

            # Read text from each page
            for page in doc:
                all_text += page.get_text("text") + " "  # Extract text

            doc.close()  # Close file

    return all_text


pdf_folder = r"C:\Users\Rose\Documents\mcmaster\semester 2\NLP\harry potter data set"

# Extract text from all PDFs
corpus = extract_text_from_pdfs(pdf_folder)

# Display a preview of extracted text
print(corpus[:1000])  # Print the first 1000 characters to check


Extracting text from: 02_20_20_Harry_20_Potter_20and_20_Chamber_20of_20_Secrets_882fda9c0d.pdf
Extracting text from: Harry_Potter_and_the_Sorcerer_s_Stone_www_libpdf_blog_ir_aba92c0a66.pdf
CHAPTER ONE 
THE WORST BIRTHDAY 
 
Not for the first time, an argument had broken out over breakfast at number four, Privet Drive. Mr. 
Vernon Dursley had been woken in the early hours of the morning by a loud, hooting noise from his 
nephew Harry's room. 
“Third time this week!” he roared across the table. “If you can't control that owl, it'll have to go!” 
Harry tried, yet again, to explain. 
“She's bored,” he said. “She's used to flying around outside. If I could just let her out at night—” 
“Do I look stupid?” snarled Uncle Vernon, a bit of fried egg dangling from his bushy mustache. “I 
know what'll happen if that owl's let out.” 
He exchanged dark looks with his wife, Petunia. 
Harry tried to argue back but his words were drowned by a long, loud belch from the Dursleys' son, 
Dudley. 
“I want m

In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize

# Download NLTK tokenizer model
nltk.download('punkt')

def preprocess_text(text):
    """
    Cleans and tokenizes text using NLTK.
    """
    # 1. Convert to lowercase
    text = text.lower()

    # 2. Remove special characters, punctuation, and numbers
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Keep only letters and spaces

    # 3. Tokenization (splitting text into words)
    tokens = word_tokenize(text)

    return tokens

# Apply tokenization to the extracted corpus
tokens = preprocess_text(corpus)

# Display first 50 tokens to check
print(tokens[:50])


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rose\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['chapter', 'one', 'the', 'worst', 'birthday', 'not', 'for', 'the', 'first', 'time', 'an', 'argument', 'had', 'broken', 'out', 'over', 'breakfast', 'at', 'number', 'four', 'privet', 'drive', 'mr', 'vernon', 'dursley', 'had', 'been', 'woken', 'in', 'the', 'early', 'hours', 'of', 'the', 'morning', 'by', 'a', 'loud', 'hooting', 'noise', 'from', 'his', 'nephew', 'harrys', 'room', 'third', 'time', 'this', 'week', 'he']


In [None]:
from collections import Counter

# Build word frequency dictionary
word_counts = Counter(tokens)

# Create a mapping from words to indices
word_to_index = {word: idx + 2 for idx, (word, _) in enumerate(word_counts.most_common())}
word_to_index["<pad>"] = 0  # Padding token
word_to_index["<unk>"] = 1  # Unknown words

# Reverse mapping (index → word) for later decoding
index_to_word = {idx: word for word, idx in word_to_index.items()}

# Convert words to indices
numerical_tokens = [word_to_index.get(word, word_to_index["<unk>"]) for word in tokens]

# Show first 50 token indices
print(numerical_tokens[:50])


[143, 43, 2, 1056, 867, 42, 26, 2, 129, 88, 51, 3433, 16, 868, 29, 60, 800, 17, 831, 354, 869, 754, 108, 176, 404, 16, 41, 1761, 12, 2, 1281, 832, 6, 2, 414, 68, 5, 437, 4277, 490, 35, 10, 3434, 90, 121, 801, 88, 40, 650, 7]


In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence

# Define sequence length
SEQ_LENGTH = 30

# Convert text into fixed length sequences
sequences = [numerical_tokens[i:i+SEQ_LENGTH] for i in range(0, len(numerical_tokens) - SEQ_LENGTH, SEQ_LENGTH)]

# Convert sequences to tensors
tensor_sequences = [torch.tensor(seq) for seq in sequences]

# Pad sequences to ensure they have the same length
padded_sequences = pad_sequence(tensor_sequences, batch_first=True, padding_value=word_to_index["<pad>"])

# Show shape of final dataset
print(padded_sequences.shape)  # Expected: (num_sequences, seqlen)


torch.Size([5487, 30])


In [None]:
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, sequences):
        """
        Custom PyTorch dataset for text sequences.
        :param sequences: List of padded sequences (tokens as tensors)
        """
        self.inputs = sequences[:, :-1]  # All words except last
        self.targets = sequences[:, 1:]  # Next word prediction (shifted by 1)

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]

# Create Dataset
dataset = TextDataset(padded_sequences)

# Create DataLoader for batching
batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Show an example batch
sample_batch = next(iter(dataloader))
print("Sample batch input shape:", sample_batch[0].shape)  # Expected: (batch_size, SEQ_LENGTH-1)
print("Sample batch target shape:", sample_batch[1].shape)  # Expected: (batch_size, SEQ_LENGTH-1)


Sample batch input shape: torch.Size([32, 29])
Sample batch target shape: torch.Size([32, 29])


In [None]:
# Hyperparameters
VOCAB_SIZE = len(word_to_index)  # Vocabulary size
EMBEDDING_DIM = 128   # Embedding vector size
HIDDEN_DIM = 256      # Hidden state size
OUTPUT_DIM = VOCAB_SIZE  # Output match vocabulary size

# Initialize model
model = RNNModel(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM)

# Loss function & optimizer
criterion = nn.CrossEntropyLoss()  # Suitable for word prediction
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

print(model)

RNNModel(
  (embedding): Embedding(10318, 128)
  (rnn): RNN(128, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=10318, bias=True)
)


In [None]:
# Split dataset into training & validation (90% train, 10% validation)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Training loop
num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    total_train_loss = 0
    total_val_loss = 0
    correct = 0
    total = 0

    ###### TRAINING ######
    model.train()
    for batch_inputs, batch_targets in train_dataloader:
        batch_inputs, batch_targets = batch_inputs.to(device), batch_targets.to(device)

        # Forward pass
        optimizer.zero_grad()
        outputs = model(batch_inputs)  # Shape: (batch_size, seq_length, vocab_size)

        # Compute loss
        loss = criterion(outputs.view(-1, VOCAB_SIZE), batch_targets.view(-1))
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    ###### VALIDATION ######
    model.eval()
    with torch.no_grad():  # Disable gradient calculation for validation
        for batch_inputs, batch_targets in val_dataloader:
            batch_inputs, batch_targets = batch_inputs.to(device), batch_targets.to(device)

            # Forward pass
            outputs = model(batch_inputs)

            # Compute validation loss
            val_loss = criterion(outputs.view(-1, VOCAB_SIZE), batch_targets.view(-1))
            total_val_loss += val_loss.item()

            # Compute accuracy
            predictions = torch.argmax(outputs, dim=-1)  # Get the predicted word indices
            correct += (predictions == batch_targets).sum().item()  # Count correct predictions
            total += batch_targets.numel()  # Total words in validation set

    ###### PRINT EPOCH RESULTS ######
    train_loss = total_train_loss / len(train_dataloader)
    val_loss = total_val_loss / len(val_dataloader)
    val_accuracy = correct / total * 100  # Convert to percentage

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Validation Loss: {val_loss:.4f} | Validation Accuracy: {val_accuracy:.2f}%")


Epoch 1/10 | Train Loss: 7.0570 | Validation Loss: 6.6168 | Validation Accuracy: 7.35%
Epoch 2/10 | Train Loss: 6.2914 | Validation Loss: 6.2565 | Validation Accuracy: 9.30%
Epoch 3/10 | Train Loss: 5.8795 | Validation Loss: 6.0668 | Validation Accuracy: 10.76%
Epoch 4/10 | Train Loss: 5.5620 | Validation Loss: 5.9560 | Validation Accuracy: 11.66%
Epoch 5/10 | Train Loss: 5.2912 | Validation Loss: 5.8962 | Validation Accuracy: 12.37%
Epoch 6/10 | Train Loss: 5.0450 | Validation Loss: 5.8647 | Validation Accuracy: 13.00%
Epoch 7/10 | Train Loss: 4.8157 | Validation Loss: 5.8599 | Validation Accuracy: 13.15%
Epoch 8/10 | Train Loss: 4.6013 | Validation Loss: 5.8694 | Validation Accuracy: 13.25%
Epoch 9/10 | Train Loss: 4.3988 | Validation Loss: 5.8860 | Validation Accuracy: 13.40%
Epoch 10/10 | Train Loss: 4.2105 | Validation Loss: 5.9143 | Validation Accuracy: 13.49%


In [None]:
import random

def generate_text(model, seed_text, word_to_index, index_to_word, seq_length=30, num_words=100):

    model.eval()  # Set model to evaluation mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Tokenize seed text
    seed_tokens = [word_to_index.get(word, word_to_index["<unk>"]) for word in seed_text.lower().split()]

    # Pad if the seed sentence is shorter than seq_length
    while len(seed_tokens) < seq_length:
        seed_tokens.insert(0, word_to_index["<pad>"])

    generated_words = seed_text.split()  # Start with seed words

    with torch.no_grad():  # Disable gradient calculations
        for _ in range(num_words):
            # Convert to tensor and reshape for batch dimension
            input_tensor = torch.tensor(seed_tokens[-seq_length:], dtype=torch.long).unsqueeze(0).to(device)

            # Forward pass to get predictions
            output = model(input_tensor)

            # Get the most probable next word
            next_word_index = torch.argmax(output[:, -1, :]).item()
            next_word = index_to_word.get(next_word_index, "<unk>")  # Convert index to word

            # Append the predicted word
            generated_words.append(next_word)

            # Update the input sequence with the new word
            seed_tokens.append(next_word_index)

    return " ".join(generated_words)


In [None]:
# Example seed sentence
seed_sentence = "harry looked at "

# Generate text
generated_text = generate_text(model, seed_sentence, word_to_index, index_to_word, seq_length=30, num_words=100)

# Display generated text
print("Generated Text:")
print(generated_text)


Generated Text:
harry looked at the snake and the door of the stairs and the door of the stairs and the door of the stairs and the door of the stairs and the door of the stairs and the door of the stairs and the door of the stairs and the door of the stairs and the door of the stairs and the door of the stairs and the door of the stairs and the door of the stairs and the door of the stairs and the door of the stairs and the door of the stairs and the door of the stairs and the


### Q5:Analyzing the results

### Q5-1: update the hyper parameter too see the impact on the result of RNN:

In [None]:
# training with new hyper parameters:
# New hyperparameters
VOCAB_SIZE = len(word_to_index)
EMBEDDING_DIM = 256
HIDDEN_DIM = 512
OUTPUT_DIM = VOCAB_SIZE

# Initialize new RNN Model
model = RNNModel(vocab_size=VOCAB_SIZE,
                 embedding_dim=EMBEDDING_DIM,
                 hidden_dim=HIDDEN_DIM,
                 output_dim=OUTPUT_DIM)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define loss function (CrossEntropyLoss)
criterion = nn.CrossEntropyLoss()

# Adam optimizer for training
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005 )

# Print model summary
print(model)

# Training RNN one more tmie:
# Split dataset into training & validation (90% train, 10% validation)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Training loop
num_epochs = 10  # Keeping it simple
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    total_train_loss = 0
    total_val_loss = 0
    correct = 0
    total = 0

    #TRAINING #
    model.train()
    for batch_inputs, batch_targets in train_dataloader:
        batch_inputs, batch_targets = batch_inputs.to(device), batch_targets.to(device)

        # Forward pass
        optimizer.zero_grad()
        outputs = model(batch_inputs)  # Shape: (batch_size, seq_length, vocab_size)

        # Compute loss
        loss = criterion(outputs.view(-1, VOCAB_SIZE), batch_targets.view(-1))
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    # VALIDATION #
    model.eval()
    with torch.no_grad():  # Disable gradient calculation for validation
        for batch_inputs, batch_targets in val_dataloader:
            batch_inputs, batch_targets = batch_inputs.to(device), batch_targets.to(device)

            # Forward pass
            outputs = model(batch_inputs)

            # Compute validation loss
            val_loss = criterion(outputs.view(-1, VOCAB_SIZE), batch_targets.view(-1))
            total_val_loss += val_loss.item()

            # Compute accuracy
            predictions = torch.argmax(outputs, dim=-1)  # Get the predicted word indices
            correct += (predictions == batch_targets).sum().item()  # Count correct predictions
            total += batch_targets.numel()  # Total words in validation set

    # PRINT EPOCH RESULTS
    train_loss = total_train_loss / len(train_dataloader)
    val_loss = total_val_loss / len(val_dataloader)
    val_accuracy = correct / total * 100  # Convert to percentage

    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {train_loss:.4f} | Validation Loss: {val_loss:.4f} | Validation Accuracy: {val_accuracy:.2f}%")


RNNModel(
  (embedding): Embedding(10318, 256)
  (rnn): RNN(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=10318, bias=True)
)
Epoch 1/10 | Train Loss: 7.0468 | Validation Loss: 6.5171 | Validation Accuracy: 7.91%
Epoch 2/10 | Train Loss: 6.2351 | Validation Loss: 6.2064 | Validation Accuracy: 9.39%
Epoch 3/10 | Train Loss: 5.8483 | Validation Loss: 6.0270 | Validation Accuracy: 10.78%
Epoch 4/10 | Train Loss: 5.5369 | Validation Loss: 5.9194 | Validation Accuracy: 11.35%
Epoch 5/10 | Train Loss: 5.2631 | Validation Loss: 5.8514 | Validation Accuracy: 12.26%
Epoch 6/10 | Train Loss: 5.0089 | Validation Loss: 5.8143 | Validation Accuracy: 12.59%
Epoch 7/10 | Train Loss: 4.7671 | Validation Loss: 5.7945 | Validation Accuracy: 12.88%
Epoch 8/10 | Train Loss: 4.5345 | Validation Loss: 5.7946 | Validation Accuracy: 12.92%
Epoch 9/10 | Train Loss: 4.3092 | Validation Loss: 5.8046 | Validation Accuracy: 13.22%
Epoch 10/10 | Train Loss: 4.0958 | Validation Loss: 5.827

In [None]:
# Example seed sentence
seed_sentence = "harry looked at"

# Generate text
generated_text = generate_text(model, seed_sentence, word_to_index, index_to_word, seq_length=30, num_words=100)

# Display generated text
print("Generated Text:")
print(generated_text)


Generated Text:
harry looked at the back of the library and then he heard the door and the snake had to be a lot of the library he said harry and ron and hermione were planning to the floor to the chamber of secrets for saken harry potter said ron in the world of the forest and the monster of the forest and the monster of the forest and the snake had been a long black traveling his head and the snake was the only one who had been a long thin package in the air and the other hand was a large black cloak


In [None]:
"""Analyzinng : As we can see the first text generated by RNN does not very make sense.
it looks like the real story when you read the first few sentences but when keep reading it
looks like a text that random words were sitted next to each other without any special meaning.In other words,
it is not coherent and is full of repetative words like "door" and "snake" it shows model is
not capable of generating a passage with more variety. After updating the hyperparameters, I was expecting improvement,
at least it is not keep repeating itself.
although we can not see any significant change in loss validation and ... but the results are way better!
in the new version it seems to be more dynamic like a human writing """


In [None]:
# Initialize LSTM model (same hyperparameters as RNN for fair comparison)
model = ImprovedLSTM(
    vocab_size=VOCAB_SIZE,
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    fc_dim=128,  # Same as before
    output_dim=VOCAB_SIZE,  # Predict next word from vocabulary
    num_layers=2,
    dropout=0.5
)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define loss function (same as RNN)
criterion = nn.CrossEntropyLoss()

# Define optimizer (same as RNN)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Print model summary
print(model)


ImprovedLSTM(
  (embedding): Embedding(10318, 256)
  (lstm): LSTM(256, 512, num_layers=2, batch_first=True, dropout=0.5)
  (fc1): Linear(in_features=512, out_features=128, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=128, out_features=10318, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)


In [None]:
# Training loop
num_epochs = 20  # Increase number of epochs
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ImprovedLSTM(vocab_size=VOCAB_SIZE, embedding_dim=100, hidden_dim=256, fc_dim=128, output_dim=VOCAB_SIZE)
model.to(device)

# Use AdamW optimizer and lower learning rate
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)
criterion = nn.CrossEntropyLoss()

for epoch in range(num_epochs):
    total_train_loss = 0
    total_val_loss = 0
    correct = 0
    total = 0

    # Training phase
    model.train()
    for input_sequences, target_labels in train_dataloader:
        input_sequences, target_labels = input_sequences.to(device), target_labels.to(device)

        # Forward pass
        optimizer.zero_grad()
        model_predictions = model(input_sequences)  # Shape: (batch_size, seq_length, vocab_size)

        # Ensure loss calculation uses correct shapes
        loss = criterion(model_predictions.reshape(-1, VOCAB_SIZE), target_labels.reshape(-1))  # Flatten both
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    # Validation phase
    model.eval()
    with torch.no_grad():
        for input_sequences, target_labels in val_dataloader:
            input_sequences, target_labels = input_sequences.to(device), target_labels.to(device)

            # Forward pass
            model_predictions = model(input_sequences)

            # Compute validation loss
            val_loss = criterion(model_predictions.reshape(-1, VOCAB_SIZE), target_labels.reshape(-1))
            total_val_loss += val_loss.item()

            # Compute accuracy
            predictions = torch.argmax(model_predictions, dim=-1)  # Shape: (batch_size, seq_length)
            correct += (predictions == target_labels).sum().item()
            total += target_labels.numel()

    # Print epoch results
    train_loss = total_train_loss / len(train_dataloader)
    val_loss = total_val_loss / len(val_dataloader)
    val_accuracy = correct / total * 100  # Convert accuracy to percentage

    print(f"Epoch {epoch + 1}/{num_epochs} | Train Loss: {train_loss:.4f} | Validation Loss: {val_loss:.4f} | Validation Accuracy: {val_accuracy:.2f}%")

Epoch 1/20 | Train Loss: 8.4431 | Validation Loss: 7.1369 | Validation Accuracy: 4.40%
Epoch 2/20 | Train Loss: 7.2676 | Validation Loss: 6.9093 | Validation Accuracy: 4.42%
Epoch 3/20 | Train Loss: 7.0748 | Validation Loss: 6.8546 | Validation Accuracy: 4.42%
Epoch 4/20 | Train Loss: 6.9925 | Validation Loss: 6.8215 | Validation Accuracy: 4.43%
Epoch 5/20 | Train Loss: 6.9338 | Validation Loss: 6.7920 | Validation Accuracy: 4.43%
Epoch 6/20 | Train Loss: 6.8905 | Validation Loss: 6.7749 | Validation Accuracy: 4.43%
Epoch 7/20 | Train Loss: 6.8615 | Validation Loss: 6.7680 | Validation Accuracy: 4.43%
Epoch 8/20 | Train Loss: 6.8408 | Validation Loss: 6.7621 | Validation Accuracy: 4.43%
Epoch 9/20 | Train Loss: 6.8197 | Validation Loss: 6.7569 | Validation Accuracy: 4.43%
Epoch 10/20 | Train Loss: 6.8029 | Validation Loss: 6.7524 | Validation Accuracy: 4.43%
Epoch 11/20 | Train Loss: 6.7905 | Validation Loss: 6.7507 | Validation Accuracy: 4.43%
Epoch 12/20 | Train Loss: 6.7766 | Valida

In [None]:
import random

def generate_text(model, seed_text, word_to_index, index_to_word, seq_length=30, num_words=100):

    model.eval()  # Set model to evaluation mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Tokenize seed text
    seed_tokens = [word_to_index.get(word, word_to_index["<unk>"]) for word in seed_text.lower().split()]

    # Pad if the seed sentence is shorter than seq_length
    while len(seed_tokens) < seq_length:
        seed_tokens.insert(0, word_to_index["<pad>"])

    generated_words = seed_text.split()  # Start with seed words

    with torch.no_grad():  # Disable gradient calculations
        for _ in range(num_words):
            # Convert to tensor and reshape for batch dimension
            input_tensor = torch.tensor(seed_tokens[-seq_length:], dtype=torch.long).unsqueeze(0).to(device)

            # Forward pass to get predictions
            output = model(input_tensor)

            # Get the most probable next word
            next_word_index = torch.argmax(output[:, -1, :]).item()
            next_word = index_to_word.get(next_word_index, "<unk>")  # Convert index to word

            # Append the predicted word
            generated_words.append(next_word)

            # Update the input sequence with the new word
            seed_tokens.append(next_word_index)

    return " ".join(generated_words)


In [None]:

# Example seed sentence
seed_sentence = "harry looked at chamber and saw jenny"

# Generate text
generated_text = generate_text(model, seed_sentence, word_to_index, index_to_word, seq_length=30, num_words=100)

# Display generated text
print("Generated Text:")
print(generated_text)

### analyzing LSTM

In [None]:
""" well the LSTM didnt work out well because I think the data set is very small for this model.
it is repeating it self I tried different learning rated but didn't see any significant improvement here. in my opinion for small data sets
small models are better. it is totally repeating itself and at the begining the results were better but i decided to make the lstm model mode complicated
to find a better results but apparently with my specific data set which is a small one, harry potter 1 & 2 it is not a good one! so myresults show that
although LSTM is a better model in comparison with RNN"""

In [None]:
# Visualization of Loss Curves
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
plt.plot(range(1, len(rnn_losses) + 1), rnn_losses, label=f'RNN (LR={best_rnn_lr})', color='blue')
plt.plot(range(1, len(lstm_losses) + 1), lstm_losses, label=f'LSTM (LR={best_lstm_lr})', color='orange')
plt.xlabel('Epoch')
plt.ylabel('Average Loss')
plt.title('Training Loss Curves for RNN and LSTM')
plt.legend()
plt.grid(True)
plt.show()

# Plot comparison across learning rates
plt.figure(figsize=(12, 6))
for lr, losses in rnn_results.items():
    plt.plot(range(1, len(losses) + 1), losses, label=f'RNN LR={lr}', linestyle='--')
for lr, losses in lstm_results.items():
    plt.plot(range(1, len(losses) + 1), losses, label=f'LSTM LR={lr}', linestyle='-')
plt.xlabel('Epoch')
plt.ylabel('Average Loss')
plt.title('Loss Curves Across Different Learning Rates')
plt.legend()
plt.grid(True)
plt.show()

NameError: name 'rnn_losses' is not defined

<Figure size 1200x600 with 0 Axes>