---

<h1 style="text-align: center;">INLP - Assignment 2</h1>

<div style="text-align: center;">
    <p>Name: Vedant Nipane</p>
    <p>Roll No: 2021102040</p>
</div>

---

# 0. Data Preprocessing

In [2]:
import os
import random
import re
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from collections import Counter
from datetime import datetime
from torch.utils.data import DataLoader, TensorDataset
import warnings
warnings.filterwarnings("ignore")


In [3]:
# Choose device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

text1_path = 'Pride and Prejudice - Jane Austen.txt'
text2_path = 'Ulysses - James Joyce.txt'

Using device: cpu


In [9]:

# Load text and split into sentences
def load_text(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        text = f.read().lower()  # Convert to lowercase

    sentences = re.split(r'[.!?]', text)  # Split on sentence boundaries
    sentences = [s.strip() for s in sentences if s.strip()]  # Remove empty lines
    return sentences

# Split into train and test (1000 test sentences, rest train)
def split_data(sentences, test_size=1000):
    random.shuffle(sentences)  # Shuffle sentences to ensure randomness
    test_sentences = sentences[:test_size]
    train_sentences = sentences[test_size:]
    return train_sentences, test_sentences

# Tokenize a list of sentences
def tokenize(sentences):
    tokenized_sentences = []
    for sentence in sentences:
        words = re.sub(r"[^a-zA-Z\s]", "", sentence).split()  # Remove special chars & split
        tokenized_sentences.append(words)
    return tokenized_sentences

# Build vocabulary from training data only
def build_vocab(tokenized_sentences, min_freq=1):
    word_counts = Counter(word for sentence in tokenized_sentences for word in sentence)
    vocab = {word: idx for idx, (word, freq) in enumerate(word_counts.items()) if freq >= min_freq}
    vocab["<UNK>"] = len(vocab)  # Add unknown token
    return vocab

# Convert words to numerical indices
def words_to_indices(sentences, vocab):
    indexed_sentences = []
    for sentence in sentences:
        indexed_sentences.append([vocab.get(word, vocab["<UNK>"]) for word in sentence])
    return indexed_sentences

# Generate n-gram dataset
def create_ngrams(indexed_sentences, n=3):
    data = []
    for sentence in indexed_sentences:
        if len(sentence) >= n:
            for i in range(len(sentence) - n):
                context = sentence[i : i + n]  # First (n) words
                target = sentence[i + n]  # Next word (prediction target)
                data.append((context, target))
    return data

# Convert dataset to PyTorch tensors
def prepare_tensors(data):
    contexts = torch.tensor([x[0] for x in data], dtype=torch.long)
    targets = torch.tensor([x[1] for x in data], dtype=torch.long)
    return contexts, targets


In [10]:
sentences1 = load_text(text1_path)

# Split into train and test (sentence-level split)
train_sentences1, test_sentences1 = split_data(sentences1)

# Tokenize sentences
train_tokens1 = tokenize(train_sentences1)
test_tokens1 = tokenize(test_sentences1)

# Build vocab from train only
train_vocab1 = build_vocab(train_tokens1, min_freq=1)

# Convert words to indices
train_indices1 = words_to_indices(train_tokens1, train_vocab1)
test_indices1 = words_to_indices(test_tokens1, train_vocab1)  # Use same vocab

# Generate n-grams
train_data_3gram1 = create_ngrams(train_indices1, n=3)
test_data_3gram1 = create_ngrams(test_indices1, n=3)

train_data_5gram1 = create_ngrams(train_indices1, n=5)
test_data_5gram1 = create_ngrams(test_indices1, n=5)

# Convert to tensors
X_train_3gram1, y_train_3gram1 = prepare_tensors(train_data_3gram1)
X_test_3gram1, y_test_3gram1 = prepare_tensors(test_data_3gram1)

X_train_5gram1, y_train_5gram1 = prepare_tensors(train_data_5gram1)
X_test_5gram1, y_test_5gram1 = prepare_tensors(test_data_5gram1)

# Print dataset sizes
print(f"Vocabulary Size (1): {len(train_vocab1)}")
print(f"Train 3-gram Dataset Size (1): {len(X_train_3gram1)}")
print(f"Test 3-gram Dataset Size (1): {len(X_test_3gram1)}")
print(f"Train 5-gram Dataset Size (1): {len(X_train_5gram1)}")
print(f"Test 5-gram Dataset Size (1): {len(X_test_5gram1)}")


Vocabulary Size (1): 6589
Train 3-gram Dataset Size (1): 89705
Test 3-gram Dataset Size (1): 14391
Train 5-gram Dataset Size (1): 78650
Test 5-gram Dataset Size (1): 12707


In [11]:
sentences2 = load_text(text2_path)

# Split into train and test (sentence-level split)
train_sentences2, test_sentences2 = split_data(sentences2)

# Tokenize sentences
train_tokens2 = tokenize(train_sentences2)
test_tokens2 = tokenize(test_sentences2)

# Build vocab from train only
train_vocab2 = build_vocab(train_tokens2, min_freq=1)

# Convert words to indices
train_indices2 = words_to_indices(train_tokens2, train_vocab2)
test_indices2 = words_to_indices(test_tokens2, train_vocab2)  # Use same vocab

# Generate n-grams
train_data_3gram2 = create_ngrams(train_indices2, n=3)
test_data_3gram2 = create_ngrams(test_indices2, n=3)

train_data_5gram2 = create_ngrams(train_indices2, n=5)
test_data_5gram2 = create_ngrams(test_indices2, n=5)

# Convert to tensors
X_train_3gram2, y_train_3gram2 = prepare_tensors(train_data_3gram2)
X_test_3gram2, y_test_3gram2 = prepare_tensors(test_data_3gram2)

X_train_5gram2, y_train_5gram2 = prepare_tensors(train_data_5gram2)
X_test_5gram2, y_test_5gram2 = prepare_tensors(test_data_5gram2)

# Print dataset sizes
print(f"Vocabulary Size (2): {len(train_vocab2)}")
print(f"Train 3-gram Dataset Size (2): {len(X_train_3gram2)}")
print(f"Test 3-gram Dataset Size (2): {len(X_test_3gram2)}")
print(f"Train 5-gram Dataset Size (2): {len(X_train_5gram2)}")
print(f"Test 5-gram Dataset Size (2): {len(X_test_5gram2)}")


Vocabulary Size (2): 28941
Train 3-gram Dataset Size (2): 191657
Test 3-gram Dataset Size (2): 7255
Train 5-gram Dataset Size (2): 160305
Test 5-gram Dataset Size (2): 6012


In [12]:
# Create a mapping from indices to words (inverse of train_vocab)
idx_to_word1 = {idx: word for word, idx in train_vocab1.items()}
idx_to_word2 = {idx: word for word, idx in train_vocab2.items()}

# 1. Feed Forward Neural Network

## 1.1 Model Architecture

In [4]:


class FFNNLanguageModel(nn.Module):
    def __init__(self, vocab_size, context_size, embedding_dim=100, hidden_dim=256):
        super(FFNNLanguageModel, self).__init__()
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        
        # Flatten the embeddings and feed through fully connected layers
        self.ff_layers = nn.Sequential(
            nn.Linear(context_size * embedding_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_dim, vocab_size)
        )
        
        self.log_softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, x):
        # x shape: (batch_size, context_size)
        embeds = self.embeddings(x)  # (batch_size, context_size, embedding_dim)
        
        # Flatten the embeddings
        batch_size = embeds.shape[0]
        flattened = embeds.view(batch_size, -1)
        
        # Feed through layers
        hidden = self.ff_layers(flattened)
        log_probs = self.log_softmax(hidden)
        return log_probs


## 1.2 Model Tracker

In [9]:

class ModelTracker:
    def __init__(self):
        self.metrics = {
            'model_name': [],
            'epoch': [],
            'train_loss': [],
            'test_loss': [],
            'train_perplexity': [],
            'test_perplexity': []
        }
    
    def add_epoch_metrics(self, model_name, epoch, train_loss, test_loss):
        self.metrics['model_name'].append(model_name)
        self.metrics['epoch'].append(epoch)
        self.metrics['train_loss'].append(train_loss)
        self.metrics['test_loss'].append(test_loss)
        self.metrics['train_perplexity'].append(None)  # Will be updated later
        self.metrics['test_perplexity'].append(None)
    
    def update_final_perplexity(self, model_name, train_perp, test_perp):
        mask = (self.metrics['model_name'] == model_name)
        self.metrics['train_perplexity'][-1] = train_perp
        self.metrics['test_perplexity'][-1] = test_perp
    
    def save_metrics(self, filename='model_metrics.csv'):
        df = pd.DataFrame(self.metrics)
        df.to_csv(filename, index=False)
        return df


## 1.3 Function to calculate Perplexity and Saving them. 

In [10]:

def ensure_directories():
    """Create necessary directories if they don't exist"""
    directories = ['Models', 'Perplexity']
    for directory in directories:
        if not os.path.exists(directory):
            os.makedirs(directory)

def calculate_sentence_perplexity(model, sentence_indices, context_size, device='cuda'):
    """Calculate perplexity for a single sentence"""
    if len(sentence_indices) <= context_size:
        return float('inf')
    
    model.eval()
    total_log_prob = 0
    count = 0
    
    for i in range(len(sentence_indices) - context_size):
        context = torch.tensor([sentence_indices[i:i+context_size]], device=device)
        target = torch.tensor([sentence_indices[i+context_size]], device=device)
        
        with torch.no_grad():
            log_probs = model(context)
            total_log_prob += -log_probs[0][target].item()
            count += 1
    
    return np.exp(total_log_prob / count) if count > 0 else float('inf')

def save_perplexity_results(corpus_name, n_gram, dataset_type, sentences_indices, perplexities):
    """Save perplexity results to file"""
    file_name = f"2021102040_ffnn_{corpus_name}_N{n_gram}_{dataset_type}-perplexity.txt"
    file_path = os.path.join('Perplexity', file_name)
    
    with open(file_path, 'w') as f:
        avg_perplexity = np.mean([p for p in perplexities if p != float('inf')])
        f.write(f"Overall Average Perplexity: {avg_perplexity:.2f}\n\n")
        
        for idx, (sentence, perp) in enumerate(zip(sentences_indices, perplexities), 1):
            f.write(f" {sentence} - Perplexity: {perp:.2f}\n")

def evaluate_and_save_perplexity(model, sentences_indices, context_size, corpus_name, n_gram, dataset_type, device='cpu'):
    """Evaluate perplexity for each sentence and save results"""
    perplexities = []
    
    for sentence in sentences_indices:
        if len(sentence) > context_size:
            perp = calculate_sentence_perplexity(model, sentence, context_size, device)
            if perp != float('inf'):
                perplexities.append(perp)
    
    save_perplexity_results(corpus_name, n_gram, dataset_type, sentences_indices, perplexities)
    return np.mean(perplexities)


## 1.4 Training The Models

In [17]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

def train_and_evaluate_model(model, train_data, test_data, corpus_name, n_gram, word_to_idx, device='cpu', batch_size=64, num_epochs=5):
    """Complete training and evaluation pipeline with vocabulary saving"""
    ensure_directories()
    model_tracker = ModelTracker()
    
    # Create model name for saving
    model_name = f"2021102040_ffnn_{corpus_name}_N{n_gram}"
    model_path = os.path.join('Models', f"{model_name}.pth")  # Changed to .pth for clarity
    
    # Prepare data loaders
    train_loader = DataLoader(TensorDataset(*train_data), batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(TensorDataset(*test_data), batch_size=batch_size)
    
    # Training setup
    criterion = nn.NLLLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    best_test_loss = float('inf')
    
    # Training loop
    for epoch in range(num_epochs):
        print(f'Training for Epoch: {epoch+1}')
        model.train()
        total_train_loss = 0
        
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            
            optimizer.zero_grad()
            output = model(batch_x)
            loss = criterion(output, batch_y)
            
            loss.backward()
            optimizer.step()
            
            total_train_loss += loss.item()
        
        avg_train_loss = total_train_loss / len(train_loader)
        
        # Validation phase
        model.eval()
        total_test_loss = 0
        
        with torch.no_grad():
            for batch_x, batch_y in test_loader:
                batch_x, batch_y = batch_x.to(device), batch_y.to(device)
                output = model(batch_x)
                loss = criterion(output, batch_y)
                total_test_loss += loss.item()
        
        avg_test_loss = total_test_loss / len(test_loader)
        
        # Save best model with vocabulary
        if avg_test_loss < best_test_loss:
            best_test_loss = avg_test_loss
            save_dict = {
                "model_state_dict": model.state_dict(),
                "word_to_idx": word_to_idx,  # Save vocabulary
                "vocab_size": len(word_to_idx)
            }
            torch.save(save_dict, model_path)
        
        # Track metrics
        model_tracker.add_epoch_metrics(model_name, epoch+1, avg_train_loss, avg_test_loss)
        
        print(f'Epoch [{epoch+1}/{num_epochs}]')
        print(f'Training Loss: {avg_train_loss:.4f}')
        print(f'Test Loss: {avg_test_loss:.4f}')
        print('-' * 50)
    
    # Load best model for evaluation
    checkpoint = torch.load(model_path)
    model.load_state_dict(checkpoint["model_state_dict"])
    model.eval()
    
    return model, model_tracker


In [19]:
def run_all_experiments(experiment_data, train_vocab1, train_vocab2, device='cpu'):
    """Run all experiments for both corpora and both n-gram sizes"""
    all_metrics = []
    
    for corpus_name in ['corpus1', 'corpus2']:
        vocab = train_vocab1 if corpus_name == 'corpus1' else train_vocab2  
        
        for n_gram in [3, 5]:
            print(f"\nTraining {corpus_name} with {n_gram}-gram model")
            
            # Get the data for this experiment
            train_data = experiment_data[corpus_name][n_gram]['train']
            test_data = experiment_data[corpus_name][n_gram]['test']
            
            # Initialize model
            model = FFNNLanguageModel(
                vocab_size=len(vocab),
                context_size=n_gram,
                embedding_dim=100,
                hidden_dim=256
            ).to(device)
            
            # Train and evaluate
            model, tracker = train_and_evaluate_model(
                model,
                train_data,
                test_data,
                corpus_name,
                n_gram,
                vocab,
                device
            )
            
            all_metrics.append(tracker)
    
    # Combine and save all metrics
    combined_metrics = pd.concat([pd.DataFrame(tracker.metrics) for tracker in all_metrics])
    combined_metrics.to_csv('ffnn_all_metrics.csv', index=False)
    
    return combined_metrics


In [20]:

# For corpus 1 (Pride and Prejudice)
train_data1_3gram = (X_train_3gram1, y_train_3gram1)  # These are already tensors
test_data1_3gram = (X_test_3gram1, y_test_3gram1)

train_data1_5gram = (X_train_5gram1, y_train_5gram1)
test_data1_5gram = (X_test_5gram1, y_test_5gram1)

# For corpus 2 (Ulysses)
train_data2_3gram = (X_train_3gram2, y_train_3gram2)
test_data2_3gram = (X_test_3gram2, y_test_3gram2)

train_data2_5gram = (X_train_5gram2, y_train_5gram2)
test_data2_5gram = (X_test_5gram2, y_test_5gram2)

# Create a dictionary to store the data
experiment_data = {
    'corpus1': {
        3: {'train': train_data1_3gram, 'test': test_data1_3gram},
        5: {'train': train_data1_5gram, 'test': test_data1_5gram}
    },
    'corpus2': {
        3: {'train': train_data2_3gram, 'test': test_data2_3gram},
        5: {'train': train_data2_5gram, 'test': test_data2_5gram}
    }
}



In [21]:
# Call the function
metrics_df = run_all_experiments(
    experiment_data,
    train_vocab1,
    train_vocab2,
    device
)


Training corpus1 with 3-gram model
Training for Epoch: 1
Epoch [1/5]
Training Loss: 6.2643
Test Loss: 5.9381
--------------------------------------------------
Training for Epoch: 2
Epoch [2/5]
Training Loss: 5.6886
Test Loss: 5.7628
--------------------------------------------------
Training for Epoch: 3
Epoch [3/5]
Training Loss: 5.3843
Test Loss: 5.7431
--------------------------------------------------
Training for Epoch: 4
Epoch [4/5]
Training Loss: 5.1597
Test Loss: 5.7674
--------------------------------------------------
Training for Epoch: 5
Epoch [5/5]
Training Loss: 4.9792
Test Loss: 5.8399
--------------------------------------------------

Training corpus1 with 5-gram model
Training for Epoch: 1
Epoch [1/5]
Training Loss: 6.3483
Test Loss: 6.0485
--------------------------------------------------
Training for Epoch: 2
Epoch [2/5]
Training Loss: 5.8030
Test Loss: 5.8999
--------------------------------------------------
Training for Epoch: 3
Epoch [3/5]
Training Loss: 5.47

## 1.5 Calculating and Saving Perplexities

In [None]:

perplexity_df = pd.DataFrame(columns=['model_name', 'train_perplexity', 'test_perplexity'])

perplexity_df = pd.DataFrame({
    'model_name': pd.Series(dtype='str'),  # Or 'object'
    'train_perplexity': pd.Series(dtype='float'),
    'test_perplexity': pd.Series(dtype='float')
})
perplexity_df = pd.DataFrame([])


#### Perplexity for N = 3; Corpus - Pride and Prejudice

In [None]:
model_3gram1 = FFNNLanguageModel(vocab_size=len(train_vocab1), context_size=3, embedding_dim=100, hidden_dim=256)
checkpoint = torch.load("Models/2021102040_ffnn_corpus1_N3.pth")
model_3gram1.load_state_dict(checkpoint["model_state_dict"])  # Corrected key

word_to_idx = checkpoint["word_to_idx"]  # Retrieve the vocabulary mapping if stored

model_3gram1.to(device)
model_3gram1.eval()


FFNNLanguageModel(
  (embeddings): Embedding(6662, 100)
  (ff_layers): Sequential(
    (0): Linear(in_features=300, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=256, out_features=256, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=256, out_features=6662, bias=True)
  )
  (log_softmax): LogSoftmax(dim=1)
)

In [None]:
model_3gram1 = FFNNLanguageModel(vocab_size=len(train_vocab1), context_size=3, embedding_dim=100, hidden_dim=256)
checkpoint = torch.load("Models/2021102040_ffnn_corpus1_N3.pth")
model_3gram1.load_state_dict(checkpoint["model_state_dict"])  # Corrected key

word_to_idx = checkpoint["word_to_idx"]  # Retrieve the vocabulary mapping if stored

model_3gram1.to(device)
model_3gram1.eval()

print("\nCalculating training perplexity...")
train_perplexity = evaluate_and_save_perplexity(model_3gram1,train_indices1,context_size=3,corpus_name='corpus1',n_gram=3,dataset_type='train',device=device)
print("Calculating test perplexity...")
test_perplexity = evaluate_and_save_perplexity(model_3gram1,test_indices1,context_size=3,corpus_name='corpus1',n_gram=3,dataset_type='test',device=device)

print("\nFinal Results for 3-gram Model (Corpus 1):")
print(f"Training Perplexity: {train_perplexity:.2f}")
print(f"Test Perplexity: {test_perplexity:.2f}")

new_row = pd.Series({'model_name': "N = 3; Corpus 1", 'train_perplexity': train_perplexity, 'test_perplexity': test_perplexity})
perplexity_df = pd.concat([perplexity_df, new_row.to_frame().T], ignore_index=True)


#### Perplexity for N = 5; Corpus - Pride and Prejudice

In [None]:
model_5gram1 = FFNNLanguageModel(vocab_size=len(train_vocab1), context_size=5, embedding_dim=100, hidden_dim=256)
checkpoint = torch.load("Models/2021102040_ffnn_corpus1_N5.pth")

model_5gram1.load_state_dict(checkpoint["model_state_dict"])  # Load weights
word_to_idx = checkpoint["word_to_idx"]  # Load vocabulary mapping if stored

model_5gram1.to(device)
model_5gram1.eval()

print("\nCalculating training perplexity...")
train_perplexity = evaluate_and_save_perplexity(model_5gram1, train_indices1, context_size=5, corpus_name='corpus1', n_gram=5, dataset_type='train', device=device)
print("Calculating test perplexity...")
test_perplexity = evaluate_and_save_perplexity(model_5gram1, test_indices1, context_size=5, corpus_name='corpus1', n_gram=5, dataset_type='test', device=device)

print("\nFinal Results for 5-gram Model (Corpus 1):")
print(f"Training Perplexity: {train_perplexity:.2f}")
print(f"Test Perplexity: {test_perplexity:.2f}")

new_row = pd.Series({'model_name': "N = 5; Corpus 1", 'train_perplexity': train_perplexity, 'test_perplexity': test_perplexity})
perplexity_df = pd.concat([perplexity_df, new_row.to_frame().T], ignore_index=True)


#### Perplexity for N = 3; Corpus - Ulysses

In [None]:
model_3gram2 = FFNNLanguageModel(vocab_size=len(train_vocab2), context_size=3, embedding_dim=100, hidden_dim=256)
checkpoint = torch.load("Models/2021102040_ffnn_corpus2_N3.pth")

model_3gram2.load_state_dict(checkpoint["model_state_dict"])  # Load weights
word_to_idx = checkpoint["word_to_idx"]  # Load vocabulary mapping if stored

model_3gram2.to(device)
model_3gram2.eval()

print("\nCalculating training perplexity...")
train_perplexity = evaluate_and_save_perplexity(model_3gram2, train_indices2, context_size=3, corpus_name='corpus2', n_gram=3, dataset_type='train', device=device)
print("Calculating test perplexity...")
test_perplexity = evaluate_and_save_perplexity(model_3gram2, test_indices2, context_size=3, corpus_name='corpus2', n_gram=3, dataset_type='test', device=device)

print("\nFinal Results for 3-gram Model (Corpus 2):")
print(f"Training Perplexity: {train_perplexity:.2f}")
print(f"Test Perplexity: {test_perplexity:.2f}")

new_row = pd.Series({'model_name': "N = 3; Corpus 2", 'train_perplexity': train_perplexity, 'test_perplexity': test_perplexity})
perplexity_df = pd.concat([perplexity_df, new_row.to_frame().T], ignore_index=True)



Calculating training perplexity...
Calculating test perplexity...

Final Results for 3-gram Model (Corpus 2):
Training Perplexity: 10943.37
Test Perplexity: 14530.91


#### Perplexity for N = 5; Corpus - Ulysses

In [None]:
model_5gram2 = FFNNLanguageModel(vocab_size=len(train_vocab2), context_size=5, embedding_dim=100, hidden_dim=256)
checkpoint = torch.load("Models/2021102040_ffnn_corpus2_N5.pth")

model_5gram2.load_state_dict(checkpoint["model_state_dict"])  # Load weights
word_to_idx = checkpoint["word_to_idx"]  # Load vocabulary mapping if stored

model_5gram2.to(device)
model_5gram2.eval()

print("\nCalculating training perplexity...")
train_perplexity = evaluate_and_save_perplexity(model_5gram2, train_indices2, context_size=5, corpus_name='corpus2', n_gram=5, dataset_type='train', device=device)
print("Calculating test perplexity...")
test_perplexity = evaluate_and_save_perplexity(model_5gram2, test_indices2, context_size=5, corpus_name='corpus2', n_gram=5, dataset_type='test', device=device)

print("\nFinal Results for 5-gram Model (Corpus 2):")
print(f"Training Perplexity: {train_perplexity:.2f}")
print(f"Test Perplexity: {test_perplexity:.2f}")

new_row = pd.Series({'model_name': "N = 5; Corpus 2", 'train_perplexity': train_perplexity, 'test_perplexity': test_perplexity})
perplexity_df = pd.concat([perplexity_df, new_row.to_frame().T], ignore_index=True)



Calculating training perplexity...
Calculating test perplexity...

Final Results for 5-gram Model (Corpus 2):
Training Perplexity: 8193.14
Test Perplexity: 19272.27


### All Perplexities

In [68]:
perplexity_df

Unnamed: 0,model_name,train_perplexity,test_perplexity
0,N = 3; Corpus 1,396.950281,6805.919115
1,N = 5; Corpus 1,443.198667,185495.097141
2,N = 3; Corpus 2,10943.366095,14530.913292
3,N = 5; Corpus 2,8193.136195,19272.271428


## 1.6 Predicting Next word

In [35]:
def generate_prediction(model, context, idx_to_word, top_k=5):
    model.eval()
    with torch.no_grad():
        context_tensor = torch.tensor([context], device=device)
        log_probs = model(context_tensor)
        probs = torch.exp(log_probs)
        top_k_probs, top_k_indices = torch.topk(probs[0], k=top_k)
        
        predictions = [(idx_to_word[idx.item()], prob.item()) 
                      for idx, prob in zip(top_k_indices, top_k_probs)]
    return predictions

# Print some sample predictions
print("\nSample Predictions:")
for i in range(3):  # Show 3 random examples
    if len(X_test_3gram1) > i:
        context = X_test_3gram1[i].tolist()
        actual = idx_to_word1[y_test_3gram1[i].item()]
        context_words = [idx_to_word1[idx] for idx in context]
        
        print(f"\nContext: {' '.join(context_words)}")
        print(f"Actual next word: {actual}")
        print("Top 5 predictions:")
        predictions = generate_prediction(model_3gram1, context, idx_to_word1)
        for word, prob in predictions:
            print(f"{word}: {prob:.4f}")



Sample Predictions:

Context: having never even
Actual next word: fancied
Top 5 predictions:
to: 0.0126
the: 0.0100
and: 0.0093
of: 0.0082
be: 0.0079

Context: never even fancied
Actual next word: herself
Top 5 predictions:
and: 0.0434
the: 0.0376
to: 0.0291
that: 0.0281
of: 0.0209

Context: even fancied herself
Actual next word: in
Top 5 predictions:
as: 0.0535
and: 0.0445
to: 0.0362
of: 0.0348
in: 0.0269


In [5]:
def predict_next_word(model, sentence, word_to_idx, idx_to_word, context_size=3, device='cpu', top_k=5):
    sentence = sentence.lower()
    words = re.sub(r"[^a-zA-Z\s]", "", sentence).split()
    if len(words) < context_size:
        raise ValueError(f"Input sentence must have at least {context_size} words")
    
    context_words = words[-context_size:]
    context_indices = [word_to_idx.get(word, word_to_idx["<UNK>"]) for word in context_words]
    context_tensor = torch.tensor([context_indices], dtype=torch.long).to(device)
    model.eval()
    with torch.no_grad():
        log_probs = model(context_tensor)
        probs = torch.exp(log_probs)
        top_k_probs, top_k_indices = torch.topk(probs[0], k=top_k)
        
        predictions = [(idx_to_word[idx.item()], prob.item()) 
                      for idx, prob in zip(top_k_indices, top_k_probs)]
    
    return predictions

def print_predictions(sentence, predictions):
    print(f"\nInput sentence: {sentence}")
    print("Top 5 predicted next words:")
    for word, prob in predictions:
        print(f"{word}: {prob:.4f}")

### Testing On Saved Models

#### 3 gram model , Corpus 1

In [15]:
checkpoint = torch.load("Models/2021102040_ffnn_corpus1_N3.pth")

# Load the saved vocabulary
word_to_idx = checkpoint["word_to_idx"]
vocab_size = checkpoint["vocab_size"]
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# Ensure vocab size is the same before initializing the model
model_3gram1 = FFNNLanguageModel(vocab_size=vocab_size, context_size=3, embedding_dim=100, hidden_dim=256)
model_3gram1.load_state_dict(checkpoint["model_state_dict"])
model_3gram1.eval()


FFNNLanguageModel(
  (embeddings): Embedding(6662, 100)
  (ff_layers): Sequential(
    (0): Linear(in_features=300, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=256, out_features=256, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=256, out_features=6662, bias=True)
  )
  (log_softmax): LogSoftmax(dim=1)
)

In [17]:
# Example usage
custom_sentence = "the cat sat on a mat"
predictions = predict_next_word(
    model=model_3gram1,
    sentence=custom_sentence,
    word_to_idx=word_to_idx,
    idx_to_word=idx_to_word,
    context_size=3
)

print_predictions(custom_sentence, predictions)


Input sentence: the cat sat on a mat
Top 5 predicted next words:
of: 0.0351
and: 0.0210
to: 0.0140
which: 0.0140
who: 0.0122


#### 5 gram model , Corpus 1

In [18]:
checkpoint = torch.load("Models/2021102040_ffnn_corpus1_N5.pth")

# Load the saved vocabulary
word_to_idx = checkpoint["word_to_idx"]
vocab_size = checkpoint["vocab_size"]
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# Ensure vocab size is the same before initializing the model
model_5gram1 = FFNNLanguageModel(vocab_size=vocab_size, context_size=5, embedding_dim=100, hidden_dim=256)
model_5gram1.load_state_dict(checkpoint["model_state_dict"])
model_5gram1.eval()


FFNNLanguageModel(
  (embeddings): Embedding(6662, 100)
  (ff_layers): Sequential(
    (0): Linear(in_features=500, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=256, out_features=256, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=256, out_features=6662, bias=True)
  )
  (log_softmax): LogSoftmax(dim=1)
)

In [19]:
# Example usage
custom_sentence = "the cat sat on a mat"
predictions = predict_next_word(
    model=model_5gram1,
    sentence=custom_sentence,
    word_to_idx=word_to_idx,
    idx_to_word=idx_to_word,
    context_size=5
)

print_predictions(custom_sentence, predictions)


Input sentence: the cat sat on a mat
Top 5 predicted next words:
of: 0.1075
and: 0.0875
to: 0.0442
which: 0.0434
in: 0.0285


#### 3 gram model , Corpus 2

In [42]:
checkpoint = torch.load("Models/2021102040_ffnn_corpus2_N3.pth")

# Load the saved vocabulary
word_to_idx = checkpoint["word_to_idx"]
vocab_size = checkpoint["vocab_size"]
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# Ensure vocab size is the same before initializing the model
model_3gram2 = FFNNLanguageModel(vocab_size=vocab_size, context_size=3, embedding_dim=100, hidden_dim=256)
model_3gram2.load_state_dict(checkpoint["model_state_dict"])
model_3gram2.eval()


FFNNLanguageModel(
  (embeddings): Embedding(29101, 100)
  (ff_layers): Sequential(
    (0): Linear(in_features=300, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=256, out_features=256, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=256, out_features=29101, bias=True)
  )
  (log_softmax): LogSoftmax(dim=1)
)

In [43]:
# Example usage
custom_sentence = "the cat sat on a mat"
predictions = predict_next_word(
    model=model_3gram2,
    sentence=custom_sentence,
    word_to_idx=word_to_idx,
    idx_to_word=idx_to_word,
    context_size=3
)

print_predictions(custom_sentence, predictions)


Input sentence: the cat sat on a mat
Top 5 predicted next words:
of: 0.0264
and: 0.0146
the: 0.0092
in: 0.0088
to: 0.0075


#### 5 gram model , Corpus 2

In [44]:
checkpoint = torch.load("Models/2021102040_ffnn_corpus2_N5.pth")

# Load the saved vocabulary
word_to_idx = checkpoint["word_to_idx"]
vocab_size = checkpoint["vocab_size"]
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# Ensure vocab size is the same before initializing the model
model_5gram2 = FFNNLanguageModel(vocab_size=vocab_size, context_size=5, embedding_dim=100, hidden_dim=256)
model_5gram2.load_state_dict(checkpoint["model_state_dict"])
model_5gram2.eval()


FFNNLanguageModel(
  (embeddings): Embedding(29101, 100)
  (ff_layers): Sequential(
    (0): Linear(in_features=500, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=256, out_features=256, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=256, out_features=29101, bias=True)
  )
  (log_softmax): LogSoftmax(dim=1)
)

In [45]:
# Example usage
custom_sentence = "the cat sat on a mat"
predictions = predict_next_word(
    model=model_5gram2,
    sentence=custom_sentence,
    word_to_idx=word_to_idx,
    idx_to_word=idx_to_word,
    context_size=5
)

print_predictions(custom_sentence, predictions)


Input sentence: the cat sat on a mat
Top 5 predicted next words:
of: 0.0211
and: 0.0134
the: 0.0097
in: 0.0080
to: 0.0067


# 2. Vanilla RNN

## 2.1 Model Architecture

In [5]:

class VanillaRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        """
        Initialize the Vanilla RNN model.
        
        Args:
            vocab_size (int): Size of the vocabulary
            embedding_dim (int): Dimension of word embeddings
            hidden_dim (int): Dimension of hidden state
        """
        super(VanillaRNN, self).__init__()
        
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        
        # Word embedding layer
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        # RNN layer
        self.rnn = nn.RNN(
            input_size=embedding_dim,
            hidden_size=hidden_dim,
            num_layers=1,
            batch_first=True
        )
        
        # Output layer
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, hidden=None):
        """
        Forward pass of the model.
        
        Args:
            x (torch.Tensor): Input tensor of shape (batch_size, sequence_length)
            hidden (torch.Tensor, optional): Initial hidden state
            
        Returns:
            tuple: (output probabilities, final hidden state)
        """
        batch_size = x.size(0)
        
        # Initialize hidden state if not provided
        if hidden is None:
            hidden = torch.zeros(1, batch_size, self.hidden_dim).to(x.device)
        
        # Embed input words
        embedded = self.embedding(x)  # Shape: (batch_size, sequence_length, embedding_dim)
        
        # Pass through RNN
        rnn_out, hidden = self.rnn(embedded, hidden)
        # rnn_out shape: (batch_size, sequence_length, hidden_dim)
        # hidden shape: (1, batch_size, hidden_dim)
        
        # Pass through final layer
        output = self.fc(rnn_out)  # Shape: (batch_size, sequence_length, vocab_size)
        
        return output, hidden
    
    def predict_next_word(self, context, idx_to_word, top_k=5):
        """
        Predict the next word given a context.
        
        Args:
            context (torch.Tensor): Input context tensor
            idx_to_word (dict): Mapping from indices to words
            top_k (int): Number of top predictions to return
            
        Returns:
            list: Top k predicted words with their probabilities
        """
        self.eval()
        with torch.no_grad():
            output, _ = self(context)
            probabilities = torch.softmax(output[:, -1], dim=-1)
            top_probs, top_indices = torch.topk(probabilities, top_k)
            
            predictions = [
                (idx_to_word[idx.item()], prob.item())
                for idx, prob in zip(top_indices[0], top_probs[0])
            ]
        
        return predictions


## 2.2 Model Training

In [21]:

# Training function
def train_rnn(model, train_data, criterion, optimizer, num_epochs, batch_size):
    """
    Train the RNN model.
    
    Args:
        model (VanillaRNN): The RNN model
        train_data (tuple): Tuple of (X_train, y_train)
        criterion: Loss function
        optimizer: Optimizer
        num_epochs (int): Number of training epochs
        batch_size (int): Batch size
    """
    X_train, y_train = train_data
    dataset_size = len(X_train)
    loss_df = []
    print(f'Training ....')

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        # Create batches
        for i in range(0, dataset_size, batch_size):
            batch_X = X_train[i:i + batch_size]
            batch_y = y_train[i:i + batch_size]
            
            # Forward pass
            output, _ = model(batch_X)
            loss = criterion(output[:, -1], batch_y)
            
            # Backward pass and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        avg_loss = total_loss / (dataset_size // batch_size)
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')
        loss_df.append(avg_loss)
    
    return loss_df

In [22]:
def train_helper(model,X_train,y_train,batch_size = 32,N_epochs = 5,learning_rate = 0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Train the model
    loss_df = train_rnn(
        model,
        (X_train, y_train),
        criterion,
        optimizer,
        N_epochs,
        batch_size
    )
    return loss_df

In [6]:
# Hyperparameters
EMBEDDING_DIM = 128
HIDDEN_DIM = 256
BATCH_SIZE = 32
NUM_EPOCHS = 5
LEARNING_RATE = 0.001

### 2.2.1 Training on Pride and Prejudice

In [48]:
# Initialize model for the first text (Pride and Prejudice)
model_rnn_1 = VanillaRNN(
    vocab_size=len(train_vocab1),
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM
)

loss1 = train_helper(model_rnn_1,X_train_3gram1,y_train_3gram1)


Training ....
Epoch 1/5, Loss: 5.9888
Epoch 2/5, Loss: 5.0141
Epoch 3/5, Loss: 4.4084
Epoch 4/5, Loss: 3.9088
Epoch 5/5, Loss: 3.5051


In [49]:
# Define paths
model_path = "Models/2021102040_rnn_corpus1.pth"

# Ensure the directory exists
os.makedirs("Models", exist_ok=True)

# Save all required components
save_data = {
    "model_state_dict": model_rnn_1.state_dict(),  # Model weights
    "word_to_idx": train_vocab1,  # Vocabulary mapping
    "idx_to_word": idx_to_word1,  # Reverse mapping
    "hyperparams": {
        "embedding_dim": EMBEDDING_DIM,
        "hidden_dim": HIDDEN_DIM,
        "vocab_size": len(train_vocab1)    },
}

torch.save(save_data, model_path)

print(f"✅ Model and metadata saved at: {model_path}")


✅ Model and metadata saved at: Models/2021102040_rnn_corpus1.pth


### 2.2.2 Training on Ulysses

In [50]:
# Initialize model for the first text (Pride and Prejudice)
model_rnn_2 = VanillaRNN(
    vocab_size=len(train_vocab2),
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM
)

loss2 = train_helper(model_rnn_2,X_train_3gram2,y_train_3gram2)


Training ....
Epoch 1/5, Loss: 7.5252
Epoch 2/5, Loss: 6.1271
Epoch 3/5, Loss: 5.2261
Epoch 4/5, Loss: 4.4929
Epoch 5/5, Loss: 3.9424


In [None]:
# Define paths
model_path = "Models/2021102040_rnn_corpus2.pth"

# Ensure the directory exists
os.makedirs("Models", exist_ok=True)

# Save all required components
save_data = {
    "model_state_dict": model_rnn_2.state_dict(),  # Model weights
    "word_to_idx": train_vocab2,  # Vocabulary mapping
    "idx_to_word": idx_to_word2,  # Reverse mapping
    "hyperparams": {
        "embedding_dim": EMBEDDING_DIM,
        "hidden_dim": HIDDEN_DIM,
        "vocab_size": len(train_vocab2)    },
}

torch.save(save_data, model_path)

print(f"✅ Model and metadata saved at: {model_path}")


✅ Model and metadata saved at: Models/2021102040_rnn_corpus2.pth


### 2.2.3 Epoch Wise Loss for both Corpus

In [52]:
loss_df = pd.DataFrame({
    'Epoch': list(range(1, 6)),  # Epochs 1 to 5
    'Pride and Prejudice': loss1,
    'Ulysses': loss2
})

# Saving to a CSV file (optional)
loss_df.to_csv('rnn_loss_data.csv', index=False)

# Display the DataFrame
print(loss_df)

   Epoch  Pride and Prejudice   Ulysses
0      1             5.988782  7.525239
1      2             5.014135  6.127071
2      3             4.408369  5.226066
3      4             3.908786  4.492947
4      5             3.505142  3.942395


## 2.3 Perplexity Calculation

In [7]:

def calculate_sentence_perplexity_rnn(model, sentence_indices, device):
    
    model.eval()
    
    # Convert sentence indices to tensor and move to device
    sentence_tensor = torch.tensor(sentence_indices, dtype=torch.long, device=device).unsqueeze(0)  # Shape: (1, seq_len)
    
    # Initialize hidden state directly here instead of calling init_hidden()
    batch_size = sentence_tensor.shape[0]  # Should be 1
    hidden = torch.zeros(1, batch_size, model.hidden_dim, device=device)  # Manually initialized hidden state

    loss_function = nn.CrossEntropyLoss()
    total_loss = 0.0
    count = 0

    with torch.no_grad():
        for i in range(len(sentence_indices) - 1):  # Predict each word given previous ones
            input_word = sentence_tensor[:, i].unsqueeze(1)  # Shape: (1, 1)
            target_word = sentence_tensor[:, i + 1]  # Shape: (1,)

            output, hidden = model(input_word, hidden)  # Forward pass

            output = output.squeeze(1)  # Remove sequence dimension (1, vocab_size)
            loss = loss_function(output, target_word)  # Compute loss
            total_loss += loss.item()
            count += 1

    avg_loss = total_loss / count if count > 0 else float('inf')
    perplexity = torch.exp(torch.tensor(avg_loss)).item()  # Convert loss to perplexity

    return perplexity

def save_perplexity_results_rnn(corpus_name, dataset_type, sentences_indices, perplexities):
    """Save perplexity results to file for RNN"""
    file_name = f"2021102040_rnn_{corpus_name}_{dataset_type}-perplexity.txt"
    file_path = os.path.join('Perplexity', file_name)

    with open(file_path, 'w') as f:
        avg_perplexity = np.mean([p for p in perplexities if p != float('inf')])
        f.write(f"Overall Average Perplexity: {avg_perplexity:.2f}\n\n")
        
        for idx, (sentence, perp) in enumerate(zip(sentences_indices, perplexities), 1):
            f.write(f" {sentence} - Perplexity: {perp:.2f}\n")

def evaluate_and_save_perplexity_rnn(model, sentences_indices, corpus_name, dataset_type, device='cpu'):
    """Evaluate perplexity for each sentence using an RNN and save results"""
    perplexities = []

    for sentence in sentences_indices:
        if len(sentence) > 1:  # Ensure valid sentence length
            perp = calculate_sentence_perplexity_rnn(model, sentence, device)
            if perp != float('inf'):
                perplexities.append(perp)

    save_perplexity_results_rnn(corpus_name, dataset_type, sentences_indices, perplexities)
    return np.mean(perplexities)


In [105]:
perplexity_rnn_df = pd.DataFrame(columns=['Corpus', 'train_perplexity', 'test_perplexity'])

perplexity_rnn_df = pd.DataFrame({
    'Corpus': pd.Series(dtype='str'),  # Or 'object'
    'train_perplexity': pd.Series(dtype='float'),
    'test_perplexity': pd.Series(dtype='float')
})
perplexity_rnn_df = pd.DataFrame([])


#### Perplexity for Corpus - Pride and Prejudice

In [None]:


# # Load model weights
# state_dict = torch.load("Models/2021102040_rnn_corpus1.pt")
# # expected_vocab_size = state_dict['embedding.weight'].shape[0]
# # Initialize model
# model_rnn_1 = VanillaRNN(vocab_size=len(train_vocab1), embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM)

# model_rnn_1.load_state_dict(state_dict)
# model_rnn_1.to(device)
# model_rnn_1.eval()

# Compute perplexity
print("\nCalculating training perplexity...")
train_perplexity = evaluate_and_save_perplexity_rnn(model_rnn_1, train_indices1, 'corpus1', 'train', device)

print("Calculating test perplexity...")
test_perplexity = evaluate_and_save_perplexity_rnn(model_rnn_1, test_indices1, 'corpus1', 'test', device)

# Print results
print("\nFinal Results for RNN (Corpus 1):")
print(f"Training Perplexity: {train_perplexity:.2f}")
print(f"Test Perplexity: {test_perplexity:.2f}")

# Store results in DataFrame
new_row = pd.DataFrame([{'Corpus': "Pride and Prejudice", 'train_perplexity': train_perplexity, 'test_perplexity': test_perplexity}])
perplexity_rnn_df = pd.concat([perplexity_rnn_df, new_row], ignore_index=True)


#### Perplexity for Corpus - Ulysses

In [None]:
# Initialize model
model_rnn_2 = VanillaRNN(vocab_size=len(train_vocab2), embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM)

# Load model weights
state_dict = torch.load("Models/2021102040_rnn_corpus2.pt")
model_rnn_2.load_state_dict(state_dict)
model_rnn_2.to(device)
model_rnn_2.eval()

# Compute perplexity
print("\nCalculating training perplexity...")
train_perplexity = evaluate_and_save_perplexity_rnn(model_rnn_2, train_indices1, 'corpus2', 'train', device)

print("Calculating test perplexity...")
test_perplexity = evaluate_and_save_perplexity_rnn(model_rnn_2, test_indices1, 'corpus2', 'test', device)

# Print results
print("\nFinal Results for RNN (Corpus 2):")
print(f"Training Perplexity: {train_perplexity:.2f}")
print(f"Test Perplexity: {test_perplexity:.2f}")

# Store results in DataFrame
new_row = pd.DataFrame([{'Corpus': "Ulysses", 'train_perplexity': train_perplexity, 'test_perplexity': test_perplexity}])
perplexity_rnn_df = pd.concat([perplexity_rnn_df, new_row], ignore_index=True)


## 2.4 Next word predction for RNN

In [26]:

def sentence_to_tensor(sentence, word_to_idx, context_size, device='cpu'):
    """
    Convert a sentence into a tensor format suitable for the RNN model.
    
    Args:
        sentence (str): Input sentence.
        word_to_idx (dict): Mapping from words to indices.
        context_size (int): Number of words used as context.
        device (str): Device to place the tensor ('cpu' or 'cuda').

    Returns:
        torch.Tensor: Context tensor of shape (1, context_size).
    """
    sentence = sentence.lower()
    words = re.sub(r"[^a-zA-Z\s]", "", sentence).split()
    
    if len(words) < context_size:
        raise ValueError(f"Input sentence must have at least {context_size} words")
    
    # Extract the last `context_size` words
    context_words = words[-context_size:]
    
    # Convert words to indices
    context_indices = [word_to_idx.get(word, word_to_idx["<UNK>"]) for word in context_words]
    
    # Convert to tensor (1, context_size)
    context_tensor = torch.tensor([context_indices], dtype=torch.long).to(device)
    
    return context_tensor


In [54]:
# Example input sentence
custom_sentence = "I Love cat"

# Convert sentence to tensor
context_tensor = sentence_to_tensor(custom_sentence, word_to_idx=train_vocab2, context_size=3, device='cpu')

# Predict next word using your RNN model
predictions = model_rnn_2.predict_next_word(context_tensor, idx_to_word2, top_k=5)

# Print predictions
print_predictions(custom_sentence, predictions)



Input sentence: I Love cat
Top 5 predicted next words:
him: 0.0756
a: 0.0637
in: 0.0458
with: 0.0348
to: 0.0307


### Testing on saved models

In [31]:
model_path = "Models/2021102040_rnn_corpus1.pth"

# Load checkpoint
checkpoint = torch.load(model_path)

# Extract saved components
word_to_idx = checkpoint["word_to_idx"]
idx_to_word = checkpoint["idx_to_word"]
hyperparams = checkpoint["hyperparams"]
embedding_dim = hyperparams["embedding_dim"]
hidden_dim = hyperparams["hidden_dim"]
vocab_size = hyperparams["vocab_size"]
model_rnn_1 = VanillaRNN(vocab_size, embedding_dim, hidden_dim)

model_rnn_1.load_state_dict(checkpoint["model_state_dict"])

model_rnn_1.eval()

print("✅ Model successfully loaded!")


✅ Model successfully loaded!


In [29]:
# Example input sentence
custom_sentence = "I Love cat since"

# Convert sentence to tensor
context_tensor = sentence_to_tensor(custom_sentence, word_to_idx=word_to_idx, context_size=4, device='cpu')

# Predict next word using your RNN model
predictions = model_rnn_1.predict_next_word(context_tensor, idx_to_word, top_k=5)

# Print predictions
print_predictions(custom_sentence, predictions)



Input sentence: I Love cat since
Top 5 predicted next words:
we: 0.1124
the: 0.0970
her: 0.0689
having: 0.0626
any: 0.0607


In [40]:
model_path = "Models/2021102040_rnn_corpus2.pth"

# Load checkpoint
checkpoint = torch.load(model_path)

# Extract saved components
word_to_idx = checkpoint["word_to_idx"]
idx_to_word = checkpoint["idx_to_word"]
hyperparams = checkpoint["hyperparams"]
embedding_dim = hyperparams["embedding_dim"]
hidden_dim = hyperparams["hidden_dim"]
vocab_size = hyperparams["vocab_size"]
model_rnn_2 = VanillaRNN(vocab_size, embedding_dim, hidden_dim)

model_rnn_2.load_state_dict(checkpoint["model_state_dict"])

model_rnn_2.eval()

print("✅ Model successfully loaded!")


✅ Model successfully loaded!


In [41]:
# Example input sentence
custom_sentence = "I Love cat since"

# Convert sentence to tensor
context_tensor = sentence_to_tensor(custom_sentence, word_to_idx=word_to_idx, context_size=4, device='cpu')

# Predict next word using your RNN model
predictions = model_rnn_2.predict_next_word(context_tensor, idx_to_word, top_k=5)

# Print predictions
print_predictions(custom_sentence, predictions)



Input sentence: I Love cat since
Top 5 predicted next words:
your: 0.0855
the: 0.0751
jacquard: 0.0445
it: 0.0272
a: 0.0256
