In [1]:
!pip install pandas
!pip install matplotlib
!pip install portalocker
!pip install torch==2.2.0 torchvision==0.17 torchtext==0.17.0


Collecting portalocker
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker
Successfully installed portalocker-3.1.1
Collecting torch==2.2.0
  Downloading torch-2.2.0-cp311-cp311-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchvision==0.17
  Downloading torchvision-0.17.0-cp311-cp311-manylinux1_x86_64.whl.metadata (6.6 kB)
Collecting torchtext==0.17.0
  Downloading torchtext-0.17.0-cp311-cp311-manylinux1_x86_64.whl.metadata (7.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.2.0)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-man

In [2]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
import numpy as np
import time
import pandas as pd
import io
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset

torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(64 * 7 * 7, 128)
        self.fc2 = nn.Linear(128, 10)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = self.pool(self.relu(self.conv1(x)))  # 28x28 -> 14x14
        x = self.pool(self.relu(self.conv2(x)))  # 14x14 -> 7x7
        x = x.view(-1, 64 * 7 * 7)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x


class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=True, 
                           dropout=dropout if n_layers > 1 else 0,
                           batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        output, (hidden, cell) = self.lstm(embedded)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        return self.fc(hidden)

# Helper functions for IMDB dataset
def yield_tokens(data_iter, tokenizer):
    for _, text in data_iter:
        yield tokenizer(text)

def collate_batch(batch):
    text_list, label_list = [], []
    for (_label, _text) in batch:
        # Ensure label is either 0 or 1 (map 'pos' to 1, 'neg' to 0)
        if isinstance(_label, str):
            label = 1 if _label == 'pos' else 0
        else:
            # Make sure it's within valid range (0 or 1)
            label = int(_label) % 2  
        
        label_list.append(label)
        processed_text = torch.tensor(text_vocab(tokenizer(_text)), dtype=torch.int64)
        text_list.append(processed_text)

    text_list = pad_sequence(text_list, batch_first=True, padding_value=pad_idx)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    return text_list, label_list

# Function to train models
def train_model(model, optimizer_name, train_loader, val_loader, criterion, epochs, model_type):
    if optimizer_name == 'Adam':
        optimizer = optim.Adam(model.parameters(), lr=0.001)
    elif optimizer_name == 'RMSprop':
        optimizer = optim.RMSprop(model.parameters(), lr=0.001)
    elif optimizer_name == 'SGD':
        optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
    elif optimizer_name == 'Adagrad':
        optimizer = optim.Adagrad(model.parameters(), lr=0.01)
    
    model = model.to(device)
    criterion = criterion.to(device)
    
    train_losses = []
    val_accuracies = []
    
    start_time = time.time()
    
    for epoch in range(epochs):
        # Training phase
        model.train()
        running_loss = 0.0
        for i, data in enumerate(train_loader):
            if model_type == 'CNN':
                inputs, labels = data
            else:  # RNN
                inputs, labels = data
            
            inputs, labels = inputs.to(device), labels.to(device)
            
            # Debugging: Check label range
            if model_type == 'LSTM':
                if torch.any(labels < 0) or torch.any(labels >= 2):  # For binary classification
                    print(f"Warning: Labels out of range found: {labels}")
                    # Fix labels to be within range
                    labels = torch.clamp(labels, 0, 1)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            
            if model_type == 'CNN':
                loss = criterion(outputs, labels)
            else:  # RNN
                loss = criterion(outputs, labels)
                
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            
            # Print progress every 100 batches
            if (i + 1) % 100 == 0:
                print(f'Epoch [{epoch+1}/{epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')
        
        epoch_loss = running_loss / len(train_loader)
        train_losses.append(epoch_loss)
        
        # Validation phase
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for data in val_loader:
                if model_type == 'CNN':
                    inputs, labels = data
                else:  # RNN
                    inputs, labels = data
                    # Ensure labels are in valid range
                    if torch.any(labels < 0) or torch.any(labels >= 2):  # For binary classification
                        labels = torch.clamp(labels, 0, 1)
                
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        accuracy = 100 * correct / total
        val_accuracies.append(accuracy)
        
        print(f'Epoch {epoch+1}/{epochs}, Loss: {epoch_loss:.4f}, Validation Accuracy: {accuracy:.2f}%')
    
    training_time = time.time() - start_time
    print(f'Training complete in {training_time:.2f} seconds')
    
    # Compute final metrics
    model.eval()
    correct = 0
    total = 0
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    
    # For multi-class (MNIST), we'll compute precision and recall for each class
    if model_type == 'CNN':
        num_classes = 10
        confusion_matrix = np.zeros((num_classes, num_classes), dtype=int)
    else:  # For binary classification (IMDB)
        num_classes = 2
        confusion_matrix = np.zeros((num_classes, num_classes), dtype=int)
    
    with torch.no_grad():
        for data in val_loader:
            if model_type == 'CNN':
                images, labels = data
                images, labels = images.to(device), labels.to(device)
                outputs = model(images)
            else:  # RNN
                texts, labels = data
                # Ensure labels are in valid range
                if torch.any(labels < 0) or torch.any(labels >= 2):
                    labels = torch.clamp(labels, 0, 1)
                texts, labels = texts.to(device), labels.to(device)
                outputs = model(texts)
            
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            # Update confusion matrix
            for t, p in zip(labels.view(-1), predicted.view(-1)):
                confusion_matrix[t.long(), p.long()] += 1
    
    accuracy = 100 * correct / total
    
    # Calculate precision, recall, and F1 score
    precisions = []
    recalls = []
    f1_scores = []
    
    for i in range(num_classes):
        true_pos = confusion_matrix[i, i]
        false_pos = confusion_matrix[:, i].sum() - true_pos
        false_neg = confusion_matrix[i, :].sum() - true_pos
        
        if true_pos + false_pos == 0:
            precision = 0
        else:
            precision = true_pos / (true_pos + false_pos)
        
        if true_pos + false_neg == 0:
            recall = 0
        else:
            recall = true_pos / (true_pos + false_neg)
        
        if precision + recall == 0:
            f1 = 0
        else:
            f1 = 2 * precision * recall / (precision + recall)
        
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
    
    # For RNN (binary classification), report macro-averaged metrics
    if model_type == 'LSTM':
        precision = sum(precisions) / len(precisions)
        recall = sum(recalls) / len(recalls)
        f1 = sum(f1_scores) / len(f1_scores)
    else:  # For CNN, report weighted metrics
        total_samples_per_class = confusion_matrix.sum(axis=1)
        precision = sum(p * s for p, s in zip(precisions, total_samples_per_class)) / total_samples_per_class.sum()
        recall = sum(r * s for r, s in zip(recalls, total_samples_per_class)) / total_samples_per_class.sum()
        f1 = sum(f * s for f, s in zip(f1_scores, total_samples_per_class)) / total_samples_per_class.sum()
    
    metrics = {
        'optimizer': optimizer_name,
        'model_type': model_type,
        'accuracy': accuracy,
        'precision': precision * 100,
        'recall': recall * 100,
        'f1_score': f1 * 100,
        'training_time': training_time,
        'train_losses': train_losses,
        'val_accuracies': val_accuracies
    }
    
    return metrics

# Function to plot results
def plot_results(metrics_list, title_prefix):
    # Plot training loss
    plt.figure(figsize=(14, 6))
    plt.subplot(1, 2, 1)
    for metrics in metrics_list:
        plt.plot(metrics['train_losses'], label=f"{metrics['optimizer']}")
    plt.title(f'{title_prefix} - Training Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    
    # Plot validation accuracy
    plt.subplot(1, 2, 2)
    for metrics in metrics_list:
        plt.plot(metrics['val_accuracies'], label=f"{metrics['optimizer']}")
    plt.title(f'{title_prefix} - Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy (%)')
    plt.legend()
    plt.tight_layout()
    
    # Save the plot
    plt.savefig(f"{title_prefix.lower().replace(' ', '_')}_plots.png")
    plt.show()
    plt.close()

# Function to display metrics comparison
def display_metrics_comparison(metrics_list, model_type):
    df = pd.DataFrame([
        {
            'Optimizer': m['optimizer'],
            'Accuracy (%)': f"{m['accuracy']:.2f}",
            'Precision (%)': f"{m['precision']:.2f}",
            'Recall (%)': f"{m['recall']:.2f}",
            'F1 Score (%)': f"{m['f1_score']:.2f}",
            'Training Time (s)': f"{m['training_time']:.2f}"
        }
        for m in metrics_list
    ])
    
    print(f"\nMetrics Comparison for {model_type}:")
    print(df.to_string(index=False))
    return df

# Main execution
def main():
    # MNIST dataset
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ])
    
    # Load MNIST dataset
    train_dataset = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform)
    test_dataset = torchvision.datasets.MNIST(root='./data', train=False, download=True, transform=transform)
    
    # Split training set into training and validation
    train_size = int(0.8 * len(train_dataset))
    val_size = len(train_dataset) - train_size
    train_mnist, val_mnist = torch.utils.data.random_split(train_dataset, [train_size, val_size])
    
    train_loader_mnist = DataLoader(train_mnist, batch_size=64, shuffle=True)
    val_loader_mnist = DataLoader(val_mnist, batch_size=64, shuffle=False)
    test_loader_mnist = DataLoader(test_dataset, batch_size=64, shuffle=False)
    
    # Load IMDB dataset
    global tokenizer, text_vocab, pad_idx
    tokenizer = get_tokenizer('basic_english')
    
    # Load IMDB dataset
    train_iter = IMDB(split='train')
    test_iter = IMDB(split='test')
    
    # Create vocabulary
    text_vocab = build_vocab_from_iterator(
        yield_tokens(train_iter, tokenizer),
        min_freq=5,
        specials=['<unk>', '<pad>']
    )
    text_vocab.set_default_index(text_vocab['<unk>'])
    
    # Define padding index
    pad_idx = text_vocab['<pad>']
    
    # Reset iterators
    train_iter, test_iter = IMDB(split='train'), IMDB(split='test')
    
    # Convert to list for easier splitting
    train_data = list(train_iter)
    test_data = list(test_iter)
    
    # Print IMDB dataset sample to debug
    print("IMDB sample entries:")
    for i in range(min(5, len(train_data))):
        print(f"Entry {i}: Label={train_data[i][0]}, Text preview: {train_data[i][1][:50]}...")
    
    # Split train data into train and validation
    train_size = int(0.8 * len(train_data))
    val_size = len(train_data) - train_size
    train_imdb, val_imdb = torch.utils.data.random_split(train_data, [train_size, val_size])
    
    # Create data loaders
    train_loader_imdb = DataLoader(train_imdb, batch_size=32, shuffle=True, collate_fn=collate_batch)
    val_loader_imdb = DataLoader(val_imdb, batch_size=32, shuffle=False, collate_fn=collate_batch)
    test_loader_imdb = DataLoader(test_data, batch_size=32, shuffle=False, collate_fn=collate_batch)
    
    # Check first batch of IMDB data to ensure labels are correct
    print("Checking first IMDB batch:")
    for texts, labels in train_loader_imdb:
        print(f"Text batch shape: {texts.shape}")
        print(f"Label batch: {labels}")
        print(f"Label min: {labels.min()}, Label max: {labels.max()}")
        break
    
    # Define CNN model hyperparameters
    cnn_epochs = 5
    
    # Define RNN model hyperparameters
    vocab_size = len(text_vocab)
    embedding_dim = 100
    hidden_dim = 256
    output_dim = 2  # Binary classification for sentiment
    n_layers = 2
    dropout = 0.5
    rnn_epochs = 3
    
    # Optimizers to test
    optimizers = ['Adam', 'RMSprop', 'SGD', 'Adagrad']
    
    # Train CNN models with different optimizers
    cnn_metrics = []
    for opt in optimizers:
        print(f"\n=== Training CNN with {opt} optimizer ===")
        model = CNN().to(device)
        criterion = nn.CrossEntropyLoss()
        metrics = train_model(model, opt, train_loader_mnist, val_loader_mnist, criterion, cnn_epochs, 'CNN')
        cnn_metrics.append(metrics)
    
    # Train RNN models with different optimizers
    rnn_metrics = []
    for opt in optimizers:
        print(f"\n=== Training LSTM with {opt} optimizer ===")
        model = LSTM(vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout).to(device)
        criterion = nn.CrossEntropyLoss()
        metrics = train_model(model, opt, train_loader_imdb, val_loader_imdb, criterion, rnn_epochs, 'LSTM')
        rnn_metrics.append(metrics)
    
    # Plot results
    plot_results(cnn_metrics, "CNN on MNIST")
    plot_results(rnn_metrics, "LSTM on IMDB")
    
    # Display metrics comparison
    cnn_df = display_metrics_comparison(cnn_metrics, "CNN on MNIST")
    rnn_df = display_metrics_comparison(rnn_metrics, "LSTM on IMDB")
    
    return cnn_df, rnn_df

if __name__ == "__main__":
    main()

Using device: cuda
IMDB sample entries:
Entry 0: Label=1, Text preview: I rented I AM CURIOUS-YELLOW from my video store b...
Entry 1: Label=1, Text preview: "I Am Curious: Yellow" is a risible and pretentiou...
Entry 2: Label=1, Text preview: If only to avoid making this type of film in the f...
Entry 3: Label=1, Text preview: This film was probably inspired by Godard's Mascul...
Entry 4: Label=1, Text preview: Oh, brother...after hearing about this ridiculous ...
Checking first IMDB batch:
Text batch shape: torch.Size([32, 930])
Label batch: tensor([0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,
        0, 0, 1, 0, 0, 1, 0, 0])
Label min: 0, Label max: 1

=== Training CNN with Adam optimizer ===
Epoch [1/5], Step [100/750], Loss: 0.5253
Epoch [1/5], Step [200/750], Loss: 0.3182
Epoch [1/5], Step [300/750], Loss: 0.1993
Epoch [1/5], Step [400/750], Loss: 0.1320
Epoch [1/5], Step [500/750], Loss: 0.1762
Epoch [1/5], Step [600/750], Loss: 0.0994
Epoch [1/5], Ste