In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from collections import Counter
import re
from sklearn.model_selection import train_test_split

# Set random seed for reproducibility
torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Custom IMDB Dataset class
class IMDBDataset(Dataset):
    def __init__(self, reviews, labels, word2idx, max_len=500):
        self.reviews = reviews
        self.labels = labels
        self.word2idx = word2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]
        # Convert review to indices
        indices = [self.word2idx.get(word, self.word2idx['<UNK>']) for word in review.split()][:self.max_len]
        # Pad or truncate
        if len(indices) < self.max_len:
            indices += [self.word2idx['<PAD>']] * (self.max_len - len(indices))
        else:
            indices = indices[:self.max_len]
        return torch.tensor(indices, dtype=torch.long), torch.tensor(label, dtype=torch.float)

# Text preprocessing function
def preprocess_text(text):
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower().strip()
    return text

# Build vocabulary
def build_vocab(reviews, min_freq=5):
    word_counts = Counter()
    for review in reviews:
        words = review.split()
        word_counts.update(words)
    # Include special tokens
    vocab = {'<PAD>': 0, '<UNK>': 1}
    idx = len(vocab)
    for word, count in word_counts.items():
        if count >= min_freq:
            vocab[word] = idx
            idx += 1
    return vocab

# Load and preprocess IMDB dataset
def load_imdb_data(data_dir='/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv'):
    df = pd.read_csv(data_dir)
    reviews = df['review'].apply(preprocess_text).tolist()
    labels = df['sentiment'].map({'positive': 1, 'negative': 0}).tolist()
    return reviews, labels

# # GRU Model
# class GRUModel(nn.Module):
#     def __init__(self, vocab_size, embed_size, hidden_size, output_size, num_layers=1, dropout=0.3):
#         super(GRUModel, self).__init__()
#         self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
#         self.gru = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
#         self.fc = nn.Linear(hidden_size, output_size)
#         self.sigmoid = nn.Sigmoid()

#     def.Concurrent(self, x):
#         embedded = self.embedding(x)  # [batch_size, seq_len, embed_size]
#         gru_out, _ = self.gru(embedded)  # [batch_size, seq_len, hidden_size]
#         last_out = gru_out[:, -1, :]  # Take the last time step
#         out = self.fc(last_out)
#         out = self.sigmoid(out)
#         return out

# GRU Model
class GRUModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, output_size, num_layers=1, dropout=0.3):
        super(GRUModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.gru = nn.GRU(embed_size, hidden_size, num_layers, batch_first=True, dropout=dropout if num_layers > 1 else 0)
        self.fc = nn.Linear(hidden_size, output_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x)  # [batch_size, seq_len, embed_size]
        gru_out, _ = self.gru(embedded)  # [batch_size, seq_len, hidden_size]
        last_out = gru_out[:, -1, :]  # Take the last time step
        out = self.fc(last_out)
        out = self.sigmoid(out)
        return out

# Training function
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs=30):
    train_losses, val_losses = [], []
    train_accs, val_accs = [], []

    for epoch in range(num_epochs):
        # Training
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            predicted = (outputs >= 0.5).float()
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()
        
        train_loss /= len(train_loader)
        train_acc = train_correct / train_total
        train_losses.append(train_loss)
        train_accs.append(train_acc)

        # Validation
        model.eval()
        val_loss = 0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs).squeeze()
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                predicted = (outputs >= 0.5).float()
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
        
        val_loss /= len(val_loader)
        val_acc = val_correct / val_total
        val_losses.append(val_loss)
        val_accs.append(val_acc)

        print(f"Epoch {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Train Acc: {train_acc:.4f}, Val Loss: {val_loss:.4f}, Val Acc: {val_acc:.4f}")

    return train_losses, val_losses, train_accs, val_accs

# Plotting function
def plot_metrics(train_losses, val_losses, train_accs, val_accs):
    epochs = range(1, len(train_losses) + 1)
    
    # Plot loss
    plt.figure(figsize=(10, 5))
    plt.plot(epochs, train_losses, 'b-', label='Training Loss')
    plt.plot(epochs, val_losses, 'r-', label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.grid(True)
    plt.savefig('loss_plot.png')
    plt.show()
    plt.close()

    # Plot accuracy
    plt.figure(figsize=(10, 5))
    plt.plot(epochs, train_accs, 'b-', label='Training Accuracy')
    plt.plot(epochs, val_accs, 'r-', label='Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid(True)
    plt.savefig('accuracy_plot.png')
    plt.show()
    plt.close()

# Main execution
def main():
    # Load data
    reviews, labels = load_imdb_data()
    
    # Build vocabulary
    word2idx = build_vocab(reviews)
    print(f"Vocabulary size: {len(word2idx)}")
    
    # Split data
    train_reviews, test_reviews, train_labels, test_labels = train_test_split(
        reviews, labels, test_size=0.5, random_state=42
    )
    train_reviews, val_reviews, train_labels, val_labels = train_test_split(
        train_reviews, train_labels, test_size=0.2, random_state=42
    )
    
    # Create datasets
    train_dataset = IMDBDataset(train_reviews, train_labels, word2idx)
    val_dataset = IMDBDataset(val_reviews, val_labels, word2idx)
    test_dataset = IMDBDataset(test_reviews, test_labels, word2idx)
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=64)
    test_loader = DataLoader(test_dataset, batch_size=64)
    
    # Model parameters
    vocab_size = len(word2idx)
    embed_size = 128
    hidden_size = 256
    output_size = 1
    num_layers = 2
    
    # Initialize model, criterion, optimizer
    model = GRUModel(vocab_size, embed_size, hidden_size, output_size, num_layers).to(device)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # Train model
    train_losses, val_losses, train_accs, val_accs = train_model(
        model, train_loader, val_loader, criterion, optimizer
    )
    
    # Plot metrics
    plot_metrics(train_losses, val_losses, train_accs, val_accs)
    
    # Evaluate on test set
    model.eval()
    test_correct = 0
    test_total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs).squeeze()
            predicted = (outputs >= 0.5).float()
            test_total += labels.size(0)
            test_correct += (predicted == labels).sum().item()
    
    test_acc = test_correct / test_total
    print(f"Test Accuracy: {test_acc:.4f}")

if __name__ == "__main__":
    main()

In [None]:
!python --version
