In [None]:
# NOTE: This code was used for experiments as-is. the code was ran in Enviornment 1: Kaggle as described in the paper
# Naming and structure may not follow programming best practices.
# Focus is on reproducibility.
#This code was developed for internal experimentation and contains hardcoded values for various test cases.
#It was not refactored for modularity, but the logic matches the experiments reported in the paper.

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import numpy as np
from tqdm import tqdm
import random
import re
import os
import urllib.request
import tarfile
from collections import Counter

def set_all_seeds(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)

# Custom activation function with hardcoded values
class CustomActivation(nn.Module):
    def __init__(self):
        super(CustomActivation, self).__init__()

    def forward(self, x):
        # === SELECT ACTIVATION BY UNCOMMENTING ONE LINE ===
        
        # Standard activations
        #return F.relu(x) #ReLU
        return 1.25*x * torch.sigmoid(x)  #ESwish
        #return x * torch.tanh(F.softplus(0.5921*x)/0.5921) #PMish
        #return x * torch.tanh(F.softplus(x))  # Mish
        #return F.gelu(x, approximate='tanh')  #GeLU
        #return x*torch.sigmoid(x) #Swish
        #return x * (torch.sigmoid(x) +0.125 * torch.exp(-0.25 * x**2))  # SwishPlus
        #return x * torch.tanh(F.softplus(x)) +0.025 *x* torch.exp(-0.25* x**2) # MishPlus
        #return F.gelu(x, approximate='tanh') +0.125 * x * torch.exp(-0.25 * x**2) #GeLUPlus
        
# Simple tokenizer function
def simple_tokenize(text):
    """Simple tokenizer that splits on whitespace and removes punctuation"""
    # Convert to lowercase
    text = text.lower()
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Keep only alphanumeric characters and spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Split on whitespace
    tokens = text.split()
    return tokens

# IMDB Dataset preprocessing
class IMDBDataset(Dataset):
    def __init__(self, texts, labels, vocab, max_length=256):
        self.texts = texts
        self.labels = labels
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize and convert to indices
        tokens = simple_tokenize(text)
        indices = [self.vocab.get(token, self.vocab['<unk>']) for token in tokens]

        # Pad or truncate
        if len(indices) > self.max_length:
            indices = indices[:self.max_length]
        else:
            indices.extend([self.vocab['<pad>']] * (self.max_length - len(indices)))

        return torch.tensor(indices, dtype=torch.long), torch.tensor(label, dtype=torch.float)

def build_vocab(texts, min_freq=5, max_vocab=20000):
    counter = Counter()

    for text in texts:
        tokens = simple_tokenize(text)
        counter.update(tokens)

    # Create vocabulary
    vocab = {'<pad>': 0, '<unk>': 1}
    for word, freq in counter.most_common(max_vocab - 2):
        if freq >= min_freq:
            vocab[word] = len(vocab)

    return vocab

# Data loading functions
def download_imdb_data():
    """Download and extract IMDB dataset manually"""
    url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
    filename = "aclImdb_v1.tar.gz"
    extract_path = "."

    if not os.path.exists("aclImdb"):
        print("Downloading IMDB dataset...")
        urllib.request.urlretrieve(url, filename)

        print("Extracting dataset...")
        with tarfile.open(filename, 'r:gz') as tar:
            tar.extractall(extract_path)

        # Clean up
        os.remove(filename)
    else:
        print("IMDB dataset already exists")

    return "aclImdb"

def read_imdb_files(data_dir, split='train'):
    """Read IMDB text files and labels"""
    texts = []
    labels = []

    # Read positive reviews
    pos_dir = os.path.join(data_dir, split, 'pos')
    for filename in os.listdir(pos_dir):
        if filename.endswith('.txt'):
            with open(os.path.join(pos_dir, filename), 'r', encoding='utf-8') as f:
                texts.append(f.read())
                labels.append(1)  # positive

    # Read negative reviews
    neg_dir = os.path.join(data_dir, split, 'neg')
    for filename in os.listdir(neg_dir):
        if filename.endswith('.txt'):
            with open(os.path.join(neg_dir, filename), 'r', encoding='utf-8') as f:
                texts.append(f.read())
                labels.append(0)  # negative

    return texts, labels

def load_imdb_data():
    """Load and preprocess IMDB dataset"""
    print("Loading IMDB dataset...")

    # Download if needed
    data_dir = download_imdb_data()

    # Read training and test data
    train_texts, train_labels = read_imdb_files(data_dir, 'train')
    test_texts, test_labels = read_imdb_files(data_dir, 'test')

    print(f"Loaded {len(train_texts)} training samples")
    print(f"Loaded {len(test_texts)} test samples")

    # Build vocabulary from training data
    print("Building vocabulary...")
    vocab = build_vocab(train_texts)
    print(f"Vocabulary size: {len(vocab)}")

    return train_texts, train_labels, test_texts, test_labels, vocab

# LSTM Model with custom activation
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=128, num_layers=2,
                 dropout=0.3):
        super(SentimentLSTM, self).__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers,
                            dropout=dropout, batch_first=True, bidirectional=True)

        self.fc1 = nn.Linear(hidden_dim * 2, 64)
        self.bn1 = nn.BatchNorm1d(64) # Add Batch Normalization
        self.fc2 = nn.Linear(64, 32)
        self.bn2 = nn.BatchNorm1d(32) # Add Batch Normalization
        self.fc3 = nn.Linear(32, 1)

        self.dropout = nn.Dropout(dropout)

        # Custom activation function
        self.activation = CustomActivation()

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        last_output = lstm_out[:, -1, :]

        x = self.fc1(last_output)
        x = self.bn1(x)  # <-- Apply BN before activation
        x = self.activation(x)
        x = self.dropout(x)

        x = self.fc2(x)
        x = self.bn2(x)  # <-- Apply BN before activation
        x = self.activation(x)
        x = self.dropout(x)

        x = self.fc3(x)
        return torch.sigmoid(x)


# Training function with best metrics tracking and tiebreaker
def train_model(model, train_loader, test_loader, epochs=15, lr=0.001):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = model.to(device)

    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    train_losses, test_losses = [], []
    train_accs, test_accs = [], []
    
    # Track best metrics with tiebreaker
    best_test_acc = 0.0
    best_test_loss_at_best_acc = float('inf')  # For tiebreaker
    best_epoch = 0
    best_metrics = {
        'epoch': 0,
        'train_loss': 0.0,
        'train_acc': 0.0,
        'test_loss': 0.0,
        'test_acc': 0.0
    }

    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0
        train_correct = 0
        train_total = 0

        for batch_idx, (data, target) in enumerate(tqdm(train_loader, desc=f'Epoch {epoch+1}/{epochs}')):
            data, target = data.to(device), target.to(device)

            optimizer.zero_grad()
            output = model(data).squeeze()
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()
            predicted = (output > 0.5).float()
            train_correct += (predicted == target).sum().item()
            train_total += target.size(0)

        # Testing
        model.eval()
        test_loss = 0
        test_correct = 0
        test_total = 0

        with torch.no_grad():
            for data, target in test_loader:
                data, target = data.to(device), target.to(device)
                output = model(data).squeeze()
                loss = criterion(output, target)

                test_loss += loss.item()
                predicted = (output > 0.5).float()
                test_correct += (predicted == target).sum().item()
                test_total += target.size(0)

        # Calculate metrics
        avg_train_loss = train_loss / len(train_loader)
        avg_test_loss = test_loss / len(test_loader)
        train_acc = train_correct / train_total
        test_acc = test_correct / test_total

        train_losses.append(avg_train_loss)
        test_losses.append(avg_test_loss)
        train_accs.append(train_acc)
        test_accs.append(test_acc)

        # Update best metrics with tiebreaker logic
        # If test accuracy is better, OR if test accuracy is tied but test loss is lower
        if (test_acc > best_test_acc) or (test_acc == best_test_acc and avg_test_loss < best_test_loss_at_best_acc):
            best_test_acc = test_acc
            best_test_loss_at_best_acc = avg_test_loss
            best_epoch = epoch + 1
            best_metrics = {
                'epoch': epoch + 1,
                'train_loss': avg_train_loss,
                'train_acc': train_acc,
                'test_loss': avg_test_loss,
                'test_acc': test_acc
            }

        print(f'Epoch {epoch+1}/{epochs}:')
        print(f'Train Loss: {avg_train_loss:.4f}, Train Acc: {train_acc:.4f}')
        print(f'Test Loss: {avg_test_loss:.4f}, Test Acc: {test_acc:.4f}')
        print('-' * 50)

    return train_losses, test_losses, train_accs, test_accs, best_metrics

# Main execution
if __name__ == "__main__":
    Rseed=48
    set_all_seeds(Rseed)
    print(Rseed)

    # Load data
    train_texts, train_labels, test_texts, test_labels, vocab = load_imdb_data()

    # Create datasets
    train_dataset = IMDBDataset(train_texts, train_labels, vocab)
    test_dataset = IMDBDataset(test_texts, test_labels, vocab)

    # Create dataloaders
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

    print(f"\n{'='*60}")
    print("Training with Custom Activation Function")
    print(f"{'='*60}")

    # Create model with custom activation
    model = SentimentLSTM(
        vocab_size=len(vocab),
        embed_dim=128,
        hidden_dim=128,
        num_layers=2,
        dropout=0.3
    )

    train_losses, test_losses, train_accs, test_accs, best_metrics = train_model(
        model, train_loader, test_loader, epochs=15, lr=0.001
    )

    # Print final results
    print(f"\n{'='*60}")
    print("TRAINING COMPLETED")
    print(f"{'='*60}")
    print(f"Final Test Accuracy: {test_accs[-1]:.4f}")
    print(f"\nBEST METRICS (from epoch {best_metrics['epoch']}):")
    print(f"Best Test Accuracy: {best_metrics['test_acc']:.4f}")
    print(f"Train Loss at Best Epoch: {best_metrics['train_loss']:.4f}")
    print(f"Train Accuracy at Best Epoch: {best_metrics['train_acc']:.4f}")
    print(f"Test Loss at Best Epoch: {best_metrics['test_loss']:.4f}")
    print(f"{'='*60}")