In [None]:
### Needs a few fixes still actively working on ###

# Install necessary libraries with specific versions for compatibility
!pip install gensim transformers datasets torch scipy

# Core PyTorch and Data Handling
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import load_dataset
import gensim
import gensim.downloader as api
from gensim.models import Word2Vec
from sklearn.preprocessing import MinMaxScaler  # For attention weight normalization
from torch.optim.lr_scheduler import ReduceLROnPlateau  # For learning rate
from typing import Dict, List, Tuple, Optional
from datasets import load_dataset, DatasetDict
import gensim.downloader as api




# Device configuration (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# For Grid Search and Hyperparameter Tuning
import itertools  # Used for iterating through parameter combinations
from sklearn.model_selection import KFold  # Used for k-fold cross-validation

# NLP and Transformers
from transformers import AutoTokenizer

# For plotting and visualization
import matplotlib.pyplot as plt
# Metric Recording
import time

 #   return device

def load_and_preprocess_data(dataset_name="imdb"):
    """Loads the dataset, creates a validation split, tokenizes it in batches, splits it, and returns all necessary components."""

    # Step 1: Load the dataset
    dataset = load_dataset(dataset_name)

    # Step 2: Create a validation split from the training data (e.g., 80% training, 20% validation)
    train_valid_split = dataset['train'].train_test_split(test_size=0.2)

    # Step 3: Update the dataset dictionary to include the validation set
    tokenized_dataset = DatasetDict({
        'train': train_valid_split['train'],
        'validation': train_valid_split['test'],
        'test': dataset['test']
    })

    # Step 4: Load Word2Vec model
    try:
        w2v_model = api.load('word2vec-google-news-300')
        print("Word2Vec model loaded successfully.")
    except Exception as e:
        print(f"Error loading Word2Vec model: {e}")
        return None

    # Step 5: Initialize tokenizer
    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

    # Step 6: Define a batch tokenization function
    def tokenize_batch(batch):
        return tokenizer(batch["text"], padding="max_length", truncation=True)

    # Step 7: Tokenize in batches
    print("Tokenizing dataset in batches...")
    tokenized_dataset = tokenized_dataset.map(
        tokenize_batch,
        batched=True,  # Process in batches
        batch_size=64,  # Adjust this batch size based on memory limits
        remove_columns=["text"]  # Remove the original text column to save memory
    )

    # Step 8: Rename and reformat dataset
    tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
    tokenized_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

    # Step 9: Extract labels from the training set
    labels = torch.tensor([example["labels"] for example in tokenized_dataset["train"]])

    # Step 10: Define loss function
    class_weight = 2.0  # Adjust based on your dataset imbalance
    weights = torch.tensor([class_weight]).to(device)
    criterion = nn.BCEWithLogitsLoss(pos_weight=weights)

    # Reduce dataset size more significantly for debugging
    tokenized_dataset['train'] = tokenized_dataset['train'].select(range(500))  # Reduce to 500 samples
    tokenized_dataset['validation'] = tokenized_dataset['validation'].select(range(100))
    tokenized_dataset['test'] = tokenized_dataset['test'].select(range(100))

    # Print dataset sizes for verification
    print(f"Train dataset size: {len(tokenized_dataset['train'])}")
    print(f"Validation dataset size: {len(tokenized_dataset['validation'])}")
    print(f"Test dataset size: {len(tokenized_dataset['test'])}")

    return tokenized_dataset, w2v_model, tokenizer, labels, criterion


def tokenize_data(example, tokenizer):
    """Tokenizes a single example using the provided tokenizer.

    Args:
        example (dict): A dictionary containing the text to tokenize.
        tokenizer (transformers.AutoTokenizer): The tokenizer to use.

    Returns:
        dict: A dictionary containing the tokenized input IDs, attention mask, etc.
    """
    return tokenizer(example['text'], padding='max_length', truncation=True)




class LSTMModel(nn.Module):
    def __init__(self, hidden_dim, layer_dim, output_dim, w2v_model, dropout_rate=0.2):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.output_dim = output_dim
        self.w2v_model = w2v_model
        self.dropout_rate = dropout_rate
        self.max_length = 512  # Ensure a consistent maximum sequence length
        self.tokenizer = tokenizer  # Store tokenizer for conversion


        # Embedding layer
        self.embedding_layer = self.create_embedding_layer(w2v_model)

        # LSTM layer
        self.lstm = nn.LSTM(w2v_model.vector_size, hidden_dim, layer_dim, batch_first=True)

        # Attention mechanism
        self.attention = Attention(hidden_dim)

        # Fully connected layer
        self.fc = nn.Linear(hidden_dim, output_dim)

        # Dropout
        self.dropout = nn.Dropout(dropout_rate)  # Use actual dropout instead of Identity

    def forward(self, x):
        try:
            # Convert input tokens to Word2Vec indices
            x = self.convert_tokens_to_w2v_indices(x)

            # Embed the input
            x = self.embedding_layer(x)
            batch_size = x.size(0)

            # Initialize hidden and cell states
            h0 = torch.zeros(self.layer_dim, batch_size, self.hidden_dim).to(x.device)
            c0 = torch.zeros(self.layer_dim, batch_size, self.hidden_dim).to(x.device)

            # LSTM forward pass
            out, (hn, cn) = self.lstm(x, (h0, c0))

            # Ensure out is valid with improved handling
            if not torch.isfinite(out).all():
                print("Warning: Invalid LSTM output detected. Replacing with zeros.")
                out = torch.zeros_like(out)

            # Apply attention
            attention_output = self.attention(out)
            context_vector = attention_output['context_vector']

            # Dropout and classification with additional checks
            out = self.dropout(context_vector)
            out = self.fc(out)

            return out
        except Exception as e:
            print(f"Critical error in forward pass: {e}")
            # Return a tensor of appropriate shape filled with a default value
            return torch.zeros(x.size(0), self.output_dim, device=x.device)

    def create_embedding_layer(self, w2v_model):
        """Creates and initializes the embedding layer from the Word2Vec model."""
        embedding_dim = w2v_model.vector_size
        vocab_size = len(w2v_model.key_to_index)
        embedding_layer = nn.Embedding(vocab_size, embedding_dim)
        embedding_layer.weight.data.copy_(torch.from_numpy(w2v_model.vectors))
        embedding_layer.weight.requires_grad = False  # Freeze embeddings
        return embedding_layer

    def convert_tokens_to_w2v_indices(self, input_ids):
        """More robust conversion of token IDs to Word2Vec indices."""
        w2v_indices = []
        for ids in input_ids:
            # Convert tokens to words
            tokens = self.tokenizer.convert_ids_to_tokens(ids.tolist())

            # Find Word2Vec indices, use 0 for unknown words
            seq_indices = [
                self.w2v_model.key_to_index.get(token.lower(), 0)
                for token in tokens if token not in ['[PAD]', '[CLS]', '[SEP]']
            ]

            # Pad or truncate to ensure consistent length
            seq_indices = seq_indices[:self.max_length]
            seq_indices += [0] * (self.max_length - len(seq_indices))

            w2v_indices.append(seq_indices)

        return torch.tensor(w2v_indices, dtype=torch.long, device=input_ids.device)

class Attention(nn.Module):
    def __init__(self, hidden_size):
        super(Attention, self).__init__()
        self.weight = nn.Parameter(torch.FloatTensor(hidden_size, hidden_size))
        self.bias = nn.Parameter(torch.FloatTensor(hidden_size))
        self.query = nn.Parameter(torch.FloatTensor(hidden_size))

    def forward(self, hidden_states):
        """
        Args:
            hidden_states: Tensor of shape (batch_size, seq_length, hidden_size)
        Returns:
            A dictionary with:
              - 'context_vector': The attention-weighted sum of hidden states
              - 'attention_weights': The weights assigned to each hidden state
        """
        # Apply attention mechanism
        energy = torch.tanh(torch.matmul(hidden_states, self.weight) + self.bias)
        attention_weights = torch.softmax(torch.matmul(energy, self.query), dim=1)
        context_vector = torch.sum(hidden_states * attention_weights.unsqueeze(-1), dim=1)

        return {
            'context_vector': context_vector,
            'attention_weights': attention_weights,
        }


def grid_search(param_grid, dataset, w2v_model, num_epochs=3, k=3):
    best_score = float('inf')
    best_params = {}

    param_combinations = list(itertools.product(*param_grid.values()))
    for params in param_combinations:
        param_dict = dict(zip(param_grid.keys(), params))

        # Perform k-fold cross-validation
        cv_scores = []
        for train_idx, val_idx in KFold(n_splits=k).split(dataset['train']):
            train_subset = torch.utils.data.Subset(dataset['train'], train_idx)
            val_subset = torch.utils.data.Subset(dataset['train'], val_idx)

            train_loader = DataLoader(train_subset, batch_size=param_dict['batch_size'], shuffle=True)
            val_loader = DataLoader(val_subset, batch_size=param_dict['batch_size'], shuffle=False)

            # Initialize model
            model = LSTMModel(
                hidden_dim=param_dict['hidden_dim'],
                layer_dim=param_dict['layer_dim'],
                output_dim=1,
                w2v_model=w2v_model,
                dropout_rate=param_dict['dropout_rate']
            ).to(device)

            optimizer = torch.optim.Adam(model.parameters(), lr=param_dict['learning_rate'])
            criterion = nn.BCEWithLogitsLoss()

            # Train and validate
            for _ in range(num_epochs):
                model.train()
                for batch in train_loader:
                    inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device).float()
                    optimizer.zero_grad()
                    outputs = model(inputs)
                    loss = criterion(outputs.squeeze(1), labels)
                    loss.backward()
                    optimizer.step()

                # Validate
                model.eval()
                val_loss = 0
                with torch.no_grad():
                    for batch in val_loader:
                        inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device).float()
                        outputs = model(inputs)
                        val_loss += criterion(outputs.squeeze(1), labels).item()

                cv_scores.append(val_loss / len(val_loader))

        # Update best params
        avg_cv_score = sum(cv_scores) / len(cv_scores)
        if avg_cv_score < best_score:
            best_score = avg_cv_score
            best_params = param_dict

    return best_params




def visualize_attention(normalized_weights, tokens):
    """
    Visualizes attention weights as a heatmap.

    Args:
        normalized_weights (np.ndarray): The normalized attention weights.
        tokens (list): The list of input tokens.
    """

    plt.figure(figsize=(8, 6))

    # Create heatmap with 'magma' color scheme and nearest neighbor interpolation
    plt.imshow(normalized_weights,
               cmap='magma',
               interpolation='nearest',
               aspect='auto')

    # Add colorbar for weight scale, title, and axis labels
    plt.colorbar(label="Attention Weight")
    plt.title("Attention Visualization")
    plt.xlabel("Input Tokens")
    plt.ylabel("Attention Heads")

    # Rotate x-axis labels for better readability
    plt.xticks(range(len(tokens)), tokens, rotation=90)

    # Add grid for visual clarity
    plt.grid(True)

    # Adjust layout to prevent overlapping elements
    plt.tight_layout()

    # Display the plot
    plt.show()

def train_model(model, train_loader, valid_loader, criterion, optimizer, device, num_epochs, best_valid_loss, epochs_without_improvement, patience):
    start_time = time.time()
    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        accumulation_steps = 4

        for batch_idx, batch in enumerate(train_loader):
            inputs = batch['input_ids'].to(device)
            labels = batch['labels'].to(device).float()

            # Add input validation
            if torch.isnan(inputs).any() or torch.isnan(labels).any():
                print(f"NaN values detected in batch {batch_idx}")
                continue

            optimizer.zero_grad()
            outputs = model(inputs)

            # Ensure outputs and labels are valid
            if not torch.isfinite(outputs).all() or not torch.isfinite(labels).all():
                print(f"Invalid outputs or labels in batch {batch_idx}")
                continue

            loss = criterion(outputs.squeeze(1), labels)

            if torch.isnan(loss):
                print(f"NaN loss detected in batch {batch_idx}")
                continue

            loss.backward()

            if (batch_idx + 1) % accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()

            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        valid_loss = evaluate_model(model, valid_loader, criterion, device)

        print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss:.4f}, Valid Loss: {valid_loss:.4f}")

        # Early stopping logic
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            epochs_without_improvement = 0
        else:
            epochs_without_improvement += 1
            if epochs_without_improvement >= patience:
                print("Early stopping triggered.")
                break

    training_time = time.time() - start_time
    return model, training_time

def evaluate_model(model, valid_loader, criterion, device):
    model.eval()  # Set model to evaluation mode
    valid_loss = 0
    total_samples = 0

    with torch.no_grad():
        for batch in valid_loader:
            # Get input data and labels
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device).float()

            # Forward pass (only pass input_ids)
            outputs = model(input_ids)

            # Calculate loss
            loss = criterion(outputs.squeeze(1), labels)  # outputs need to be squeezed for binary classification
            valid_loss += loss.item() * labels.size(0)  # Accumulate loss for all samples in the batch
            total_samples += labels.size(0)

    avg_valid_loss = valid_loss / total_samples  # Calculate average validation loss
    return avg_valid_loss

def early_stopping(avg_valid_loss, best_valid_loss, epochs_without_improvement, patience, epoch):
    if avg_valid_loss < best_valid_loss:
        best_valid_loss = avg_valid_loss
        epochs_without_improvement = 0
    else:
        epochs_without_improvement += 1
        if epochs_without_improvement >= patience:
            print(f"Early stopping at epoch {epoch + 1}")
            return best_valid_loss, epochs_without_improvement, True  # Stop training

    return best_valid_loss, epochs_without_improvement, False  # Continue training

def evaluate_test_set(model, test_loader, device):
    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():  # Disable gradient calculation during evaluation
        for batch in test_loader:
            inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
            outputs = model(inputs)

            # Calculate probabilities using sigmoid
            probs = torch.sigmoid(outputs)

            # Get predicted labels (0 or 1)
            predicted = (probs >= 0.5).int()

            # Accumulate correct predictions and total samples
            total += labels.size(0)
            correct += (predicted.squeeze() == labels.squeeze()).sum().item()

    accuracy = correct / total  # Calculate accuracy
    return accuracy

def calculate_test_accuracy(model, test_loader, device):
    model.eval()  # Set model to evaluation mode
    correct = 0
    total = 0

    with torch.no_grad():  # Disable gradient calculation during evaluation
        for batch in test_loader:
            inputs, labels = batch['input_ids'].to(device), batch['labels'].to(device)
            outputs = model(inputs)

            # Calculate probabilities using sigmoid
            probs = torch.sigmoid(outputs)

            # Get predicted labels (0 or 1)
            predicted = (probs >= 0.5).int()

            # Accumulate correct predictions and total samples
            total += labels.size(0)
            correct += (predicted.squeeze() == labels.squeeze()).sum().item()

    accuracy = correct / total  # Calculate accuracy
    return accuracy

def create_and_initialize_model(param_grid, tokenized_dataset, w2v_model, num_epochs, device):
    # Use grid_search to find the best hyperparameters
    best_params = grid_search(param_grid, tokenized_dataset, w2v_model, num_epochs=num_epochs, k=3)

    # Initialize the model with the best parameters
    model = LSTMModel(
        hidden_dim=best_params['hidden_dim'],
        layer_dim=best_params['layer_dim'],
        output_dim=1,  # Binary classification
        w2v_model=w2v_model,
        dropout_rate=best_params['dropout_rate']
    ).to(device)

    # Set up optimizer, criterion, and scheduler
    optimizer = torch.optim.Adam(model.parameters(), lr=best_params['learning_rate'])
    criterion = nn.BCEWithLogitsLoss()
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.7)

    return model, optimizer, criterion, scheduler, best_params


def prepare_data_and_evaluate(tokenized_dataset, best_params, best_model, device, tokenizer):
    # Create DataLoaders for training, validation, and testing
    best_train_loader = DataLoader(
        tokenized_dataset['train'],
        batch_size=best_params['batch_size'],
        shuffle=True,
        drop_last=True
    )
    best_valid_loader = DataLoader(
        tokenized_dataset['validation'],
        batch_size=best_params['batch_size'],
        shuffle=False,
        drop_last=True
    )
    best_test_loader = DataLoader(
        tokenized_dataset['test'],
        batch_size=best_params['batch_size'],
        shuffle=False,
        drop_last=True
    )
    # Evaluate on the test set
    test_accuracy = calculate_test_accuracy(best_model, best_test_loader, device)
    print(f"Test Accuracy: {test_accuracy:.2f}%")

    # Get a batch from the test loader for visualization
    batch = next(iter(best_test_loader))
    inputs = batch['input_ids'].to(device)

    # Visualize attention
    visualize_model_attention(best_model, inputs, tokenizer, device)

    return best_train_loader, best_valid_loader, best_test_loader, float('inf'), 0, 3

def visualize_model_attention(model, inputs, tokenizer, device):
    # Convert input IDs to Word2Vec indices
    word2vec_indices = model.convert_tokens_to_w2v_indices(inputs)

    # Get Word2Vec embeddings
    word_embeddings = model.embedding_layer(word2vec_indices)

    # Get hidden states from the LSTM layer
    hidden_states, _ = model.lstm(word_embeddings)

    # Calculate attention weights
    attention_output = model.attention(hidden_states)
    attention_weights = attention_output['attention_weights']

    # Detach from GPU, move to CPU, and convert to NumPy array
    attention_weights = attention_weights.detach().cpu().numpy()

    # Normalize attention weights for better visualization
    scaler = MinMaxScaler()
    normalized_weights = scaler.fit_transform(attention_weights)

    # Get tokens for the input sequence
    tokens = tokenizer.convert_ids_to_tokens(inputs[0].tolist())

    # Visualize attention
    visualize_attention(normalized_weights, tokens)

def _create_and_initialize_model_with_params(params, tokenized_dataset, w2v_model, num_epochs, device):
    model = LSTMModel(
        hidden_dim=params['hidden_dim'],
        layer_dim=params['layer_dim'],
        output_dim=1,  # Binary classification
        w2v_model=w2v_model,
        dropout_rate=params['dropout_rate']
        )
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=params['learning_rate'])
    criterion = nn.BCEWithLogitsLoss()  # Match the loss function used elsewhere

    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.7)
    return model, optimizer, criterion, scheduler


In [None]:
#def main():
"""Main function to execute the training and evaluation pipeline."""

# Define num_epochs here
num_epochs = 10  # You can adjust the number of epochs as needed

# 1. Setup Environment and Device
#device = setup_environment_and_device()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 2. Load and Preprocess Data
tokenized_dataset, w2v_model, tokenizer, labels, criterion = load_and_preprocess_data()

# 3. Define Hyperparameter Grid
param_grid = {
    'batch_size': [16, 8],
    'learning_rate': [0.0001, 0.0005],
    'hidden_dim': [64, 128],
    'dropout_rate': [0.1],
    'layer_dim': [1]
}

# 4. Perform grid search to find best params and initialize model
best_model, optimizer, criterion, scheduler, best_params = create_and_initialize_model(
    param_grid, tokenized_dataset, w2v_model, num_epochs, device
)

print(f"Best Parameters Found: {best_params}")

# 5. Prepare Data and Evaluate (Initial)
best_train_loader, best_valid_loader, best_test_loader, best_valid_loss, epochs_without_improvement, patience = prepare_data_and_evaluate(
    tokenized_dataset, best_params, best_model, device, tokenizer  # Note: best_params is obtained from create_and_initialize_model
)

# 6. Train Model with Early Stopping
best_model, training_time = train_model(best_model, best_train_loader, best_valid_loader, criterion, optimizer, device, num_epochs,
                                          best_valid_loss, epochs_without_improvement, patience)

# 7. Evaluate on Test Set and Visualize Attention
test_accuracy = calculate_test_accuracy(best_model, best_test_loader, device)
print(f"Final Test Accuracy: {test_accuracy:.4f}")

# Get a batch from the test loader for visualization
batch = next(iter(best_test_loader))
inputs = batch['input_ids'].to(device)

visualize_model_attention(best_model, inputs, tokenizer, device)

print(f"Total training time: {training_time:.2f} seconds")

#if __name__ == "__main__":
 #   main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

unsupervised-00000-of-00001.parquet:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]



TypeError: cannot unpack non-iterable NoneType object