# EnStack: Stacking Ensemble for Vulnerability Detection

This notebook demonstrates the complete pipeline for training and evaluating the EnStack model on Google Colab.

## Pipeline Overview:
1. Setup environment and mount Google Drive
2. Clone repository and install dependencies
3. Configure paths for Colab environment
4. Train base models (CodeBERT, GraphCodeBERT, UniXcoder)
5. Extract features from trained models
6. Train meta-classifier (Stacking)
7. Evaluate ensemble performance

## 1. Setup Environment

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Clone repository (if not already cloned)
import os

REPO_URL = "https://github.com/YOUR_USERNAME/EnStack-paper.git"  # Update with your repo URL
REPO_DIR = "/content/EnStack-paper"

if not os.path.exists(REPO_DIR):
    !git clone {REPO_URL} {REPO_DIR}
else:
    print("Repository already exists. Pulling latest changes...")
    !cd {REPO_DIR} && git pull

# Change to repository directory
%cd {REPO_DIR}

In [None]:
# Install dependencies
!pip install -r requirements.txt -q

## 2. Import Libraries and Setup Logging

In [None]:
import sys
import logging
import numpy as np
import torch
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path.cwd() / "src"))

from src.utils import load_config, setup_logging, set_seed, get_device
from src.dataset import create_dataloaders
from src.models import create_model
from src.trainer import EnStackTrainer
from src.stacking import (
    prepare_meta_features,
    train_meta_classifier,
    evaluate_meta_classifier,
    save_meta_classifier,
)

# Setup logging
logger = setup_logging(level=logging.INFO)
logger.info("EnStack pipeline initialized")

## 3. Load and Configure

In [None]:
# Load configuration
config = load_config("configs/config.yaml")

# Override paths for Colab environment (if needed)
# config["data"]["root_dir"] = "/content/drive/MyDrive/EnStack_Data"
# config["training"]["output_dir"] = "/content/drive/MyDrive/EnStack_Data/checkpoints"

# Set random seed
set_seed(config["training"]["seed"])

# Get device
device = get_device()

print(f"Configuration loaded. Using device: {device}")

## 4. Train Base Models

In [None]:
# Base models to train
base_model_names = config["model"]["base_models"]
num_epochs = config["training"]["epochs"]

# Storage for trained models
trained_models = {}
trainers = {}

for model_name in base_model_names:
    logger.info(f"\n{'='*60}")
    logger.info(f"Training {model_name.upper()}")
    logger.info(f"{'='*60}\n")
    
    # Create model and tokenizer
    model, tokenizer = create_model(model_name, config, pretrained=True)
    
    # Create dataloaders
    train_loader, val_loader, test_loader = create_dataloaders(
        config, tokenizer
    )
    
    # Create trainer
    trainer = EnStackTrainer(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        test_loader=test_loader,
        learning_rate=config["training"]["learning_rate"],
        device=device,
        output_dir=f"{config['training']['output_dir']}/{model_name}",
    )
    
    # Train
    history = trainer.train(num_epochs=num_epochs, save_best=True)
    
    # Evaluate on test set
    if test_loader is not None:
        test_metrics = trainer.evaluate(test_loader, split_name="Test")
        logger.info(f"{model_name} Test Results: {test_metrics}")
    
    # Store
    trained_models[model_name] = model
    trainers[model_name] = trainer
    
    logger.info(f"\n{model_name} training completed\n")

## 5. Extract Features for Stacking

In [None]:
logger.info("\n" + "="*60)
logger.info("FEATURE EXTRACTION FOR STACKING")
logger.info("="*60 + "\n")

# Extract features from each base model
train_features_list = []
val_features_list = []
test_features_list = []

for model_name in base_model_names:
    logger.info(f"Extracting features from {model_name}...")
    
    trainer = trainers[model_name]
    tokenizer = trainer.model.base_model.config._name_or_path
    
    # Recreate dataloaders with the same tokenizer
    from transformers import AutoTokenizer
    tok = AutoTokenizer.from_pretrained(config["model"]["model_map"][model_name])
    train_loader, val_loader, test_loader = create_dataloaders(config, tok)
    
    # Extract features
    if train_loader:
        train_features = trainer.extract_features(train_loader)
        train_features_list.append(train_features)
    
    if val_loader:
        val_features = trainer.extract_features(val_loader)
        val_features_list.append(val_features)
    
    if test_loader:
        test_features = trainer.extract_features(test_loader)
        test_features_list.append(test_features)

logger.info("Feature extraction completed")

## 6. Prepare Meta-Features

In [None]:
# Get labels from datasets
import pickle
from pathlib import Path

def load_labels(data_path):
    """Load labels from data file."""
    with open(data_path, "rb") as f:
        data = pickle.load(f)
    import pandas as pd
    if not isinstance(data, pd.DataFrame):
        data = pd.DataFrame(data)
    return data["target"].values

root_dir = Path(config["data"]["root_dir"])
train_labels = load_labels(root_dir / config["data"]["train_file"])
val_labels = load_labels(root_dir / config["data"]["val_file"])
test_labels = load_labels(root_dir / config["data"]["test_file"])

# Prepare meta-features
train_meta_features, _ = prepare_meta_features(train_features_list, train_labels)
val_meta_features, _ = prepare_meta_features(val_features_list, val_labels)
test_meta_features, _ = prepare_meta_features(test_features_list, test_labels)

logger.info(f"Train meta-features shape: {train_meta_features.shape}")
logger.info(f"Val meta-features shape: {val_meta_features.shape}")
logger.info(f"Test meta-features shape: {test_meta_features.shape}")

## 7. Train Meta-Classifier

In [None]:
logger.info("\n" + "="*60)
logger.info("TRAINING META-CLASSIFIER")
logger.info("="*60 + "\n")

# Get meta-classifier type from config
meta_classifier_type = config["model"].get("meta_classifier", "svm")

# Train meta-classifier
meta_classifier = train_meta_classifier(
    train_meta_features,
    train_labels,
    classifier_type=meta_classifier_type,
    random_state=config["training"]["seed"],
)

# Save meta-classifier
meta_save_path = f"{config['training']['output_dir']}/meta_classifier.pkl"
save_meta_classifier(meta_classifier, meta_save_path)

## 8. Evaluate Ensemble

In [None]:
logger.info("\n" + "="*60)
logger.info("ENSEMBLE EVALUATION")
logger.info("="*60 + "\n")

# Evaluate on validation set
logger.info("Validation Set Results:")
val_metrics = evaluate_meta_classifier(
    meta_classifier, val_meta_features, val_labels
)

# Evaluate on test set
logger.info("\nTest Set Results:")
test_metrics = evaluate_meta_classifier(
    meta_classifier, test_meta_features, test_labels
)

# Print summary
print("\n" + "="*60)
print("FINAL RESULTS SUMMARY")
print("="*60)
print(f"\nValidation Metrics:")
for key, value in val_metrics.items():
    print(f"  {key.capitalize()}: {value:.4f}")
print(f"\nTest Metrics:")
for key, value in test_metrics.items():
    print(f"  {key.capitalize()}: {value:.4f}")
print("\n" + "="*60)

## 9. Visualization (Optional)

In [None]:
# Plot confusion matrix (optional)
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Get predictions
test_predictions = meta_classifier.predict(test_meta_features)

# Plot confusion matrix
cm = confusion_matrix(test_labels, test_predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap='Blues')
plt.title('EnStack Confusion Matrix (Test Set)')
plt.show()