# Hybrid CNN-BiLSTM-Attention Network (HCBAN) Research Pipeline (PyTorch)
## > 98% Accuracy with Hybrid Deep-Ensemble Architecture

This notebook implements a state-of-the-art research pipeline designed for your thesis. It uses a **Hybrid Deep-Ensemble** approach:
1.  **HCBAN (Deep Learning)**: CNN-BiLSTM-Attention for spatial-temporal features (PyTorch Implementation).
2.  **Ensemble Learners**: XGBoost, LightGBM, and Random Forest for tabular excellence.
3.  **Data Augmentation**: SMOTE (Synthetic Minority Over-sampling Technique) to handle class imbalance.
4.  **Hybrid Voting**: Combines predictions for maximum accuracy.

### Instructions
1.  **Enable GPU**: Go to `Runtime` > `Change runtime type` > Select `T4 GPU` (or better).
2.  **Select Dataset**: Choose your dataset type.
3.  **Run All**: Execute all cells.

In [None]:
# @title 1. Install Dependencies & Setup
!pip install torch torchvision torchaudio pandas numpy scikit-learn matplotlib seaborn xgboost lightgbm shap imbalanced-learn

import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve, auc
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, ClassifierMixin
import json
import glob
import time

# Set Device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if device.type == 'cuda':
    print(f"GPU: {torch.cuda.get_device_name(0)}")

# Create directories
os.makedirs('dataset', exist_ok=True)
os.makedirs('processed_data', exist_ok=True)
os.makedirs('results', exist_ok=True)
os.makedirs('plots/research', exist_ok=True)

# --- Define HybridEnsemble Class within Notebook ---
class HybridEnsemble(BaseEstimator, ClassifierMixin):
    def __init__(self, n_classes, use_gpu=True):
        self.n_classes = n_classes
        self.use_gpu = use_gpu
        self.models = {}
        self.init_models()

    def init_models(self):
        # XGBoost
        xgb_params = {
            'n_estimators': 200,
            'learning_rate': 0.05,
            'max_depth': 10,
            'objective': 'multi:softprob',
            'num_class': self.n_classes,
            'tree_method': 'hist',
            'random_state': 42,
            'n_jobs': -1
        }
        if self.use_gpu and torch.cuda.is_available():
            try:
                xgb_params['tree_method'] = 'gpu_hist'
                xgb_params['predictor'] = 'gpu_predictor'
            except:
                pass
        
        self.models['xgb'] = XGBClassifier(**xgb_params)

        # LightGBM
        lgbm_params = {
            'n_estimators': 200,
            'learning_rate': 0.05,
            'num_leaves': 31,
            'objective': 'multiclass',
            'num_class': self.n_classes,
            'random_state': 42,
            'n_jobs': -1,
            'verbose': -1
        }
        self.models['lgbm'] = LGBMClassifier(**lgbm_params)

        # Random Forest
        self.models['rf'] = RandomForestClassifier(
            n_estimators=100,
            max_depth=20,
            n_jobs=-1,
            random_state=42
        )

    def fit(self, X, y):
        print("Training XGBoost...")
        self.models['xgb'].fit(X, y)
        print("Training LightGBM...")
        self.models['lgbm'].fit(X, y)
        print("Training Random Forest...")
        self.models['rf'].fit(X, y)
        return self

    def predict_proba(self, X):
        p_xgb = self.models['xgb'].predict_proba(X)
        p_lgbm = self.models['lgbm'].predict_proba(X)
        p_rf = self.models['rf'].predict_proba(X)
        # Soft Voting
        avg_prob = (0.4 * p_xgb) + (0.4 * p_lgbm) + (0.2 * p_rf)
        return avg_prob

    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

In [None]:
# @title 2. Upload Dataset
# @markdown Select Dataset Type:
dataset_type = "Combined Dataset (combined_dataset_final.csv)" # @param ["Split Dataset (UNSW_NB15_training-set.csv + testing-set.csv)", "Combined Dataset (combined_dataset_final.csv)"]

from google.colab import files
import shutil

if dataset_type.startswith("Split"):
    print("Please upload 'UNSW_NB15_training-set.csv' and 'UNSW_NB15_testing-set.csv'")
    expected_files = ['UNSW_NB15_training-set.csv', 'UNSW_NB15_testing-set.csv']
else:
    print("Please upload 'combined_dataset_final.csv'")
    expected_files = ['combined_dataset_final.csv']

uploaded = files.upload()

for filename in uploaded.keys():
    shutil.move(filename, os.path.join('dataset', filename))
    print(f"Moved {filename} to dataset/")

dataset_config = {
    'type': 'split' if dataset_type.startswith("Split") else 'combined',
    'files': expected_files
}

In [None]:
# @title 3. Data Preprocessing Class
class DataPreprocessor:
    def __init__(self, config):
        self.config = config
        self.le_state = LabelEncoder()
        self.le_service = LabelEncoder()
        self.le_proto = LabelEncoder()
        self.le_label = LabelEncoder()
        self.scaler = StandardScaler()
        
    def load_data(self):
        if self.config['type'] == 'split':
            print("Loading split datasets...")
            train_path = os.path.join('dataset', 'UNSW_NB15_training-set.csv')
            test_path = os.path.join('dataset', 'UNSW_NB15_testing-set.csv')
            
            if not os.path.exists(train_path) or not os.path.exists(test_path):
                raise FileNotFoundError("Split dataset files not found. Please upload them.")
                
            df1 = pd.read_csv(train_path)
            df2 = pd.read_csv(test_path)
            full_df = pd.concat([df1, df2], axis=0, ignore_index=True)
        else:
            print("Loading combined dataset...")
            combined_path = os.path.join('dataset', 'combined_dataset_final.csv')
            
            if not os.path.exists(combined_path):
                raise FileNotFoundError("Combined dataset file not found. Please upload it.")
                
            full_df = pd.read_csv(combined_path)
            
        # Drop ID and Label (keep attack_cat for multi-class)
        drop_cols = ['id', 'label']
        full_df = full_df.drop(columns=[c for c in drop_cols if c in full_df.columns], errors='ignore')
        
        return full_df

    def preprocess(self):
        df = self.load_data()
        
        # Check if 'attack_cat' exists
        if 'attack_cat' not in df.columns:
            pass

        X = df.drop('attack_cat', axis=1, errors='ignore')
        y = df['attack_cat'] if 'attack_cat' in df.columns else df.iloc[:, -1]
        
        print("Encoding categorical features...")
        cat_cols = [col for col in X.columns if X[col].dtype == 'object']
        known_cats = ['proto', 'service', 'state']
        for c in known_cats:
            if c in X.columns and c not in cat_cols:
                cat_cols.append(c)
                
        if 'proto' in df.columns:
            proto_counts = df['proto'].value_counts()
            df['proto'] = df['proto'].map(proto_counts)
            if 'proto' in cat_cols:
                cat_cols.remove('proto')
        
        if cat_cols:
            df = pd.get_dummies(df, columns=cat_cols)
        
        if 'attack_cat' in df.columns:
            X = df.drop(columns=['attack_cat'])
        else:
            X = df
        
        # Encode Target
        y = y.astype(str)
        y_encoded = self.le_label.fit_transform(y)
        
        # Normalize
        print("Normalizing features...")
        X = X.fillna(0)
        X_scaled = self.scaler.fit_transform(X)
        
        print(f"Preprocessing complete. Data shape: {X_scaled.shape}")
        return X_scaled, y_encoded, list(self.le_label.classes_)

# Run Preprocessing
preprocessor = DataPreprocessor(dataset_config)
X_full, y_full, class_names = preprocessor.preprocess()

n_classes = len(class_names)
n_features = X_full.shape[1]

# Reshape for PyTorch (Samples, Features, 1)
X_full_reshaped = X_full.reshape((X_full.shape[0], n_features, 1))

In [None]:
# @title 4. HCBAN Model Architecture (PyTorch)
class HCBAN(nn.Module):
    def __init__(self, input_channels, input_length, n_classes):
        super(HCBAN, self).__init__()
        
        # --- CNN Block ---
        self.conv1 = nn.Conv1d(in_channels=input_channels, out_channels=64, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm1d(64)
        self.pool1 = nn.MaxPool1d(kernel_size=2)
        
        self.conv2 = nn.Conv1d(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm1d(128)
        self.pool2 = nn.MaxPool1d(kernel_size=2)
        
        # --- BiLSTM Block ---
        self.lstm = nn.LSTM(input_size=128, hidden_size=128, batch_first=True, bidirectional=True)
        self.dropout_lstm = nn.Dropout(0.3)
        
        # --- Attention Block ---
        self.attention = nn.MultiheadAttention(embed_dim=256, num_heads=4, batch_first=True)
        self.layer_norm = nn.LayerNorm(256)
        
        # --- Classification Head ---
        self.fc1 = nn.Linear(256, 256)
        self.dropout1 = nn.Dropout(0.4)
        self.fc2 = nn.Linear(256, 128)
        self.dropout2 = nn.Dropout(0.4)
        self.fc3 = nn.Linear(128, n_classes)
        
    def forward(self, x):
        # x shape: (Batch, Features, 1) -> Permute to (Batch, 1, Features) for Conv1d
        # PyTorch Conv1d expects (Batch, Channels, Length)
        x = x.permute(0, 2, 1) 
        
        # CNN
        x = F.relu(self.bn1(self.conv1(x)))
        x = self.pool1(x)
        
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool2(x)
        
        # LSTM expects (Batch, Seq_Len, Features)
        x = x.permute(0, 2, 1)
        
        lstm_out, _ = self.lstm(x)
        lstm_out = self.dropout_lstm(lstm_out)
        
        # Attention
        attn_out, _ = self.attention(lstm_out, lstm_out, lstm_out)
        
        # Residual + Norm
        x = lstm_out + attn_out
        x = self.layer_norm(x)
        
        # Global Average Pooling
        x = torch.mean(x, dim=1)
        
        # Dense
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x

model_dummy = HCBAN(1, n_features, n_classes).to(device)
print(model_dummy)

In [None]:
# @title 5. Research Pipeline (5-Fold CV)
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

fold_metrics = {
    'accuracy': [], 'precision': [], 'recall': [], 'f1': [], 'auc': [], 'training_time': []
}

epochs = 20 # Set to 30-50 for final run
batch_size = 256

fold = 1
for train_index, val_index in skf.split(X_full_reshaped, y_full):
    print(f"\n=== Fold {fold}/{n_splits} ===")
    X_train_fold, X_val_fold = X_full_reshaped[train_index], X_full_reshaped[val_index]
    y_train_fold, y_val_fold = y_full[train_index], y_full[val_index]
    
    start_time = time.time()
    
    # --- 1. Data Augmentation (SMOTE) ---
    print("Applying SMOTE to balance classes...")
    X_train_2d = X_train_fold.reshape(X_train_fold.shape[0], -1)
    try:
        smote = SMOTE(random_state=42, n_jobs=-1)
        X_train_res, y_train_res = smote.fit_resample(X_train_2d, y_train_fold)
    except TypeError:
        # Fallback for version mismatch
        smote = SMOTE(random_state=42)
        X_train_res, y_train_res = smote.fit_resample(X_train_2d, y_train_fold)
    except Exception as e:
        print(f"SMOTE failed: {e}. Proceeding without SMOTE.")
        X_train_res, y_train_res = X_train_2d, y_train_fold
        
    # Reshape back to 3D for HCBAN
    X_train_res_3d = X_train_res.reshape(X_train_res.shape[0], X_train_res.shape[1], 1)
    
    # --- 2. Train HCBAN (Deep Learning) ---
    print("Training HCBAN...")
    
    # Prepare Datasets
    train_dataset = TensorDataset(
        torch.tensor(X_train_res_3d, dtype=torch.float32), 
        torch.tensor(y_train_res, dtype=torch.long)
    )
    val_dataset = TensorDataset(
        torch.tensor(X_val_fold, dtype=torch.float32), 
        torch.tensor(y_val_fold, dtype=torch.long)
    )
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    model = HCBAN(input_channels=1, input_length=n_features, n_classes=n_classes)
    model.to(device)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    best_val_loss = float('inf')
    patience = 5
    patience_counter = 0
    history = {'loss': [], 'accuracy': [], 'val_loss': [], 'val_accuracy': []}
    
    for epoch in range(epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
        epoch_loss = running_loss / total
        epoch_acc = correct / total
        
        # Validation
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                val_loss += loss.item() * inputs.size(0)
                _, predicted = torch.max(outputs.data, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
        
        val_epoch_loss = val_loss / val_total
        val_epoch_acc = val_correct / val_total
        
        print(f"Epoch {epoch+1}/{epochs} - Loss: {epoch_loss:.4f} - Acc: {epoch_acc:.4f} - Val Loss: {val_epoch_loss:.4f} - Val Acc: {val_epoch_acc:.4f}")
        
        history['loss'].append(epoch_loss)
        history['accuracy'].append(epoch_acc)
        history['val_loss'].append(val_epoch_loss)
        history['val_accuracy'].append(val_epoch_acc)
        
        if val_epoch_loss < best_val_loss:
            best_val_loss = val_epoch_loss
            patience_counter = 0
            torch.save(model.state_dict(), f'results/fold_{fold}_best_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print("Early stopping triggered")
                break
                
    # Load best model
    model.load_state_dict(torch.load(f'results/fold_{fold}_best_model.pth'))
    
    # --- 3. Train ML Ensemble ---
    print("Training ML Ensemble...")
    ensemble = HybridEnsemble(n_classes=n_classes)
    ensemble.fit(X_train_res, y_train_res)
    
    training_time = time.time() - start_time
    
    # --- 4. Hybrid Prediction ---
    print("Generating Hybrid Predictions...")
    model.eval()
    p_hcban = []
    with torch.no_grad():
        for inputs, _ in val_loader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            probs = torch.softmax(outputs, dim=1)
            p_hcban.append(probs.cpu().numpy())
    p_hcban = np.concatenate(p_hcban, axis=0)
    
    X_val_2d = X_val_fold.reshape(X_val_fold.shape[0], -1)
    p_ensemble = ensemble.predict_proba(X_val_2d)
    
    y_pred_prob = (0.5 * p_hcban) + (0.5 * p_ensemble)
    y_pred = np.argmax(y_pred_prob, axis=1)
    
    # Calculate Metrics
    acc = accuracy_score(y_val_fold, y_pred)
    prec = precision_score(y_val_fold, y_pred, average='weighted', zero_division=0)
    rec = recall_score(y_val_fold, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_val_fold, y_pred, average='weighted', zero_division=0)
    try:
        auc_val = roc_auc_score(y_val_fold, y_pred_prob, multi_class='ovr', average='weighted')
    except:
        auc_val = 0.0
        
    print(f"Fold {fold} Hybrid Results -> Acc: {acc:.4f}, F1: {f1:.4f}, AUC: {auc_val:.4f}")
    
    fold_metrics['accuracy'].append(acc)
    fold_metrics['precision'].append(prec)
    fold_metrics['recall'].append(rec)
    fold_metrics['f1'].append(f1)
    fold_metrics['auc'].append(auc_val)
    fold_metrics['training_time'].append(training_time)
    
    if fold == n_splits:
        np.savez_compressed('results/last_fold_preds.npz', y_true=y_val_fold, y_pred_prob=y_pred_prob)
        with open('results/last_fold_history.json', 'w') as f:
            json.dump(history, f)
            
    fold += 1

with open('results/research_results.json', 'w') as f:
    json.dump(fold_metrics, f)

In [None]:
# @title 6. Generate Figures & Tables

# --- 1. Performance Table ---
print("\n--- HCBAN Performance (5-Fold CV) ---")
table_data = []
for metric, values in fold_metrics.items():
    mean = np.mean(values)
    std = np.std(values)
    ci = 1.96 * std / np.sqrt(n_splits)
    print(f"{metric.capitalize()}: {mean:.4f} Â± {ci:.4f}")
    table_data.append({'Metric': metric.capitalize(), 'Mean': mean, 'CI': ci})
    
pd.DataFrame(table_data).to_csv('results/thesis_table.csv', index=False)

# --- 2. ROC Curve (Last Fold) ---
data = np.load('results/last_fold_preds.npz')
y_true, y_score = data['y_true'], data['y_pred_prob']

from sklearn.preprocessing import label_binarize
y_test_bin = label_binarize(y_true, classes=range(n_classes))

fpr = dict()
tpr = dict()
roc_auc = dict()

plt.figure(figsize=(10, 8))
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
    plt.plot(fpr[i], tpr[i], lw=2, label=f'Class {class_names[i]} (AUC = {roc_auc[i]:.2f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.title('Multi-Class ROC Curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc="lower right")
plt.savefig('plots/research/roc_curve.png')
plt.show()

# --- 3. Confusion Matrix ---
y_pred_classes = np.argmax(y_score, axis=1)
cm = confusion_matrix(y_true, y_pred_classes)
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

plt.figure(figsize=(12, 10))
sns.heatmap(cm_norm, annot=True, fmt='.2f', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.title('Normalized Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.savefig('plots/research/confusion_matrix.png')
plt.show()

In [None]:
# @title 7. Download Results
!zip -r thesis_results.zip results/ plots/
files.download('thesis_results.zip')