# Cross-Dataset Evaluation Notebook

This notebook reproduces the figures and tables discussed in the conversation. It is Google Colab compatible. Place your datasets in the same folder or mount Google Drive.

Files generated by this notebook will be saved into `figures/` and `tables/` directories.

In [None]:
# Setup
import os, warnings, numpy as np, pandas as pd
warnings.filterwarnings("ignore")
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
!mkdir -p figures tables
print("Directories created: figures/ tables/")

In [None]:
# Imports
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import mutual_info_classif, RFE
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt, seaborn as sns
sns.set(style='whitegrid')
print('Libraries imported')

## 1) Load datasets

Place your dataset files in the notebook folder or mount Google Drive. Update the paths if needed.

In [None]:
# Example placeholders for dataset paths - update if needed
UNSW_PATH = "UNSW-NB15_features.csv"
CIC_PATH  = "cicddos2019_dataset.csv"
print("Set dataset paths. Update these paths if your files are placed elsewhere.")

## 2) Feature selection (Mutual Information + RFE)
This cell computes mutual information and RFE and saves top feature importance figures.

In [None]:
def feature_selection_report(df, ycol, top_k=25, fig_name="figures/feature_importance.png"):
    X = df.drop(columns=[ycol])
    y = df[ycol]
    mi = mutual_info_classif(X.fillna(0), y, random_state=RANDOM_STATE)
    mi_series = pd.Series(mi, index=X.columns).sort_values(ascending=False)
    base = LogisticRegression(max_iter=200)
    n_features = min(30, X.shape[1]) if X.shape[1] > 30 else max(5, X.shape[1]//2)
    selector = RFE(base, n_features_to_select=n_features, step=1)
    selector.fit(X.fillna(0), y)
    rfe_rank = pd.Series(selector.ranking_, index=X.columns).sort_values()
    feat_df = pd.DataFrame({'Mutual_Info':mi_series, 'RFE_Rank': rfe_rank})
    top = mi_series.head(top_k)
    plt.figure(figsize=(12,7))
    top.sort_values(ascending=True).plot(kind='barh', alpha=0.9)
    plt.xlabel('Importance Score'); plt.title('Feature Importance (Mutual Information)')
    plt.tight_layout(); plt.savefig(fig_name, dpi=160); plt.show()
    feat_df.to_csv(fig_name.replace('.png','.csv').replace('figures/','tables/'))
    return feat_df

print('Feature selection helper defined. Run this after loading your numeric dataframes.')

## 3) Synthetic ROC curves & Figures

This cell reproduces the ROC figure using the AUCs provided in the conversation (synthetic curves).

In [None]:
def synthetic_roc_from_auc(auc, n=300):
    x = np.linspace(0,1,n)
    a = max(1.001, 5*(auc-0.5)+1)
    y = 1 - (1 - x**(1/a))**a
    adj = (auc - 0.5) * 0.12
    y = np.clip(y + adj*(1 - 2*x), 0, 1)
    return x, y

plt.figure(figsize=(9,7))
models_auc = {
    "Naïve Bayes": 0.8964,
    "Decision Tree": 0.9244,
    "SVM": 0.9412,
    "Random Forest": 0.9547,
    "DNN": 0.9638,
    "Ensemble": 0.9656,
    "CNN": 0.9683,
    "LSTM": 0.9711,
    "A2C": 0.9733,
    "DDPG": 0.9764,
    "TD3": 0.9849
}
for name, auc in models_auc.items():
    fpr, tpr = synthetic_roc_from_auc(auc)
    plt.plot(fpr, tpr, label=f"{name} (AUC = {auc:.4f})", linewidth=2)
plt.plot([0,1],[0,1],'k--',label='Random Classifier (AUC = 0.5)')
plt.title('ROC Curve Comparison of All Models'); plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate')
plt.legend(loc='lower right', ncol=2, fontsize=9, frameon=True); plt.tight_layout()
plt.savefig('figures/roc_curves_all_models.png', dpi=180); plt.show()
print('Saved: figures/roc_curves_all_models.png')

## 4) K-Fold Accuracy bar plot (uses provided table values)

In [None]:
# Table 5 values (provided)
tbl5 = pd.DataFrame({
"Model":["Naïve Bayes","Decision Tree","SVM","Random Forest","Ensemble","DNN","CNN","LSTM","A2C","DDPG","TD3"],
"Fold 1":[90.98,93.10,94.00,95.60,96.72,96.10,96.55,96.92,97.30,97.75,99.00],
"Fold 2":[91.22,93.42,94.35,95.80,96.95,96.25,96.70,97.05,97.45,97.88,99.10],
"Fold 3":[90.85,93.50,94.25,95.90,96.88,96.40,96.82,97.15,97.55,97.90,99.15],
"Fold 4":[91.10,93.20,94.10,95.65,96.74,96.30,96.60,97.00,97.40,97.80,99.20],
"Fold 5":[91.35,93.48,94.35,95.90,96.90,96.35,96.75,97.10,97.40,97.92,99.15]
})
tbl5["Average Accuracy (%)"] = tbl5[[f"Fold {i}" for i in range(1,6)]].mean(axis=1).round(2)
tbl5.to_csv('tables/Table5_KFold.csv', index=False)
tbl5.head()

In [None]:
# K-Fold accuracy bar plot
fold_cols = [f"Fold {i}" for i in range(1,6)]
avg_col = "Average Accuracy (%)"
plt.figure(figsize=(14,7))
x = np.arange(len(tbl5['Model']))
width = 0.13
for i, col in enumerate(fold_cols + [avg_col]):
    plt.bar(x + (i-2.5)*width, tbl5[col], width, label=col)
plt.xticks(x, tbl5['Model'], rotation=20); plt.ylim(85,100); plt.ylabel('Accuracy (%)')
plt.title('K-Fold Cross-Validation Accuracy per Model'); plt.legend(); plt.tight_layout()
plt.savefig('figures/kfold_accuracy_per_model.png', dpi=180); plt.show()
print('Saved: figures/kfold_accuracy_per_model.png')

## 5) Confusion matrices (use provided counts)

In [None]:
import numpy as np, seaborn as sns
cms = {
    "Naïve Bayes": np.array([[900,100],[100,900]]),
    "Decision Tree": np.array([[924, 76],[ 76,924]]),
    "SVM": np.array([[936, 64],[ 64,936]]),
    "Random Forest": np.array([[950, 50],[ 50,950]]),
    "Ensemble": np.array([[961, 39],[ 39,961]]),
    "DNN": np.array([[963, 37],[ 37,963]]),
    "CNN": np.array([[967, 33],[ 33,967]]),
    "LSTM": np.array([[969, 31],[ 31,969]]),
    "A2C": np.array([[967, 33],[ 33,967]]),
    "DDPG": np.array([[973, 27],[ 27,973]]),
    "TD3": np.array([[988, 12],[ 12,988]])
}
fig, axes = plt.subplots(3,4, figsize=(14,10))
axes = axes.flatten()
for ax, (name, cm) in zip(axes, cms.items()):
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, ax=ax,
                xticklabels=['Attack','Benign'], yticklabels=['Attack','Benign'])
    ax.set_title(name); ax.set_xlabel('Predicted label'); ax.set_ylabel('True label')
if len(cms) < len(axes):
    for j in range(len(cms), len(axes)):
        fig.delaxes(axes[j])
plt.suptitle('Confusion Matrices', y=1.02, fontsize=16); plt.tight_layout()
plt.savefig('figures/confusion_matrices_grid.png', dpi=180, bbox_inches='tight'); plt.show()
print('Saved: figures/confusion_matrices_grid.png')

## 6) Tables (Scenario A & B)
Tables are created using the exact numbers provided in the conversation.

In [None]:
tbl3 = pd.DataFrame({
    "Model": ["Naïve Bayes","Decision Tree","SVM","Random Forest","Ensemble","CNN","DNN","LSTM","A2C","DDPG","TD3"],
    "Accuracy (%)":[88.92,91.40,92.88,94.26,95.30,96.11,95.72,96.42,96.04,96.68,98.43],
    "Precision (%)":[88.30,91.05,92.45,94.35,95.42,95.85,95.40,96.10,95.95,96.55,98.21],
    "Recall (%)":[87.45,90.72,92.10,93.88,95.05,95.62,95.10,95.92,95.78,96.42,98.13],
    "F1-Score (%)":[87.87,90.88,92.27,94.11,95.23,95.73,95.25,96.01,95.86,96.48,98.36],
    "AUC (%)":[88.50,91.78,93.40,94.82,96.02,96.40,95.88,96.65,96.75,97.12,98.11]
})
tbl4 = pd.DataFrame({
    "Model": ["Naïve Bayes","Decision Tree","SVM","Random Forest","Ensemble","DNN","CNN","LSTM","A2C","DDPG","TD3"],
    "Accuracy (%)":[91.12,93.34,94.21,95.77,96.84,96.95,97.18,97.35,97.42,97.85,99.12],
    "Precision (%)":[90.45,92.97,94.55,95.98,96.75,96.62,96.90,97.02,97.30,97.71,98.86],
    "Recall (%)":[89.85,92.50,93.78,95.33,96.42,96.30,96.71,96.88,97.15,97.66,99.02],
    "F1-Score (%)":[90.15,92.73,94.16,95.65,96.58,96.46,96.80,96.95,97.22,97.68,98.94],
    "AUC (%)":[90.78,93.10,94.83,96.12,97.10,96.88,97.25,97.56,97.91,98.15,98.87]
})
tbl5.to_csv('tables/Table5_KFold.csv', index=False)
tbl3.to_csv('tables/Table3_ScenarioA.csv', index=False)
tbl4.to_csv('tables/Table4_ScenarioB.csv', index=False)
print('Saved tables to tables/ directory')
tbl3.head()