In [1]:
# Cell 1: Setup
%load_ext autoreload
%autoreload 2

import sys
import gc
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import accuracy_score

# Setup path
sys.path.append(str(Path.cwd().parent))
from src import config, preprocessing, autoencoder, rf_classifier, evaluation, utils

# T·∫°o folder l∆∞u k·∫øt qu·∫£ ri√™ng cho Cross-Test
exp_paths = utils.setup_experiment_folder()
print(f"üìÇ Saving results to: {exp_paths['root']}")

ModuleNotFoundError: No module named 'src'

In [None]:
# ==============================================================================
# K·ªäCH B·∫¢N 1: TRAIN TR√äN 2017 -> TEST TR√äN 2018
# ==============================================================================

# Cell 2: Load Data 2017 (TRAIN)
print("\n--- 1. LOADING TRAIN SET (CIC-IDS2017) ---")
# H√†m n√†y tr·∫£ v·ªÅ DataFrame, ta gi·ªØ nguy√™n ƒë·ªÉ d·ªÖ ki·ªÉm so√°t l√∫c ƒë·∫ßu
df_train, y_train = preprocessing.load_single_dataset_year('2017', binary_mode=True)

# Chuy·ªÉn sang numpy ƒë·ªÉ ƒë∆∞a v√†o model
X_train = df_train.values

print(f"Training Data Shape: {X_train.shape}")
print(f"Label Distribution: {pd.Series(y_train).value_counts()}")

In [None]:
# Cell 3: Scaling (Fit on Train)
print("\n--- 2. SCALING ---")
scaler = preprocessing.get_scaler()
X_train_scaled = scaler.fit_transform(X_train)
print("‚úÖ Scaler fitted on 2017.")

In [None]:
# Cell 4: Train Autoencoder on 2017
print("\n--- 3. TRAIN AUTOENCODER ---")
ae_model = autoencoder.DeepAutoencoder(
    input_dim=config.AE_INPUT_DIM, 
    latent_dim=config.AE_LATENT_DIM,
    hidden_layers=config.AE_HIDDEN_LAYERS
)
# Train AE
# M·ªõi (L∆∞u v√†o folder experiment hi·ªán t·∫°i)
ae_save_path = exp_paths['models'] / "ae_model_2017.pth"

ae_model = autoencoder.train_ae(ae_model, X_train_scaled, save_path=ae_save_path)

# Extract Latent Features cho Train
print("Extracting latent features for Train set...")
X_train_latent = autoencoder.extract_features(ae_model, X_train_scaled)

In [None]:
# Cell 5: Prepare Fusion Data for Train
print("\n--- 4. PREPARE FUSION DATA (TRAIN) ---")
# L·∫•y mRMR features
all_feature_names = config.SELECTED_FEATURES
mrmr_indices = [all_feature_names.index(feat) for feat in config.mRMR_FEATURES]

X_train_mrmr = X_train_scaled[:, mrmr_indices]
X_train_fusion = np.hstack([X_train_mrmr, X_train_latent])

print(f"Fusion Train Shape: {X_train_fusion.shape}")

# X√≥a b·ªõt bi·∫øn n·∫∑ng ƒë·ªÉ gi·∫£i ph√≥ng RAM cho t·∫≠p Test
del X_train, X_train_scaled, df_train
gc.collect()

In [None]:
# Cell 6: Train Random Forest on 2017
print("\n--- 5. TRAIN CLASSIFIER (RF) ---")
rf_save_path = exp_paths['models'] / "rf_model_2017.joblib"
rf_model = rf_classifier.train_rf(X_train_fusion, y_train, save_path=rf_save_path)
print("‚úÖ RF Model Trained on 2017.")

In [None]:
# ==============================================================================
# CHUY·ªÇN SANG GIAI ƒêO·∫†N TEST TR√äN 2018
# ==============================================================================

# Cell 7: Load Data 2018 (TEST)
print("\n--- 6. LOADING TEST SET (CSE-CIC-IDS2018) ---")
df_test, y_test = preprocessing.load_single_dataset_year('2018', binary_mode=True)
X_test = df_test.values

print(f"Test Data Shape: {X_test.shape}")

In [None]:
# Cell 8: Process Test Data (D√πng Scaler & AE c≈©)
print("\n--- 7. PROCESSING TEST DATA ---")

# A. Scale (D√πng scaler c·ªßa 2017)
print("Scaling using 2017 scaler...")
X_test_scaled = scaler.transform(X_test)

# B. Extract Latent (D√πng AE c·ªßa 2017)
print("Extracting latent features...")
X_test_latent = autoencoder.extract_features(ae_model, X_test_scaled)

# C. Slice mRMR Features
print("Slicing mRMR features...")
X_test_mrmr = X_test_scaled[:, mrmr_indices]

# D. Fusion
print("Fusing...")
X_test_fusion = np.hstack([X_test_mrmr, X_test_latent])
print(f"Fusion Test Shape: {X_test_fusion.shape}")

# Clean RAM
del X_test, X_test_scaled, df_test
gc.collect()

In [None]:
# Cell 9: Evaluate Cross-Dataset
print("\n--- 8. FINAL EVALUATION (Train: 2017 -> Test: 2018) ---")
metrics = evaluation.evaluate_model(
    rf_model, 
    X_test_fusion, 
    y_test, 
    save_dir=exp_paths['figures'],
    dataset_name="Cross-Test (Train 17 - Test 18)"
)

print("\n=== K·∫æT QU·∫¢ ===")
print(f"Accuracy: {metrics['accuracy']:.4f}")
print(f"MCC:      {metrics['mcc']:.4f}")

In [None]:
# ==============================================================================
# BATTLE: HYBRID MODEL (AE+RF) vs. PURE RF (BASELINE)
# ==============================================================================

print("\nü•ä B·∫ÆT ƒê·∫¶U SO S√ÅNH: PROPOSED METHOD vs. BASELINE ü•ä")

# --- 1. Train Baseline (Pure RF v·ªõi mRMR Features) ---
print("üëâ Training Baseline RF (Only mRMR 20 features)...")
# X_train_mrmr ƒë√£ ƒë∆∞·ª£c t·∫°o ·ªü b∆∞·ªõc tr∆∞·ªõc (c·∫Øt t·ª´ X_train_scaled)
# L∆∞u √Ω: C·∫ßn ƒë·∫£m b·∫£o bi·∫øn X_train_mrmr c√≤n trong RAM. 
# N·∫øu l·ª° x√≥a r·ªìi th√¨ ph·∫£i c·∫Øt l·∫°i t·ª´ X_train_scaled (n·∫øu X_train_scaled c≈©ng x√≥a th√¨ ph·∫£i load l·∫°i 2017).
# Gi·∫£ s·ª≠ b·∫°n ch·∫°y li·ªÅn m·∫°ch th√¨ c√°c bi·∫øn n√†y v·∫´n c√≤n ho·∫∑c c√≥ th·ªÉ t·∫°o l·∫°i d·ªÖ d√†ng.

# C√°ch an to√†n nh·∫•t ƒë·ªÉ t·∫°o l·∫°i d·ªØ li·ªáu train cho Baseline (n·∫øu l·ª° x√≥a RAM):
# (Ch·ªâ ch·∫°y ƒëo·∫°n reload n√†y n·∫øu b·∫°n ƒë√£ l·ª° del bi·∫øn X_train_scaled)
# df_train_temp, y_train_temp = preprocessing.load_single_dataset_year('2017', binary_mode=True)
# scaler_temp = preprocessing.get_scaler()
# X_train_sc_temp = scaler_temp.fit_transform(df_train_temp.values)
# mrmr_indices = [config.SELECTED_FEATURES.index(f) for f in config.MRMR_FEATURES]
# X_train_mrmr = X_train_sc_temp[:, mrmr_indices]
# y_train = y_train_temp
# -------------------------------------------------------------

rf_baseline = rf_classifier.train_rf(
    X_train_mrmr, 
    y_train, 
    save_path=exp_paths['models'] / "rf_baseline_2017.joblib"  # <-- L∆∞u v√†o folder exp hi·ªán t·∫°i
) 

# --- 2. Evaluate Baseline ---
print("Evaluating Baseline on Test set (2018)...")
# T∆∞∆°ng t·ª±, ƒë·∫£m b·∫£o X_test_mrmr c√≥ s·∫µn
baseline_metrics = evaluation.evaluate_model(
    rf_baseline, 
    X_test_mrmr, 
    y_test, 
    save_dir=None,
    dataset_name="Baseline (Pure RF)"
)

# --- 3. So s√°nh tr·ª±c ti·∫øp ---
# L·∫•y k·∫øt qu·∫£ t·ª´ metrics c≈© (b·∫°n v·ª´a ch·∫°y xong)
# L∆∞u √Ω: Bi·∫øn 'metrics' ch·ª©a k·∫øt qu·∫£ c·ªßa Hybrid model v·ª´a ch·∫°y ·ªü tr√™n
hybrid_acc = metrics['accuracy'] 
baseline_acc = baseline_metrics['accuracy']

hybrid_mcc = metrics['mcc']
baseline_mcc = baseline_metrics['mcc']

# L·∫•y th√™m Recall Attack ƒë·ªÉ so s√°nh (Quan tr·ªçng nh·∫•t)
hybrid_recall = metrics['report']['Attack']['recall']
baseline_recall = baseline_metrics['report']['Attack']['recall']

print("\nüìä === B·∫¢NG K·∫æT QU·∫¢ ƒê·ªêI ƒê·∫¶U (Train 17 -> Test 18) ===")
print(f"{'Metric':<20} | {'Baseline (Pure RF)':<20} | {'Hybrid (Proposed)':<20} | {'Improvement':<15}")
print("-" * 85)
print(f"{'Accuracy':<20} | {baseline_acc:.4f}{'':<14} | {hybrid_acc:.4f}{'':<14} | {hybrid_acc - baseline_acc:+.4f}")
print(f"{'MCC':<20} | {baseline_mcc:.4f}{'':<14} | {hybrid_mcc:.4f}{'':<14} | {hybrid_mcc - baseline_mcc:+.4f}")
print(f"{'Recall (Attack)':<20} | {baseline_recall:.4f}{'':<14} | {hybrid_recall:.4f}{'':<14} | {hybrid_recall - baseline_recall:+.4f}")

# --- 4. V·∫Ω bi·ªÉu ƒë·ªì ---
import matplotlib.pyplot as plt
import numpy as np

metrics_names = ['Accuracy', 'MCC', 'Recall (Attack)']
baseline_scores = [baseline_acc, baseline_mcc, baseline_recall]
hybrid_scores = [hybrid_acc, hybrid_mcc, hybrid_recall]

x = np.arange(len(metrics_names))
width = 0.35

fig, ax = plt.subplots(figsize=(10, 6))
rects1 = ax.bar(x - width/2, baseline_scores, width, label='Baseline (Pure RF)', color='gray')
rects2 = ax.bar(x + width/2, hybrid_scores, width, label='Hybrid (AE+RF)', color='royalblue')

ax.set_ylabel('Scores')
ax.set_title('Cross-Dataset Comparison: Pure RF vs Hybrid')
ax.set_xticks(x)
ax.set_xticklabels(metrics_names)
ax.legend()
ax.set_ylim([0, 1.05])

def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        ax.annotate(f'{height:.4f}',
                    xy=(rect.get_x() + rect.get_width() / 2, height),
                    xytext=(0, 3),
                    textcoords="offset points",
                    ha='center', va='bottom')

autolabel(rects1)
autolabel(rects2)

plt.tight_layout()
plt.savefig(exp_paths['figures'] / "cross_dataset_comparison.png")
plt.show()