In [1]:
%load_ext autoreload
%autoreload 2

import sys
import gc
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

# Thi·∫øt l·∫≠p ƒë∆∞·ªùng d·∫´n ƒë·∫øn c√°c module trong th∆∞ m·ª•c src/ [cite: 590-592]
sys.path.append(str(Path.cwd().parent))
from src import config, preprocessing, evaluation, utils

# Kh·ªüi t·∫°o th∆∞ m·ª•c l∆∞u tr·ªØ k·∫øt qu·∫£ cho k·ªãch b·∫£n Mix-training
exp_paths = utils.setup_experiment_folder()
print(f"üìÇ Saving Mix-training SVM results to: {exp_paths['root']}")

üìÇ New Experiment Created: C:\Users\Admin\Documents\ids_ae_rf_hybrid\results\experiments\exp_20251221_163337
üìÇ Saving Mix-training SVM results to: C:\Users\Admin\Documents\ids_ae_rf_hybrid\results\experiments\exp_20251221_163337


In [2]:
print("\n--- 1. LOADING & SAMPLING MIXED DATASETS ---")

# Load 2017
df_17, y_17 = preprocessing.load_single_dataset_year('2017', binary_mode=True)
# L·∫•y m·∫´u 50,000 d√≤ng t·ª´ 2017
df_17_s = df_17.sample(n=50000, random_state=config.RANDOM_STATE)
y_17_s = pd.Series(y_17).iloc[df_17_s.index].values

# Load 2018
df_18, y_18 = preprocessing.load_single_dataset_year('2018', binary_mode=True)
# L·∫•y m·∫´u 50,000 d√≤ng t·ª´ 2018
df_18_s = df_18.sample(n=50000, random_state=config.RANDOM_STATE)
y_18_s = pd.Series(y_18).iloc[df_18_s.index].values

# Gi·∫£i ph√≥ng RAM d·ªØ li·ªáu g·ªëc
del df_17, y_17, df_18, y_18
gc.collect()

# Chia Train/Test ƒë·ªôc l·∫≠p cho t·ª´ng nƒÉm tr∆∞·ªõc khi tr·ªôn [cite: 628]
X_train_17, X_test_17, y_train_17, y_test_17 = train_test_split(
    df_17_s, y_17_s, test_size=0.2, random_state=config.RANDOM_STATE, stratify=y_17_s
)
X_train_18, X_test_18, y_train_18, y_test_18 = train_test_split(
    df_18_s, y_18_s, test_size=0.2, random_state=config.RANDOM_STATE, stratify=y_18_s
)

# G·ªôp th√†nh t·∫≠p MIX (Hu·∫•n luy·ªán h·ªón h·ª£p) [cite: 612]
X_train_mix = pd.concat([X_train_17, X_train_18])
y_train_mix = np.concatenate([y_train_17, y_train_18])

print(f"Final Mixed Training Set Shape: {X_train_mix.shape}")


--- 1. LOADING & SAMPLING MIXED DATASETS ---
üîÑ Loading dataset year 2017 (Binary=True)...
‚úÖ Loaded 2017. Shape: (2830743, 65)
üîÑ Loading dataset year 2018 (Binary=True)...
‚úÖ Loaded 2018. Shape: (9625148, 65)
Final Mixed Training Set Shape: (80000, 65)


In [3]:
print("\n--- 2. PREPROCESSING & FEATURE SELECTION ---")

# L·∫•y index c·ªßa Top-25 ƒë·∫∑c tr∆∞ng mRMR [cite: 664]
all_features = config.SELECTED_FEATURES
mrmr_25_list = config.mRMR_FEATURES[:25] 
indices = [all_features.index(f) for f in mrmr_25_list]

# C·∫Øt ƒë·∫∑c tr∆∞ng
X_train_mix_f = X_train_mix.values[:, indices]

# Scaling tr√™n t·∫≠p MIX [cite: 506]
scaler = preprocessing.get_scaler()
X_train_mix_scaled = scaler.fit_transform(X_train_mix_f)

# L∆∞u scaler
joblib.dump(scaler, exp_paths['models'] / "scaler_mix_svm.joblib")


--- 2. PREPROCESSING & FEATURE SELECTION ---


['C:\\Users\\Admin\\Documents\\ids_ae_rf_hybrid\\results\\experiments\\exp_20251221_163337\\models\\scaler_mix_svm.joblib']

In [4]:
print("\n--- 3. TRAINING SVM ON MIXED DATASET ---")

svm_model = SVC(
    kernel='rbf', 
    C=1.0, 
    probability=True, 
    class_weight='balanced', 
    random_state=config.RANDOM_STATE,
    verbose=True
)

svm_model.fit(X_train_mix_scaled, y_train_mix)

# L∆∞u model
joblib.dump(svm_model, exp_paths['models'] / "svm_mix_model.joblib")
print("‚úÖ SVM Mixed-Training Complete.")


--- 3. TRAINING SVM ON MIXED DATASET ---
[LibSVM]‚úÖ SVM Mixed-Training Complete.


In [6]:
print("\n--- 4. MULTI-SCENARIO EVALUATION ---")

# Chu·∫©n b·ªã 3 t·∫≠p Test
test_scenarios = {
    "CIC2017 Holdout": (X_test_17, y_test_17),
    "CIC2018 Holdout": (X_test_18, y_test_18),
    "Combined MIX Test": (pd.concat([X_test_17, X_test_18]), np.concatenate([y_test_17, y_test_18]))
}

for name, (X_raw, y_true) in test_scenarios.items():
    print(f"\nTesting on: {name}")
    X_scaled = scaler.transform(X_raw.values[:, indices])
    
    metrics = evaluation.evaluate_model(
        svm_model, X_scaled, y_true, 
        save_dir=exp_paths['figures'],
        dataset_name=f"SVM MIX - {name}"
    )
    
    print(f"Results for {name}: Accuracy={metrics['accuracy']:.4f}, MCC={metrics['mcc']:.4f}")

del X_train_mix_scaled, X_test_17, X_test_18
gc.collect()


--- 4. MULTI-SCENARIO EVALUATION ---

Testing on: CIC2017 Holdout

üìä Evaluating on SVM MIX - CIC2017 Holdout...
   ‚úÖ Accuracy: 0.8880
   ‚≠ê MCC:      0.7282
   üìù Report saved to: report_SVM_MIX_-_CIC2017_Holdout.txt
   üñºÔ∏è Confusion Matrix saved to: cm_SVM_MIX_-_CIC2017_Holdout.png
Results for CIC2017 Holdout: Accuracy=0.8880, MCC=0.7282

Testing on: CIC2018 Holdout

üìä Evaluating on SVM MIX - CIC2018 Holdout...
   ‚úÖ Accuracy: 0.8935
   ‚≠ê MCC:      0.7581
   üìù Report saved to: report_SVM_MIX_-_CIC2018_Holdout.txt
   üñºÔ∏è Confusion Matrix saved to: cm_SVM_MIX_-_CIC2018_Holdout.png
Results for CIC2018 Holdout: Accuracy=0.8935, MCC=0.7581

Testing on: Combined MIX Test

üìä Evaluating on SVM MIX - Combined MIX Test...
   ‚úÖ Accuracy: 0.8908
   ‚≠ê MCC:      0.7435
   üìù Report saved to: report_SVM_MIX_-_Combined_MIX_Test.txt
   üñºÔ∏è Confusion Matrix saved to: cm_SVM_MIX_-_Combined_MIX_Test.png
Results for Combined MIX Test: Accuracy=0.8908, MCC=0.7435


287