In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import joblib
import os

In [None]:
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))

binary_path = os.path.join(ROOT_DIR, "TrafficLabelling", "data_binary.csv")
multi_path = os.path.join(ROOT_DIR, "TrafficLabelling", "data_multi_class.csv")

data_binary = pd.read_csv(binary_path)
data_multiclass = pd.read_csv(multi_path)

In [51]:
X_binary_train, X_binary_test, y_binary_train, y_binary_test = train_test_split(
    data_binary.drop(columns=['Label']), data_binary['Label'],
    test_size=0.2,
    stratify=data_binary['Label'],  # ✅ giữ tỉ lệ lớp
    random_state=42
)
X_multiclass_train, X_multiclass_test, y_multiclass_train, y_multiclass_test = train_test_split(
    data_multiclass.drop(columns=['Label']), data_multiclass['Label'],
    test_size=0.2,
    stratify=data_multiclass['Label'],  # ✅ giữ tỉ lệ lớp
    random_state=42
)

In [52]:
# 1. Nhóm feature dạng count/flag (0/1 hoặc nhỏ)
flag_features = ['Fwd PSH Flags', 'Fwd URG Flags', 'FIN Flag Count',
                 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count']

# 2. Nhóm feature có outlier lớn, lệch mạnh
outlier_features = ['Flow Duration', 'Flow Bytes/s', 'Flow Packets/s',
                    'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Min',
                    'Fwd IAT Std', 'Fwd IAT Min',
                    'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
                    'Active Mean', 'Active Std', 'Active Max', 'Idle Std']

# 3. Nhóm còn lại (Port, Length, Ratio, Packet stats…) outlier vừa phải, lệnh vừa
other_features = [col for col in data_binary.columns if col not in flag_features + outlier_features +["Label"] ]
other_features

['Destination Port',
 'Total Fwd Packets',
 'Total Length of Fwd Packets',
 'Fwd Packet Length Max',
 'Fwd Packet Length Min',
 'Fwd Packet Length Mean',
 'Bwd Packet Length Min',
 'Bwd Packets/s',
 'Min Packet Length',
 'Packet Length Mean',
 'Down/Up Ratio',
 'min_seg_size_forward']

In [53]:
class LogTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Đảm bảo đầu vào là DataFrame hoặc ndarray
        X = np.asarray(X, dtype=np.float64)
        # Clip để tránh log âm và log(0)
        X = np.clip(X, a_min=0, a_max=None)
        return np.log1p(X)
flag_pipeline = 'passthrough'

# Nhóm 2: outlier -> log rồi StandardScaler
outlier_pipeline = Pipeline([
    ('log', LogTransformer()),
    ('scale', StandardScaler())
])

# Nhóm 3: other -> RobustScaler để giảm outlier
other_pipeline = Pipeline([
    ('scale', RobustScaler())
])

# ===== 4. Kết hợp lại thành ColumnTransformer =====
preprocessor = ColumnTransformer(
    transformers=[
        ('flag', flag_pipeline, flag_features),
        ('outlier', outlier_pipeline, outlier_features),
        ('other', other_pipeline, other_features)
    ],
    remainder='drop'  # bỏ các cột khác (Label)
)

# ===== 5. Tạo pipeline tổng =====
scaling_pipeline = Pipeline(steps=[
    ('preprocess', preprocessor)
])

In [54]:
# --- Fit scaler CHỈ TRÊN TRAIN ---
scaling_pipeline.fit(X_binary_train)

# --- Transform cả train & test ---
X_binary_train_scaled = scaling_pipeline.transform(X_binary_train)
X_binary_test_scaled = scaling_pipeline.transform(X_binary_test)

X_multiclass_train_scaled = scaling_pipeline.transform(X_multiclass_train)
X_multiclass_test_scaled = scaling_pipeline.transform(X_multiclass_test)

X_binary_train_scaled = pd.DataFrame(
    X_binary_train_scaled,
    columns=X_binary_train.columns,
    index=X_binary_train.index  # giữ nguyên chỉ số hàng
)
X_binary_test_scaled = pd.DataFrame(
    X_binary_test_scaled,
    columns=X_binary_test.columns,
    index=X_binary_test.index
)

X_multiclass_train_scaled = pd.DataFrame(
    X_multiclass_train_scaled,
    columns=X_multiclass_train.columns,
    index=X_multiclass_train.index
)
X_multiclass_test_scaled = pd.DataFrame(
    X_multiclass_test_scaled,
    columns=X_multiclass_test.columns,
    index=X_multiclass_test.index
)

In [55]:
X_binary_train_scaled.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Length of Fwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Bwd Packet Length Min,Flow Bytes/s,Flow Packets/s,...,RST Flag Count,PSH Flag Count,ACK Flag Count,URG Flag Count,Down/Up Ratio,min_seg_size_forward,Active Mean,Active Std,Active Max,Idle Std
1988865,0.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.059926,-1.667575,1.038465,...,-0.354286,-0.5,-0.055556,-0.772727,0.0,2.218656,-0.055556,-0.503521,0.0,-0.333333
1634870,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.141386,0.421047,-0.588836,...,22.388571,4.081081,-0.055556,0.114082,0.0,-0.00064,-0.055556,7.276002,0.0,0.666667
1662109,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.620244,0.213026,-0.574992,...,6.937143,12.108108,-0.055556,2.852273,0.0,-0.00089,-0.055556,2.0436,0.0,-0.333333
66321,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.511496,-0.621941,-1.296158,...,-0.034286,-0.22973,-0.055556,-0.613636,0.0,-0.002683,-0.055556,6.829854,-1.0,-0.333333
2524042,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.944094,-0.166176,-0.941224,...,14.731429,13.783784,-0.055556,5.893939,0.0,-0.002423,-0.055556,1.924503,-1.0,0.666667


In [56]:
joblib.dump(scaling_pipeline, "scaling_pipeline.pkl")

['scaling_pipeline.pkl']

In [59]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
def train_compare_models(X_train, X_test, y_train, y_test, class_names=None):
    """
    Huấn luyện và so sánh 3 model: RandomForest, LightGBM, XGBoost
    """
    # ---------------------
    # Chuẩn bị model
    # ---------------------
    models = {
        "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1,class_weight='balanced'),
        "LightGBM": lgb.LGBMClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1),
        # "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', n_estimators=100, random_state=42)
    }
    
    results = {}
    
    # ---------------------
    # Huấn luyện từng model
    # ---------------------
    for name, model in models.items():
        print(f"\n===== Training {name} =====")
        model.fit(X_train.drop(columns=['Label'], errors='ignore'), y_train)
        y_pred = model.predict(X_test.drop(columns=['Label'], errors='ignore'))
        
        acc = accuracy_score(y_test, y_pred)
        print(f"Accuracy: {acc:.4f}")
        print("Classification Report:")
        print(classification_report(y_test, y_pred, target_names=class_names))
        
        results[name] = {"model": model, "accuracy": acc}
    # ---------------------
    # So sánh accuracy
    # ---------------------
    print("\n===== Accuracy Comparison =====")
    for name, info in results.items():
        print(f"{name}: {info['accuracy']:.4f}")
    
    return results

In [60]:
binary_classes = ['BENIGN', 'ATTACK']
results_binary = train_compare_models(
    X_binary_train, X_binary_test,
    y_binary_train, y_binary_test,
    class_names=binary_classes
)


===== Training RandomForest =====
Accuracy: 0.9977
Classification Report:
              precision    recall  f1-score   support

      BENIGN       0.99      1.00      0.99    111055
      ATTACK       1.00      1.00      1.00    454073

    accuracy                           1.00    565128
   macro avg       1.00      1.00      1.00    565128
weighted avg       1.00      1.00      1.00    565128


===== Training LightGBM =====
[LightGBM] [Info] Number of positive: 1816292, number of negative: 444217
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.064277 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6424
[LightGBM] [Info] Number of data points in the train set: 2260509, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Accura

In [61]:
binary_classes = ['BENIGN', 'ATTACK']
results_scaled_binary = train_compare_models(
    X_binary_train_scaled, X_binary_test_scaled,
    y_binary_train, y_binary_test,
    class_names=binary_classes
)


===== Training RandomForest =====
Accuracy: 0.9977
Classification Report:
              precision    recall  f1-score   support

      BENIGN       0.99      1.00      0.99    111055
      ATTACK       1.00      1.00      1.00    454073

    accuracy                           1.00    565128
   macro avg       1.00      1.00      1.00    565128
weighted avg       1.00      1.00      1.00    565128


===== Training LightGBM =====
[LightGBM] [Info] Number of positive: 1816292, number of negative: 444217
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.064136 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6426
[LightGBM] [Info] Number of data points in the train set: 2260509, number of used features: 35
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Start training from score 0.000000
Accura

In [62]:
multiclass_classes = data_multiclass['Label'].unique().tolist()
results_multiclass = train_compare_models(
    X_multiclass_train, X_multiclass_test,
    y_multiclass_train, y_multiclass_test,
    class_names=multiclass_classes
)


===== Training RandomForest =====
Accuracy: 0.9998
Classification Report:
                precision    recall  f1-score   support

      DOS_DDOS       1.00      1.00      1.00      2766
PROBE_PORTSCAN       1.00      1.00      1.00     76101
 R2L_U2R_OTHER       1.00      1.00      1.00     31786
      CRED_WEB       1.00      1.00      1.00       402

      accuracy                           1.00    111055
     macro avg       1.00      1.00      1.00    111055
  weighted avg       1.00      1.00      1.00    111055


===== Training LightGBM =====
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.011707 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5662
[LightGBM] [Info] Number of data points in the train set: 444217, number of used features: 33
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training fr

In [63]:
multiclass_classes = data_multiclass['Label'].unique().tolist()
results_scaled_multiclass = train_compare_models(
    X_multiclass_train_scaled, X_multiclass_test_scaled,
    y_multiclass_train, y_multiclass_test,
    class_names=multiclass_classes
)


===== Training RandomForest =====
Accuracy: 0.9998
Classification Report:
                precision    recall  f1-score   support

      DOS_DDOS       1.00      1.00      1.00      2766
PROBE_PORTSCAN       1.00      1.00      1.00     76101
 R2L_U2R_OTHER       1.00      1.00      1.00     31786
      CRED_WEB       1.00      1.00      1.00       402

      accuracy                           1.00    111055
     macro avg       1.00      1.00      1.00    111055
  weighted avg       1.00      1.00      1.00    111055


===== Training LightGBM =====
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010993 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5666
[LightGBM] [Info] Number of data points in the train set: 444217, number of used features: 33
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training fr

In [64]:
# 1️⃣ Kiểm tra có cột 'Label' trong feature không
print('Label' in X_binary_train.columns)
print('Label' in X_multiclass_train.columns)

# 2️⃣ Kiểm tra data leakage
print("Scaler fitted trên train:", scaling_pipeline.feature_names_in_[:5])

# 3️⃣ Kiểm tra cross-validation (đánh giá thật)
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
scores = cross_val_score(rf, X_binary_train, y_binary_train, cv=5, scoring='accuracy')
print("Cross-val accuracy:", scores.mean())

False
False
Scaler fitted trên train: ['Destination Port' 'Flow Duration' 'Total Fwd Packets'
 'Total Length of Fwd Packets' 'Fwd Packet Length Max']
Cross-val accuracy: 0.9977447556554624


In [None]:
import joblib
import os
import stat
def save_models(results, type="binary", save_dir="./models"):
    if os.path.exists(save_dir):
        print(f"Thư mục '{save_dir}' đã tồn tại.")
    else:
        os.makedirs(save_dir)
        print(f"Tạo thư mục mới: '{save_dir}'")
    for model_name, info in results.items():
        model = info["model"]
        save_path = os.path.join(save_dir, f"{model_name}_{type}.joblib")
        # Lưu bằng joblib
        joblib.dump(model, save_path)
        print(f"Saved: {save_path}")
        # Khóa quyền ghi (read-only)
        try:
            if os.name == "nt":  # Windows
                os.chmod(save_path, stat.S_IREAD)
            else:  # Linux / macOS
                os.chmod(save_path, 0o444)
            print(f"Locked (read-only): {save_path}")
        except Exception as e:
            print(f"Could not lock {save_path}: {e}")
model_save = os.path.join(ROOT_DIR, "models")
save_models(results_binary, type="binary",save_dir=model_save)
save_models(results_scaled_binary, type="binary_scaled",save_dir=model_save)
save_models(results_multiclass, type="multiclass",save_dir=model_save)
save_models(results_scaled_multiclass, type="multiclass_scaled",save_dir=model_save)

Tạo thư mục mới: './models'
Saved: ./models\RandomForest_binary.joblib
Locked (read-only): ./models\RandomForest_binary.joblib
Saved: ./models\LightGBM_binary.joblib
Locked (read-only): ./models\LightGBM_binary.joblib
Thư mục './models' đã tồn tại.
Saved: ./models\RandomForest_binary_scaled.joblib
Locked (read-only): ./models\RandomForest_binary_scaled.joblib
Saved: ./models\LightGBM_binary_scaled.joblib
Locked (read-only): ./models\LightGBM_binary_scaled.joblib
Thư mục './models' đã tồn tại.
Saved: ./models\RandomForest_multiclass.joblib
Locked (read-only): ./models\RandomForest_multiclass.joblib
Saved: ./models\LightGBM_multiclass.joblib
Locked (read-only): ./models\LightGBM_multiclass.joblib
Thư mục './models' đã tồn tại.
Saved: ./models\RandomForest_multiclass_scaled.joblib
Locked (read-only): ./models\RandomForest_multiclass_scaled.joblib
Saved: ./models\LightGBM_multiclass_scaled.joblib
Locked (read-only): ./models\LightGBM_multiclass_scaled.joblib
