In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import RobustScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import joblib
import os

In [None]:
ROOT_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))

binary_path = os.path.join(ROOT_DIR, "TrafficLabelling", "data_binary.csv")
multi_path = os.path.join(ROOT_DIR, "TrafficLabelling", "data_multi_class.csv")

data_binary = pd.read_csv(binary_path)
data_multiclass = pd.read_csv(multi_path)

In [None]:
X_binary_train, X_binary_test, y_binary_train, y_binary_test = train_test_split(
    data_binary.drop(columns=['Label']), data_binary['Label'],
    test_size=0.2,
    stratify=data_binary['Label'],  # ✅ giữ tỉ lệ lớp
    random_state=42
)
X_multiclass_train, X_multiclass_test, y_multiclass_train, y_multiclass_test = train_test_split(
    data_multiclass.drop(columns=['Label']), data_multiclass['Label'],
    test_size=0.2,
    stratify=data_multiclass['Label'],  # ✅ giữ tỉ lệ lớp
    random_state=42
)

In [None]:
# 1. Nhóm feature dạng count/flag (0/1 hoặc nhỏ)
flag_features = ['Fwd PSH Flags', 'Fwd URG Flags', 'FIN Flag Count',
                 'RST Flag Count', 'PSH Flag Count', 'ACK Flag Count', 'URG Flag Count']

# 2. Nhóm feature có outlier lớn, lệch mạnh
outlier_features = ['Flow Duration', 'Flow Bytes/s', 'Flow Packets/s',
                    'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Min',
                    'Fwd IAT Std', 'Fwd IAT Min',
                    'Bwd IAT Total', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
                    'Active Mean', 'Active Std', 'Active Max', 'Idle Std']

# 3. Nhóm còn lại (Port, Length, Ratio, Packet stats…) outlier vừa phải, lệnh vừa
other_features = [col for col in data_binary.columns if col not in flag_features + outlier_features +["Label"] ]
other_features

In [None]:
class LogTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Đảm bảo đầu vào là DataFrame hoặc ndarray
        X = np.asarray(X, dtype=np.float64)
        # Clip để tránh log âm và log(0)
        X = np.clip(X, a_min=0, a_max=None)
        return np.log1p(X)
flag_pipeline = 'passthrough'

# Nhóm 2: outlier -> log rồi StandardScaler
outlier_pipeline = Pipeline([
    ('log', LogTransformer()),
    ('scale', StandardScaler())
])

# Nhóm 3: other -> RobustScaler để giảm outlier
other_pipeline = Pipeline([
    ('scale', RobustScaler())
])

# ===== 4. Kết hợp lại thành ColumnTransformer =====
preprocessor = ColumnTransformer(
    transformers=[
        ('flag', flag_pipeline, flag_features),
        ('outlier', outlier_pipeline, outlier_features),
        ('other', other_pipeline, other_features)
    ],
    remainder='drop'  # bỏ các cột khác (Label)
)

# ===== 5. Tạo pipeline tổng =====
scaling_pipeline = Pipeline(steps=[
    ('preprocess', preprocessor)
])

In [None]:
# --- Fit scaler CHỈ TRÊN TRAIN ---
scaling_pipeline.fit(X_binary_train)

# --- Transform cả train & test ---
X_binary_train_scaled = scaling_pipeline.transform(X_binary_train)
X_binary_test_scaled = scaling_pipeline.transform(X_binary_test)

X_multiclass_train_scaled = scaling_pipeline.transform(X_multiclass_train)
X_multiclass_test_scaled = scaling_pipeline.transform(X_multiclass_test)

X_binary_train_scaled = pd.DataFrame(
    X_binary_train_scaled,
    columns=X_binary_train.columns,
    index=X_binary_train.index  # giữ nguyên chỉ số hàng
)
X_binary_test_scaled = pd.DataFrame(
    X_binary_test_scaled,
    columns=X_binary_test.columns,
    index=X_binary_test.index
)

X_multiclass_train_scaled = pd.DataFrame(
    X_multiclass_train_scaled,
    columns=X_multiclass_train.columns,
    index=X_multiclass_train.index
)
X_multiclass_test_scaled = pd.DataFrame(
    X_multiclass_test_scaled,
    columns=X_multiclass_test.columns,
    index=X_multiclass_test.index
)

In [None]:
X_binary_train_scaled.head()

In [None]:
joblib.dump(scaling_pipeline, "scaling_pipeline.pkl")

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
def train_compare_models(X_train, X_test, y_train, y_test, class_names=None):
    """
    Huấn luyện và so sánh 3 model: RandomForest, LightGBM, XGBoost
    """
    # ---------------------
    # Chuẩn bị model
    # ---------------------
    models = {
        "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1,class_weight='balanced'),
        "LightGBM": lgb.LGBMClassifier(n_estimators=100, random_state=42, class_weight='balanced', n_jobs=-1),
        # "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', n_estimators=100, random_state=42)
    }
    
    results = {}
    
    # ---------------------
    # Huấn luyện từng model
    # ---------------------
    for name, model in models.items():
        print(f"\n===== Training {name} =====")
        model.fit(X_train.drop(columns=['Label'], errors='ignore'), y_train)
        y_pred = model.predict(X_test.drop(columns=['Label'], errors='ignore'))
        
        acc = accuracy_score(y_test, y_pred)
        print(f"Accuracy: {acc:.4f}")
        print("Classification Report:")
        print(classification_report(y_test, y_pred, target_names=class_names))
        
        results[name] = {"model": model, "accuracy": acc}
    # ---------------------
    # So sánh accuracy
    # ---------------------
    print("\n===== Accuracy Comparison =====")
    for name, info in results.items():
        print(f"{name}: {info['accuracy']:.4f}")
    
    return results

In [None]:
binary_classes = ['BENIGN', 'ATTACK']
results_binary = train_compare_models(
    X_binary_train, X_binary_test,
    y_binary_train, y_binary_test,
    class_names=binary_classes
)

In [None]:
binary_classes = ['BENIGN', 'ATTACK']
results_scaled_binary = train_compare_models(
    X_binary_train_scaled, X_binary_test_scaled,
    y_binary_train, y_binary_test,
    class_names=binary_classes
)

In [None]:
multiclass_classes = data_multiclass['Label'].unique().tolist()
results_multiclass = train_compare_models(
    X_multiclass_train, X_multiclass_test,
    y_multiclass_train, y_multiclass_test,
    class_names=multiclass_classes
)

In [None]:
multiclass_classes = data_multiclass['Label'].unique().tolist()
results_scaled_multiclass = train_compare_models(
    X_multiclass_train_scaled, X_multiclass_test_scaled,
    y_multiclass_train, y_multiclass_test,
    class_names=multiclass_classes
)

In [None]:
# 1️⃣ Kiểm tra có cột 'Label' trong feature không
print('Label' in X_binary_train.columns)
print('Label' in X_multiclass_train.columns)

# 2️⃣ Kiểm tra data leakage
print("Scaler fitted trên train:", scaling_pipeline.feature_names_in_[:5])

# 3️⃣ Kiểm tra cross-validation (đánh giá thật)
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
scores = cross_val_score(rf, X_binary_train, y_binary_train, cv=5, scoring='accuracy')
print("Cross-val accuracy:", scores.mean())

In [None]:
import joblib
import os
import stat
def save_models(results, type="binary", save_dir="./models"):
    if os.path.exists(save_dir):
        print(f"Thư mục '{save_dir}' đã tồn tại.")
    else:
        os.makedirs(save_dir)
        print(f"Tạo thư mục mới: '{save_dir}'")
    for model_name, info in results.items():
        model = info["model"]
        save_path = os.path.join(save_dir, f"{model_name}_{type}.joblib")
        # Lưu bằng joblib
        joblib.dump(model, save_path)
        print(f"Saved: {save_path}")
        # Khóa quyền ghi (read-only)
        try:
            if os.name == "nt":  # Windows
                os.chmod(save_path, stat.S_IREAD)
            else:  # Linux / macOS
                os.chmod(save_path, 0o444)
            print(f"Locked (read-only): {save_path}")
        except Exception as e:
            print(f"Could not lock {save_path}: {e}")
model_save = os.path.join(ROOT_DIR, "models")
save_models(results_binary, type="binary",save_dir=model_save)
save_models(results_scaled_binary, type="binary_scaled",save_dir=model_save)
save_models(results_multiclass, type="multiclass",save_dir=model_save)
save_models(results_scaled_multiclass, type="multiclass_scaled",save_dir=model_save)