# Devide validate (test after training model)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv(r"..\data\CIC-IDS2017_processed\training\balanced_dataset_v1.csv")

df_train, df_val = train_test_split(df, test_size=0.05, random_state=42, stratify=df["Label"])

print("Train size:", df_train.shape)
print("Validation size:", df_val.shape)

# Lưu ra file nếu muốn
df_train.to_csv(r"..\data\CIC-IDS2017_processed\validate\train_95.csv", index=False)
df_val.to_csv(r"..\data\CIC-IDS2017_processed\validate\val_05.csv", index=False)

Train size: (975379, 37)
Validation size: (51336, 37)


# Training model

In [38]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import joblib
# 1. Đọc dữ liệu
df_all = pd.read_csv(r"..\data\CIC-IDS2017_processed\training\all_95.csv")

# 2. Tách features (X) và label (y)
X = df_all.drop(columns=["Label"])
y = df_all["Label"]

# 3. Chia train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 4. Khởi tạo mô hình RandomForest
rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    random_state=42,
    verbose=1,
    n_jobs=-1
)

# 5. Train
rf.fit(X_train, y_train)

# 6. Dự đoán
y_pred = rf.predict(X_test)

# 7. Đánh giá
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
joblib.dump(rf, r"..\Output_model\CIC-IDS-2017\random_forest_model.pkl")
joblib.dump(list(X.columns), r"..\Output_model\CIC-IDS-2017\feature_order.pkl")
print("RF Model saved!")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   43.2s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.1min finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.6s finished


Accuracy: 0.9992310689167299
[[81783    15    33]
 [   12 72319     0]
 [   90     0 40824]]
                     precision    recall  f1-score   support

             BENIGN       1.00      1.00      1.00     81831
               DDoS       1.00      1.00      1.00     72331
Unauthorized Access       1.00      1.00      1.00     40914

           accuracy                           1.00    195076
          macro avg       1.00      1.00      1.00    195076
       weighted avg       1.00      1.00      1.00    195076

RF Model saved!


# Infer

## Validation

In [None]:
import pandas as pd
import joblib
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# ==== 1. Load file validation 5% ====
df_val = pd.read_csv(r"..\data\CIC-IDS2017_processed\validate\val_05.csv")

X_val = df_val.drop(columns=["Label"])
y_val = df_val["Label"]

# ==== 2. Load model đã train ====
rf = joblib.load(r"..Output_model\CIC-IDS-2017\random_forest_model.pkl")

# ==== 3. Infer & đánh giá ====
# --- RandomForest ---
y_pred_rf = rf.predict(X_val)
print("\n===== RandomForest on Validation (5%) =====")
print("Accuracy:", accuracy_score(y_val, y_pred_rf))
print(confusion_matrix(y_val, y_pred_rf))
print(classification_report(y_val, y_pred_rf))


[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.2s finished



===== RandomForest on Validation (5%) =====
Accuracy: 0.9991429016674458
[[21515     6    14]
 [    6 19028     0]
 [   18     0 10749]]
                     precision    recall  f1-score   support

             BENIGN       1.00      1.00      1.00     21535
               DDoS       1.00      1.00      1.00     19034
Unauthorized Access       1.00      1.00      1.00     10767

           accuracy                           1.00     51336
          macro avg       1.00      1.00      1.00     51336
       weighted avg       1.00      1.00      1.00     51336



## Infer test on 10 samples

In [None]:
import pandas as pd
import joblib
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

df_val = pd.read_csv(r"..\data\CIC-IDS2017_processed\validate\val_05.csv")

X_val = df_val.drop(columns=["Label"])
y_val = df_val["Label"]

rf = joblib.load(r"..\Output_model\CIC-IDS-2017\random_forest_model.pkl")

y_pred_rf = rf.predict(X_val[0:10])
print("Predict labels",y_pred_rf)
print("Ground truth labels", y_val[0:10])


Predict labels ['Unauthorized Access' 'DDoS' 'DDoS' 'DDoS' 'Unauthorized Access' 'BENIGN'
 'BENIGN' 'DDoS' 'DDoS' 'Unauthorized Access']
Ground truth labels 0    Unauthorized Access
1                   DDoS
2                   DDoS
3                   DDoS
4    Unauthorized Access
5                 BENIGN
6                 BENIGN
7                   DDoS
8                   DDoS
9    Unauthorized Access
Name: Label, dtype: object


[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished


## Infer + norm + test (using input raw)

In [31]:
import pandas as pd
df_raw = pd.read_csv(r"..\data\CIC-IDS2017\Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv")
current_column_names = df_raw.columns
stripped_column_names = {col: col.strip() for col in current_column_names}
df_raw = df_raw.rename(columns=stripped_column_names)
importance_colums = [
    "PSH Flag Count", "min_seg_size_forward", "Flow IAT Max", "Flow IAT Min", "ACK Flag Count", "Destination Port", "Bwd Packet Length Mean", "Bwd Packet Length Max", "Bwd Packet Length Min", "Init_Win_bytes_forward",
    "Fwd IAT Max", "Idle Mean", "Idle Max", "Avg Bwd Segment Size", "Bwd Packet Length Std", "Bwd IAT Mean", "Fwd IAT Std", "Down/Up Ratio", "Max Packet Length", "Average Packet Size",
    "Min Packet Length", "Packet Length Std", "Fwd Packets/s", "Packet Length Mean", "Flow IAT Std", "URG Flag Count", "FIN Flag Count", "Fwd Packet Length Min", "Subflow Fwd Packets", "Bwd IAT Max",
    "Packet Length Variance", "Fwd IAT Mean", "Flow Duration", "Fwd IAT Total", "Bwd IAT Std", "Flow IAT Mean"
]
# Check cột có/không
missing_cols = [col for col in importance_colums if col not in df_raw.columns]

if not missing_cols:
    print("All columns are present.")

importance_colums_with_label = importance_colums + ["Label"]
df_normal_data = df_raw[importance_colums_with_label]
print(df_normal_data[0:10])

All columns are present.
   PSH Flag Count  min_seg_size_forward  Flow IAT Max  Flow IAT Min  \
0               0                    20             3             3   
1               0                    20           109           109   
2               0                    20            52            52   
3               0                    20            34            34   
4               0                    20             3             3   
5               0                    20          1022          1022   
6               0                    20             4             4   
7               0                    20            42            42   
8               0                    20             4             4   
9               0                    20             4             4   

   ACK Flag Count  Destination Port  Bwd Packet Length Mean  \
0               1             54865                     0.0   
1               1             55054                     6.0   
2   

### preprocessing raw data to infer

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np
from functools import partial

def scale_numerical_features(data_df, importance_colums_):
    """
    Norm numerical features using StandardScaler.
    Arguments:
        Input:  
            - data_df: A pandas DataFrame containing numerical features that need to be scaled.
            - importance_colums_: A list of column names to be scaled.
        Output:
            - data_df: A pandas DataFrame containing the scaled numerical features in the required columns.
    """
    scaler = StandardScaler()
    numerical_data = np.array([data_df[col] for col in importance_colums_]).T
    scaled_data = scaler.fit_transform(numerical_data)
    for i, col in enumerate(importance_colums_):
        data_df[col] = scaled_data[:, i].tolist()
    return data_df

df_normal_data = scale_numerical_features(df_normal_data, importance_colums)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_df[col] = scaled_data[:, i].tolist()


Unnamed: 0,PSH Flag Count,min_seg_size_forward,Flow IAT Max,Flow IAT Min,ACK Flag Count,Destination Port,Bwd Packet Length Mean,Bwd Packet Length Max,Bwd Packet Length Min,Init_Win_bytes_forward,...,Fwd Packet Length Min,Subflow Fwd Packets,Bwd IAT Max,Packet Length Variance,Fwd IAT Mean,Flow Duration,Fwd IAT Total,Bwd IAT Std,Flow IAT Mean,Label
0,-0.735674,-0.35585,-0.505203,-0.037003,0.991113,2.327831,-0.794893,-0.738327,-0.331193,-0.52433,...,-0.133981,-0.186406,-0.282318,-0.677831,-0.428095,-0.51521,-0.487105,-0.294079,-0.585057,BENIGN
1,-0.735674,-0.35585,-0.5052,-0.036864,0.991113,2.337398,-0.789538,-0.736707,-0.212335,-0.524827,...,-0.133981,-0.251245,-0.282318,-0.677831,-0.428095,-0.515207,-0.487106,-0.294079,-0.585018,BENIGN
2,-0.735674,-0.35585,-0.505202,-0.036939,0.991113,2.337449,-0.789538,-0.736707,-0.212335,-0.524827,...,-0.133981,-0.251245,-0.282318,-0.677831,-0.428095,-0.515209,-0.487106,-0.294079,-0.585039,BENIGN
3,-0.735674,-0.35585,-0.505202,-0.036963,0.991113,1.891022,-0.789538,-0.736707,-0.212335,-0.524578,...,-0.133981,-0.251245,-0.282318,-0.677831,-0.428095,-0.515209,-0.487106,-0.294079,-0.585046,BENIGN
4,-0.735674,-0.35585,-0.505203,-0.037003,0.991113,2.32773,-0.794893,-0.738327,-0.331193,-0.524454,...,-0.133981,-0.186406,-0.282318,-0.677831,-0.428095,-0.51521,-0.487105,-0.294079,-0.585057,BENIGN
5,-0.735674,-0.35585,-0.505165,-0.035662,0.991113,2.328135,-0.794893,-0.738327,-0.331193,-0.524454,...,-0.133981,-0.186406,-0.282318,-0.677831,-0.427923,-0.515178,-0.487073,-0.294079,-0.58468,BENIGN
6,-0.735674,-0.35585,-0.505203,-0.037002,0.991113,2.330868,-0.794893,-0.738327,-0.331193,-0.524454,...,-0.133981,-0.186406,-0.282318,-0.677831,-0.428095,-0.51521,-0.487105,-0.294079,-0.585057,BENIGN
7,-0.735674,-0.35585,-0.505202,-0.036952,0.991113,2.330868,-0.789538,-0.736707,-0.212335,-0.524454,...,-0.133981,-0.251245,-0.282318,-0.677831,-0.428095,-0.515209,-0.487106,-0.294079,-0.585043,BENIGN
8,-0.735674,-0.35585,-0.505203,-0.037002,0.991113,0.020369,-0.794893,-0.738327,-0.331193,-0.524454,...,-0.133981,-0.186406,-0.282318,-0.677831,-0.428095,-0.51521,-0.487105,-0.294079,-0.585057,BENIGN
9,-0.735674,-0.35585,-0.505203,-0.037002,0.991113,2.34241,-0.794893,-0.738327,-0.331193,-0.410741,...,-0.133981,-0.186406,-0.282318,-0.67778,-0.428095,-0.51521,-0.487105,-0.294079,-0.585057,BENIGN


In [54]:
print(df_normal_data["Label"].unique())

['BENIGN' 'DDoS']


### Infer with newdata

In [None]:
import pandas as pd
import joblib
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

df_ddos = df_normal_data[df_normal_data["Label"] == "DDoS"]

X_features = df_ddos.drop(columns=["Label"])
y_features = df_ddos["Label"]

print(X_features.shape, y_features.shape)

rf = joblib.load(r"..\Output_model\CIC-IDS-2017\random_forest_model.pkl")
feature_order = joblib.load(r"..\Output_model\CIC-IDS-2017\feature_order.pkl")
X_features = X_features[feature_order]
y_pred_rf = rf.predict(X_features[0:5])
print("Predict labels",y_pred_rf)
print("Ground truth labels", y_features[0:10])

(128027, 36) (128027,)
Predict labels ['DDoS' 'BENIGN' 'DDoS' 'DDoS' 'DDoS']
Ground truth labels 18883    DDoS
18884    DDoS
18885    DDoS
18886    DDoS
18887    DDoS
18888    DDoS
18889    DDoS
18890    DDoS
18891    DDoS
18892    DDoS
Name: Label, dtype: object


[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.0s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.0s finished


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib

# ====== 1) Fit & save scaler ======
df_all = pd.read_csv(r"..\data\CIC-IDS2017_processed\training\all_95.csv")

# Giả sử tất cả cột trừ "Label" là numeric cần scale.
# Nếu bạn chỉ muốn scale 1 subset, thay bằng list tên cột numeric.
feature_cols = [c for c in df_all.columns if c != "Label"]

X = df_all[feature_cols]
y = df_all["Label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
scaler.fit(X_train)  # chỉ fit trên train

# Lưu scaler + thứ tự cột
joblib.dump(scaler, r"..\Output_model\CIC-IDS-2017\scaler.pkl")
joblib.dump(feature_cols, r"..\Output_model\CIC-IDS-2017\feature_order.pkl")
print("Saved scaler + feature order")


# ====== 2) Load & scale một điểm duy nhất ======
def load_scaler_and_feature_order(scaler_path, feat_path):
    scaler = joblib.load(scaler_path)
    feature_order = joblib.load(feat_path)
    return scaler, feature_order

def scale_single_point(point, scaler, feature_order, fill_missing=0.0):
    """
    point: dict | pd.Series | pd.DataFrame(1 x n)
    scaler: StandardScaler đã fit
    feature_order: list[str] đúng thứ tự cột đã fit
    fill_missing: giá trị điền khi thiếu cột (nên để 0.0 hoặc np.nan + Imputer trước đó)
    """
    # Chuyển point → DataFrame 1 hàng
    if isinstance(point, dict):
        df = pd.DataFrame([point])
    elif isinstance(point, pd.Series):
        df = point.to_frame().T
    elif isinstance(point, pd.DataFrame):
        # đảm bảo đúng 1 hàng
        if len(point) != 1:
            raise ValueError("DataFrame point must have exactly 1 row.")
        df = point.copy()
    else:
        raise TypeError("point must be dict, pd.Series, or 1-row pd.DataFrame.")

    # Chỉ giữ các cột đã dùng khi fit, và sắp xếp đúng thứ tự
    df = df.reindex(columns=feature_order)

    # Điền giá trị cho cột bị thiếu
    missing_cols = [c for c in feature_order if c not in df.columns or df[c].isna().any()]
    if missing_cols:
        df[missing_cols] = df[missing_cols].fillna(fill_missing)

    # Nếu có cột dtype object (chuỗi), sẽ lỗi -> cần encode trước từ khi fit.
    # Ở đây giả định tất cả numeric rồi.

    # transform yêu cầu shape (1, n_features)
    scaled = scaler.transform(df)
    # Trả về numpy 1D hoặc dict theo cột
    scaled_row = scaled[0]
    return scaled_row, pd.Series(scaled_row, index=feature_order).to_dict()

# ====== Ví dụ dùng ======
if __name__ == "__main__":
    scaler, feature_order = load_scaler_and_feature_order(
        r"..\Output_model\CIC-IDS-2017\scaler.pkl",
        r"..\Output_model\CIC-IDS-2017\feature_order.pkl"
    )

    # Một điểm mới (ví dụ rút từ luồng realtime)
    new_point = {
        "psh_flag_count": 0,
        "min_seg_size_forward": 20,
        "flow_iat_max": 1500,
        "flow_iat_min": 0,
        "ack_flag_count": 2,
        "destination_port": 443,
        "bwd_packet_length_mean": 120.5,
        # ... điền đủ các cột bạn đã fit; cột thiếu sẽ được fill_missing
    }

    scaled_row, scaled_dict = scale_single_point(
        new_point, scaler, feature_order, fill_missing=0.0
    )
    print("Scaled (np.array):", scaled_row)
    # Nếu bạn muốn đưa vào model.predict, đảm bảo dùng đúng thứ tự cột:
    X_one = np.array(scaled_row, dtype=float).reshape(1, -1)
    # y_hat = model.predict(X_one)


Saved scaler + feature order
Scaled (np.array): [-4.46507239e-03 -5.70765566e-02  2.32924741e+01 -1.44700973e-02
  5.94629439e-02 -8.86123345e-03  1.89690870e-03  4.93716877e-03
  2.30553407e-02 -1.16573827e-02 -2.30524898e-02 -5.90002125e-03
  6.30697453e-02  7.97997722e-03  1.34315696e-02 -2.35404603e-02
  8.71524310e-03  2.82054544e-03 -1.13678281e-02  7.47945686e-02
  8.69043895e-03 -7.94099942e-03  6.77509090e-03 -1.75164856e-02
 -1.33230103e-02 -3.06246978e-04 -3.11471202e-02 -1.07781678e-02
 -2.33648465e-03  7.87929861e-02 -1.10653369e-02 -1.07634481e-02
 -2.20253534e-03  8.27672213e-03 -1.75164856e-02  7.44301345e-03]


In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib
from pathlib import Path

# ========= Cấu hình đường dẫn =========
CSV_PATH = r"C:\Code\DDoS-Prevention-System\Research\data\CIC-IDS2017\Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv"   # file CSV cần validate
MODEL_PATH = r"..\Output_model\CIC-IDS-2017\random_forest_model.pkl"  # model đã train (không kèm scaler)
SCALER_PATH = r"..\Output_model\CIC-IDS-2017\scaler.pkl"              # scaler đã fit trên train
FEAT_ORDER_PATH = r"..\Output_model\CIC-IDS-2017\feature_order.pkl"   # list thứ tự cột dùng khi fit

# ====== Mapping nhóm label lớn (tuỳ mục đích báo cáo) ======
label_category_mapping = {
    'BENIGN': 'Normal',
    'DoS Hulk': 'DDoS',
    'DoS GoldenEye': 'DDoS',
    'DoS slowloris': 'DDoS',
    'DoS Slowhttptest': 'DDoS',
    'DDoS': 'DDoS',
    'PortScan': 'Unauthorized Access',
    'FTP-Patator': 'Unauthorized Access',
    'SSH-Patator': 'Unauthorized Access',
    'Web Attack � Brute Force': 'Unauthorized Access',
    'Web Attack � XSS': 'Unauthorized Access',
    'Web Attack � Sql Injection': 'Unauthorized Access',
    'Infiltration': 'Unauthorized Access',
    'Bot': 'Unauthorized Access',
    'Heartbleed': 'Unauthorized Access'
}

def load_assets(model_path, scaler_path, feat_order_path):
    model = joblib.load(model_path)
    scaler = joblib.load(scaler_path)
    feature_order = joblib.load(feat_order_path)
    return model, scaler, feature_order

def safe_read_csv(path):
    df = pd.read_csv(path)
    # Strip khoảng trắng ở tên cột
    df.columns = [c.strip() for c in df.columns]
    return df

def coerce_numeric(df, cols):
    """Ép kiểu numeric cho các cột trong 'cols'; nếu lỗi -> NaN."""
    for c in cols:
        df[c] = pd.to_numeric(df[c], errors='coerce')
    return df

def fillna_with_scaler_means(df, feature_order, scaler):
    """Điền NaN theo mean đã học trong StandardScaler (theo đúng thứ tự cột)."""
    mean_vec = scaler.mean_
    mean_map = {col: mean_vec[i] for i, col in enumerate(feature_order)}
    df = df.copy()
    for col in feature_order:
        if col in df.columns:
            df[col] = df[col].fillna(mean_map[col])
        else:
            # nếu thiếu cột -> tạo cột và fill bằng mean
            df[col] = mean_map[col]
    return df

def transform_with_scaler(df_X, feature_order, scaler):
    # Reindex đúng thứ tự cột
    X = df_X.reindex(columns=feature_order)
    # Transform -> numpy
    X_scaled = scaler.transform(X.values)
    return X_scaled

def map_to_category(labels):
    return [label_category_mapping.get(lbl, lbl) for lbl in labels]

# ================== MAIN ==================
if __name__ == "__main__":
    # 1) Load model/scaler/feature_order
    model, scaler, feature_order = load_assets(MODEL_PATH, SCALER_PATH, FEAT_ORDER_PATH)

    # 2) Đọc CSV cần validate
    df = safe_read_csv(CSV_PATH)

    # 3) Xác định cột label trong file (thường là "Label")
    #    CSV mẫu bạn dán có "Label" cuối cùng – nhưng mình vẫn
    #    kiểm tra linh hoạt đề phòng tên/ khoảng trắng.
    label_col_candidates = ["Label", "label", "Labels"]
    label_col = next((c for c in label_col_candidates if c in df.columns), None)
    if label_col is None:
        raise ValueError(f"Không tìm thấy cột Label trong file. Có các cột: {list(df.columns)[:10]} ...")

    # 4) Chuẩn bị X, ép numeric + fillna = scaler.mean_
    #    Lưu ý: phải dùng đúng 'feature_order' đã lưu khi train.
    df = coerce_numeric(df, [c for c in df.columns if c != label_col])

    # Một số cột có thể thiếu/trùng tên -> chuẩn hoá schema theo feature_order
    # Điền NaN/thiếu bằng mean đã học (từ scaler)
    df_filled = fillna_with_scaler_means(df, feature_order, scaler)

    # 5) Scale theo scaler đã fit
    X_scaled = transform_with_scaler(df_filled[feature_order], feature_order, scaler)

    # 6) Predict
    y_true = df[label_col].astype(str).values  # ground truth
    y_pred = model.predict(X_scaled)

    # 7) Đánh giá (exact labels)
    print("=== Đánh giá theo label gốc ===")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Confusion matrix:\n", confusion_matrix(y_true, y_pred, labels=sorted(pd.unique(np.concatenate([y_true, y_pred])))))
    print("Classification report:\n", classification_report(y_true, y_pred, zero_division=0))

    # 8) Đánh giá theo nhóm lớn (mapping)
    y_true_cat = map_to_category(y_true)
    y_pred_cat = map_to_category(y_pred)

    print("\n=== Đánh giá theo nhóm lớn (mapping) ===")
    print("Accuracy (category):", accuracy_score(y_true_cat, y_pred_cat))
    print("Confusion matrix (category):\n", confusion_matrix(y_true_cat, y_pred_cat, labels=sorted(pd.unique(np.concatenate([y_true_cat, y_pred_cat])))))
    print("Classification report (category):\n", classification_report(y_true_cat, y_pred_cat, zero_division=0))

    # 9) Xem nhanh vài dòng dự đoán
    preview = pd.DataFrame({
        "y_true": y_true,
        "y_pred": y_pred,
        "y_true_cat": y_true_cat,
        "y_pred_cat": y_pred_cat
    }).head(10)
    print("\nPreview dự đoán:")
    print(preview.to_string(index=False))


[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  26 tasks      | elapsed:    0.1s
[Parallel(n_jobs=12)]: Done 100 out of 100 | elapsed:    0.5s finished


=== Đánh giá theo label gốc ===
Accuracy: 0.4452066032038594
Confusion matrix:
 [[127537      0]
 [158930      0]]
Classification report:
               precision    recall  f1-score   support

      BENIGN       0.45      1.00      0.62    127537
    PortScan       0.00      0.00      0.00    158930

    accuracy                           0.45    286467
   macro avg       0.22      0.50      0.31    286467
weighted avg       0.20      0.45      0.27    286467


=== Đánh giá theo nhóm lớn (mapping) ===
Accuracy (category): 0.4452066032038594
Confusion matrix (category):
 [[127537      0]
 [158930      0]]
Classification report (category):
                      precision    recall  f1-score   support

             Normal       0.45      1.00      0.62    127537
Unauthorized Access       0.00      0.00      0.00    158930

           accuracy                           0.45    286467
          macro avg       0.22      0.50      0.31    286467
       weighted avg       0.20      0.45     