# 03 - Model Training 🤖

> Huấn luyện 3 mô hình: Random Forest, LightGBM, Neural Network. Đánh giá sơ bộ và lưu model để sử dụng ở bước sau.


## Import thư viện

In [12]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
# Import các công cụ từ scikit-optimize và imblearn
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline # Đổi tên để tránh trùng lặp
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

import joblib
import lightgbm as lgb
import seaborn as sns
import matplotlib.pyplot as plt

## Tải dữ liệu đã tách

In [13]:
X_train = pd.read_csv("../data/processed/X_train.csv")
X_test = pd.read_csv("../data/processed/X_test.csv")
y_train = pd.read_csv("../data/processed/y_train.csv").squeeze()
y_test = pd.read_csv("../data/processed/y_test.csv").squeeze()
# Định nghĩa chiến lược kiểm định chéo để tái sử dụng
# StratifiedKFold là lựa chọn tốt nhất cho dữ liệu mất cân bằng
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [14]:

print("✅ Dữ liệu huấn luyện:", X_train.shape)
print("✅ Dữ liệu kiểm tra:", X_test.shape)

✅ Dữ liệu huấn luyện: (9712, 18)
✅ Dữ liệu kiểm tra: (2428, 18)


In [15]:
models_dir = '../models'
os.makedirs(models_dir, exist_ok=True)

## 1: Train mô hình Random Forest

In [None]:
# 1. Tạo một pipeline bao gồm SMOTE và mô hình
pipeline_rf = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(random_state=42))
])
search_space_rf = {
    'classifier__n_estimators': Integer(200, 800),
    'classifier__max_depth': Integer(10, 50),
    'classifier__min_samples_split': Integer(2, 10),
    'classifier__min_samples_leaf': Integer(1, 5),
    'classifier__max_features': Categorical(['sqrt', 'log2', 0.6, 0.8]),
    'classifier__class_weight': Categorical([None, 'balanced', 'balanced_subsample'])
}
bayes_search_rf = BayesSearchCV(
    estimator=pipeline_rf,
    search_spaces=search_space_rf,
    n_iter=32, 
    cv=cv_strategy,
    n_jobs=-1,
    verbose=2,
    scoring='f1',
    random_state=42
)
print("Bắt đầu tinh chỉnh cho Random Forest...")
bayes_search_rf.fit(X_train, y_train)
best_rf_model = bayes_search_rf.best_estimator_
print("\nCác tham số tốt nhất cho Random Forest:")
print(bayes_search_rf.best_params_)
print(f"Điểm F1 tốt nhất: {bayes_search_rf.best_score_}")
joblib.dump(best_rf_model, os.path.join(models_dir, 'best_rf_model.pkl')) 
print(f"Đã lưu mô hình Random Forest tốt nhất tại: {os.path.join(models_dir, 'best_rf_model.pkl')}")

Bắt đầu tinh chỉnh cho Random Forest...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds 

In [17]:
joblib.dump(best_rf_model, os.path.join(models_dir, 'best_rf_model.pkl')) 
print(f"Đã lưu mô hình Random Forest tốt nhất tại: {os.path.join(models_dir, 'best_rf_model.pkl')}")

Đã lưu mô hình Random Forest tốt nhất tại: ../models\best_rf_model.pkl


In [18]:
# Lưu danh sách tên cột để dùng khi predict
feature_order = X_train.columns.tolist()
joblib.dump(feature_order, '../models/feature_order.pkl')

['../models/feature_order.pkl']

## 2: Huấn luyện LightGBM

In [None]:
# 1. Tạo pipeline
pipeline_lgbm = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('classifier', LGBMClassifier(random_state=42))
])

search_space_lgbm = {
    'classifier__n_estimators': Integer(200, 1500),
    'classifier__learning_rate': Real(0.01, 0.1, 'log-uniform'),
    'classifier__num_leaves': Integer(20, 60),
    'classifier__reg_alpha': Real(0.0, 1.0, 'uniform'), 
    'classifier__reg_lambda': Real(0.0, 1.0, 'uniform'), 
    'classifier__colsample_bytree': Real(0.6, 1.0, 'uniform'),
    'classifier__subsample': Real(0.6, 1.0, 'uniform')
}
bayes_search_lgbm = BayesSearchCV(
    estimator=pipeline_lgbm,
    search_spaces=search_space_lgbm,
    n_iter=50,
    cv=cv_strategy,
    n_jobs=-1,
    verbose=2,
    scoring='f1',
    random_state=42
)
print("\nBắt đầu tinh chỉnh cho LightGBM...")
bayes_search_lgbm.fit(X_train, y_train)
best_lgbm_model = bayes_search_lgbm.best_estimator_
print("\nCác tham số tốt nhất cho LightGBM:")
print(bayes_search_lgbm.best_params_)
print(f"Điểm F1 tốt nhất: {bayes_search_lgbm.best_score_}")
joblib.dump(best_lgbm_model, os.path.join(models_dir, 'best_lgbm_model.pkl'))
print(f"Đã lưu mô hình LightGBM tốt nhất tại: {os.path.join(models_dir, 'best_lgbm_model.pkl')}")


Bắt đầu tinh chỉnh cho LightGBM...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for 

In [20]:

feature_order = X_train.columns.tolist()
joblib.dump(feature_order, '../models/feature_order_lightgbm.pkl')

['../models/feature_order_lightgbm.pkl']

## 3: Huấn luyện Neural Network (MLPClassifier)

In [None]:
# 1. Tạo pipeline bao gồm cả co giãn dữ liệu (StandardScaler)
#    CỐ ĐỊNH một kiến trúc mạng hợp lý: (128, 64, 32)
pipeline_mlp = ImbPipeline([
    ('scaler', StandardScaler()),
    ('smote', SMOTE(random_state=42)),
    ('classifier', MLPClassifier(hidden_layer_sizes=(128, 64, 32)
    , max_iter=1000, early_stopping=True, random_state=42))
])
search_space_mlp = {
    'classifier__activation': Categorical(['relu', 'tanh']),
    'classifier__alpha': Real(1e-5, 1e-1, 'log-uniform'),
    'classifier__learning_rate_init': Real(1e-4, 1e-1, 'log-uniform')
}
bayes_search_mlp = BayesSearchCV(
    estimator=pipeline_mlp,
    search_spaces=search_space_mlp,
    n_iter=32,
    cv=cv_strategy,
    n_jobs=-1,
    verbose=2,
    scoring='f1',
    random_state=42
)

print("\nBắt đầu tinh chỉnh cho MLP...")
bayes_search_mlp.fit(X_train, y_train)
best_mlp_model = bayes_search_mlp.best_estimator_
print("\nCác tham số tốt nhất cho MLP:")
print(bayes_search_mlp.best_params_)
print(f"Điểm F1 tốt nhất trên tập CV: {bayes_search_mlp.best_score_:.4f}")
joblib.dump(best_mlp_model, os.path.join(models_dir, 'best_mlp_model.pkl'))
print(f"Đã lưu mô hình MLP tốt nhất tại: {os.path.join(models_dir, 'best_mlp_model.pkl')}")


Bắt đầu tinh chỉnh cho MLP...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each 

## 4: Stacking 

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
estimators = [
    ('rf', best_rf_model),
    ('lgbm', best_lgbm_model),
    ('mlp', best_mlp_model)
]
stacking_model = StackingClassifier(
    estimators=estimators, 
    final_estimator=LogisticRegression(),
    cv=cv_strategy
)
print("\nBắt đầu huấn luyện mô hình Stacking...")
stacking_model.fit(X_train, y_train)
print("Hoàn thành!")
joblib.dump(stacking_model, os.path.join(models_dir, 'stacking_model.pkl'))

print(f"Đã lưu thành công mô hình Stacking cuối cùng tại: {os.path.join(models_dir, 'stacking_model.pkl')}")
feature_order = X_train.columns.tolist()
joblib.dump(feature_order, '../models/feature_stacking.pkl')


Bắt đầu huấn luyện mô hình Stacking...
[LightGBM] [Info] Number of positive: 6643, number of negative: 6643
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000771 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2648
[LightGBM] [Info] Number of data points in the train set: 13286, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 5314, number of negative: 5314
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000514 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2649
[LightGBM] [Info] Number of data points in the train set: 10628, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 5314, number of negative: 5314
[LightGBM] [Info]

['../models/feature_stacking.pkl']