In [4]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, f1_score, confusion_matrix, accuracy_score
import joblib
import warnings
warnings.filterwarnings('ignore')

# === 1. Đọc dữ liệu và tạo feature engineering ===
df = pd.read_csv("D:/dow/project/data/full_dataset_with_gan.csv")
base = [
    'Normalized_Temp', 'Normalized_Vibration', 'Normalized_Pressure',
    'Normalized_Voltage', 'Normalized_Current',
    'FFT_Feature1', 'FFT_Feature2', 'Anomaly_Score'
]
df = df.sort_values(['Sensor_ID', 'Year', 'Month', 'Day', 'Hour', 'Minute'])

for col in base:
    for w in [3, 7, 15, 30]:
        df[f'{col}_rollmean{w}'] = df.groupby('Sensor_ID')[col].transform(lambda x: x.rolling(w, min_periods=1).mean())
        df[f'{col}_rollstd{w}']  = df.groupby('Sensor_ID')[col].transform(lambda x: x.rolling(w, min_periods=1).std().fillna(0))
    df[f'{col}_delta'] = df.groupby('Sensor_ID')[col].diff().fillna(0)
df['fault_rate_50'] = df.groupby('Sensor_ID')['Fault_Status'].transform(lambda x: x.rolling(50, min_periods=1).mean())

drop_cols = ['Sensor_ID', 'Fault_Status', 'Fault_Type', 'Year','Month','Day','Hour','Minute']
feature_cols = [c for c in df.columns if c not in drop_cols]

# === 2. Chia train/test ===
X = df[feature_cols].fillna(0)
y = df['Fault_Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# === 3. Tính scale_pos_weight cho dữ liệu mất cân bằng ===
n0 = (y_train == 0).sum()
n1 = (y_train == 1).sum()
scale_pos_weight = n0 / n1
print("scale_pos_weight =", round(scale_pos_weight, 2))

# === 4. Tạo thư mục lưu kết quả ===
result_dir = r"D:/dow/project/evaluation/feature"
os.makedirs(result_dir, exist_ok=True)

# === 5. Huấn luyện và đánh giá XGBoost ===
model_xgb = XGBClassifier(
    use_label_encoder=False, eval_metric='logloss', random_state=42, 
    n_estimators=400, learning_rate=0.13, max_depth=6, subsample=0.93, 
    colsample_bytree=0.71, scale_pos_weight=scale_pos_weight
)
model_xgb.fit(X_train, y_train)
y_pred_xgb = model_xgb.predict(X_test)

# Báo cáo đánh giá XGBoost (in ra console)
print("\n=== XGBoost Evaluation ===")
print(classification_report(y_test, y_pred_xgb, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_xgb))
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("Macro F1:", f1_score(y_test, y_pred_xgb, average='macro'))

# Báo cáo đánh giá XGBoost (lưu file csv)
xgb_report_dict = classification_report(y_test, y_pred_xgb, digits=4, output_dict=True)
xgb_report_df = pd.DataFrame(xgb_report_dict).transpose()
xgb_report_df.to_csv(os.path.join(result_dir, "xgb_feature_report.csv"))
pd.DataFrame(confusion_matrix(y_test, y_pred_xgb)).to_csv(
    os.path.join(result_dir, "xgb_feature_confusion_matrix.csv"), index=False, header=False)

# Lưu model XGBoost
joblib.dump(model_xgb, os.path.join(result_dir, "xgb_feature_model.pkl"))

# Feature importance XGBoost (vừa in vừa lưu ảnh/csv)
xgb_fi_df = pd.DataFrame({
    "feature": feature_cols,
    "importance": model_xgb.feature_importances_
}).sort_values("importance", ascending=False)

print("\nTop 15 XGBoost Feature Importances:")
print(xgb_fi_df.head(15))
xgb_fi_df.to_csv(os.path.join(result_dir, "xgb_feature_importance.csv"), index=False)

plt.figure(figsize=(7, 9))
plt.barh(xgb_fi_df['feature'].head(20)[::-1], xgb_fi_df['importance'].head(20)[::-1])
plt.title("XGBoost Feature Importances (Top 20)")
plt.tight_layout()
plt.savefig(os.path.join(result_dir, "xgb_feature_importance.png"))
plt.close()

# === 6. Huấn luyện và đánh giá LightGBM ===
model_lgbm = LGBMClassifier(
    random_state=42, n_estimators=400, learning_rate=0.13, max_depth=6,
    subsample=0.93, colsample_bytree=0.71, scale_pos_weight=scale_pos_weight
)
model_lgbm.fit(X_train, y_train)
y_pred_lgbm = model_lgbm.predict(X_test)

print("\n=== LightGBM Evaluation ===")
print(classification_report(y_test, y_pred_lgbm, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lgbm))
print("Accuracy:", accuracy_score(y_test, y_pred_lgbm))
print("Macro F1:", f1_score(y_test, y_pred_lgbm, average='macro'))

lgbm_report_dict = classification_report(y_test, y_pred_lgbm, digits=4, output_dict=True)
lgbm_report_df = pd.DataFrame(lgbm_report_dict).transpose()
lgbm_report_df.to_csv(os.path.join(result_dir, "lgbm_feature_report.csv"))
pd.DataFrame(confusion_matrix(y_test, y_pred_lgbm)).to_csv(
    os.path.join(result_dir, "lgbm_feature_confusion_matrix.csv"), index=False, header=False)

joblib.dump(model_lgbm, os.path.join(result_dir, "lgbm_feature_model.pkl"))

lgbm_fi_df = pd.DataFrame({
    "feature": feature_cols,
    "importance": model_lgbm.feature_importances_
}).sort_values("importance", ascending=False)

print("\nTop 15 LightGBM Feature Importances:")
print(lgbm_fi_df.head(15))
lgbm_fi_df.to_csv(os.path.join(result_dir, "lgbm_feature_importance.csv"), index=False)

plt.figure(figsize=(7, 9))
plt.barh(lgbm_fi_df['feature'].head(20)[::-1], lgbm_fi_df['importance'].head(20)[::-1])
plt.title("LightGBM Feature Importances (Top 20)")
plt.tight_layout()
plt.savefig(os.path.join(result_dir, "lgbm_feature_importance.png"))
plt.close()


scale_pos_weight = 1.15

=== XGBoost Evaluation ===
              precision    recall  f1-score   support

           0     0.6073    0.6593    0.6322      6759
           1     0.5673    0.5118    0.5381      5901

    accuracy                         0.5905     12660
   macro avg     0.5873    0.5855    0.5852     12660
weighted avg     0.5887    0.5905    0.5884     12660

Confusion Matrix:
 [[4456 2303]
 [2881 3020]]
Accuracy: 0.590521327014218
Macro F1: 0.5851843342019655

Top 15 XGBoost Feature Importances:
                           feature  importance
80                   fault_rate_50    0.048574
7                    Anomaly_Score    0.025070
5                     FFT_Feature1    0.023519
6                     FFT_Feature2    0.021824
1             Normalized_Vibration    0.021146
2              Normalized_Pressure    0.013962
3               Normalized_Voltage    0.012937
0                  Normalized_Temp    0.012774
25      Normalized_Vibration_delta    0.012427
49    Norma