In [11]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
import joblib
import warnings
warnings.filterwarnings('ignore')

result_dir = r'D:/dow/project/evaluation/case3'
os.makedirs(result_dir, exist_ok=True)

# --- 1. Đọc và xử lý dữ liệu ---
df = pd.read_csv(r'D:/dow/project/data/full_dataset_with_gan.csv')

def add_features(df):
    df = df.copy()
    base = [
        'Normalized_Temp', 'Normalized_Vibration', 'Normalized_Pressure',
        'Normalized_Voltage', 'Normalized_Current',
        'FFT_Feature1', 'FFT_Feature2', 'Anomaly_Score'
    ]
    df = df.sort_values(['Sensor_ID', 'Year', 'Month', 'Day', 'Hour', 'Minute'])
    for col in base:
        for w in [3, 7, 15]:
            df[f'{col}_rollmean{w}'] = df.groupby('Sensor_ID')[col].transform(lambda x: x.rolling(w, min_periods=1).mean())
            df[f'{col}_rollstd{w}'] = df.groupby('Sensor_ID')[col].transform(lambda x: x.rolling(w, min_periods=1).std().fillna(0))
    df['Temp_Vibration_ratio'] = df['Normalized_Temp'] / (df['Normalized_Vibration'] + 1e-5)
    df['Delta_Current'] = df['Normalized_Current'].diff().fillna(0)
    return df

df = add_features(df)
drop_cols = ['Sensor_ID', 'Fault_Status', 'Fault_Type', 'Year','Month','Day','Hour','Minute']
feature_cols = [c for c in df.columns if c not in drop_cols]

def split_by_device(df, test_size=0.15, val_size=0.15):
    train_list, test_list, val_list = [], [], []
    for device in df['Sensor_ID'].unique():
        sub = df[df['Sensor_ID'] == device]
        train_sub, test_val_sub = train_test_split(sub, test_size=(test_size + val_size), random_state=42, shuffle=True)
        test_sub, val_sub = train_test_split(test_val_sub, test_size=val_size / (test_size + val_size), random_state=42)
        train_list.append(train_sub)
        test_list.append(test_sub)
        val_list.append(val_sub)
    train = pd.concat(train_list)
    test = pd.concat(test_list)
    val = pd.concat(val_list)
    return train, test, val

def save_metrics(y_true, y_pred, best_params, case, model, model_obj):
    report_dict = classification_report(y_true, y_pred, digits=4, output_dict=True)
    report_df = pd.DataFrame(report_dict).transpose()
    report_file = os.path.join(result_dir, f"{case}_{model}_report.csv")
    report_df.to_csv(report_file)
    cm = confusion_matrix(y_true, y_pred)
    pd.DataFrame(cm).to_csv(os.path.join(result_dir, f"{case}_{model}_confusion_matrix.csv"), index=False, header=False)
    with open(os.path.join(result_dir, f"{case}_{model}_best_params.txt"), "w", encoding="utf-8") as f:
        f.write(str(best_params))
    # Save model
    joblib.dump(model_obj, os.path.join(result_dir, f"{case}_{model}.pkl"))

In [12]:
def print_metrics(y_true, y_pred, name):
    print(f"\n=== [{name}] ===")
    print("Classification Report:")
    print(classification_report(y_true, y_pred, digits=4))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Macro F1:", f1_score(y_true, y_pred, average='macro'))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))

def run_case(case_name, X_train, y_train, X_test, y_test):
    print(f"\n===== {case_name} =====")
    # ---- Tuning XGBoost ----
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    xgb_param = {
        'max_depth': [3, 5, 7],
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1, 0.2],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'scale_pos_weight': [1.0, float(sum(y_train==0))/sum(y_train==1)]
    }
    grid_xgb = GridSearchCV(xgb, xgb_param, scoring='f1_macro', n_jobs=-1, cv=3, verbose=1)
    grid_xgb.fit(X_train, y_train)
    print("Best XGBoost params:", grid_xgb.best_params_)
    y_pred_xgb = grid_xgb.predict(X_test)
    print_metrics(y_test, y_pred_xgb, "XGBoost (tuned)")
    save_metrics(y_test, y_pred_xgb, grid_xgb.best_params_, case_name, "xgboost", grid_xgb.best_estimator_)

    # ---- Tuning LightGBM ----
    lgbm = LGBMClassifier(random_state=42)
    lgbm_param = {
        'max_depth': [3, 5, 7],
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1, 0.2],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0],
        'scale_pos_weight': [1.0, float(sum(y_train==0))/sum(y_train==1)]
    }
    grid_lgbm = GridSearchCV(lgbm, lgbm_param, scoring='f1_macro', n_jobs=-1, cv=3, verbose=1)
    grid_lgbm.fit(X_train, y_train)
    print("Best LightGBM params:", grid_lgbm.best_params_)
    y_pred_lgbm = grid_lgbm.predict(X_test)
    print_metrics(y_test, y_pred_lgbm, "LightGBM (tuned)")
    save_metrics(y_test, y_pred_lgbm, grid_lgbm.best_params_, case_name, "lightgbm", grid_lgbm.best_estimator_)


In [13]:
# -- CASE 1: Dependent split per device
train, test, val = split_by_device(df)
X_train, y_train = train[feature_cols], train['Fault_Status']
X_test, y_test = test[feature_cols], test['Fault_Status']
run_case("case1_dependent_per_device", X_train, y_train, X_test, y_test)


===== case1_dependent_per_device =====
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Best XGBoost params: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100, 'scale_pos_weight': 1.1612147251244995, 'subsample': 1.0}

=== [XGBoost (tuned)] ===
Classification Report:
              precision    recall  f1-score   support

           0     0.5784    0.7351    0.6474      4964
           1     0.5861    0.4118    0.4837      4522

    accuracy                         0.5810      9486
   macro avg     0.5822    0.5734    0.5655      9486
weighted avg     0.5821    0.5810    0.5694      9486

Confusion Matrix:
 [[3649 1315]
 [2660 1862]]
Accuracy: 0.5809614168247944
Macro F1: 0.5655433724580133
Precision: 0.5860875039345295
Recall: 0.4117647058823529
Fitting 3 folds for each of 144 candidates, totalling 432 fits
[LightGBM] [Info] Number of positive: 20482, number of negative: 23784
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the

In [14]:
# -- CASE 2: Independent split per device
unique_sensors = df['Sensor_ID'].unique()
np.random.seed(42)
np.random.shuffle(unique_sensors)
n = len(unique_sensors)
train_sensor = unique_sensors[:int(0.7*n)]
test_sensor = unique_sensors[int(0.7*n):int(0.85*n)]
train = df[df['Sensor_ID'].isin(train_sensor)]
test = df[df['Sensor_ID'].isin(test_sensor)]
X_train, y_train = train[feature_cols], train['Fault_Status']
X_test, y_test = test[feature_cols], test['Fault_Status']
run_case("case2_independent_per_device", X_train, y_train, X_test, y_test)


===== case2_independent_per_device =====
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Best XGBoost params: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'scale_pos_weight': 1.1384771868642836, 'subsample': 0.8}

=== [XGBoost (tuned)] ===
Classification Report:
              precision    recall  f1-score   support

           0     0.5757    0.7105    0.6360      5015
           1     0.5608    0.4138    0.4762      4480

    accuracy                         0.5705      9495
   macro avg     0.5682    0.5622    0.5561      9495
weighted avg     0.5687    0.5705    0.5606      9495

Confusion Matrix:
 [[3563 1452]
 [2626 1854]]
Accuracy: 0.5705107951553449
Macro F1: 0.5561311265205364
Precision: 0.5607985480943739
Recall: 0.4138392857142857
Fitting 3 folds for each of 144 candidates, totalling 432 fits
[LightGBM] [Info] Number of positive: 20646, number of negative: 23505
[LightGBM] [Info] Auto-choosing col-wise multi-threading, t

In [15]:
# -- CASE 3: Random split all data
X = df[feature_cols]
y = df['Fault_Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)
run_case("case3_random_split_all_data", X_train, y_train, X_test, y_test)


===== case3_random_split_all_data =====
Fitting 3 folds for each of 144 candidates, totalling 432 fits
Best XGBoost params: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100, 'scale_pos_weight': 1.154066799552725, 'subsample': 0.8}

=== [XGBoost (tuned)] ===
Classification Report:
              precision    recall  f1-score   support

           0     0.5787    0.7185    0.6411     10055
           1     0.5650    0.4113    0.4761      8934

    accuracy                         0.5740     18989
   macro avg     0.5718    0.5649    0.5586     18989
weighted avg     0.5723    0.5740    0.5635     18989

Confusion Matrix:
 [[7225 2830]
 [5259 3675]]
Accuracy: 0.574016535889199
Macro F1: 0.5585890330054701
Precision: 0.5649500384319754
Recall: 0.41134989926124915
Fitting 3 folds for each of 144 candidates, totalling 432 fits
[LightGBM] [Info] Number of positive: 20569, number of negative: 23738
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the