In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.utils.class_weight import compute_class_weight
import warnings
warnings.filterwarnings('ignore')

# 1. Đọc & tạo feature
df = pd.read_csv(r'D:\dow\project\data\full_dataset_with_gan.csv')

def add_features(df):
    df = df.copy()
    base = [
        'Normalized_Temp', 'Normalized_Vibration', 'Normalized_Pressure',
        'Normalized_Voltage', 'Normalized_Current',
        'FFT_Feature1', 'FFT_Feature2', 'Anomaly_Score'
    ]
    df = df.sort_values(['Sensor_ID', 'Year', 'Month', 'Day', 'Hour', 'Minute'])
    for col in base:
        for w in [3, 7, 15]:
            df[f'{col}_rollmean{w}'] = df.groupby('Sensor_ID')[col].transform(lambda x: x.rolling(w, min_periods=1).mean())
            df[f'{col}_rollstd{w}'] = df.groupby('Sensor_ID')[col].transform(lambda x: x.rolling(w, min_periods=1).std().fillna(0))
    df['Temp_Vibration_ratio'] = df['Normalized_Temp'] / (df['Normalized_Vibration'] + 1e-5)
    df['Delta_Current'] = df['Normalized_Current'].diff().fillna(0)
    return df

df = add_features(df)
drop_cols = ['Sensor_ID', 'Fault_Status', 'Fault_Type', 'Year','Month','Day','Hour','Minute']
feature_cols = [c for c in df.columns if c not in drop_cols]

# 2. In độ đo
def print_metrics(y_true, y_pred, name):
    print(f"\n=== [{name}] ===")
    print("Classification Report:")
    print(classification_report(y_true, y_pred, digits=4))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Macro F1:", f1_score(y_true, y_pred, average='macro'))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))

# 3. GridSearchCV
def tune_xgb(X, y):
    # Tính scale_pos_weight nếu nhị phân, class_weight nếu đa lớp thì tự nhân sample_weight khi fit
    classes = np.unique(y)
    if len(classes) == 2:
        n_neg = np.sum(y == classes[0])
        n_pos = np.sum(y == classes[1])
        scale_pos_weight = n_neg / n_pos if n_pos > 0 else 1.0
    else:
        scale_pos_weight = 1.0  # cho đa lớp
    param_grid = {
        "n_estimators": [100, 150],
        "max_depth": [3, 5, 7],
        "learning_rate": [0.01, 0.05, 0.1],
        "subsample": [0.7, 0.85, 1.0],
        "colsample_bytree": [0.7, 0.85, 1.0],
        "scale_pos_weight": [scale_pos_weight] if len(classes) == 2 else [1.0]
    }
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    gs = GridSearchCV(xgb, param_grid, scoring="f1_macro", n_jobs=-1, cv=3, verbose=0)
    gs.fit(X, y)
    print(f"Best XGB params: {gs.best_params_}")
    return gs.best_estimator_

def tune_lgbm(X, y):
    classes = np.unique(y)
    # Có thể truyền class_weight='balanced' hoặc dict
    lgbm = LGBMClassifier(random_state=42, class_weight='balanced')
    param_grid = {
        "n_estimators": [100, 150],
        "max_depth": [3, 5, 7],
        "learning_rate": [0.01, 0.05, 0.1],
        "subsample": [0.7, 0.85, 1.0],
        "colsample_bytree": [0.7, 0.85, 1.0]
    }
    gs = GridSearchCV(lgbm, param_grid, scoring="f1_macro", n_jobs=-1, cv=3, verbose=0)
    gs.fit(X, y)
    print(f"Best LGBM params: {gs.best_params_}")
    return gs.best_estimator_


In [18]:
# 4. Đánh giá với từng kiểu chia
def run_case(case_name, X_train, y_train, X_test, y_test):
    print(f"\n===== {case_name} =====")
    # XGB
    xgb_best = tune_xgb(X_train, y_train)
    y_pred_xgb  = xgb_best.predict(X_test)
    print_metrics(y_test, y_pred_xgb,  "XGBoost Tuned")
    # LGBM
    lgbm_best = tune_lgbm(X_train, y_train)
    y_pred_lgbm = lgbm_best.predict(X_test)
    print_metrics(y_test, y_pred_lgbm, "LightGBM Tuned")
    return {
        'case': case_name,
        'XGB_macro_f1': f1_score(y_test, y_pred_xgb, average='macro'),
        'LGBM_macro_f1': f1_score(y_test, y_pred_lgbm, average='macro'),
        'XGB_accuracy': accuracy_score(y_test, y_pred_xgb),
        'LGBM_accuracy': accuracy_score(y_test, y_pred_lgbm)
    }

# 5. Chia dữ liệu
def split_by_device(df, test_size=0.15, val_size=0.15):
    train_list, test_list, val_list = [], [], []
    for device in df['Sensor_ID'].unique():
        sub = df[df['Sensor_ID'] == device]
        train_sub, test_val_sub = train_test_split(sub, test_size=(test_size + val_size), random_state=42, shuffle=True)
        test_sub, val_sub = train_test_split(test_val_sub, test_size=val_size / (test_size + val_size), random_state=42)
        train_list.append(train_sub)
        test_list.append(test_sub)
        val_list.append(val_sub)
    train = pd.concat(train_list)
    test = pd.concat(test_list)
    val = pd.concat(val_list)
    return train, test, val

# 6. Chạy các CASE
results = []

In [19]:
# CASE 1: Dependent split per device
train, test, val = split_by_device(df)
X_train, y_train = train[feature_cols], train['Fault_Status']
X_test, y_test = test[feature_cols], test['Fault_Status']
results.append(run_case("CASE 1: Dependent split per device", X_train, y_train, X_test, y_test))


===== CASE 1: Dependent split per device =====
Best XGB params: {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 150, 'scale_pos_weight': 1.1612147251244995, 'subsample': 0.85}

=== [XGBoost Tuned] ===
Classification Report:
              precision    recall  f1-score   support

           0     0.5726    0.7510    0.6498      4964
           1     0.5845    0.3846    0.4639      4522

    accuracy                         0.5763      9486
   macro avg     0.5786    0.5678    0.5568      9486
weighted avg     0.5783    0.5763    0.5612      9486

Confusion Matrix:
 [[3728 1236]
 [2783 1739]]
Accuracy: 0.5763230023192073
Macro F1: 0.5568396247387843
Precision: 0.5845378151260504
Recall: 0.3845643520566121
[LightGBM] [Info] Number of positive: 20482, number of negative: 23784
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.012335 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Tot

In [20]:
# CASE 2: Independent split per device
unique_sensors = df['Sensor_ID'].unique()
np.random.seed(42)
np.random.shuffle(unique_sensors)
n = len(unique_sensors)
train_sensor = unique_sensors[:int(0.7*n)]
test_sensor = unique_sensors[int(0.7*n):int(0.85*n)]
train = df[df['Sensor_ID'].isin(train_sensor)]
test = df[df['Sensor_ID'].isin(test_sensor)]
X_train, y_train = train[feature_cols], train['Fault_Status']
X_test, y_test = test[feature_cols], test['Fault_Status']
results.append(run_case("CASE 2: Independent split per device", X_train, y_train, X_test, y_test))



===== CASE 2: Independent split per device =====
Best XGB params: {'colsample_bytree': 0.85, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150, 'scale_pos_weight': 1.1384771868642836, 'subsample': 0.7}

=== [XGBoost Tuned] ===
Classification Report:
              precision    recall  f1-score   support

           0     0.5781    0.7332    0.6465      5015
           1     0.5732    0.4011    0.4720      4480

    accuracy                         0.5765      9495
   macro avg     0.5757    0.5672    0.5592      9495
weighted avg     0.5758    0.5765    0.5642      9495

Confusion Matrix:
 [[3677 1338]
 [2683 1797]]
Accuracy: 0.5765139547130068
Macro F1: 0.5592343624858398
Precision: 0.5732057416267943
Recall: 0.4011160714285714
[LightGBM] [Info] Number of positive: 20646, number of negative: 23505
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011356 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] To

In [None]:
# CASE 3: Random split all data
X = df[feature_cols]
y = df['Fault_Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)
results.append(run_case("CASE 3: Random split all data", X_train, y_train, X_test, y_test))



===== CASE 3: Random split all data =====


In [None]:
# 7. Tổng hợp & Chọn case tốt nhất
result_df = pd.DataFrame(results)
print("\n==== Tổng hợp macro F1 từng case ====")
print(result_df[['case', 'XGB_macro_f1', 'XGB_accuracy', 'LGBM_macro_f1', 'LGBM_accuracy']])

idx_best = result_df['XGB_macro_f1'].idxmax()
print(f"\n==> Case XGB tốt nhất: {result_df.loc[idx_best, 'case']} với Macro F1 = {result_df.loc[idx_best, 'XGB_macro_f1']:.4f}")

idx_best_lgbm = result_df['LGBM_macro_f1'].idxmax()
print(f"==> Case LGBM tốt nhất: {result_df.loc[idx_best_lgbm, 'case']} với Macro F1 = {result_df.loc[idx_best_lgbm, 'LGBM_macro_f1']:.4f}")
