In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
import warnings
warnings.filterwarnings('ignore')

# ------------------------- 1. Đọc và xử lý dữ liệu -------------------------
df = pd.read_csv(r'D:\dow\project\data\full_dataset_with_gan.csv')

def add_features(df):
    df = df.copy()
    base = [
        'Normalized_Temp', 'Normalized_Vibration', 'Normalized_Pressure',
        'Normalized_Voltage', 'Normalized_Current',
        'FFT_Feature1', 'FFT_Feature2', 'Anomaly_Score'
    ]
    df = df.sort_values(['Sensor_ID', 'Year', 'Month', 'Day', 'Hour', 'Minute'])
    for col in base:
        for w in [3, 7, 15]:
            df[f'{col}_rollmean{w}'] = df.groupby('Sensor_ID')[col].transform(lambda x: x.rolling(w, min_periods=1).mean())
            df[f'{col}_rollstd{w}'] = df.groupby('Sensor_ID')[col].transform(lambda x: x.rolling(w, min_periods=1).std().fillna(0))
    df['Temp_Vibration_ratio'] = df['Normalized_Temp'] / (df['Normalized_Vibration'] + 1e-5)
    df['Delta_Current'] = df['Normalized_Current'].diff().fillna(0)
    return df

df = add_features(df)
drop_cols = ['Sensor_ID', 'Fault_Status', 'Fault_Type', 'Year','Month','Day','Hour','Minute']
feature_cols = [c for c in df.columns if c not in drop_cols]

# ------------------------- 2. Hàm đánh giá -------------------------
def print_metrics(y_true, y_pred, name):
    print(f"\n=== [{name}] ===")
    print("Classification Report:")
    print(classification_report(y_true, y_pred, digits=4))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Macro F1:", f1_score(y_true, y_pred, average='macro'))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall:", recall_score(y_true, y_pred))

# ------------------------- 3. Hàm chạy từng kịch bản -------------------------
def run_case(case_name, X_train, y_train, X_test, y_test):
    print(f"\n===== {case_name} =====")
    
    # XGBoost
    xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    xgb.fit(X_train, y_train)
    y_pred_xgb = xgb.predict(X_test)
    print_metrics(y_test, y_pred_xgb, "XGBoost")

    # LightGBM
    lgbm = LGBMClassifier(random_state=42)
    lgbm.fit(X_train, y_train)
    y_pred_lgbm = lgbm.predict(X_test)
    print_metrics(y_test, y_pred_lgbm, "LightGBM")

# ------------------------- 4. Hàm chia theo thiết bị (phụ thuộc) -------------------------
def split_by_device(df, test_size=0.15, val_size=0.15):
    train_list, test_list, val_list = [], [], []
    for device in df['Sensor_ID'].unique():
        sub = df[df['Sensor_ID'] == device]
        train_sub, test_val_sub = train_test_split(sub, test_size=(test_size + val_size), random_state=42, shuffle=True)
        test_sub, val_sub = train_test_split(test_val_sub, test_size=val_size / (test_size + val_size), random_state=42)
        train_list.append(train_sub)
        test_list.append(test_sub)
        val_list.append(val_sub)
    train = pd.concat(train_list)
    test = pd.concat(test_list)
    val = pd.concat(val_list)
    return train, test, val



In [7]:
# ====================== 5. Các kịch bản chia dữ liệu ======================

## CASE 1: Dependent split per device
train, test, val = split_by_device(df)
X_train, y_train = train[feature_cols], train['Fault_Status']
X_test, y_test = test[feature_cols], test['Fault_Status']
run_case("CASE 1: Dependent split per device", X_train, y_train, X_test, y_test)



===== CASE 1: Dependent split per device =====

=== [XGBoost] ===
Classification Report:
              precision    recall  f1-score   support

           0     0.5677    0.7200    0.6348      4964
           1     0.5643    0.3981    0.4668      4522

    accuracy                         0.5665      9486
   macro avg     0.5660    0.5590    0.5508      9486
weighted avg     0.5660    0.5665    0.5547      9486

Confusion Matrix:
 [[3574 1390]
 [2722 1800]]
Accuracy: 0.5665190807505798
Macro F1: 0.5508092391825063
Precision: 0.5642633228840125
Recall: 0.3980539584254755
[LightGBM] [Info] Number of positive: 20482, number of negative: 23784
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.017620 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14790
[LightGBM] [Info] Number of data points in the train set: 44266, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.462703 -> i

In [8]:
## CASE 2: Independent split per device
unique_sensors = df['Sensor_ID'].unique()
np.random.seed(42)
np.random.shuffle(unique_sensors)
n = len(unique_sensors)
train_sensor = unique_sensors[:int(0.7*n)]
test_sensor = unique_sensors[int(0.7*n):int(0.85*n)]
train = df[df['Sensor_ID'].isin(train_sensor)]
test = df[df['Sensor_ID'].isin(test_sensor)]
X_train, y_train = train[feature_cols], train['Fault_Status']
X_test, y_test = test[feature_cols], test['Fault_Status']
run_case("CASE 2: Independent split per device", X_train, y_train, X_test, y_test)



===== CASE 2: Independent split per device =====

=== [XGBoost] ===
Classification Report:
              precision    recall  f1-score   support

           0     0.5654    0.6995    0.6253      5015
           1     0.5419    0.3980    0.4589      4480

    accuracy                         0.5572      9495
   macro avg     0.5536    0.5487    0.5421      9495
weighted avg     0.5543    0.5572    0.5468      9495

Confusion Matrix:
 [[3508 1507]
 [2697 1783]]
Accuracy: 0.5572406529752502
Macro F1: 0.5421283009518304
Precision: 0.5419452887537994
Recall: 0.39799107142857143
[LightGBM] [Info] Number of positive: 20646, number of negative: 23505
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009748 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14790
[LightGBM] [Info] Number of data points in the train set: 44151, number of used features: 58
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.467622 -

In [9]:
## CASE 3: Random split all data
X = df[feature_cols]
y = df['Fault_Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)
run_case("CASE 3: Random split all data", X_train, y_train, X_test, y_test)



===== CASE 3: Random split all data =====

=== [XGBoost] ===
Classification Report:
              precision    recall  f1-score   support

           0     0.5695    0.7201    0.6360     10055
           1     0.5515    0.3873    0.4550      8934

    accuracy                         0.5635     18989
   macro avg     0.5605    0.5537    0.5455     18989
weighted avg     0.5610    0.5635    0.5509     18989

Confusion Matrix:
 [[7241 2814]
 [5474 3460]]
Accuracy: 0.5635367844541577
Macro F1: 0.5455179843167859
Precision: 0.55148230793752
Recall: 0.38728453100514887
[LightGBM] [Info] Number of positive: 20569, number of negative: 23738
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010130 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 14790
[LightGBM] [Info] Number of data points in the train set: 44307, number of used features: 58
[Li