## 📌 Notebook 1: Sinh dữ liệu lỗi bằng CTGAN (10k lỗi, đúng tỷ lệ, seed cố định)

In [None]:
!pip install -q ctgan pandas scikit-learn

In [None]:
import pandas as pd
import numpy as np
import random
import torch
from ctgan import CTGAN
from sklearn.preprocessing import LabelEncoder, StandardScaler
from google.colab import files

# Đặt seed cố định
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

In [None]:
# Upload dữ liệu gốc
uploaded = files.upload()
df = pd.read_csv(list(uploaded.values())[0])
df['Timestamp'] = pd.to_datetime(df[['Year', 'Month', 'Day', 'Hour', 'Minute']])

In [None]:
# Chuẩn bị dữ liệu lỗi gốc
df_fault = df[df['Fault_Status'] == 1].copy()
cat_cols = ['Fault_Type']
num_cols = ['Temperature', 'Vibration', 'Pressure', 'Voltage', 'Current',
            'FFT_Feature1', 'FFT_Feature2', 'Anomaly_Score']
used_cols = cat_cols + num_cols

label_encoders = {}
scalers = {}
for col in cat_cols:
    le = LabelEncoder()
    df_fault[col] = le.fit_transform(df_fault[col])
    label_encoders[col] = le
for col in num_cols:
    scaler = StandardScaler()
    df_fault[col] = scaler.fit_transform(df_fault[[col]])
    scalers[col] = scaler

In [None]:
# Huấn luyện CTGAN và sinh dữ liệu
ctgan = CTGAN(epochs=300)
ctgan.fit(df_fault[used_cols], discrete_columns=cat_cols)
samples = ctgan.sample(10000)
for col in cat_cols:
    samples[col] = label_encoders[col].inverse_transform(
        np.clip(np.round(samples[col]).astype(int), 0, len(label_encoders[col].classes_)-1))
for col in num_cols:
    samples[col] = scalers[col].inverse_transform(samples[[col]])

In [None]:
# Gán thời gian thực tế dàn đều + Sensor_ID
date_range = pd.date_range(df['Timestamp'].min().date(), df['Timestamp'].max().date(), freq='D')
samples['Sensor_ID'] = np.random.choice(df['Sensor_ID'].unique(), size=10000)
samples['Day_Index'] = np.tile(np.arange(len(date_range)), int(10000/len(date_range)) + 1)[:10000]
samples['Date'] = date_range[samples['Day_Index'].values]
samples['Year'] = samples['Date'].dt.year
samples['Month'] = samples['Date'].dt.month
samples['Day'] = samples['Date'].dt.day
samples['Hour'] = np.random.randint(0, 24, size=10000)
samples['Minute'] = np.random.randint(0, 60, size=10000)
samples['Fault_Status'] = 1
samples.drop(columns=['Date', 'Day_Index'], inplace=True)

In [None]:
# Lưu file kết quả
samples.to_csv("ctgan_generated_faults_only.csv", index=False)
files.download("ctgan_generated_faults_only.csv")