In [6]:
# 0. Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

# 1. Import Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.metrics import classification_report, roc_auc_score
from scipy.stats.mstats import winsorize
from imblearn.over_sampling import SMOTE

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical
import tensorflow as tf

# 2. Set Seed untuk Konsistensi
SEED = 44
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['TF_DETERMINISTIC_OPS'] = '1'

# 3. Load Dataset
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Skripsi/Dataset/diabetes_012_health_indicators_BRFSS2015.csv')
X = data.drop('Diabetes_012', axis=1)
y = data['Diabetes_012']

# 4. Preprocessing
## 4.1 Winsorizing untuk fitur BMI (0.5%-99.5%)
X['BMI'] = winsorize(X['BMI'], limits=[0.005, 0.005])

## 4.2 Cap untuk fitur MentHlth & PhysHlth (batas atas 30)
X['MentHlth'] = np.where(X['MentHlth'] > 30, 30, X['MentHlth'])
X['PhysHlth'] = np.where(X['PhysHlth'] > 30, 30, X['PhysHlth'])

## 4.3 Definisi fitur scaler
robust_features = ['BMI', 'MentHlth', 'PhysHlth']
minmax_features = ['Age', 'Education', 'Income', 'GenHlth']

## 4.4 Terapkan Scalers
scaler_robust = RobustScaler()
scaler_minmax = MinMaxScaler()

X[robust_features] = scaler_robust.fit_transform(X[robust_features])
X[minmax_features] = scaler_minmax.fit_transform(X[minmax_features])


# 6. Split Data (Train/Test)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

# 6.1 Simpan Split Data Awal (Sebelum SMOTE)
# X_train.to_csv('/content/drive/MyDrive/Colab Notebooks/Skripsi/Dataset/Hasil Eksperimen/X_train_before_smote.csv', index=False)
# y_train.to_frame().to_csv('/content/drive/MyDrive/Colab Notebooks/Skripsi/Dataset/Hasil Eksperimen/y_train_before_smote.csv', index=False)
# X_test.to_csv('/content/drive/MyDrive/Colab Notebooks/Skripsi/Dataset/Hasil Eksperimen/X_test.csv', index=False)
# y_test.to_frame().to_csv('/content/drive/MyDrive/Colab Notebooks/Skripsi/Dataset/Hasil Eksperimen/y_test.csv', index=False)

# 7. SMOTE on Train Data Only
smote = SMOTE(random_state=SEED)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# 7.1 Simpan Dataset Hasil SMOTE
df_train_smote = pd.DataFrame(X_train_smote, columns=X_train.columns)
df_train_smote['Diabetes_012'] = y_train_smote
df_train_smote.to_csv('/content/drive/MyDrive/Colab Notebooks/Skripsi/Dataset/Output DNN+SMOTE.csv', index=False)

# 8. One-hot Encoding
y_train_encoded = to_categorical(y_train_smote)
y_test_encoded = to_categorical(y_test)

# 9. Split Train → Train Main & Validation
X_train_main, X_val, y_train_main, y_val = train_test_split(
    X_train_smote, y_train_encoded, test_size=0.2, random_state=SEED, stratify=y_train_smote
)

# 10. Build DNN Model (Tanpa Self-Attention)
model = Sequential([
    Dense(64, activation='relu', input_shape=(X.shape[1],)),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(3, activation='softmax')
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)


# 11. Train Model with EarlyStopping
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(
    X_train_main, y_train_main,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=50,
    callbacks=[early_stop],
    verbose=1
)

# 12. Evaluate on Test Set
y_pred_prob = model.predict(X_test)
y_pred_class = np.argmax(y_pred_prob, axis=1)
y_true_class = np.argmax(y_test_encoded, axis=1)

# 13. Metrik Evaluasi
print("\n=== Classification Report (Macro) Model DNN + SMOTE===")
print(classification_report(y_true_class, y_pred_class, digits=4))
roc_auc = roc_auc_score(y_test_encoded, y_pred_prob, average='macro', multi_class='ovr')
print(f"\nMacro-average ROC AUC: {roc_auc:.4f}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8207/8207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 3ms/step - accuracy: 0.5269 - loss: 0.9425 - val_accuracy: 0.5740 - val_loss: 0.8851
Epoch 2/100
[1m8207/8207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 2ms/step - accuracy: 0.5825 - loss: 0.8772 - val_accuracy: 0.6097 - val_loss: 0.8401
Epoch 3/100
[1m8207/8207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 3ms/step - accuracy: 0.6169 - loss: 0.8328 - val_accuracy: 0.6314 - val_loss: 0.8080
Epoch 4/100
[1m8207/8207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 3ms/step - accuracy: 0.6354 - loss: 0.8027 - val_accuracy: 0.6419 - val_loss: 0.7896
Epoch 5/100
[1m8207/8207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 2ms/step - accuracy: 0.6465 - loss: 0.7823 - val_accuracy: 0.6444 - val_loss: 0.7838
Epoch 6/100
[1m8207/8207[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 3ms/step - accuracy: 0.6540 - loss: 0.7683 - val_accuracy: 0.6447 - val_loss: 0.7817
Epoch 7/100
[1m