## IMPORT LIBRARY

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

## LOAD DATA

In [None]:
# Load dataset dari path
file_path = '/content/diabetes_prediction_dataset.csv'
df = pd.read_csv(file_path)

# Tampilkan 5 baris
print(df.head())

   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [None]:
df.describe()

Unnamed: 0,age,hypertension,heart_disease,bmi,HbA1c_level,blood_glucose_level,diabetes
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,41.885856,0.07485,0.03942,27.320767,5.527507,138.05806,0.085
std,22.51684,0.26315,0.194593,6.636783,1.070672,40.708136,0.278883
min,0.08,0.0,0.0,10.01,3.5,80.0,0.0
25%,24.0,0.0,0.0,23.63,4.8,100.0,0.0
50%,43.0,0.0,0.0,27.32,5.8,140.0,0.0
75%,60.0,0.0,0.0,29.58,6.2,159.0,0.0
max,80.0,1.0,1.0,95.69,9.0,300.0,1.0


In [None]:
# Jumlah baris (data) dan kolom
rows, cols = df.shape
print(f"Jumlah data (baris): {rows}")
print(f"Jumlah fitur (kolom): {cols}")

Jumlah data (baris): 100000
Jumlah fitur (kolom): 9


## PREPROCESSING

In [None]:
df.isnull().sum()  # cek jumlah nilai yang hilang
df.dropna(inplace=True)  # atau gunakan imputasi jika sesuai

In [None]:
df.duplicated()

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
99995,True
99996,False
99997,False
99998,False


In [None]:
print(f"Jumlah baris sebelum: {df.shape[0]}")
print(f"Duplikat persis: {df.duplicated().sum()}")

Jumlah baris sebelum: 100000
Duplikat persis: 3854


In [None]:
# Cek duplikat persis
duplicates = df[df.duplicated()]
print("Duplikat persis yang ditemukan:", len(duplicates))

# Jika ingin aman: hanya hapus jika semua kolom, termasuk target, sama
df = df.drop_duplicates()

Duplikat persis yang ditemukan: 3854


In [None]:
from sklearn.preprocessing import LabelEncoder

# Otomatis encode semua kolom kategorikal
categorical_cols = df.select_dtypes(include='object').columns

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # simpan encoder jika perlu inverse_transform

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = le.fit_transform(df[col])


In [None]:
X = df.drop('diabetes', axis=1)
y = df['diabetes']

## SPLIT DATA

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## MODELLING & EVALUASI

In [None]:
model = keras.Sequential([
    layers.Dense(16, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(8, activation='relu'),
    layers.Dense(1, activation='sigmoid')  # Karena ini binary classification
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Buat callback EarlyStopping
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=5,           # Berhenti jika tidak ada perbaikan selama 5 epoch
    restore_best_weights=True
)

# Latih model dengan callback
model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=32,
    validation_split=0.2,
    callbacks=[early_stop],  # Tambahkan di sini
    verbose=1
)

Epoch 1/100
[1m1923/1923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.9035 - loss: 0.2675 - val_accuracy: 0.9595 - val_loss: 0.1166
Epoch 2/100
[1m1923/1923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9581 - loss: 0.1211 - val_accuracy: 0.9604 - val_loss: 0.1141
Epoch 3/100
[1m1923/1923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.9591 - loss: 0.1162 - val_accuracy: 0.9626 - val_loss: 0.1111
Epoch 4/100
[1m1923/1923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9592 - loss: 0.1145 - val_accuracy: 0.9631 - val_loss: 0.1096
Epoch 5/100
[1m1923/1923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - accuracy: 0.9609 - loss: 0.1115 - val_accuracy: 0.9648 - val_loss: 0.1058
Epoch 6/100
[1m1923/1923[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 3ms/step - accuracy: 0.9619 - loss: 0.1098 - val_accuracy: 0.9659 - val_loss: 0.1010
Epoch 7/10

<keras.src.callbacks.history.History at 0x7c9e774c1210>

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Akurasi pada data test: {accuracy:.2f}")

[1m601/601[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9715 - loss: 0.0837
Akurasi pada data test: 0.97


In [None]:
gender = input("Gender (Male/Female): ")
age = int(input("Umur: "))
hypertension = int(input("Hipertensi (0 = Tidak, 1 = Ya): "))
heart_disease = int(input("Penyakit jantung (0 = Tidak, 1 = Ya): "))
smoking_history = input("Riwayat merokok (never/former/current): ")
bmi = float(input("BMI: "))
hba1c = float(input("HbA1c Level: "))
glucose = float(input("Blood Glucose Level: "))

sample = pd.DataFrame([{
    'gender': label_encoders['gender'].transform([gender])[0],
    'age': age,
    'hypertension': hypertension,
    'heart_disease': heart_disease,
    'smoking_history': label_encoders['smoking_history'].transform([smoking_history])[0],
    'bmi': bmi,
    'HbA1c_level': hba1c,
    'blood_glucose_level': glucose
}])

sample_scaled = scaler.transform(sample)
pred = model.predict(sample_scaled)[0][0]

print(f"Probabilitas diabetes: {pred * 100:.2f}%")
print("Prediksi:", "Diabetes" if pred >= 0.5 else "Tidak Diabetes")


Gender (Male/Female): Male
Umur: 50
Hipertensi (0 = Tidak, 1 = Ya): 1
Penyakit jantung (0 = Tidak, 1 = Ya): 1
Riwayat merokok (never/former/current): current
BMI: 26
HbA1c Level: 7.7
Blood Glucose Level: 130
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Probabilitas diabetes: 100.00%
Prediksi: Diabetes
