In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
data = pd.read_csv('diabetes_prediction_dataset.csv')

In [3]:
print(data.head())

   gender   age  hypertension  heart_disease smoking_history    bmi  \
0  Female  80.0             0              1           never  25.19   
1  Female  54.0             0              0         No Info  27.32   
2    Male  28.0             0              0           never  27.32   
3  Female  36.0             0              0         current  23.45   
4    Male  76.0             1              1         current  20.14   

   HbA1c_level  blood_glucose_level  diabetes  
0          6.6                  140         0  
1          6.6                   80         0  
2          5.7                  158         0  
3          5.0                  155         0  
4          4.8                  155         0  


In [4]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB
None


In [5]:
print(data.isnull().sum())

gender                 0
age                    0
hypertension           0
heart_disease          0
smoking_history        0
bmi                    0
HbA1c_level            0
blood_glucose_level    0
diabetes               0
dtype: int64


In [6]:
data.drop_duplicates(inplace=True)

In [7]:
data = pd.get_dummies(data, columns=['gender', 'smoking_history'])

In [8]:
X = data.drop(columns=['diabetes'])  # 'diabetes' adalah kolom target
y = data['diabetes']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
print(f"Ukuran X_train: {X_train.shape}")
print(f"Ukuran X_test: {X_test.shape}")
print(f"Ukuran y_train: {y_train.shape}")
print(f"Ukuran y_test: {y_test.shape}")

Ukuran X_train: (76916, 15)
Ukuran X_test: (19230, 15)
Ukuran y_train: (76916,)
Ukuran y_test: (19230,)


- Inisialisasi model Decision Tree

In [11]:
dtc = DecisionTreeClassifier(random_state=42)

- Melakukan training pada model

In [12]:
dtc.fit(X_train, y_train)

- Melakukan prediksi pada data testing

In [13]:
y_pred = dtc.predict(X_test)

- Mengevaluasi performa model

In [14]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Akurasi model: {accuracy}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Akurasi model: 0.9496099843993759
Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     17509
           1       0.71      0.74      0.72      1721

    accuracy                           0.95     19230
   macro avg       0.84      0.85      0.85     19230
weighted avg       0.95      0.95      0.95     19230

Confusion Matrix:
[[16989   520]
 [  449  1272]]


-  Melakukan cross-validation dengan 10-fold cross-validation

In [16]:
cv_scores = cross_val_score(dtc, X, y, cv=10)

- Menampilkan hasil cross-validation

In [17]:
print(f"Cross-validation scores: {cv_scores}")
print(f"Rata-rata cross-validation score: {cv_scores.mean()}")

Cross-validation scores: [0.95091004 0.95288612 0.9475819  0.95111804 0.94862194 0.9474779
 0.95017683 0.94892865 0.94976077 0.95090493]
Rata-rata cross-validation score: 0.9498367110885809
