In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
df = pd.read_csv('Multiclass_Diabetes_Dataset.csv')

In [3]:
df.sample(10)


Unnamed: 0,Gender,AGE,Urea,Cr,HbA1c,Chol,TG,HDL,LDL,VLDL,BMI,Class
103,1,49,5.0,74,6.2,2.0,0.8,0.6,1.0,0.4,25.0,1
144,0,56,1.9,20,11.7,5.5,5.3,0.9,1.3,1.2,33.0,2
196,1,53,5.9,67,9.9,5.2,1.8,0.9,3.6,0.6,33.0,2
129,1,41,5.9,62,5.9,4.7,5.3,0.9,1.7,2.4,21.0,1
203,1,26,7.5,79,8.8,3.5,2.0,2.0,0.9,0.5,28.0,2
19,0,33,5.3,62,5.0,4.2,1.5,1.2,2.3,0.6,24.0,0
160,1,57,7.4,43,9.0,3.9,1.4,0.7,2.6,0.6,26.0,2
57,1,44,6.8,64,4.9,4.9,2.8,2.0,1.8,1.2,21.0,0
26,0,44,4.3,49,4.0,5.6,1.4,1.4,3.6,0.6,22.0,0
235,1,68,7.4,98,12.9,8.8,4.5,1.1,5.6,2.1,29.0,2


In [4]:
x = df.drop(['Class'], axis=1)
y = df['Class']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

# Logistic Regression

In [6]:
from sklearn.preprocessing import StandardScaler
model_log = LogisticRegression(max_iter=100, random_state=42)

### Model Evaluation

In [7]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [8]:
model_log.fit(X_train_scaled, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,100


In [9]:
y_pred_log = model_log.predict(X_test_scaled)

In [10]:
accuracy_score(y_test, y_pred_log)

0.8867924528301887

In [11]:
confusion_matrix(y_test, y_pred_log)

array([[17,  1,  1],
       [ 1,  5,  2],
       [ 1,  0, 25]])

In [12]:
df_report_log = pd.DataFrame(classification_report(y_test, y_pred_log, output_dict=True)).transpose()
df_report_log

Unnamed: 0,precision,recall,f1-score,support
0,0.894737,0.894737,0.894737,19.0
1,0.833333,0.625,0.714286,8.0
2,0.892857,0.961538,0.925926,26.0
accuracy,0.886792,0.886792,0.886792,0.886792
macro avg,0.873642,0.827092,0.844983,53.0
weighted avg,0.884546,0.886792,0.882799,53.0


## Grid Search CV

In [13]:
from sklearn.pipeline import Pipeline

In [14]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('model', LogisticRegression())
])

In [15]:
param_grid_log = {
    'model__C': [0.01, 0.1, 1, 10, 100]
}

In [16]:
gridsearch = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid_log,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)
gridsearch.fit(X_train, y_train)

0,1,2
,estimator,Pipeline(step...egression())])
,param_grid,"{'model__C': [0.01, 0.1, ...]}"
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [17]:
y_pred_grid = gridsearch.predict(X_test)
print(classification_report(y_test, y_pred_grid))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89        19
           1       0.83      0.62      0.71         8
           2       0.89      0.96      0.93        26

    accuracy                           0.89        53
   macro avg       0.87      0.83      0.84        53
weighted avg       0.88      0.89      0.88        53



# Random Forest Classification

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [19]:
model_rfc = RandomForestClassifier(n_estimators=99)

In [20]:
model_rfc.fit(X_train, y_train)

0,1,2
,n_estimators,99
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [21]:
y_pred_rfc = model_rfc.predict(X_test)

In [22]:
accuracy_score(y_test, y_pred_rfc)

0.9811320754716981

# Decision Tree


In [23]:
from sklearn.tree import DecisionTreeClassifier, plot_tree

In [24]:
model_dtree = DecisionTreeClassifier(random_state=42)

In [25]:
model_dtree.fit(X_train, y_train)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,42
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [26]:
y_pred_dtree = model_dtree.predict(X_test)

In [27]:
accuracy_score(y_test, y_pred_dtree)

0.9433962264150944

In [28]:
# plt.figure(figsize=(15,10))
# plot_tree(model_dtree, filled=True, feature_names=df.columns, class_names=df['Class'])
# plt.show()

# GAGAL, karena gapunya target atau aku ga tau target nya saja