In [1]:
# import libraries
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix, classification_report


In [2]:
# prepare our independent and dependent variables
df =pd.read_csv("diabetes_data_clean.csv")

X = df.drop('class', axis=1)
y = df['class']


In [3]:
# split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, 
                                                    stratify = y)

In [4]:
# begin our model training
# start with DummyClassifier to establish baseline
dummy = DummyClassifier()
dummy.fit(X_train, y_train)
dummy_pred = dummy.predict(X_test)

In [5]:
# assess DummyClassifier model
confusion_matrix(y_test, dummy_pred)

array([[ 0, 40],
       [ 0, 64]], dtype=int64)

In [6]:
# use a classification report
print(classification_report(y_test, dummy_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        40
           1       0.62      1.00      0.76        64

    accuracy                           0.62       104
   macro avg       0.31      0.50      0.38       104
weighted avg       0.38      0.62      0.47       104



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
# start with LogisticRegression
logr = LogisticRegression(max_iter=10000)
logr.fit(X_train, y_train)
logr_pred = logr.predict(X_test)

In [8]:
confusion_matrix(y_test, logr_pred)

array([[35,  5],
       [ 4, 60]], dtype=int64)

In [9]:
print(classification_report(y_test, logr_pred))

              precision    recall  f1-score   support

           0       0.90      0.88      0.89        40
           1       0.92      0.94      0.93        64

    accuracy                           0.91       104
   macro avg       0.91      0.91      0.91       104
weighted avg       0.91      0.91      0.91       104



In [10]:
# try DecisionTree
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
tree_pred = tree.predict(X_test)

In [11]:
confusion_matrix(y_test, tree_pred)

array([[38,  2],
       [ 1, 63]], dtype=int64)

In [12]:
print(classification_report(y_test, tree_pred))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96        40
           1       0.97      0.98      0.98        64

    accuracy                           0.97       104
   macro avg       0.97      0.97      0.97       104
weighted avg       0.97      0.97      0.97       104



In [13]:
# try RandomForest
forest = RandomForestClassifier()
forest.fit(X_train, y_train)
forest_pred = forest.predict(X_test)

In [14]:
confusion_matrix(y_test, forest_pred)

array([[37,  3],
       [ 0, 64]], dtype=int64)

In [15]:
print(classification_report(y_test, forest_pred))

              precision    recall  f1-score   support

           0       1.00      0.93      0.96        40
           1       0.96      1.00      0.98        64

    accuracy                           0.97       104
   macro avg       0.98      0.96      0.97       104
weighted avg       0.97      0.97      0.97       104



In [16]:
forest.feature_importances_

array([0.09118487, 0.09600694, 0.2332844 , 0.20225181, 0.05276632,
       0.01618913, 0.02714777, 0.02469652, 0.03045743, 0.02652772,
       0.04423708, 0.02946612, 0.04332907, 0.02432374, 0.0396608 ,
       0.0184703 ])

In [17]:
X.columns


Index(['age', 'ismale', 'polyuria', 'polydipsia', 'sudden weight loss',
       'weakness', 'polyphagia', 'genital thrush', 'visual blurring',
       'itching', 'irritability', 'delayed healing', 'partial paresis',
       'muscle stiffness', 'alopecia', 'obesity'],
      dtype='object')

In [18]:
pd.DataFrame({'feature':X.columns,
             'importance': forest.feature_importances_ }).sort_values('importance',
                                                                     ascending=False)

Unnamed: 0,feature,importance
2,polyuria,0.233284
3,polydipsia,0.202252
1,ismale,0.096007
0,age,0.091185
4,sudden weight loss,0.052766
10,irritability,0.044237
12,partial paresis,0.043329
14,alopecia,0.039661
8,visual blurring,0.030457
11,delayed healing,0.029466


Summary:
1. Trained a baseline model
2. Trained three different models - logistic regression, decision tree, random forest
3. Identified the importance features in the best performing model