# 👨‍⚕️ Early Stage Diabetes Risk Prediction
---

Given data about *patient symptoms*, let's try to predict if a given patient is **at risk for diabetes or not**.

# Getting Started

In [1]:
# Tools
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler

# Models
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Disable warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
data = pd.read_csv('../input/early-stage-diabetes-risk-prediction-dataset/diabetes_data_upload.csv')
data

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity,class
0,40,Male,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No,Yes,Yes,Yes,Positive
1,58,Male,No,No,No,Yes,No,No,Yes,No,No,No,Yes,No,Yes,No,Positive
2,41,Male,Yes,No,No,Yes,Yes,No,No,Yes,No,Yes,No,Yes,Yes,No,Positive
3,45,Male,No,No,Yes,Yes,Yes,Yes,No,Yes,No,Yes,No,No,No,No,Positive
4,60,Male,Yes,Yes,Yes,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Yes,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,39,Female,Yes,Yes,Yes,No,Yes,No,No,Yes,No,Yes,Yes,No,No,No,Positive
516,48,Female,Yes,Yes,Yes,Yes,Yes,No,No,Yes,Yes,Yes,Yes,No,No,No,Positive
517,58,Female,Yes,Yes,Yes,Yes,Yes,No,Yes,No,No,No,Yes,Yes,No,Yes,Positive
518,32,Female,No,No,No,Yes,No,No,Yes,Yes,No,Yes,No,No,Yes,No,Negative


In [3]:
data.isna().sum()

Age                   0
Gender                0
Polyuria              0
Polydipsia            0
sudden weight loss    0
weakness              0
Polyphagia            0
Genital thrush        0
visual blurring       0
Itching               0
Irritability          0
delayed healing       0
partial paresis       0
muscle stiffness      0
Alopecia              0
Obesity               0
class                 0
dtype: int64

There are no missing values.

In [4]:
{column: len(data[column].unique()) for column in data.columns}

{'Age': 51,
 'Gender': 2,
 'Polyuria': 2,
 'Polydipsia': 2,
 'sudden weight loss': 2,
 'weakness': 2,
 'Polyphagia': 2,
 'Genital thrush': 2,
 'visual blurring': 2,
 'Itching': 2,
 'Irritability': 2,
 'delayed healing': 2,
 'partial paresis': 2,
 'muscle stiffness': 2,
 'Alopecia': 2,
 'Obesity': 2,
 'class': 2}

Most of the columns contain only 2 classes except the age columns.

The target is **Positive** or **Negative**.

# Preprocessing

In [5]:
def preprocess_inputs(df):
    df = df.copy()
    
    # Split X and y
    X = df.drop('class', axis=1)
    y = df['class']
    
    # Binary encode X
    X = X.replace({'No': 0, 'Yes': 1})
    X = X.replace({'Female': 0, 'Male': 1})
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=42)
    
    # Scale X
    scaler = StandardScaler()
    X_train = pd.DataFrame(scaler.fit_transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    
    return X_train, X_test, y_train, y_test

In [6]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)
X_train.describe()

Unnamed: 0,Age,Gender,Polyuria,Polydipsia,sudden weight loss,weakness,Polyphagia,Genital thrush,visual blurring,Itching,Irritability,delayed healing,partial paresis,muscle stiffness,Alopecia,Obesity
count,364.0,364.0,364.0,364.0,364.0,364.0,364.0,364.0,364.0,364.0,364.0,364.0,364.0,364.0,364.0,364.0
mean,-1.268826e-16,1.073622e-16,-8.784182e-17,-2.4400510000000003e-17,2.440051e-18,-3.4160710000000005e-17,-1.9520400000000003e-17,1.122423e-16,5.368111000000001e-17,1.708035e-16,-8.296172000000001e-17,-5.856121e-17,-1.634834e-16,2.7450570000000003e-17,0.0,-1.9520400000000003e-17
std,1.001376,1.001376,1.001376,1.001376,1.001376,1.001376,1.001376,1.001376,1.001376,1.001376,1.001376,1.001376,1.001376,1.001376,1.001376,1.001376
min,-2.606318,-1.325736,-0.9728947,-0.8708927,-0.7997747,-1.174344,-0.9258201,-0.5094267,-0.9464847,-1.016622,-0.5477226,-0.9309493,-0.841974,-0.8043997,-0.754298,-0.417365
25%,-0.8166253,-1.325736,-0.9728947,-0.8708927,-0.7997747,-1.174344,-0.9258201,-0.5094267,-0.9464847,-1.016622,-0.5477226,-0.9309493,-0.841974,-0.8043997,-0.754298,-0.417365
50%,-0.003128833,0.754298,-0.9728947,-0.8708927,-0.7997747,0.8515389,-0.9258201,-0.5094267,-0.9464847,0.9836501,-0.5477226,-0.9309493,-0.841974,-0.8043997,-0.754298,-0.417365
75%,0.729018,0.754298,1.02786,1.148247,1.250352,0.8515389,1.080123,-0.5094267,1.056541,0.9836501,-0.5477226,1.074172,1.187685,1.243163,1.325736,-0.417365
max,3.413556,0.754298,1.02786,1.148247,1.250352,0.8515389,1.080123,1.962991,1.056541,0.9836501,1.825742,1.074172,1.187685,1.243163,1.325736,2.395984


For the training set, the mean is close to 0 and the standard deviation is close to 1 (standard scaling).

# Training

In [7]:
models = {
    "                   Logistic Regression": LogisticRegression(),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "   Support Vector Machine (RBF Kernel)": SVC(),
    "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier()    
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + "trained.")

                   Logistic Regressiontrained.
                   K-Nearest Neighborstrained.
                         Decision Treetrained.
Support Vector Machine (Linear Kernel)trained.
   Support Vector Machine (RBF Kernel)trained.
                        Neural Networktrained.
                         Random Foresttrained.
                     Gradient Boostingtrained.


# Cross Validation

In [8]:
# Evaluate the model
for name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=5)
    print(name, "average score: {:.2f} %".format(scores.mean() * 100))

                   Logistic Regression average score: 89.01 %
                   K-Nearest Neighbors average score: 92.04 %
                         Decision Tree average score: 96.43 %
Support Vector Machine (Linear Kernel) average score: 89.56 %
   Support Vector Machine (RBF Kernel) average score: 93.96 %
                        Neural Network average score: 93.68 %
                         Random Forest average score: 95.88 %
                     Gradient Boosting average score: 95.89 %


# Results

In [9]:
# Predict on test set
for name, model in models.items():
    print(name + "score on test set: {:.2f} %".format(model.score(X_test, y_test) * 100))

                   Logistic Regressionscore on test set: 93.59 %
                   K-Nearest Neighborsscore on test set: 90.38 %
                         Decision Treescore on test set: 95.51 %
Support Vector Machine (Linear Kernel)score on test set: 93.59 %
   Support Vector Machine (RBF Kernel)score on test set: 99.36 %
                        Neural Networkscore on test set: 97.44 %
                         Random Forestscore on test set: 99.36 %
                     Gradient Boostingscore on test set: 98.72 %


# Conclusion

According to the cross validation, the best models are Decision Tree and Random Forest. It seems that bagging was not helpful in that case.

One can notice that other models such as Neural Network give a very good accuracy on the test set, but not as good during cross validation.

Sometimes, there is some randomness because of the little amount of data. Cross validation seems to be the best



**Thank you for reading, have a nice day!**