## Exploring the dataset

In [31]:
import pandas as pd
import numpy as np

In [32]:
data = pd.read_csv(r"data/diabetes.csv")

Check first 5 and last 5 rows of dataset

In [33]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [34]:
data.tail()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


Size of dataset

In [35]:
rows, columns = data.shape
print("Number of Rows", rows)
print("Number of Columns", columns)

Number of Rows 768
Number of Columns 9


Get Information About Our Dataset Like Total Number Rows, Total Number of Columns, Datatypes of Each Column And Memory Requirement

In [36]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


Overall statistics

In [37]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


Check Null Values In The Dataset


In [38]:
data.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

There are no null values but there may be zero values so lets check

In [39]:
data.replace(0, np.nan).isnull().sum()

Pregnancies                 111
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64

## Data Imputation

Replacing zero values with mean of that column

In [40]:
columns_to_impute = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
for column in columns_to_impute:
    data[column] = data[column].replace(0, data[column].mean())

Store Feature Matrix In X and Response(Target) In Vector y

In [41]:
X = data.drop("Outcome", axis=1)
y = data["Outcome"]

## Choosing appropriate model

Splitting The Dataset Into The Training Set And Test Set

In [42]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)

Building Scikit-Learn Pipelines to choose appropriate model


In [43]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.pipeline import Pipeline

In [44]:
pipeline_logistic_regression = Pipeline(
    [("scaler", StandardScaler()), ("logistic_regression", LogisticRegression())]
)

pipeline_knn = Pipeline(
    [("scaler", StandardScaler()), ("k_nearest_neighbors", KNeighborsClassifier())]
)

pipeline_svc = Pipeline(
    [("scaler", StandardScaler()), ("support_vector_classifier", SVC())]
)

pipeline_decision_tree = Pipeline(
    [("decision_tree_classifier", DecisionTreeClassifier())]
)

pipeline_random_forest = Pipeline(
    [("random_forest_classifier", RandomForestClassifier())]
)

pipeline_gradient_boosting = Pipeline(
    [("gradient_boosting_classifier", GradientBoostingClassifier())]
)

pipelines = [
    pipeline_logistic_regression,
    pipeline_knn,
    pipeline_svc,
    pipeline_decision_tree,
    pipeline_random_forest,
    pipeline_gradient_boosting,
]

In [45]:
pipelines

[Pipeline(steps=[('scaler', StandardScaler()),
                 ('logistic_regression', LogisticRegression())]),
 Pipeline(steps=[('scaler', StandardScaler()),
                 ('k_nearest_neighbors', KNeighborsClassifier())]),
 Pipeline(steps=[('scaler', StandardScaler()),
                 ('support_vector_classifier', SVC())]),
 Pipeline(steps=[('decision_tree_classifier', DecisionTreeClassifier())]),
 Pipeline(steps=[('random_forest_classifier', RandomForestClassifier())]),
 Pipeline(steps=[('gradient_boosting_classifier', GradientBoostingClassifier())])]

Fitting data to all the scalers and models

In [46]:
for pipe in pipelines:
    pipe.fit(X_train, y_train)

In [47]:
pipe_dict = {
    0: "Logistic Regression",
    1: "K-Nearest Neighbors",
    2: "Support Vector Classifier",
    3: "Decision Tree",
    4: "Random Forest",
    5: "Gradient Boosting Classifier",
}

In [48]:
for i, model in enumerate(pipelines):
    print("{} Test Accuracy:{}".format(pipe_dict[i], model.score(X_test, y_test) * 100))

Logistic Regression Test Accuracy:76.62337662337663
K-Nearest Neighbors Test Accuracy:76.62337662337663
Support Vector Classifier Test Accuracy:73.37662337662337
Decision Tree Test Accuracy:72.72727272727273
Random Forest Test Accuracy:74.67532467532467
Gradient Boosting Classifier Test Accuracy:76.62337662337663


The best results are given by Logistic Regression, K-Nearest Neighbors, and Gradient Boosting Classifier.

But lets do hyper parameter tuning on Random forest classifier and Gradient Boosting classifier.

In [49]:
from sklearn.model_selection import GridSearchCV

In [50]:
random_forest = RandomForestClassifier()
gradient_boosting = GradientBoostingClassifier()

param_grid_rf = {
    "n_estimators": [50, 75, 100],
    "max_depth": [3, 5, 10],
    "min_samples_split": [2, 10, 20],
}

param_grid_gbc = {
    "n_estimators": [50, 75, 100],
    "learning_rate": [0.01, 0.1, 0.2],
    "max_depth": [3, 5, 10],
}

In [51]:
models = {
    "Random Forest": (random_forest, param_grid_rf),
    "Gradient Boosting Classifier": (gradient_boosting, param_grid_gbc),
}

In [52]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


best_models = {}
for name, (model, param_grid) in models.items():
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring="accuracy", n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_models[name] = grid_search.best_estimator_
    print(f"Best parameters for {name}: {grid_search.best_params_}")
    print(f"Best score for {name}: {grid_search.best_score_}")

for name, model in best_models.items():
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nAccuracy for {name} on test data: {accuracy}")
    print(f"Classification Report for {name}:\n{classification_report(y_test, y_pred)}")

Best parameters for Random Forest: {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 50}
Best score for Random Forest: 0.7785019325603092
Best parameters for Gradient Boosting Classifier: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 75}
Best score for Gradient Boosting Classifier: 0.767119818739171

Accuracy for Random Forest on test data: 0.7402597402597403
Classification Report for Random Forest:
              precision    recall  f1-score   support

           0       0.80      0.80      0.80        99
           1       0.64      0.64      0.64        55

    accuracy                           0.74       154
   macro avg       0.72      0.72      0.72       154
weighted avg       0.74      0.74      0.74       154


Accuracy for Gradient Boosting Classifier on test data: 0.7532467532467533
Classification Report for Gradient Boosting Classifier:
              precision    recall  f1-score   support

           0       0.82      0.79      0.80        99
           1 

As gradient boosting classifier performs better lets use that model.

In [53]:
gbc = best_models["Gradient Boosting Classifier"]

Prediction on New Data


In [54]:
new_data = pd.DataFrame(
    {
        "Pregnancies": 6,
        "Glucose": 148.0,
        "BloodPressure": 72.0,
        "SkinThickness": 35.0,
        "Insulin": 79.799479,
        "BMI": 33.6,
        "DiabetesPedigreeFunction": 0.627,
        "Age": 50,
    },
    index=[0],
)

In [55]:
p = gbc.predict(new_data)

In [56]:
if p[0] == 0:
    print("non-diabetic")
else:
    print("diabetic")

diabetic


In [57]:
data.head(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,79.799479,33.6,0.627,50,1
1,1,85.0,66.0,29.0,79.799479,26.6,0.351,31,0
2,8,183.0,64.0,20.536458,79.799479,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [58]:
def prediction(a,b,c,d,e,f,g,h): 
    
    result=gbc.predict([[a,b,c,d,e,f,g,h]])
    
    if result:
        return("You are diabetic\n")
    else:
        return("You are not diabetic\n")
    

In [59]:
import gradio as gr
app=gr.Interface(fn=prediction,
                inputs=[gr.Number(label="Pregnancies"),
                        gr.Number(label="Glucose"),
                        gr.Number(label="BloodPressure"),
                        gr.Number(label="SkinThickness"),
                        gr.Number(label="Insulin"),
                        gr.Number(label="BMI"), 
                        gr.Number(label="DiabetesPedigreeFunction"),
                        gr.Number(label="Age")
                       ],
                 outputs=gr.Text(),
                 title="Diabetes Prediction")

In [60]:
import warnings
warnings.filterwarnings("ignore")
app.launch()

IMPORTANT: You are using gradio version 3.46.0, however version 4.29.0 is available, please upgrade.
--------
Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.


