# Predicting Diabetes 🩺

#### If you like my work, It will be really great of you to upvote this notebook!
#### If not then you leaving a comment on what do I need to work on and improve will be really helpful!

## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter("ignore")

## Loading up the data

In [None]:
df = pd.read_csv("../input/pima-indians-diabetes-database/diabetes.csv")
df.head()

* **Pregnancies** : Number of Pregnancies

* **Glucose** : Plasma glucose concentration

* **BloodPressure** : Diastolic blood pressure (mm Hg)

* **SkinThickness** : Triceps skin fold thickness (mm)

* **Insulin** : 2-Hour serum insulin (mu U/ml)

* **BMI** : Body Mass Index (weight in kg/(height in m)^2)

* **DiabetesPedigreeFunction** : Diabetes pedigree function (a function which scores likelihood of diabetes based on family history).

* **Age** : Age (years)

* **Outcome** : Whether the patient is diabetic or not, 0 represents the person is not diabetic and 1 represents that the person is diabetic.

In [None]:
df.shape

In [None]:
# Looking for missing values in the dataset
df.isna().sum()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
# Having a look at the correlation matrix

fig, ax = plt.subplots(figsize=(8,6))
sns.heatmap(df.corr(), annot=True, fmt='.1g', cmap="viridis", cbar=False);

In [None]:
plt.style.use("seaborn")
fig, ax = plt.subplots(figsize=(7,7))
plt.pie(x=df["Outcome"].value_counts(), 
        colors=["seagreen","firebrick"], 
        labels=["Non-Diabetic","Diabetic"], 
        shadow = True, 
        explode = (0, 0.1)
        )
plt.show()

In [None]:
df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']] = df[['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']].replace(0, np.nan)
df.isnull().sum()

In [None]:
# Filling null values with the median

for col in ["Glucose", "BloodPressure", "SkinThickness", "BMI", "Insulin"]:
    df[col] = df[col].fillna(df[col].median())

In [None]:
plt.style.use("seaborn")
fig, ax = plt.subplots(3,3, figsize=(20,25)) 


sns.distplot(df["Age"], ax=ax[0,0], color="darkorange"); 
ax[0,0].set_xlabel("Age",fontsize=15)

sns.distplot(df["Pregnancies"], ax=ax[0,1], color="darkorange"); 
ax[0,1].set_xlabel("Pregnancies",fontsize=15)

sns.distplot(df["BloodPressure"], ax=ax[0,2], color="darkorange"); 
ax[0,2].set_xlabel("BloodPressure",fontsize=15)



sns.distplot(x = df["BMI"], ax=ax[1,0], color="mediumblue"); 
ax[1,0].set_xlabel("BMI",fontsize=15)

sns.distplot(x = df["Glucose"], ax=ax[1,1], color="mediumblue"); 
ax[1,1].set_xlabel("Glucose",fontsize=15)

sns.distplot(x = df["Insulin"], ax=ax[1,2], color="mediumblue"); 
ax[1,2].set_xlabel("Insulin",fontsize=15)


sns.distplot(x = df["DiabetesPedigreeFunction"], ax=ax[2,0], color="darkgreen"); 
ax[2,0].set_xlabel("DiabetesPedigreeFunction",fontsize=15)

sns.distplot(x = df["SkinThickness"], ax=ax[2,1], color="darkgreen"); 
ax[2,1].set_xlabel("SkinThickness",fontsize=15)

sns.distplot(x = df["Outcome"], ax=ax[2,2], color="darkgreen"); 
ax[2,2].set_xlabel("Outcome",fontsize=15);

In [None]:
plt.style.use("seaborn")
fig, ax =plt.subplots(4,2, figsize=(20,25)) 


sns.histplot(x = df["Age"], hue = df["Outcome"], palette="rocket", kde=True, ax=ax[0,0]);
ax[0,0].set_xlabel("Age",fontsize=15)

sns.histplot(x = df["Pregnancies"], hue = df["Outcome"], palette="rocket", kde=True, ax=ax[0,1]);
ax[0,1].set_xlabel("Pregnancies",fontsize=15)


sns.histplot(x = df["Insulin"], hue = df["Outcome"], palette="dark", kde=True, ax=ax[1,0]);
ax[1,0].set_xlabel("Insulin",fontsize=15)

sns.histplot(x = df["Glucose"], hue = df["Outcome"], palette="dark", kde=True, ax=ax[1,1]);
ax[1,1].set_xlabel("Glucose",fontsize=15)


sns.histplot(x = df["BMI"], hue = df["Outcome"], palette="flare", kde=True, ax=ax[2,0]);
ax[2,0].set_xlabel("BMI",fontsize=15)

sns.histplot(x = df["BloodPressure"], hue = df["Outcome"], palette="flare", kde=True, ax=ax[2,1]);
ax[2,1].set_xlabel("BloodPressure",fontsize=15)


sns.histplot(x = df["SkinThickness"], hue = df["Outcome"], palette="viridis", kde=True, ax=ax[3,0]);
ax[3,0].set_xlabel("SkinThickness",fontsize=15)

sns.histplot(x = df["DiabetesPedigreeFunction"], hue = df["Outcome"], palette="viridis", kde=True, ax=ax[3,1]);
ax[3,1].set_xlabel("DiabetesPedigreeFunction",fontsize=15);

In [None]:
plt.style.use("seaborn")
fig, ax = plt.subplots(figsize=(8,6))
sns.scatterplot(x = df['Glucose'], y = df['Age'], hue = df['Outcome'], palette='hls', legend=True)
plt.legend(title='Result', loc='upper left', labels=['Healthy' , 'Diabetic']);

In [None]:
plt.style.use("seaborn")
fig, ax = plt.subplots(figsize=(8,6))
sns.scatterplot(x = df['BloodPressure'], y = df['Age'], hue = df['Outcome'], palette='hls', legend=True)
plt.legend(title='Result', loc='upper left',labels=['Healthy' , 'Diabetic']);

In [None]:
plt.style.use("seaborn")
fig, ax = plt.subplots(figsize=(8,6))
sns.scatterplot(x = df['BMI'], y = df['Age'], hue = df['Outcome'], palette='hls', legend=True)
plt.legend(title='Result', labels=['Healthy' , 'Diabetic']);

In [None]:
plt.style.use("seaborn")
fig, ax = plt.subplots(figsize=(8,6))
sns.scatterplot(x = df['Insulin'], y = df['Age'], hue = df['Outcome'], palette='hls', legend=True)
plt.legend(title='Result', labels=['Healthy' , 'Diabetic']);

In [None]:
df.head()

## Splitting the data into training and test datasets
Here, we are trying to predict whether the patient has diabetes or not using the given data. Hence, the `Outcome` will be the y label and rest of the data will be the X or the input data.

In [None]:
# X data
X = df.drop("Outcome", axis=1)
X.head()

In [None]:
# y data
y = df["Outcome"]
y.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
len(X_train), len(X_test)

In [None]:
# Scaling the data 

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
LogisticRegressionScore = lr.score(X_test, y_test)
print("Accuracy obtained by Logistic Regression model:",LogisticRegressionScore*100)

In [None]:
# Having a look at the confusion matrix

from sklearn.metrics import confusion_matrix, classification_report

y_pred = lr.predict(X_test)
cf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(cf_matrix, annot=True, cmap="Spectral")
plt.title("Confusion Matrix for Logistic Regression", fontsize=14, fontname="Helvetica", y=1.03);

## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 100)
rf.fit(X_train,y_train)

In [None]:
RandomForestClassifierScore = rf.score(X_test, y_test)
print("Accuracy obtained by Random Forest Classifier model:",RandomForestClassifierScore*100)

In [None]:
# Having a look at the confusion matrix
y_pred = rf.predict(X_test)
cf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(cf_matrix, annot=True, cmap="Spectral")
plt.title("Confusion Matrix for Random Forest Classifier", fontsize=14, fontname="Helvetica", y=1.03);

## K Neighbors Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(4)
knn.fit(X_train,y_train)

In [None]:
KNeighborsClassifierScore = knn.score(X_test, y_test)
print("Accuracy obtained by K Neighbors Classifier model:",KNeighborsClassifierScore*100)

In [None]:
# Having a look at the confusion matrix
y_pred = knn.predict(X_test)
cf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(cf_matrix, annot=True, cmap="Spectral")
plt.title("Confusion Matrix for K Neighbors Classifier", fontsize=14, fontname="Helvetica", y=1.03);

## Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)

In [None]:
DecisionTreeClassifierScore = tree.score(X_test,y_test)
print("Accuracy obtained by Decision Tree Classifier model:",DecisionTreeClassifierScore*100)

In [None]:
# Confusion matrix
y_pred = tree.predict(X_test)
cf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(cf_matrix, annot=True, cmap="Spectral")
plt.title("Confusion Matrix for Decision Tree Classifier", fontsize=14, fontname="Helvetica", y=1.03);

## CatBoost Classifier

In [None]:
from catboost import CatBoostClassifier
cat = CatBoostClassifier(iterations=40)
cat.fit(X_train, y_train);

In [None]:
CatBoostClassifierScore = cat.score(X_test,y_test)
print("Accuracy obtained by CatBoost Classifier model:",CatBoostClassifierScore*100)

In [None]:
# Confusion matrix
y_pred = cat.predict(X_test)
cf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(cf_matrix, annot=True, cmap="Spectral")
plt.title("Confusion Matrix for CatBoost Classifier", fontsize=14, fontname="Helvetica", y=1.03);

## Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)

In [None]:
GradientBoostingClassifierScore = gb.score(X_test,y_test)
print("Accuracy obtained by Gradient Boosting Classifier model:",GradientBoostingClassifierScore*100)

In [None]:
# Confusion matrix
y_pred = gb.predict(X_test)
cf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(cf_matrix, annot=True, cmap="Spectral")
plt.title("Confusion Matrix for Gradient Boosting Classifier", fontsize=14, fontname="Helvetica", y=1.03);

## Comparing performance of the models

In [None]:
plt.style.use("seaborn")

x = ["LogisticRegression", 
     "Decision Tree Classifier", 
     "RandomForestClassifier", 
     "KNeighborsClassifier", 
     "CatBoost Classifier", 
     "Gradient Boosting Classifier"]

y = [LogisticRegressionScore, 
     DecisionTreeClassifierScore, 
     RandomForestClassifierScore, 
     KNeighborsClassifierScore, 
     CatBoostClassifierScore, 
     GradientBoostingClassifierScore]

fig, ax = plt.subplots(figsize=(8,6))
sns.barplot(x=x,y=y, palette="crest");
plt.ylabel("Model Accuracy")
plt.xticks(rotation=40)
plt.title("Model Comparison - Model Accuracy", fontsize=14, fontname="Helvetica", y=1.03);

## Hyperparameter Tuning on Random Forest Classifier

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'criterion' : ['gini', 'entropy'],
    'n_estimators': [100, 200, 300, 1000]
}

grid_search = GridSearchCV(estimator = rf, 
                           param_grid = param_grid, 
                           cv = 3, n_jobs = -1, verbose = 2)

In [None]:
grid_search.fit(X_train, y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
grid_search_rfc_predict = grid_search.predict(X_test)

In [None]:
print('Improvement in Random Forest Classifier after GridSearchCV: {:0.2f}%.'.format(100 * (grid_search.best_score_ - RandomForestClassifierScore) / RandomForestClassifierScore))

In [None]:
# Comparing the results after the improvement in Random Forest Classifier

plt.style.use("seaborn")

x = ["Random Forest Classifier",  
     "GridSearch-RandomForestClassifier"]

y = [RandomForestClassifierScore,  
     grid_search.best_score_]

fig, ax = plt.subplots(figsize=(7,7))
sns.barplot(x=x,y=y, palette="crest");
plt.ylabel("Accuracy")
plt.xticks(rotation=45)
plt.title("Random Forest Classifier  vs  GridSearched Random Forest Classifier", fontsize=14, fontname="Helvetica", y=1.03);

## Hyperparameter Tuning on Logistic Regression

In [None]:
param_grid = [
    {
      "penalty": ["l1", "l2", "elastic", "none" ],
      "C" : np.logspace(-4, 4, 20),
      "solver" : ["sag", "saga", "lbfgs", "liblinear", "newton-cg"],
      "max_iter" : [100, 1000, 2500, 5000]
    }
]

grid_search_lr = GridSearchCV(estimator = lr, 
                              param_grid = param_grid, 
                              cv = 5, n_jobs = -1, verbose = True)

In [None]:
grid_search_lr.fit(X_train, y_train)

In [None]:
grid_search_lr.best_params_

In [None]:
grid_search_lr.best_score_

In [None]:
grid_search_lr_predict = grid_search_lr.predict(X_test)

In [None]:
print('Improvement in Logistic Regression after GridSearchCV: {:0.2f}%.'.format(100 * (grid_search_lr.best_score_ - LogisticRegressionScore) / LogisticRegressionScore))

In [None]:
# Comparing the results after the improvement in Logistic Regression

plt.style.use("seaborn")

x = ["Logistic Regression",
     "GridSearch-LogisticRegression"]

y = [LogisticRegressionScore,
     grid_search_lr.best_score_]

fig, ax = plt.subplots(figsize=(7,7))
sns.barplot(x=x,y=y, palette="crest");
plt.ylabel("Accuracy")
plt.xticks(rotation=45)
plt.title("LogisticRegression  vs  GridSearched LogisticRegression", fontsize=14, fontname="Helvetica", y=1.03);

In [None]:
# Comparing both the improved models

plt.style.use("seaborn")

x = ["GridSearch LogisticRegression", "GridSearch RandomForestClassifier"]

y = [grid_search_lr.best_score_,  grid_search.best_score_]
  
fig, ax = plt.subplots(figsize=(7,7))
sns.barplot(x=x,y=y, palette="viridis");
plt.ylabel("Accuracy")
plt.xticks(rotation=45)
plt.title("GridSearched LogisticRegression  vs  GridSearched RandomForestClassifier", fontsize=14, fontname="Helvetica", y=1.03);

In [None]:
# Classification Report of Random Forest Classifier

print(classification_report(y_test, grid_search_rfc_predict))