# <span style = "color:orange;">Diabetes Prediction:</span>
![](https://res.cloudinary.com/grohealth/image/upload/c_fill,f_auto,fl_lossy,h_650,q_auto,w_1085/v1581695681/DCUK/Content/causes-of-diabetes.png)

## Importing Libraries

In [None]:
# Basic Libraries
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# Classification Models
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier

# Helper Libraries
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_validate
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.datasets import make_classification, load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, precision_recall_curve, auc, roc_curve, recall_score, classification_report, f1_score

import warnings
warnings.filterwarnings("ignore")

import os
print(os.listdir("../input"))

## First look at the Data

In [None]:
df = pd.read_csv('../input/pima-indians-diabetes-database/diabetes.csv')
df.head()

## Contents of Data
**Pregnancies:**                   Number of times pregnant

**Glucose:**                       Plasma Glucose Concentration (mg/dl)

**Blood Pressure:**                Diastolic Blood Pressure(mmHg)

**Skin Thickness:**                A value used to estimate body fat.

**Insulin:**                       2-Hour Serum Insulin (mu U/ml)

**BMI:**                           Body Mass Index (weight in kg/ height in m2)

**Diabetes Pedigree Function:**    It provides information about diabetes history in relatives and genetics.

**Age:**                           Age (years)

**Outcome:**                       0 = Diabetic, 1 = Not Diabetic

In [None]:
df.describe()

Data contains contains some big values so normalization can be done.

## Checking missing values

In [None]:
df.isna().sum()

Data does not have any nan values.

## Checking duplicate values

In [None]:
df[df.duplicated() == True]

No duplicates present

# Data Analysis

In [None]:
df['Outcome'].value_counts()

In [None]:
fig, (ax1,ax2) = plt.subplots(1,2,figsize = (12,5),constrained_layout=True)
plt.subplots_adjust(wspace = 0.5)

negative_count = df.Outcome.value_counts().tolist()[0]
positive_count = df.Outcome.value_counts().tolist()[1]

ax1.bar(df.Outcome.unique(),df.Outcome.value_counts(),color = ['blue', 'orange'],width = 0.8)
ax1.set_xticks(df.Outcome.unique())
ax1.set_xticklabels(('Negative','Positive'))

ax2.pie((negative_count,positive_count), labels = ('Negative','Positive'), autopct='%1.1f%%', shadow=True, startangle=90, explode=[0,0.1])

plt.show()

## Age dependence on Diabetes

In [None]:
feature = 'Age'

fig, (ax1,ax2) = plt.subplots(1,2, figsize = (20,5),constrained_layout=True)
bin_x = range(25,80,2)

ax1.hist(df[feature],bins=bin_x,rwidth=0.9)
ax1.set_xticks(range(25,80,2))
ax1.set_xlabel('Age',fontsize=15)
ax1.set_ylabel('Count',fontsize=15)
ax1.set_title('Age Distribution',fontsize=20)

ax2.hist(df[df['Outcome']==1][feature], label = 'Positive',bins=bin_x,rwidth=0.9)
ax2.hist(df[df['Outcome']==0][feature], label = 'Negative',bins=bin_x,rwidth=0.5)
ax2.legend()
ax2.set_xticks(range(25,80,2))
ax2.set_xlabel('Age',fontsize=15)
ax2.set_ylabel('Count',fontsize=15)
ax2.set_title('Diabetes: Positive vs Negative',fontsize=20)

plt.show()

In [None]:
x = df.groupby(['Age', 'Pregnancies']).agg({'Outcome':'count'})
y = df.groupby(['Age']).agg({'Outcome':'count'})
z = (x.div(y, level='Age') * 100)
q= 100 - z

fig, ax = plt.subplots(2,2, figsize = (20,12))
plt.subplots_adjust(hspace = 0.5)

ax[0,0].hist(df[df['Outcome']==1].Age.tolist(),bins=bin_x,rwidth=0.8)
ax[0,0].set_xticks(range(30,80,2))
ax[0,0].set_xlabel('Age Range',fontsize=15)
ax[0,0].set_ylabel('Patient Count',fontsize=15)
ax[0,0].set_title('Patients having Diabetes',fontsize=20)

ax[0,1].hist(df[df['Outcome']==0].Age.tolist(),bins=bin_x,rwidth=0.8)
ax[0,1].set_xticks(range(30,80,2))
ax[0,1].set_xlabel('Age Range',fontsize=15)
ax[0,1].set_ylabel('Patient Count',fontsize=15)
ax[0,1].set_title('People not having Diabetes',fontsize=20)

ax[1,0].scatter(z.xs(1,level=1).reset_index().Age,z.xs(1,level=1).reset_index().Outcome, s=(x.xs(1,level=1).Outcome)*30,edgecolors = 'r',c = 'yellow')
ax[1,0].plot(z.xs(1,level=1).reset_index().Age,z.xs(1,level=1).reset_index().Outcome)
ax[1,0].set_xticks(range(15,70,2))
ax[1,0].set_yticks(range(0,50,2))
ax[1,0].set_xlabel('Age',fontsize=15)
ax[1,0].set_ylabel('%',fontsize=15)
ax[1,0].set_title('% of Patients with Diabetes by age',fontsize=20)

ax[1,1].scatter(z.xs(1,level=1).reset_index().Age,q.xs(1,level=1).reset_index().Outcome, s=(x.xs(1,level=1).Outcome)*30,edgecolors = 'r',c = 'yellow')
ax[1,1].plot(z.xs(1,level=1).reset_index().Age,q.xs(1,level=1).reset_index().Outcome)
ax[1,1].set_xticks(range(15,70,2))
ax[1,1].set_yticks(range(50,100,2))
ax[1,1].set_xlabel('Age',fontsize=15)
ax[1,1].set_ylabel('%',fontsize=15)
ax[1,1].set_title('% of Patients without Diabetes by age',fontsize=20)

plt.show()

## Blood Pressure Distribution (w.r.t Outcome)

In [None]:
feature = 'BloodPressure'

fig, (ax1,ax2) = plt.subplots(1,2, figsize = (20,5),constrained_layout=True)
bin_x = range(40,100,2)

ax1.hist(df[feature],bins=bin_x,rwidth=0.9)
ax1.set_xticks(range(40,100,2))
ax1.set_xlabel('BloodPressure',fontsize=15)
ax1.set_ylabel('Count',fontsize=15)
ax1.set_title('BloodPressure Distribution',fontsize=20)

ax2.hist(df[df['Outcome']==1][feature], label = 'Positive',bins=bin_x,rwidth=0.9)
ax2.hist(df[df['Outcome']==0][feature], label = 'Negative',bins=bin_x,rwidth=0.5)
ax2.legend()
ax2.set_xticks(range(40,100,2))
ax2.set_xlabel('BloodPressure',fontsize=15)
ax2.set_ylabel('Count',fontsize=15)
ax2.set_title('Diabetes: Positive vs Negative',fontsize=20)

plt.show()

## Distribution of other features (w.r.t Outcome)

In [None]:
f,ax=plt.subplots(3,2,figsize=(18,18))
sns.violinplot(x='Outcome', y='Pregnancies', data=df, ax=ax[0][0])
ax[0][0].set_title('Outcome vs Pregnancies',fontsize=20)
sns.violinplot(x='Outcome', y='Glucose', data=df, ax=ax[0][1])
ax[0][1].set_title('Outcome vs Glucose',fontsize=20)
sns.violinplot(x='Outcome', y='BloodPressure', data=df, ax=ax[1][0])
ax[1][0].set_title('Outcome vs BloodPressure',fontsize=20)
sns.violinplot(x='Outcome', y='SkinThickness', data=df, ax=ax[1][1])
ax[1][1].set_title('Outcome vs SkinThickness',fontsize=20)
sns.violinplot(x='Outcome', y='Insulin', data=df, ax=ax[2][0])
ax[2][0].set_title('Outcome vs Insulin',fontsize=20)
sns.violinplot(x='Outcome', y='BMI', data=df, ax=ax[2][1])
ax[2][1].set_title('Outcome vs BMI',fontsize=20)

plt.show()

## Feature 

Based on this [Notebook](https://www.kaggle.com/vincentlugat/pima-indians-diabetes-eda-prediction-0-906) by - Vincent Lugat

In [None]:
fig = plt.figure(figsize=(12,8))
ax1 = sns.scatterplot(x = df['Glucose'], y = df['Age'], hue = "Outcome",
                    data = df, edgecolor='black')

plt.annotate('N1', size=25, color='black', xy=(80, 30), xytext=(60, 35),
            arrowprops=dict(facecolor='black', shrink=0.05),
            )
plt.plot([70, 130], [30, 30], linewidth=2, color = 'red')
plt.plot([130, 130], [20, 30], linewidth=2, color = 'red')
plt.plot([70, 130], [20, 20], linewidth=2, color = 'red')
plt.plot([70, 70], [20, 30], linewidth=2, color = 'red')
plt.title('Glucose vs Age')
plt.show()

In [None]:
df1 = df[(df['Glucose']<=130) & (df['Age']<=30)]
df2 = df[(df['Glucose']>130) & (df['Age']>30)]

f,ax=plt.subplots(1,3,figsize=(18,5))
ax[0].bar(df.Outcome.unique(),df.Outcome.value_counts(), color = ['blue', 'orange'],width = 0.8)
ax[0].set_xticks(df.Outcome.unique())
ax[0].set_xticklabels(('Negative','Positive'))
ax[0].set_title('Complete Distribution',fontsize=20)

ax[1].bar(df1.Outcome.unique(), df1.Outcome.value_counts(), color = ['blue', 'orange'],width = 0.8)
ax[1].set_xticks(df1.Outcome.unique())
ax[1].set_xticklabels(('Negative','Positive'))
ax[1].set_title('N1 Distribution',fontsize=20)

ax[2].bar(df2.Outcome.unique(), df2.Outcome.value_counts(), color = ['blue', 'orange'],width = 0.8)
ax[2].set_xticks(df1.Outcome.unique())
ax[2].set_xticklabels(('Negative','Positive'))
ax[2].set_title('Rest Distribution',fontsize=20)

plt.show()

In [None]:
fig = plt.figure(figsize=(12,8))
ax1 = sns.scatterplot(x = df['Glucose'], y = df['BloodPressure'], hue = "Outcome",
                    data = df, edgecolor='black')

plt.annotate('N2', size=25, color='black', xy=(70, 80), xytext=(50, 110),
            arrowprops=dict(facecolor='black', shrink=0.05),
            )
plt.plot([40, 105], [80, 80], linewidth=2, color = 'red')
plt.plot([40, 40], [0, 80], linewidth=2, color = 'red')
plt.plot([40, 105], [0, 0], linewidth=2, color = 'red')
plt.plot([105, 105], [0, 80], linewidth=2, color = 'red')
plt.title('Glucose vs BloodPressure')
plt.show()

In [None]:
df1 = df[(df['Glucose']<=120) & (df['BloodPressure']<=80)]
df2 = df[(df['Glucose']>120) & (df['BloodPressure']>0)]

f,ax=plt.subplots(1,3,figsize=(18,5))
ax[0].bar(df.Outcome.unique(),df.Outcome.value_counts(), color = ['blue', 'orange'],width = 0.8)
ax[0].set_xticks(df.Outcome.unique())
ax[0].set_xticklabels(('Negative','Positive'))
ax[0].set_title('Complete Distribution',fontsize=20)

ax[1].bar(df1.Outcome.unique(), df1.Outcome.value_counts(), color = ['blue', 'orange'],width = 0.8)
ax[1].set_xticks(df1.Outcome.unique())
ax[1].set_xticklabels(('Negative','Positive'))
ax[1].set_title('N2 Distribution',fontsize=20)

ax[2].bar(df2.Outcome.unique(), df2.Outcome.value_counts(), color = ['blue', 'orange'],width = 0.8)
ax[2].set_xticks(df1.Outcome.unique())
ax[2].set_xticklabels(('Negative','Positive'))
ax[2].set_title('Rest Distribution',fontsize=20)

plt.show()

In [None]:
fig = plt.figure(figsize=(12,8))

ax1 = sns.scatterplot(x = df['SkinThickness'], y = df['BMI'], hue = "Outcome",
                    data = df, edgecolor='black')

plt.annotate('N3', size=25, color='black', xy=(20, 20), xytext=(50, 25),
            arrowprops=dict(facecolor='black', shrink=0.05),
            )
plt.plot([0, 20], [30, 30], linewidth=2, color = 'red')
plt.plot([0, 0], [16, 30], linewidth=2, color = 'red')
plt.plot([0, 20], [16, 16], linewidth=2, color = 'red')
plt.plot([20, 20], [16, 30], linewidth=2, color = 'red')
plt.title('SkinThickness vs BMI')
plt.show()

In [None]:
df1 = df[(df['SkinThickness']<=20) & (df['BMI']<=30)]
df2 = df[(df['SkinThickness']>20) & (df['BMI']>30)]

f,ax=plt.subplots(1,3,figsize=(18,5))
ax[0].bar(df.Outcome.unique(),df.Outcome.value_counts(), color = ['blue', 'orange'],width = 0.8)
ax[0].set_xticks(df.Outcome.unique())
ax[0].set_xticklabels(('Negative','Positive'))
ax[0].set_title('Complete Distribution',fontsize=20)

ax[1].bar(df1.Outcome.unique(), df1.Outcome.value_counts(), color = ['blue', 'orange'],width = 0.8)
ax[1].set_xticks(df1.Outcome.unique())
ax[1].set_xticklabels(('Negative','Positive'))
ax[1].set_title('N3 Distribution',fontsize=20)

ax[2].bar(df2.Outcome.unique(), df2.Outcome.value_counts(), color = ['blue', 'orange'],width = 0.8)
ax[2].set_xticks(df1.Outcome.unique())
ax[2].set_xticklabels(('Negative','Positive'))
ax[2].set_title('Rest Distribution',fontsize=20)

plt.show()

## Creating extra features

Grouping / Categorising Linear Columns

In [None]:
categorical = ['Glucose', 'BMI', 'Age',]
for col in categorical:    
    df['Categorical'] = pd.cut(df[col], 5)
    df = pd.concat([df, pd.get_dummies(df['Categorical'])],axis=1)
    del df['Categorical']

Features Based on above analysis

In [None]:
df['N1'] = [1 if x>30 else 0 for x in df['Age']]
df['N2'] = [1 if x>125 else 0 for x in df['Glucose']]
df['N3'] = [1 if x>80 else 0 for x in df['BloodPressure']]
df['N4'] = [1 if x>20 else 0 for x in df['SkinThickness']]
df['N5'] = [1 if x>30 else 0 for x in df['BMI']]

df['N1&N2'] = df['N1'] + df['N2']
df['N2&N3'] = df['N2'] + df['N3']
df['N4&N5'] = df['N4'] + df['N5']

In [None]:
df.head()

## Feature Correlations

In [None]:
df.corr().style.background_gradient(cmap = 'Oranges')

In [None]:
plt.figure(figsize=(12,6))
df.corr()['Outcome'].sort_values().plot(kind='bar');

In [None]:
x = df.corr()
pd.DataFrame(x['Outcome']).sort_values(by='Outcome',ascending = False).style.background_gradient(cmap = 'Greens')

## Classification Pipeline

In [None]:
class Classifier:
    Models = {
        'SVC': SVC(),
        'LGBMClassifier': LGBMClassifier(),
        'GaussianNB': GaussianNB(),
        'SGD': SGDClassifier(),
        'DecisionTree': DecisionTreeClassifier(),
        'AdaBoostClassifier': AdaBoostClassifier(),
        'RidgeClassifier': RidgeClassifier(),
        'KNeighborsClassifier': KNeighborsClassifier(),
        'LogisticRegression': LogisticRegression(),
        'RandomForestClassifier': RandomForestClassifier(),
        'GradientBoostingClassifier': GradientBoostingClassifier(),
        'PassiveAggressiveClassifier': PassiveAggressiveClassifier(),
        'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
        'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
        'CatBoostClassifier': CatBoostClassifier(verbose= False, eval_metric = 'AUC'),
        'XGBClassifier': XGBClassifier(objective= 'binary:logistic', eval_metric='auc'),
        'ExtraTreesClassifier': ExtraTreesClassifier()
    }
    scaler = StandardScaler()
    scoring = ['accuracy', 'precision_macro', 'recall_macro' , 'f1_weighted', 'roc_auc']
    model_results = {
        'Model': [], 'Fitting time': [], 'Scoring time':[], 'Accuracy':[], 'Precision':[], 'Recall':[], 'F1_Score':[], 'AUC':[]
    }
    
    def __init__(self, data, target):
        self.df = data
        self.cols = list(data.columns)
        self.cols.remove(target)
        self.scaler.fit(self.df[self.cols])
        self.target = target
    
    def cross_validation(self, model, folds):
        X = self.scaler.transform(self.df[self.cols])
        scores = cross_validate(model, X, self.df[self.target], scoring=self.scoring, cv=folds, verbose=False)
        sorted(scores.keys())
        return scores['fit_time'].mean(), scores['score_time'].mean(), scores['test_accuracy'].mean(), scores['test_precision_macro'].mean(), scores['test_recall_macro'].mean(),scores['test_f1_weighted'].mean(),scores['test_roc_auc'].mean()
    
    def score_models(self, folds = 10):
        for model_name in self.Models.keys():
            fit_time, score_time, accuracy, precision, recall, f1_score, auc = self.cross_validation(self.Models[model_name], folds)
            self.model_results['Model'].append(model_name)
            self.model_results['Fitting time'].append(fit_time)
            self.model_results['Scoring time'].append(score_time)
            self.model_results['Accuracy'].append(accuracy)
            self.model_results['Precision'].append(precision)
            self.model_results['Recall'].append(recall)
            self.model_results['F1_Score'].append(f1_score)
            self.model_results['AUC'].append(auc)
        
        return pd.DataFrame(self.model_results)

In [None]:
pipeline = Classifier(df, 'Outcome')
results = pipeline.score_models()

## Results

In [None]:
results.sort_values(by='AUC', ascending=False)

In [None]:
plt.figure(figsize=(15,5))

for col in list(results.columns):
    if col=='AUC' or col=='Accuracy':
        plt.plot(results['Model'], results[col], '-o', label = col)

plt.title('Mean Score with different params')
plt.xticks(rotation=90)
plt.grid(True)
plt.legend()
plt.show()

I will use Linear Regression for further predictions as it showed promising results.

In [None]:
cols = list(df.columns)
cols.remove('Outcome')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df[cols], df['Outcome'], test_size=0.1, stratify=df['Outcome'], random_state=123)

## Oversampling using SMOTE

In [None]:
smote = SMOTE(random_state = 123)
X_train, y_train = smote.fit_resample(X_train, y_train)

## GridSearch Pipeline

In [None]:
pipeline = Pipeline(steps = [['scaler', MinMaxScaler()],
                             ['classifier', LogisticRegression(random_state=11, max_iter=1000)]])

stratified_kfold = StratifiedKFold(n_splits=10, shuffle=True, random_state=11)

param_grid = {'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}

grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=stratified_kfold,
                           n_jobs=-1)

grid_search.fit(X_train, y_train)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')

## Results

In [None]:
y_prob = grid_search.predict_proba(X_test)[:,1]
y_pred = grid_search.predict(X_test)

false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_prob)
precision, recall, th = precision_recall_curve(y_test, y_prob)
roc_auc = auc(false_positive_rate, true_positive_rate)
roc_auc

In [None]:
plt.figure(figsize=(10,10))
plt.title('Receiver Operating Characteristic')
plt.plot(false_positive_rate,true_positive_rate, color='red',label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],linestyle='--')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

In [None]:
plt.figure(figsize=(10,10))
plt.title('Receiver Operating Characteristic')
plt.plot(recall, precision, color='red', label = 'LogisticRegression')
plt.legend(loc = 'lower right')
plt.plot([1, 0], [0, 1],linestyle='--')
plt.axis('tight')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
plot_confusion_matrix(confusion_matrix(y_test, y_pred), classes=['Negative', 'Positive'], title='Confusion matrix')

## Thank you very much for your attention. Kindly upvote if you like my work.😊