# Diabetes Prediction - Classification

 **This script will apply cross validation technique for different Algorithms and choose the best one based on accuracy**
 

In [5]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Load the Diabetes Data Set



In [6]:
dataset = pd.read_csv('datasets_diabetes_prima.csv')

# Analyse the Data


In [7]:
dataset.count()


Pregnancies                 768
Glucose                     768
BloodPressure               768
SkinThickness               768
Insulin                     768
BMI                         768
DiabetesPedigreeFunction    768
Age                         768
Outcome                     768
dtype: int64

In [8]:
dataset.info()
dataset.shape
dataset.index


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
Pregnancies                 768 non-null int64
Glucose                     768 non-null int64
BloodPressure               768 non-null int64
SkinThickness               768 non-null int64
Insulin                     768 non-null int64
BMI                         768 non-null float64
DiabetesPedigreeFunction    768 non-null float64
Age                         768 non-null int64
Outcome                     768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


RangeIndex(start=0, stop=768, step=1)

In [9]:
dataset.head(10)
print(dataset['Outcome'].unique())

[1 0]


# Find whether data is balanced

In [10]:

dataset['Outcome'].count()
dataset['Outcome'].value_counts()


0    500
1    268
Name: Outcome, dtype: int64

# Missing Value Treatment as the Many columns have min value 0 means there are 0 values

In [11]:

print(dataset.describe())
#Many columns have min value 0 means there are 0 values

dataset.head(20)
print(dataset.isnull().sum())

       Pregnancies     Glucose  BloodPressure  SkinThickness     Insulin  \
count   768.000000  768.000000     768.000000     768.000000  768.000000   
mean      3.845052  120.894531      69.105469      20.536458   79.799479   
std       3.369578   31.972618      19.355807      15.952218  115.244002   
min       0.000000    0.000000       0.000000       0.000000    0.000000   
25%       1.000000   99.000000      62.000000       0.000000    0.000000   
50%       3.000000  117.000000      72.000000      23.000000   30.500000   
75%       6.000000  140.250000      80.000000      32.000000  127.250000   
max      17.000000  199.000000     122.000000      99.000000  846.000000   

              BMI  DiabetesPedigreeFunction         Age     Outcome  
count  768.000000                768.000000  768.000000  768.000000  
mean    31.992578                  0.471876   33.240885    0.348958  
std      7.884160                  0.331329   11.760232    0.476951  
min      0.000000                  

In [12]:
# Replace 0 with Null
dataset.columns
#dataset.replace(0, np.nan, inplace=True)

dataset_target = dataset.filter(['Outcome'], axis=1)
#dataset_target = dataset.iloc[:, -1] this will return a series and not a dataframe
dataset_target.columns
dataset.drop(['Outcome'],axis=1, inplace = True)
dataset.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [13]:
#print(dataset.isnull().sum())
dataset_target.columns

Index(['Outcome'], dtype='object')

In [14]:
#Replace all Null with mean values as it is all integer columns
dataset =dataset.apply(lambda x: x.fillna(x.mean())) 

dataset 

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33
5,5,116,74,0,0,25.6,0.201,30
6,3,78,50,32,88,31.0,0.248,26
7,10,115,0,0,0,35.3,0.134,29
8,2,197,70,45,543,30.5,0.158,53
9,8,125,96,0,0,0.0,0.232,54


In [15]:
print(dataset.isnull().sum())


#Apply missing Value Treatment to Independent variables
dataset_new=pd.concat([dataset, dataset_target], axis=1)
dataset_target.columns
dataset_new.columns

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
dtype: int64


Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')

# Model Building

In [16]:
X = dataset_new.iloc[:, :-1].values
y = dataset_new.iloc[:, -1].values

In [17]:

dataset_new['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

# Apply Cross validation - KNN nearest neighbors

In [20]:

from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier
knnclassifier = KNeighborsClassifier(n_neighbors=4)
print(cross_val_score(knnclassifier, X, y, cv=10, scoring ='accuracy').mean())
#71.8 ----> 10 folds

0.7187115516062884


# Apply Cross Validation - Logistic Regression

In [23]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
print (cross_val_score(logreg, X, y, cv=10, scoring = 'accuracy').mean())
#76.8 --> 10 folds

0.7669685577580314




# Apply Cross Validation - SVM

In [25]:
# Try SVM model 
from sklearn.svm import SVC
classifierSVM = SVC(kernel = 'linear', random_state = 0)
print (cross_val_score(classifierSVM, X, y, cv=10, scoring = 'accuracy').mean())
#76.69 --> 10 folds

0.7669685577580314


# Apply Cross Validation -Random Forest

In [26]:
#try Random Forest
from sklearn.ensemble import RandomForestClassifier
classifierRF = RandomForestClassifier(n_estimators = 20, criterion = 'entropy', random_state = 0)
print (cross_val_score(classifierRF, X, y, cv=10, scoring = 'accuracy').mean())
#76.56 --> 10 folds

0.7656356801093643
