# Diabetes Classifier

This is a simple classifier based on a Decsion Tree model which uses Random Forest Algorithm to predict whether a patient has diabetes or not.

# Importing Libraries

In [753]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from jupyterthemes import jtplot
jtplot.style()

# Loading Data

In [754]:
diabetes = pd.read_csv('C:/Users/LENOVO/# Jupyter Notebook Files/Data Sets/diabetes/diabetes.csv')
diabetes

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


# Data Cleaning

* Detecteing Null values

In [755]:
diabetes.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

No Null values were detected in the Data set

# Separating Dependant variable from Independant variables

In [756]:
y = diabetes.pop('Outcome')
x = diabetes

# Train Test Split

In [757]:
from sklearn.model_selection import train_test_split

In [758]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.4)

# Checking for Zero/Improper values in the data

In [759]:
print("Total no. of rows : {}".format(len(diabetes)))
print("Number of rows missing in 'Glucose' : {}".format(len(diabetes.loc[diabetes['Glucose'] == 0])))
print("Number of rows missing in 'Blood Pressure' {}".format(len(diabetes.loc[diabetes['BloodPressure'] == 0])))
print("Number of rows missing in 'Skin Thickness' : {}".format(len(diabetes.loc[diabetes['SkinThickness'] == 0])))
print("Number of rows missing in 'Insulin' : {}".format(len(diabetes.loc[diabetes['Insulin'] == 0])))
print("Number of rows missing in 'BMI' : {}".format(len(diabetes.loc[diabetes['BMI'] == 0])))
print("Number of rows missing in 'Diabetes Pedigree Function' : {}".format(len(diabetes.loc[diabetes['DiabetesPedigreeFunction'] == 0]))) 
print("Number of rows missing in 'Age' : {}".format(len(diabetes.loc[diabetes['Age'] == 0])))

Total no. of rows : 768
Number of rows missing in 'Glucose' : 5
Number of rows missing in 'Blood Pressure' 35
Number of rows missing in 'Skin Thickness' : 227
Number of rows missing in 'Insulin' : 374
Number of rows missing in 'BMI' : 11
Number of rows missing in 'Diabetes Pedigree Function' : 0
Number of rows missing in 'Age' : 0


# Filling Zero/Improper Values with Mean

In [760]:
 from sklearn.impute import SimpleImputer

In [761]:
fill_values = SimpleImputer(missing_values=0, strategy='mean')
x_train = fill_values.fit_transform(x_train)
x_test = fill_values.fit_transform(x_test)

# Model Creation

In [762]:
from sklearn.ensemble import RandomForestClassifier

In [763]:
diabetes_classifier = RandomForestClassifier()

# Training the Model

In [764]:
diabetes_classifier.fit(x_train,y_train)

RandomForestClassifier()

# Testing the Model

In [765]:
diabetes_predicted = diabetes_classifier.predict(x_test)

# Accuracy of the Model

In [766]:
score = diabetes_classifier.score(x_test,y_test)
score*100

76.94805194805194

# Cross Validation set

In [767]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(diabetes_classifier,x_test,y_test)
score.mean()

0.7662083553675304

# Finding the Best Algorithm to use

In [768]:
def get_score(model,x_train,x_test,y_train,y_test):
    model.fit(x_train,y_train)
    return cross_val_score(model,x_test,y_test)

In [769]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

In [770]:
log_reg_score = get_score(LogisticRegression(),x_train,x_test,y_train,y_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [771]:
svm_score = get_score(SVC(),x_train,x_test,y_train,y_test)

In [772]:
dec_tree_score = get_score(DecisionTreeClassifier(),x_train,x_test,y_train,y_test)

In [773]:
ran_forest_score = get_score(RandomForestClassifier(),x_train,x_test,y_train,y_test)

In [774]:
scores = {
    'Logistic Regresssion' :  log_reg_score,
    'SVM' : svm_score,
    'Decision Tree' : dec_tree_score ,
    'Random Forest' : ran_forest_score 
}

In [775]:
scores = pd.DataFrame(scores)
scores

Unnamed: 0,Logistic Regresssion,SVM,Decision Tree,Random Forest
0,0.741935,0.741935,0.66129,0.709677
1,0.741935,0.693548,0.66129,0.774194
2,0.758065,0.725806,0.741935,0.758065
3,0.688525,0.770492,0.688525,0.721311
4,0.737705,0.737705,0.590164,0.770492


In [776]:
print("Logistic Regression : {}".format(int(100*scores['Logistic Regresssion'].mean())))
print("SVM : {}".format(int(100*scores['SVM'].mean())))
print("Decision Tree : {}".format(int(100*scores['Decision Tree'].mean())))
print("Random Forest : {}".format(int(100*scores['Random Forest'].mean())))

Logistic Regression : 73
SVM : 73
Decision Tree : 66
Random Forest : 74


# Best Model
Logitic Regression Model and Random Forest Algorithms proved to have the best accuracy in the Cross Validation set.

# Conclusion
Since Random Forest Model was used earlier as a trail and error choice and proved to be one of the best algorithm for this data set, Algorithm for the  Model will not be changed.