##Importing libraries to implement various machine learning algorithms

In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from numpy import loadtxt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

##Reading our dataset file i.e. the diabetes.csv files using the 'pd.read_csv' command

In [0]:
data = pd.read_csv('/content/diabetes.csv')

# Link to the diabetes dataset - https://www.kaggle.com/johndasilva/diabetes

##The len command is used to find the length of the dataset i.e. the number of rows in our dataset.

In [67]:
len(data)

768

##The ".head()" command is used to check the first 5 rows in our dataset with all the columns included. We can specify the number of rows in the brackets, but the default value is 5.

In [68]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [0]:
zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']
for column in zero_not_accepted:
     data[column] = data[column].replace(0,np.NaN)
     mean = int(data[column].mean(skipna=True))
     data[column] = data[column].replace(np.NaN,mean)

##Initializing the training and the target columns

In [0]:
X = data.iloc[:,:8]
y = data.iloc[:,8]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = .2, random_state = 0) 
#The train_test_split function to split our training and testing data into 80% for training and 20% for testing

##For implementing the KNN algorithm we need to perform feature scaling

In [0]:
sc_X = StandardScaler()
X_train1 = sc_X.fit_transform(X_train)
X_test1 = sc_X.transform(X_test)

##Let's find the number of neighbors to initialize the KNN algorithm

In [72]:
import math
math.sqrt(len(y_test))


12.409673645990857

## Creating a KNN object called clf and training the model using the fit function

In [73]:
clf = KNeighborsClassifier(n_neighbors = 12, p = 2, metric = 'euclidean')
clf.fit(X_train1,y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=12, p=2,
                     weights='uniform')

#Using the ".predict" function on our testing dataset to predict the output

In [0]:
y_pred = clf.predict(X_test1)

In [75]:
y_pred # Displaying the output array

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## Confusion matrix is used to represent the True and the False values.

In [76]:
cm = confusion_matrix(y_test,y_pred)
cm

array([[97, 10],
       [21, 26]])

## The "accuracy_score" function shows the prediction accuracy of our KNN model.

In [77]:
accuracy = accuracy_score(y_test,y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 79.87%


#### Creating a Random Forest Classifier object called Rclf and training the model using the fit function

In [0]:
#Create a Random Forest Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training data
clf.fit(X_train,y_train)

#Predict output for the test
y_pred=clf.predict(X_test)

##We will evaluate the model by finding the accuracy of the Random Forest Model

In [79]:
# Model Accuracy, how often is the classifier correct?
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 81.17%


##Let's try using the XGBoost algorithm to predict the diabetes in a patient

In [80]:
model = XGBClassifier()
model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

##Let's test the XGBoost model to predict the diabetes and finally find the accuracy

In [81]:
# make predictions for test data
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 81.17%


##**Observations**: We implemented 3 machine learning algorithms i.e KNearest Neighbors, Random Forest and XGBoost to predict diabetes in a patient. We achieved highest accuracy using XGBoost and Random Forest algorithms. Thus we can use any of the model between them to predict diabetes.