# Diabetes Prediction
--------------------------------

Name: Logesh.V <br>
Email: vlogesh2001@gmail.com <br>

--------------------------------

In [1]:
#Import Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix

<br>Reading the data csv file as dataframe and printing first 5 rows

In [2]:
df = pd.read_csv("diabetes.csv")
print(df.shape)
no_of_features = df.shape[1]  #finding no.of feature columns
df.head()

(768, 9)


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


<br>Converting Dataframe to Numpy Arrays, so it would be numerical values instead of string</br>
<br>X -> input data features
<br>Y -> input data classification</br>

In [3]:
X = df.iloc[:,0:no_of_features-1].values
Y = df.iloc[:,no_of_features-1].values

print(X.shape, "\t", Y.shape)

(768, 8) 	 (768,)


<br> **As we can see the dataset has different ranges, we need to scale out all the features to common range**

In [4]:
np.set_printoptions(precision=2, suppress=True, threshold=25) #changing printing options

print("X data: \n", X)
sc = StandardScaler()
X_sc = sc.fit_transform(X)
print("\n Scaled X data: \n", X_sc)

X data: 
 [[  6.   148.    72.   ...  33.6    0.63  50.  ]
 [  1.    85.    66.   ...  26.6    0.35  31.  ]
 [  8.   183.    64.   ...  23.3    0.67  32.  ]
 ...
 [  5.   121.    72.   ...  26.2    0.24  30.  ]
 [  1.   126.    60.   ...  30.1    0.35  47.  ]
 [  1.    93.    70.   ...  30.4    0.32  23.  ]]

 Scaled X data: 
 [[ 0.64  0.85  0.15 ...  0.2   0.47  1.43]
 [-0.84 -1.12 -0.16 ... -0.68 -0.37 -0.19]
 [ 1.23  1.94 -0.26 ... -1.1   0.6  -0.11]
 ...
 [ 0.34  0.    0.15 ... -0.74 -0.69 -0.28]
 [-0.84  0.16 -0.47 ... -0.24 -0.37  1.17]
 [-0.84 -0.87  0.05 ... -0.2  -0.47 -0.87]]


<br>Spliting the dataset into Train Data and Test Data based on some ratio

In [5]:
X_train, X_test, Y_train,Y_test = train_test_split(X_sc, Y, test_size=0.15, random_state=0)
print("Train Set Dimensions:\n", X_train.shape, "\n", Y_train.shape)
print("\nTest Set Dimensions:\n", X_test.shape, "\n", Y_test.shape)

Train Set Dimensions:
 (652, 8) 
 (652,)

Test Set Dimensions:
 (116, 8) 
 (116,)


## Logistic Regression Classification

In [6]:
#1-> Logistic Regression (This is classified based on 2 Outcome classifaction here)

log = LogisticRegression()
log.fit(X_train, Y_train)
Y_pred = log.predict(X_test)
CM = confusion_matrix(Y_test, Y_pred)

print("(Logistic Regression) Accuracy:", accuracy_score(Y_test, Y_pred)*100, "%")
print("\n[Prediction | Actual results]:")
print(np.concatenate((Y_pred.reshape(-1,1), Y_test.reshape(-1,1)), axis =1))
print("\n Confusion Matrix:\n", CM)
print("\nTotal Correct Prediction =", CM.trace(), "/", X_test.shape[0])

(Logistic Regression) Accuracy: 82.75862068965517 %

[Prediction | Actual results]:
[[1 1]
 [0 0]
 [0 0]
 ...
 [0 1]
 [1 1]
 [1 1]]

 Confusion Matrix:
 [[71  7]
 [13 25]]

Total Correct Prediction = 96 / 116


## **K**-Nearest Neighbors Classification

In [7]:
#2-> KNN Classification
#The best accuracy is usually in the middle not too high or not too low, so it needs to be found out by hit and trial
acc = 0
K = 0
for i in range(1,no_of_features):    
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, Y_train)
    Y_pred = knn.predict(X_test)
    
    print("K-Value =", i, "\t\tAccuracy:", accuracy_score(Y_test, Y_pred)*100, "%")
    
    if(accuracy_score(Y_test, Y_pred)*100 > acc):
        acc = accuracy_score(Y_test, Y_pred)*100
        Y_pred_final = Y_pred
        K = i

print("\n(KNN-classification) K-Neighbhor:", K, "\t\tAccuracy:", accuracy_score(Y_test, Y_pred_final)*100, "%")
print("\n[Prediction | Actual results]:")
print(np.concatenate((Y_pred_final.reshape(-1,1), Y_test.reshape(-1,1)), axis =1))
CM = confusion_matrix(Y_test, Y_pred)
print("\n Confusion Matrix:\n", CM)
print("\nTotal Correct Prediction =", CM.trace(), "/", X_test.shape[0])

K-Value = 1 		Accuracy: 68.96551724137932 %
K-Value = 2 		Accuracy: 74.13793103448276 %
K-Value = 3 		Accuracy: 75.86206896551724 %
K-Value = 4 		Accuracy: 75.0 %
K-Value = 5 		Accuracy: 82.75862068965517 %
K-Value = 6 		Accuracy: 80.17241379310344 %
K-Value = 7 		Accuracy: 78.44827586206897 %
K-Value = 8 		Accuracy: 76.72413793103449 %

K-Neighbhor: 5 		Accuracy: 82.75862068965517 %

[Prediction | Actual results]:
[[1 1]
 [0 0]
 [0 0]
 ...
 [1 1]
 [1 1]
 [1 1]]

 Confusion Matrix:
 [[68 10]
 [17 21]]

Total Correct Prediction = 89 / 116


## Decision Tree Classification

In [8]:
#3-> Decision Tree

dt = DecisionTreeClassifier()
dt.fit(X_train, Y_train)
Y_pred = dt.predict(X_test)
CM = confusion_matrix(Y_test, Y_pred)

print("(Decision Tree) Accuracy:", accuracy_score(Y_test, Y_pred)*100, "%")
print("\n[Prediction | Actual results]:")
print(np.concatenate((Y_pred.reshape(-1,1), Y_test.reshape(-1,1)), axis =1))
print("\n Confusion Matrix:\n", CM)
print("\nTotal Correct Prediction =", CM.trace(), "/", X_test.shape[0])

(Decision Tree) Accuracy: 68.10344827586206 %

[Prediction | Actual results]:
[[1 1]
 [0 0]
 [0 0]
 ...
 [1 1]
 [1 1]
 [1 1]]

 Confusion Matrix:
 [[55 23]
 [14 24]]

Total Correct Prediction = 79 / 116


## SVM (Linear & **RBF**) Classification

In [9]:
#3-> SVM

svc = SVC(kernel = 'linear', random_state = 0)
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)

print("(SVC - 'linear') Accuracy:", accuracy_score(Y_test, Y_pred)*100, "%")
print("\n[Prediction | Actual results]:")
print(np.concatenate((Y_pred.reshape(-1,1), Y_test.reshape(-1,1)), axis =1))
CM = confusion_matrix(Y_test, Y_pred)
print("\n Confusion Matrix:\n", CM)
print("\nTotal Correct Prediction =", CM.trace())


svc = SVC(kernel = 'rbf', random_state = 0)
svc.fit(X_train, Y_train)
Y_pred = svc.predict(X_test)

print("\n\n\n(SVC - 'rbf') Accuracy:", accuracy_score(Y_test, Y_pred)*100, "%")
print("\n[Prediction | Actual results]:")
print(np.concatenate((Y_pred.reshape(-1,1), Y_test.reshape(-1,1)), axis =1))
CM = confusion_matrix(Y_test, Y_pred)
print("\n Confusion Matrix:\n", CM)
print("\nTotal Correct Prediction =", CM.trace(), "/", X_test.shape[0])

(SVC - 'linear') Accuracy: 82.75862068965517 %

[Prediction | Actual results]:
[[1 1]
 [0 0]
 [0 0]
 ...
 [0 1]
 [1 1]
 [1 1]]

 Confusion Matrix:
 [[71  7]
 [13 25]]

Total Correct Prediction = 96



(SVC - 'rbf') Accuracy: 79.3103448275862 %

[Prediction | Actual results]:
[[1 1]
 [0 0]
 [0 0]
 ...
 [1 1]
 [0 1]
 [1 1]]

 Confusion Matrix:
 [[68 10]
 [14 24]]

Total Correct Prediction = 92 / 116


## Naive Bayes Classification

In [10]:
#4-> Naive Bayes Classification

NB = GaussianNB()
NB.fit(X_train, Y_train)
Y_pred = NB.predict(X_test)
CM = confusion_matrix(Y_test, Y_pred)

print("(Naive Bayes) Accuracy:", accuracy_score(Y_test, Y_pred)*100, "%")
print("\n[Prediction | Actual results]:")
print(np.concatenate((Y_pred.reshape(-1,1), Y_test.reshape(-1,1)), axis =1))
print("\n Confusion Matrix:\n", CM)
print("\nTotal Correct Prediction =", CM.trace(), "/", X_test.shape[0])

(Naive Bayes) Accuracy: 80.17241379310344 %

[Prediction | Actual results]:
[[1 1]
 [0 0]
 [0 0]
 ...
 [1 1]
 [1 1]
 [1 1]]

 Confusion Matrix:
 [[67 11]
 [12 26]]

Total Correct Prediction = 93 / 116


--------------------------------------------------------------------------------------------------
### x---------------------------- Thanking You ----------------------------x
--------------------------------------------------------------------------------------------------