In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score


In [2]:
# loading the diabetes dataset to a pandas Dataframe
diabetes_dataset = pd.read_csv('diabetes.csv')

In [3]:
# print the first five rows of the dataset
diabetes_dataset.head()

Unnamed: 0,Patient no.,Glucose,BP,SkinThickness,Insulin,BMI,DPedigreeFun,Age,Outcome
0,1,148,72,35,0,33.6,0.627,50,1
1,2,85,66,29,0,26.6,0.351,31,0
2,3,183,64,0,0,23.3,0.672,32,0
3,4,89,66,23,94,28.1,0.167,21,1
4,5,137,40,35,168,43.1,2.288,33,1


In [4]:
# print the last five rows of the dataset
diabetes_dataset.tail()

Unnamed: 0,Patient no.,Glucose,BP,SkinThickness,Insulin,BMI,DPedigreeFun,Age,Outcome
393,394,116,72,12,87,22.1,0.463,37,0
394,395,158,78,0,0,32.9,0.803,31,0
395,396,127,58,24,275,27.7,1.6,25,0
396,397,96,56,34,115,24.7,0.944,39,1
397,398,131,66,40,0,34.3,0.196,22,1


In [5]:
#number of rows and columns in this dataset
diabetes_dataset.shape

(398, 9)

In [6]:
# Statistical measures of the data
diabetes_dataset.describe()

Unnamed: 0,Patient no.,Glucose,BP,SkinThickness,Insulin,BMI,DPedigreeFun,Age,Outcome
count,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0,398.0
mean,199.5,121.158291,69.055276,20.351759,81.658291,32.128392,0.488784,33.133166,0.432161
std,115.036951,32.508361,19.059286,15.596515,121.765989,8.137649,0.350244,11.339277,0.496
min,1.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,100.25,100.0,64.0,0.0,0.0,27.325,0.25125,24.0,0.0
50%,199.5,116.5,72.0,23.0,36.5,32.0,0.381,29.0,0.0
75%,298.75,142.75,80.0,32.0,128.75,36.6,0.6535,40.0,1.0
max,398.0,197.0,122.0,60.0,846.0,67.1,2.329,69.0,1.0


In [7]:
# Outcome value counts 
diabetes_dataset['Outcome'].value_counts()

Outcome
0    226
1    172
Name: count, dtype: int64

In [8]:
# Mean values grouped by Outcome 
diabetes_dataset.groupby('Outcome').mean()

Unnamed: 0_level_0,Patient no.,Glucose,BP,SkinThickness,Insulin,BMI,DPedigreeFun,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,199.243363,120.871681,67.79646,19.884956,80.743363,31.933186,0.479611,32.331858
1,199.837209,121.534884,70.709302,20.965116,82.860465,32.384884,0.500837,34.186047


In [9]:
X = diabetes_dataset.drop(columns = 'Outcome', axis=1)
Y = diabetes_dataset['Outcome']
print(Y)


0      1
1      0
2      0
3      1
4      1
      ..
393    0
394    0
395    0
396    1
397    1
Name: Outcome, Length: 398, dtype: int64


In [10]:
scaler = StandardScaler()
scaler.fit(X)
standardized_data = scaler.transform(X)
X = standardized_data
Y = diabetes_dataset['Outcome']

In [11]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, stratify=Y, random_state=2)
print(X.shape, X_train.shape, X_test.shape)

(398, 8) (318, 8) (80, 8)


In [12]:
classifier = svm.SVC(kernel='linear')
classifier.fit(X_train, Y_train)

In [13]:
# accuracy score on the training data
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy score of the training data: ', training_data_accuracy)

Accuracy score of the training data:  0.5691823899371069


In [14]:
# accuracy score on the training data
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score of the test data: ', test_data_accuracy)


Accuracy score of the test data:  0.5625


In [15]:
# Logistic Regression Algorithm
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression(random_state = 42)
logreg.fit(X_train, Y_train)


In [16]:
# K nearest neighbors Algorithm
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 24, metric = 'minkowski', p = 2)
knn.fit(X_train, Y_train)

In [17]:
# Making predictions on test dataset
Y_pred_logreg = logreg.predict(X_test)
Y_pred_knn = knn.predict(X_test)

In [18]:
# Evaluating using accuracy_score metric
from sklearn.metrics import accuracy_score
accuracy_logreg = accuracy_score(Y_test, Y_pred_logreg)
accuracy_knn = accuracy_score(Y_test, Y_pred_knn)


In [19]:
# Accuracy on test set
print("Logistic Regression: " + str(accuracy_logreg * 100))
print("K Nearest neighbors: " + str(accuracy_knn * 100))

Logistic Regression: 58.75
K Nearest neighbors: 51.24999999999999


In [None]:
# Confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(Y_test, Y_pred_knn)
cm

In [None]:
# Classification report
from sklearn.metrics import classification_report
print(classification_report(Y_test, Y_pred_knn))

In [None]:
input_data = (8,115,0,0,0,35.3 ,0.134,29)
# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

# standardize the input data
std_data = scaler.transform(input_data_reshaped)
print(std_data)

prediction = classifier.predict(std_data)
print(prediction)

if (prediction[0] == 0):
  print('the person is not diabetic')
else:
  print('The person is diabetic')