**Predict the onset of diabetes based on diagnostic measures**


The datasets consists of several medical predictor variables and one target variable, Outcome. Predictor variables includes the number of pregnancies the patient has had, their BMI, insulin level, age, and so on

In [0]:
!pip install sklearn

In [0]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

In [5]:
from google.colab import files
uploaded = files.upload()

In [6]:
dataset = pd.read_csv('diabetes.csv')
print(len(dataset))
print(dataset.head())

768
   Pregnancies  Glucose  BloodPressure  ...  DiabetesPedigreeFunction  Age  Outcome
0            6      148             72  ...                     0.627   50        1
1            1       85             66  ...                     0.351   31        0
2            8      183             64  ...                     0.672   32        1
3            1       89             66  ...                     0.167   21        0
4            0      137             40  ...                     2.288   33        1

[5 rows x 9 columns]


In [0]:
#we can't have the zeros in Glucose,BloodPressure etc
#So we need to replace all zeros if any value have

zero_not_accepted = ["Glucose" ,"BloodPressure", "SkinThickness","BMI","Insulin" ]
#using for loop to check

#use of mean if you have missing data then mean take the average the your data
for column in zero_not_accepted:
  dataset[column] = dataset[column].replace(0 ,np.NaN) #replaceing 0 with none data
  mean= int(dataset[column].mean(skipna=True)) #
  dataset[column] = dataset[column].replace(np.NaN , mean) #replaceing none data with mean
  

In [0]:
print(dataset['SkinThickness']) #BloodPressure , SkinThickness ,BMI,Insulin

In [0]:
#Split the dataset
X=dataset.iloc[: , 0:8] #0 to 8 rows location 
y= dataset.iloc[: , 8]
X_train , X_test, y_train , y_test = train_test_split(X,y , random_state=0, test_size=0.2)

In [0]:
# Feature Scaling
sc_X = StandardScaler()

X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [28]:
import math
print("The root of y_train:",math.sqrt(len(y_train)))
print("The root of y_test: ",math.sqrt(len(y_test)))

The root of y_train: 24.779023386727733
The root of y_test:  12.409673645990857


In [29]:
#The root of y_test is 12 but we need the odd number 
#So 12-1=11 is our K 
#Define the model : Init K-NN
classifier = KNeighborsClassifier(n_neighbors = 11 , p=2 , metric ='euclidean')
#Fit the Model
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='euclidean',
                     metric_params=None, n_jobs=None, n_neighbors=11, p=2,
                     weights='uniform')

In [31]:
#predict the test set results
y_pred = classifier.predict(X_test)
print(y_pred)

[1 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 1 0
 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 1 1 0 0 0 1 0 1 1 0 0 1 1 1 1 0 0 0 0 0 0 1
 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 1 1 0 0 0 0 1 1 0 1 0 1 0
 0 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0
 0 0 0 0 0 0]


In [34]:
#using confusion matrix
cm = confusion_matrix(y_test , y_pred)
print(cm)

[[94 13]
 [15 32]]


In [36]:
print(f1_score(y_test , y_pred))

0.6956521739130436


In [37]:
print(accuracy_score(y_test,y_pred))

0.8181818181818182
