Using KNN to predict Diabetes. 1 is true, meaning that they have diabetes, and 0 is false.

In [66]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
import matplotlib.pyplot as plt

#Loading the data
data = pd.read_csv('data.csv')

#Creates dataframe
df = pd.DataFrame(data)

print("The length of the data set is: " + str(len(df)))

#Displays the first 5 rows of the dataframe
print(df.head(5))

columns_to_replace = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']

#Replaces 0 with NaN in the specified columns
df[columns_to_replace] = df[columns_to_replace].replace(0, np.nan)

#Fills NaN values with the mean of each column (basically replacing the NaN values with data of an 'average' person)
df[columns_to_replace] = df[columns_to_replace].fillna(df[columns_to_replace].mean())

#Checks if there are any missing values by displaying the first 20 rows of 'Glucose' column
print(df['Glucose'].head(20))

#Selecting columns 0 to 8 as features (since the last column is the target)
X = df.iloc[:, 0:8]

#Selecting the target column
y = df.iloc[:, 8]

#Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

print(len(X_train))
print(len(y_train))
print(len(X_test))
print(len(y_test))

#Scaling the different features
#Scaling data (so all data is on a consistent scale. E.g. instead of one column having data ranging from 5-500 and another being from 1-20, all features will be scaled to range from -1 to 1)
xscaler = StandardScaler()
X_train = xscaler.fit_transform(X_train)
X_test = xscaler.transform(X_test)  # We only transform the test set

#Fit training data to the K Neighbors Classifier
#Using euclidean distance as the metric for the KNN algorithm (essentially finding the distance between two points in a 2D plane- in this case, the distance between two data points; the closer the points, the more similar they are; the further apart, the more different they are; the algorithm uses this to classify the data points; pythagorean theorem is used to calculate the distance between the points)
knn = KNeighborsClassifier(n_neighbors=5, metric='euclidean')

#Fit the model on the training data
knn.fit(X_train, y_train)




The length of the data set is: 768
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  
0     148.0
1      85.0
2     183.0
3      89.0
4     137.0
5     116.0
6      78.0
7     115.0
8     197.0
9     125.0
10    110.0
11    168.0
12    139.0
13    189.0
14    166.0
15    100.0
16    118.0
17    107.0
18    103.0
19    115.0
Name: Glucose, dtype: float6

In [67]:
#Predict the test set results
y_pred = knn.predict(X_test)

#Print the predicted results
print(y_pred)

#Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)


print("\n\n\nConfusion Matrix:\n", conf_matrix)
print("Accuracy:", accuracy)
print("F1 Score:", f1)

[1 0 0 1 1 1 0 0 1 0 0 1 0 0 0 0 0 0 1 1 1 0 0 0 1 1 0 0 0 0 1 1 1 1 1 1 1
 1 0 0 0 0 0 1 0 0 1 0 0 1 0 1 1 1 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 1 0 0 0
 0 1 1 0 1 0 1 0 0 0 0 1 1 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 1 1 1 0 0 1 0 1 0
 0 1 1 0 1 1 0 0 1 0 0 1 0 1 0 1 1 1 1 1 1 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0
 0 1 0 0 1 1]



Confusion Matrix:
 [[74 25]
 [14 41]]
Accuracy: 0.7467532467532467
F1 Score: 0.6776859504132231
