In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


*   Data Preparation


In [None]:
# Storing Dataset in a variable

data = pd.read_csv("#2320306.csv")
data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0,140,65.0,26,130,42.6,0.431,24.0,1
1,3,113,50.0,10,85,29.5,0.626,25.0,0
2,4,142,86.0,0,0,44.0,0.645,22.0,1
3,0,117,80.0,31,53,45.2,0.089,24.0,0
4,1,87,68.0,34,77,37.6,0.401,24.0,0
...,...,...,...,...,...,...,...,...,...
695,6,85,78.0,0,0,31.2,0.382,42.0,0
696,7,62,78.0,0,0,32.6,0.391,41.0,0
697,3,148,66.0,25,0,32.5,0.256,22.0,0
698,2,88,74.0,19,53,29.0,0.229,22.0,0


The dataset contains 700 entries with input features such as **Pregnancies, Glucose, Bloodpressure, SkinThickness, Insulin, BMI, DiabetesPedigreeFunction, Age, Outcome**(Target Variable)


*   Data Cleaning


In [None]:
# Now I will perform few steps to understand and analyse this dataset
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,700.0,700.0,695.0,700.0,700.0,689.0,700.0,684.0,700.0
mean,3.894286,121.177143,69.020144,20.457143,79.78,31.882003,0.473219,33.378655,0.355714
std,3.375028,31.912569,19.323565,15.884405,116.012641,7.693876,0.335401,11.65028,0.479072
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,64.0,0.0,0.0,27.3,0.245,24.0,0.0
50%,3.0,117.0,72.0,23.0,27.0,32.0,0.3745,29.0,0.0
75%,6.0,141.0,80.0,32.0,129.25,36.4,0.61225,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,59.4,2.42,81.0,1.0


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               700 non-null    int64  
 1   Glucose                   700 non-null    int64  
 2   BloodPressure             695 non-null    float64
 3   SkinThickness             700 non-null    int64  
 4   Insulin                   700 non-null    int64  
 5   BMI                       689 non-null    float64
 6   DiabetesPedigreeFunction  700 non-null    float64
 7   Age                       684 non-null    float64
 8   Outcome                   700 non-null    int64  
dtypes: float64(4), int64(5)
memory usage: 49.3 KB


In [None]:
# Now I will identify how many missing values are present in dataset

data.isnull().sum()

Pregnancies                  0
Glucose                      0
BloodPressure                5
SkinThickness                0
Insulin                      0
BMI                         11
DiabetesPedigreeFunction     0
Age                         16
Outcome                      0
dtype: int64

This shows that we have **32** missing values among 3 input features such as **BloodPressure, BMI, Age**.


In [None]:
# Replacing missing values with Median

data[['BloodPressure', 'BMI', 'Age']] = data[['BloodPressure', 'BMI', 'Age']].fillna(data[['BloodPressure', 'BMI', 'Age']].median())

data.isnull().sum() # Checking for presence of any missing values remaining in data

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

This shows that missing values are no longer present in dataset.



*   Clustering as Preprocessing



In [None]:
# Now I will start preprocessing our data for Clustering
# First I will drop 'Outcomes' because it is our target variable

X = data.drop(columns=["Outcome"])
X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0,140,65.0,26,130,42.6,0.431,24.0
1,3,113,50.0,10,85,29.5,0.626,25.0
2,4,142,86.0,0,0,44.0,0.645,22.0
3,0,117,80.0,31,53,45.2,0.089,24.0
4,1,87,68.0,34,77,37.6,0.401,24.0
...,...,...,...,...,...,...,...,...
695,6,85,78.0,0,0,31.2,0.382,42.0
696,7,62,78.0,0,0,32.6,0.391,41.0
697,3,148,66.0,25,0,32.5,0.256,22.0
698,2,88,74.0,19,53,29.0,0.229,22.0


In [None]:
# Now I will normalise our data
X = (X - X.min()) / (X.max() - X.min())

In [None]:
# Defining Clusters for KMeans

kmeans = KMeans(n_clusters = 3, random_state = 42, n_init = "auto")

In [None]:
# Fitting data into Kmeans

kmeans.fit(X)

In [None]:
kmeans.labels_ # This shows the result of clusters

array([1, 2, 2, 1, 2, 2, 2, 0, 2, 1, 0, 2, 2, 0, 1, 1, 0, 2, 0, 2, 2, 0,
       1, 1, 0, 2, 0, 0, 0, 2, 2, 2, 1, 2, 2, 1, 1, 2, 1, 0, 1, 0, 0, 1,
       2, 2, 1, 1, 1, 2, 1, 1, 2, 0, 2, 0, 0, 2, 1, 1, 1, 1, 2, 2, 1, 1,
       2, 1, 0, 2, 0, 2, 0, 2, 0, 1, 1, 1, 0, 0, 2, 2, 2, 2, 1, 2, 2, 2,
       2, 0, 2, 2, 1, 2, 0, 2, 1, 1, 0, 0, 0, 1, 0, 2, 2, 0, 2, 1, 2, 2,
       2, 1, 2, 2, 0, 1, 1, 0, 0, 2, 2, 2, 0, 2, 0, 2, 2, 0, 2, 2, 1, 2,
       2, 0, 1, 2, 1, 0, 2, 2, 2, 2, 1, 0, 0, 2, 1, 1, 2, 2, 0, 0, 0, 1,
       2, 2, 2, 0, 2, 2, 0, 1, 2, 2, 2, 1, 1, 0, 2, 2, 0, 2, 2, 2, 0, 2,
       2, 0, 2, 2, 2, 2, 0, 1, 2, 2, 2, 1, 0, 2, 2, 1, 2, 0, 2, 2, 2, 1,
       1, 1, 0, 2, 2, 2, 0, 0, 2, 0, 0, 0, 0, 0, 2, 2, 0, 2, 1, 2, 2, 2,
       0, 1, 0, 2, 2, 1, 0, 0, 2, 2, 2, 1, 2, 2, 2, 0, 2, 1, 2, 2, 0, 2,
       1, 0, 0, 1, 2, 2, 2, 2, 2, 1, 0, 2, 1, 1, 0, 1, 2, 1, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 2, 2, 1, 1, 0, 2, 1, 0, 2, 1, 0, 1, 2, 0, 2, 1, 2,
       1, 0, 2, 2, 1, 0, 2, 0, 1, 0, 2, 1, 0, 2, 1,

In [None]:
kmeans.cluster_centers_
# This show us the 3 centroids that we used to cluster our observations.

array([[0.45362517, 0.65020451, 0.6277926 , 0.18369744, 0.07666172,
        0.55091222, 0.17495085, 0.42317829],
       [0.10978295, 0.69376293, 0.59450338, 0.34554097, 0.2028925 ,
        0.61058896, 0.21348817, 0.13083779],
       [0.14192657, 0.52591818, 0.5033282 , 0.13602468, 0.0388881 ,
        0.48023298, 0.13620837, 0.09328859]])



*   Classification




In [None]:
# Assigning data to x for our classification model
# First we will drop column Outcome as it is our target variable.

x = data.drop(columns=["Outcome"])
x

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0,140,65.0,26,130,42.6,0.431,24.0
1,3,113,50.0,10,85,29.5,0.626,25.0
2,4,142,86.0,0,0,44.0,0.645,22.0
3,0,117,80.0,31,53,45.2,0.089,24.0
4,1,87,68.0,34,77,37.6,0.401,24.0
...,...,...,...,...,...,...,...,...
695,6,85,78.0,0,0,31.2,0.382,42.0
696,7,62,78.0,0,0,32.6,0.391,41.0
697,3,148,66.0,25,0,32.5,0.256,22.0
698,2,88,74.0,19,53,29.0,0.229,22.0


In [None]:
# Now I will assign target variable to y
y = data["Outcome"]
y

0      1
1      0
2      1
3      0
4      0
      ..
695    0
696    0
697    0
698    0
699    1
Name: Outcome, Length: 700, dtype: int64

In [None]:
# Now I will split the data into training and testing parts for training the model

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 10)
x_train # calling train data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
116,1,196,76.0,36,249,36.5,0.875,29.0
217,0,73,0.0,0,0,21.1,0.342,25.0
411,2,90,60.0,0,0,23.5,0.191,25.0
167,10,75,82.0,0,0,33.3,0.263,38.0
223,3,102,44.0,20,94,30.8,0.400,26.0
...,...,...,...,...,...,...,...,...
369,5,97,76.0,27,0,35.6,0.378,52.0
320,8,194,80.0,0,0,26.1,0.551,67.0
527,0,95,85.0,25,36,37.4,0.247,24.0
125,8,120,0.0,0,0,30.0,0.183,38.0


In [None]:
x_test # calling test data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
27,15,136,70.0,32,110,37.1,0.153,43.0
641,5,128,80.0,0,0,34.6,0.144,45.0
152,11,103,68.0,40,0,46.2,0.126,42.0
302,8,100,76.0,0,0,38.7,0.190,42.0
541,7,97,76.0,32,91,40.9,0.871,29.0
...,...,...,...,...,...,...,...,...
237,2,127,46.0,21,335,34.4,0.176,22.0
322,4,110,66.0,0,0,31.9,0.471,29.0
462,2,75,64.0,24,55,29.7,0.370,33.0
109,1,95,74.0,21,73,25.9,0.673,36.0


In [None]:
# Now I will create a instance for KNN and also determine number of neighbors for classification

KNN_model = KNeighborsClassifier(n_neighbors = 5) # model is ready for training

In [None]:
# Now I will train the model based on the training data

KNN_model.fit(x_train, y_train) # This will train the data on both input features and their target variables.

In [None]:
# now we shall use the trained model to make predictions
yhat_train = KNN_model.predict(x_train)
yhat_test = KNN_model.predict(x_test)

In [None]:
# Now we shall check the accuracy score of our model for the predictions

print(accuracy_score(y_train, yhat_train))  # Training Data accuracy
print(accuracy_score(y_test, yhat_test))    # Test Data accuracy

0.789795918367347
0.7523809523809524
