#### Case 1: Diabetes Classification Analysis


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

##### 1. Determine the accuracy of determining the output if the following pairs are only considered at a time:
a. Glucose and Blood Pressure

b. Glucose and Insulin

c. Insulin and BMI

d. BMI and Diabetes Pedigree Function

In [3]:
# Load the Datasets

df = pd.read_csv('../../datasets/diabetes.csv')
test_df = pd.read_csv('../../datasets/diabetes_data_table.csv')

In [13]:
feature_pairs = [
    ('Glucose', 'BloodPressure'),
    ('Glucose', 'Insulin'),
    ('Insulin', 'BMI'),
    ('BMI', 'DiabetesPedigreeFunction'),
]

y = df['Outcome']

for pair in feature_pairs:
    X = df[list(pair)]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)

    print('Accuracy of K-NN classifier on training set: {:.2f}%'.format(
        knn.score(X_train, y_train) * 100))
    print('Accuracy of K-NN classifier on test set: {:.2f}%'.format(
        knn.score(X_test, y_test) * 100))

    print('Predicting the accuracy based on the test set:')
    print(knn.predict(test_df[list(pair)]))

    print('Predicting the probability based on the test set:')
    print(knn.predict_proba(test_df[list(pair)]))
    print("")


Accuracy of K-NN classifier on training set: 78.82%
Accuracy of K-NN classifier on test set: 73.44%
Predicting the accuracy based on the test set:
[0 1 1 0 1 0 1 0 0 1 1 0 1 0 1]
Predicting the probability based on the test set:
[[0.8 0.2]
 [0.4 0.6]
 [0.  1. ]
 [1.  0. ]
 [0.2 0.8]
 [1.  0. ]
 [0.4 0.6]
 [1.  0. ]
 [0.8 0.2]
 [0.4 0.6]
 [0.4 0.6]
 [1.  0. ]
 [0.  1. ]
 [0.8 0.2]
 [0.4 0.6]]

Accuracy of K-NN classifier on training set: 77.43%
Accuracy of K-NN classifier on test set: 73.96%
Predicting the accuracy based on the test set:
[0 0 1 0 1 0 1 0 0 0 1 0 1 0 1]
Predicting the probability based on the test set:
[[1.  0. ]
 [0.8 0.2]
 [0.2 0.8]
 [0.8 0.2]
 [0.4 0.6]
 [1.  0. ]
 [0.4 0.6]
 [1.  0. ]
 [0.8 0.2]
 [0.6 0.4]
 [0.4 0.6]
 [0.8 0.2]
 [0.2 0.8]
 [0.8 0.2]
 [0.4 0.6]]

Accuracy of K-NN classifier on training set: 77.08%
Accuracy of K-NN classifier on test set: 68.23%
Predicting the accuracy based on the test set:
[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
Predicting the probability ba

##### 3. Create a program that will accept the features in the data set. Then using K Nearest Neighbor Aggregation,
determine if the patient is diabetic or not.

a. Glucose and Blood Pressure

b. Glucose and Insulin

c. Glucose and Age

d. Insulin and BMI

e. Insulin and Age

In [14]:
# Create different models with different feature paire
pairs: list[tuple[str, str]] = [
    ('Glucose', 'BloodPressure'),
    ('Glucose', 'Insulin'),
    ('Glucose', 'Age'),
    ('Insulin', 'BMI'),
    ('Insulin', 'Age'),
]


y = df['Outcome']

models: list[KNeighborsClassifier] = []

for pair in pairs:
    X = df[list(pair)]
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)

    models.append(knn)

In [17]:
import pickle

# Save the models
for i, model in enumerate(models):
    pair = pairs[i]
    model_name = f'../../models/knn_{pair[0].lower()}_{pair[1].lower()}_model.pkl'
    pickle.dump(model, open(model_name, 'wb'))
