# K-Nearest Neighbors Classifier

In [32]:
# Import necessary libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import balanced_accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [33]:
# Load dataset

dataset= pd.read_csv('dataset.csv')

# Replace 'Unknown' with NaN in Target
dataset['Genetic Disorder'].replace('Unknown', np.nan, inplace=True)
dataset['Disorder Subclass'].replace('Unknown', np.nan, inplace=True)

dataset.dropna(inplace=True)

# Alocate features and labels

X = dataset.iloc[:, :-2]  # Features
y = dataset.iloc[:, -2:]  # Labels (last two columns)

# Perform one-hot encoding on the categorical features
X = pd.get_dummies(X, drop_first=True)
# drop_first is used to drop one of the columns for each categorical feature to avoid multicollinearity.

# Check if the target variable has more than 2 classes
if y.nunique().any() > 2:
    y = pd.get_dummies(y, drop_first=True)


In [34]:
# Split the data into validation, validation and test

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75,  random_state=1) # training as 75% 


# Standardize features (important for KNN)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [35]:
# Initialize the KNN classifier

k_neighbors = 3
knn = MultiOutputClassifier(KNeighborsClassifier(n_neighbors=k_neighbors))

# Perform cross-validation
cv_scores = cross_val_score(knn, X_train_scaled, y_train, cv=5) # 5-fold cross-validation

print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Score:", cv_scores.mean())

# Train the KNN classifier on the entire training set
knn.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test_scaled)


Cross-Validation Scores: [0.14259328 0.15367566 0.1344662  0.14591799 0.13889915]
Mean Cross-Validation Score: 0.14311045437753972


In [36]:
# Evaluate the model

# Separate the multi output targets
y_test_last1 = y_test.iloc[:, -1]
y_test_last2 = y_test.iloc[:, -2]

y_pred_last1 = y_pred[:, -1]
y_pred_last2 = y_pred[:, -2]


balanced_acc_last1 = balanced_accuracy_score(y_test_last2, y_pred_last2)
balanced_acc_last2 = balanced_accuracy_score(y_test_last2, y_pred_last2)


# Use dataset without Unknown values

In [37]:
# Train models without unknown data

# Replace 'Unknown' with NaN
dataset.replace('Unknown', np.nan, inplace=True)

dataset.dropna(inplace=True)

In [38]:
# Alocate features and labels

X = dataset.iloc[:, :-2]  # Features
y = dataset.iloc[:, -2:]  # Labels (last two columns)

# Perform one-hot encoding on the categorical features
X = pd.get_dummies(X, drop_first=True)
# drop_first is used to drop one of the columns for each categorical feature to avoid multicollinearity.

# Check if the target variable has more than 2 classes
if y.nunique().any() > 2:
    y = pd.get_dummies(y, drop_first=True)

In [39]:
# Split the data into validation, validation and test

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75,  random_state=1) # training as 75% 


# Standardize features (important for KNN)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [43]:
# Initialize the KNN classifier

k_neighbors = 3
knn = MultiOutputClassifier(KNeighborsClassifier(n_neighbors=k_neighbors))

# Perform cross-validation
cv_scores = cross_val_score(knn, X_train_scaled, y_train, cv=5) # 5-fold cross-validation

print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Score:", cv_scores.mean())

# Train the KNN classifier on the entire training set
knn.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test_scaled)


Cross-Validation Scores: [0.1589404  0.16666667 0.11333333 0.18666667 0.12666667]
Mean Cross-Validation Score: 0.15045474613686535


In [41]:
# Evaluate the model

# Separate the multi output targets
y_test_last1_new = y_test.iloc[:, -1]
y_test_last2_new = y_test.iloc[:, -2]

y_pred_last1_new = y_pred[:, -1]
y_pred_last2_new = y_pred[:, -2]


balanced_acc_last1_new = balanced_accuracy_score(y_test_last2_new, y_pred_last2_new)
balanced_acc_last2_new = balanced_accuracy_score(y_test_last2_new, y_pred_last2_new)


In [42]:
# Metrics and information for each output column
print('\nClassification Report (Last Column):')
print(classification_report(y_test_last1, y_pred_last1))
print("balanced accuracy:", balanced_acc_last1)

print('\nClassification Report (Second to Last Column):')
print(classification_report(y_test_last2, y_pred_last2))
print("balanced accuracy:", balanced_acc_last2)

print('\n-------------------------REMOVING UNKNOWNS ---------------------------')

print('\nClassification Report (Last Column):')
print(classification_report(y_test_last1_new, y_pred_last1_new))
print("balanced accuracy:", balanced_acc_last1_new)

print('\nClassification Report (Second to Last Column):')
print(classification_report(y_test_last2_new, y_pred_last2_new))
print("balanced accuracy:", balanced_acc_last2_new)



Classification Report (Last Column):
                                     precision    recall  f1-score   support

                        Alzheimer's       0.03      0.06      0.04        32
                             Cancer       0.00      0.00      0.00        22
                    Cystic fibrosis       0.17      0.36      0.23       745
                           Diabetes       0.11      0.16      0.13       417
                    Hemochromatosis       0.10      0.10      0.10       315
Leber's hereditary optic neuropathy       0.04      0.04      0.04       141
                     Leigh syndrome       0.27      0.22      0.24      1186
             Mitochondrial myopathy       0.23      0.14      0.17      1016
                          Tay-Sachs       0.19      0.06      0.10       638

                           accuracy                           0.18      4512
                          macro avg       0.13      0.13      0.12      4512
                       weighted avg 