# K-Nearest Neighbours

In [None]:
# Import necessary libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import balanced_accuracy_score, f1_score, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler


label_encoder = LabelEncoder()

# Load dataset
df= pd.read_csv('dataset.csv')

# Replace 'Unknown' with NaN in Target Columns and eliminate such samples
df['Genetic Disorder'].replace('Unknown', np.nan, inplace=True)
df['Disorder Subclass'].replace('Unknown', np.nan, inplace=True)
df.dropna(inplace=True)

# Encode Data

Before Feature Selection

In [None]:
# Encode categorical variables
all_x_columns= [
    'Patient Age', "Genes in mother's side", 'Inherited from father', 'Maternal gene', 'Paternal gene',
    'Blood cell count (mcL)', "Mother's age", "Father's age", 'Status', 'Respiratory Rate (breaths/min)',
    'Heart Rate (rates/min)', 'Follow-up', 'Gender', 'Birth asphyxia',
    'Autopsy shows birth defect (if applicable)', 'Place of birth', 'Folic acid details (peri-conceptional)',
    'H/O serious maternal illness', 'H/O radiation exposure (x-ray)', 'H/O substance abuse',
    'Assisted conception IVF/ART', 'History of anomalies in previous pregnancies', 'No. of previous abortion',
    'Birth defects', 'White Blood cell count (thousand per microliter)', 'Blood test result',
    'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5', 'Genetic Disorder', 'Disorder Subclass'
]

df_encoded=df.copy()
for column in all_x_columns:
    df_encoded[column] = label_encoder.fit_transform(df_encoded[column].astype(str))


Feature Extraction (run with or without this)

In [None]:
# With feature selection ANOVA
selected_columns = ["Symptom 5", "Symptom 4", "Symptom 3", "Symptom 2",
                    "Genes in mother's side", "Inherited from father","Genetic Disorder","Disorder Subclass"]
df=df[selected_columns]

df_encoded=df.copy()
for column in selected_columns:
    df_encoded[column] = label_encoder.fit_transform(df_encoded[column].astype(str))

In [None]:
X = df_encoded.drop(columns=['Genetic Disorder', 'Disorder Subclass'])
y = df_encoded[['Genetic Disorder', 'Disorder Subclass']]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=1)

# Standardize features (important for KNN)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

knn_gd = KNeighborsClassifier()
knn_ds = KNeighborsClassifier()
knn_multi = MultiOutputClassifier(KNeighborsClassifier())

Run one or other for models accounting unbalanced data:

In [None]:
weights = ['uniform']

In [None]:
weights = ['distance']

Computed for each class

In [None]:
# Tuning
param_grid = {
    'n_neighbors': [9, 15, 17, 21],
    'algorithm': ['auto', 'ball_tree', 'kd_tree'],
    'leaf_size': [10, 20, 30, 40],
    'metric': ['minkowski', 'euclidean', 'manhattan'],
    'weights':weights
}


# Genetic Disorder
knn_tuned = RandomizedSearchCV(estimator = knn_gd, param_distributions = param_grid, n_iter = 100, cv = 3, verbose=2, refit=True, scoring='balanced_accuracy')

knn_tuned.fit(X_train_scaled, y_train['Disorder Subclass'])
y_pred_ds = knn_tuned.predict(X_test_scaled) 

# Disorder Subclass
knn_tuned = RandomizedSearchCV(estimator = knn_ds, param_distributions = param_grid, n_iter = 100, cv = 3, verbose=2, refit=True, scoring='balanced_accuracy')

knn_tuned.fit(X_train_scaled, y_train['Genetic Disorder'])
y_pred_gd = knn_tuned.predict(X_test_scaled) 

# Assess performance

y_test_array = y_test.to_numpy()

f1_score_genetic_disorder = f1_score(y_test_array[:, 0], y_pred_gd, average='weighted')
f1_score_disorder_subclass = f1_score(y_test_array[:, 1], y_pred_ds, average='weighted')

f1_score_macro_genetic_disorder = f1_score(y_test_array[:, 0], y_pred_gd, average='macro')
f1_score_macro_disorder_subclass = f1_score(y_test_array[:, 1], y_pred_ds, average='macro')

accuracy_genetic_disorder = accuracy_score(y_test_array[:, 0], y_pred_gd)
accuracy_disorder_subclass = accuracy_score(y_test_array[:, 1], y_pred_ds)

balanced_accuracy_genetic_disorder = balanced_accuracy_score(y_test_array[:, 0],y_pred_gd)
balanced_accuracy_disorder_subclass = balanced_accuracy_score(y_test_array[:, 1], y_pred_ds)

Multioutput

In [None]:
# Tuning
param_grid = {
    'estimator__n_neighbors': [9, 15, 17, 21],
    'estimator__algorithm': ['auto', 'ball_tree', 'kd_tree'],
    'estimator__leaf_size': [10, 20, 30, 40],
    'estimator__metric': ['minkowski', 'euclidean', 'manhattan'],
    'estimator__weights': weights
}


# Genetic Disorder
knn_multi_tuned = RandomizedSearchCV(estimator = knn_multi, param_distributions = param_grid, n_iter = 100, cv = 3, verbose=2, refit=True)

knn_multi_tuned.fit(X_train_scaled, y_train)
y_pred = knn_multi_tuned.predict(X_test_scaled) 

# Assess performance

y_test_array = y_test.to_numpy()

f1_score_genetic_disorder = f1_score(y_test_array[:, 0], y_pred[:,0], average='weighted')
f1_score_disorder_subclass = f1_score(y_test_array[:, 1], y_pred[:, 1], average='weighted')

f1_score_macro_genetic_disorder = f1_score(y_test_array[:, 0], y_pred[:,0], average='macro')
f1_score_macro_disorder_subclass = f1_score(y_test_array[:, 1], y_pred[:, 1], average='macro')

accuracy_genetic_disorder = accuracy_score(y_test_array[:, 0], y_pred[:,0])
accuracy_disorder_subclass = accuracy_score(y_test_array[:, 1], y_pred[:, 1])

balanced_accuracy_genetic_disorder = balanced_accuracy_score(y_test_array[:, 0],y_pred[:,0])
balanced_accuracy_disorder_subclass = balanced_accuracy_score(y_test_array[:, 1], y_pred[:, 1])

Print results

In [None]:
print("Classification for Genetic Disorder:")

print("F1 Score for Genetic Disorder:", f1_score_genetic_disorder)
print("Macro F1 Score for Genetic Disorder:", f1_score_macro_genetic_disorder)
print("Accuracy for Genetic Disorder:", accuracy_genetic_disorder)
print("Balanced Accuracy for Genetic Disorder:", balanced_accuracy_genetic_disorder)

print("\nClassification for Disorder Subclass:")

print("F1 Score for Disorder Subclass:", f1_score_disorder_subclass)
print("Macro F1 Score for Disorder Subclass:", f1_score_macro_disorder_subclass)
print("Accuracy for Disorder Subclass:", accuracy_disorder_subclass)
print("Balanced Accuracy for Disorder Subclass:", balanced_accuracy_disorder_subclass)
