# K-Nearest Neighbors Classifier

In [6]:
# Import necessary libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import balanced_accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [7]:
# Load dataset

dataset= pd.read_csv('dataset.csv')

# Replace 'Unknown' with NaN in Target
dataset['Genetic Disorder'].replace('Unknown', np.nan, inplace=True)
dataset['Disorder Subclass'].replace('Unknown', np.nan, inplace=True)

dataset.dropna(inplace=True)

# Alocate features and labels

X = dataset.iloc[:, :-2]  # Features
y = dataset.iloc[:, -2:]  # Labels (last two columns)

# Perform one-hot encoding on the categorical features
X = pd.get_dummies(X, drop_first=True)
# drop_first is used to drop one of the columns for each categorical feature to avoid multicollinearity.

# Check if the target variable has more than 2 classes
if y.nunique().any() > 2:
    y = pd.get_dummies(y, drop_first=True)


In [8]:
# Split the data into validation, validation and test

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75,  random_state=1) # training as 75% 


# Standardize features (important for KNN)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [9]:
# Initialize the KNN classifier

k_neighbors = 3
knn = MultiOutputClassifier(KNeighborsClassifier(n_neighbors=k_neighbors))

# Perform cross-validation
cv_scores = cross_val_score(knn, X_train_scaled, y_train, cv=5) # 5-fold cross-validation

print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Score:", cv_scores.mean())

# Train the KNN classifier on the entire training set
knn.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test_scaled)


Cross-Validation Scores: [0.14259328 0.15367566 0.1344662  0.14591799 0.13889915]
Mean Cross-Validation Score: 0.14311045437753972


In [5]:
# Evaluate the model

# Separate the multi output targets
y_test_last1 = y_test.iloc[:, -1]
y_test_last2 = y_test.iloc[:, -2]

y_pred_last1 = y_pred[:, -1]
y_pred_last2 = y_pred[:, -2]


balanced_acc_last1 = balanced_accuracy_score(y_test_last2, y_pred_last2)
balanced_acc_last2 = balanced_accuracy_score(y_test_last2, y_pred_last2)


NameError: name 'y_pred' is not defined

# Use dataset without Unknown values

In [None]:
# Train models without unknown data

# Replace 'Unknown' with NaN
dataset.replace('Unknown', np.nan, inplace=True)

dataset.dropna(inplace=True)

In [None]:
# Alocate features and labels

X = dataset.iloc[:, :-2]  # Features
y = dataset.iloc[:, -2:]  # Labels (last two columns)

# Perform one-hot encoding on the categorical features
X = pd.get_dummies(X, drop_first=True)
# drop_first is used to drop one of the columns for each categorical feature to avoid multicollinearity.

# Check if the target variable has more than 2 classes
if y.nunique().any() > 2:
    y = pd.get_dummies(y, drop_first=True)

In [None]:
# Split the data into validation, validation and test

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75,  random_state=1) # training as 75% 


# Standardize features (important for KNN)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Initialize the KNN classifier

k_neighbors = 3
knn = MultiOutputClassifier(KNeighborsClassifier(n_neighbors=k_neighbors))

# Perform cross-validation
cv_scores = cross_val_score(knn, X_train_scaled, y_train, cv=5) # 5-fold cross-validation

print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Score:", cv_scores.mean())

# Train the KNN classifier on the entire training set
knn.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test_scaled)


Cross-Validation Scores: [0.1589404  0.16666667 0.11333333 0.18666667 0.12666667]
Mean Cross-Validation Score: 0.15045474613686535


In [None]:
# Evaluate the model

# Separate the multi output targets
y_test_last1_new = y_test.iloc[:, -1]
y_test_last2_new = y_test.iloc[:, -2]

y_pred_last1_new = y_pred[:, -1]
y_pred_last2_new = y_pred[:, -2]


balanced_acc_last1_new = balanced_accuracy_score(y_test_last2_new, y_pred_last2_new)
balanced_acc_last2_new = balanced_accuracy_score(y_test_last2_new, y_pred_last2_new)


In [None]:
# Metrics and information for each output column

existing_metrics_df = pd.read_excel('metrics_report.xlsx')

metrics_dict = {
    'Model': 'KNN - W/ Unknowns',

    # Metrics for the original dataset
    'Scenario': 'Original',
    'Balanced Accuracy (Last Column)': balanced_acc_last1,
    'Balanced Accuracy (Second to Last Column)': balanced_acc_last2,
    'Classification Report (Last Column)': classification_report(y_test_last1, y_pred_last1),
    'Classification Report (Second to Last Column)': classification_report(y_test_last2, y_pred_last2),

    # Metrics for the dataset without samples with unknown values
    'Scenario': 'KNN - Removed Unknowns',
    'Balanced Accuracy (Last Column)': balanced_acc_last1_new,
    'Balanced Accuracy (Second to Last Column)': balanced_acc_last2_new,
    'Classification Report (Last Column)': classification_report(y_test_last1_new, y_pred_last1_new),
    'Classification Report (Second to Last Column)': classification_report(y_test_last2_new, y_pred_last2_new),
}

# Convert the dictionary to a Pandas DataFrame
metrics_df = pd.DataFrame(metrics_dict.items(), columns=['Metric', 'Value'])

# Append the new metrics DataFrame to the existing CSV file
combined_metrics_df = pd.concat([existing_metrics_df, metrics_df], ignore_index=True)

# Save the combined DataFrame to the CSV file
combined_metrics_df.to_csv('metrics_report.csv', index=False)