In [1]:
# Import necessary libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import balanced_accuracy_score, hamming_loss, f1_score, recall_score, roc_auc_score, accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import ParameterGrid, GridSearchCV


In [2]:
# Load dataset
dataset= pd.read_csv('dataset.csv')

# Replace 'Unknown' with NaN
dataset.replace('Unknown', np.nan, inplace=True)

# Eliminate samples with
dataset.dropna(inplace=True)



# Features to use
features = ['Patient Age', "Mother's age", "Father's age", 'No. of previous abortion',
    'White Blood cell count (thousand per microliter)']
targets = ['Genetic Disorder', 'Disorder Subclass']

# Select only the desired columns
selected_columns = features + targets
dataset = dataset[selected_columns]



# Encode categorical variables
label_encoder = LabelEncoder()

quantitative_with_unknowns_or_ordered_columns = [
    'Patient Age', "Mother's age", "Father's age", 'No. of previous abortion',
    'White Blood cell count (thousand per microliter)']


# Create a copy for encoding
dataset_encoded = dataset.copy()

# Encode target variables 'Genetic Disorder' and 'Disorder Subclass'
# Create mapping
genetic_disorder_mapping = {label: i for i, label in enumerate(dataset_encoded['Genetic Disorder'].unique()) if pd.notna(label)}
disorder_subclass_mapping = {label: i for i, label in enumerate(dataset_encoded['Disorder Subclass'].unique())}

# Replace each original value with its corresponding encoded value as per the mapping
dataset_encoded['Genetic Disorder'] = dataset_encoded['Genetic Disorder'].map(genetic_disorder_mapping)
dataset_encoded['Disorder Subclass'] = dataset_encoded['Disorder Subclass'].map(disorder_subclass_mapping)


# Correspond weights to classes
# Get values mapped
encoded_values_genetic_disorder = dataset_encoded['Genetic Disorder'].unique()
encoded_values_disorder_subclass = dataset_encoded['Disorder Subclass'].unique()


# Allocate features and labels
X = dataset.drop(columns=['Genetic Disorder', 'Disorder Subclass'])
y = dataset[['Genetic Disorder', 'Disorder Subclass']]

In [3]:
# Split the data into validation, validation and test

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75,  random_state=1) # training as 75% 

# Standardize features (important for KNN)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Tuning

In [4]:
# Tune model for Genetic Disorder
knn_genetic_disorder_tuning = KNeighborsClassifier()

# Define the parameter grid to search
param_grid_genetic_disorder = {
    'n_neighbors': [9, 15, 17, 21],
    'algorithm': ['auto', 'ball_tree', 'kd_tree'],
    'leaf_size': [10, 20, 30, 40],
    'metric': ['minkowski', 'euclidean', 'manhattan'],
    'weights': ['uniform', 'distance']
}

grid_search_genetic_disorder = GridSearchCV(
    knn_genetic_disorder_tuning, param_grid_genetic_disorder, cv=5, scoring='accuracy', n_jobs=-1
)
grid_search_genetic_disorder.fit(X_train_scaled, y_train['Genetic Disorder'])

# Print the best parameters and corresponding score
print("Best Parameters:", grid_search_genetic_disorder.best_params_)
print("Best Score:", grid_search_genetic_disorder.best_score_)


# Tune model for Disorder Subclass
knn_disorder_subclass_tuning = KNeighborsClassifier()

# Define the parameter grid to search
param_grid_disorder_subclass = {
    'n_neighbors': [9, 15, 17, 21],
    'algorithm': ['auto', 'ball_tree', 'kd_tree'],
    'leaf_size': [10, 20, 30, 40],
    'metric': ['minkowski', 'euclidean', 'manhattan'],
    'weights': ['uniform', 'distance']
}

grid_search_disorder_subclass = GridSearchCV(
    knn_disorder_subclass_tuning, param_grid_disorder_subclass, cv=5, scoring='accuracy', n_jobs=-1
)
grid_search_disorder_subclass.fit(X_train_scaled, y_train['Disorder Subclass'])

# Print the best parameters and corresponding score
print("Best Parameters:", grid_search_disorder_subclass.best_params_)
print("Best Score:", grid_search_disorder_subclass.best_score_)

Best Parameters: {'algorithm': 'auto', 'leaf_size': 10, 'metric': 'minkowski', 'n_neighbors': 21, 'weights': 'uniform'}
Best Score: 0.47939072847682124




Best Parameters: {'algorithm': 'auto', 'leaf_size': 10, 'metric': 'manhattan', 'n_neighbors': 9, 'weights': 'distance'}
Best Score: 0.21373068432671083


In [5]:
# Use model of KNN classifier

# Best Parameters: {'algorithm': 'auto', 'leaf_size': 10, 'metric': 'minkowski', 'n_neighbors': 9, 'weights': 'distance'}
knn = MultiOutputClassifier(
    estimator=KNeighborsClassifier(
        n_neighbors=9,
        algorithm='auto',
        leaf_size=10,
        metric='minkowski',
        weights='distance'
    )
)

# Train the KNN classifier on the entire training set
knn.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test_scaled)


In [6]:
# y_test_new is DataFrames
y_test_columns = y_test.columns[:]
# Convert y_pred_new to a DataFrame
y_pred_df = pd.DataFrame(y_pred, columns=y_test_columns)

balanced_acc_scores = {}

for column in y_test_columns:
    y_test_column = y_test[column]
    y_pred_column = y_pred_df[column]
    
    balanced_acc_scores[column] = balanced_accuracy_score(y_test_column, y_pred_column)


In [7]:
# Generate classification report for each output separately

y_test_array = y_test.to_numpy()

hamming_loss_genetic_disorder = hamming_loss(y_test_array[:, 0], y_pred[:, 0])
hamming_loss_disorder_subclass = hamming_loss(y_test_array[:, 1], y_pred[:, 1])

f1_score_genetic_disorder = f1_score(y_test_array[:, 0], y_pred[:, 0], average='weighted')
f1_score_disorder_subclass = f1_score(y_test_array[:, 1], y_pred[:, 1], average='weighted')

recall_genetic_disorder = recall_score(y_test_array[:, 0], y_pred[:, 0], average='weighted')
recall_disorder_subclass = recall_score(y_test_array[:, 1], y_pred[:, 1], average='weighted')

accuracy_genetic_disorder = accuracy_score(y_test_array[:, 0], y_pred[:, 0])
accuracy_disorder_subclass = accuracy_score(y_test_array[:, 1], y_pred[:, 1])

balanced_accuracy_genetic_disorder = balanced_accuracy_score(y_test_array[:, 0], y_pred[:, 0])
balanced_accuracy_disorder_subclass = balanced_accuracy_score(y_test_array[:, 1], y_pred[:, 1])


# Use dataset without Cancer and Alzeimer

In [8]:
# Assuming your DataFrame is named dataset
filtered_dataset = dataset_encoded[(dataset['Disorder Subclass'] != 7) & (dataset['Disorder Subclass'] != 8)]

# Update your X and y with the filtered dataset
X_filtered = filtered_dataset.drop(columns=['Genetic Disorder', 'Disorder Subclass'])
y_filtered = filtered_dataset[['Genetic Disorder', 'Disorder Subclass']]


X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_filtered, y_filtered, train_size=0.75,  random_state=1) # training as 75%

In [9]:
# Standardize features (important for KNN)

scaler = StandardScaler()
X_train_scaled_new = scaler.fit_transform(X_train_new)
X_test_scaled_new = scaler.transform(X_test_new)

# Use model of KNN classifier

#Best Parameters: {'estimator__n_neighbors': 9, 'estimator__p': 1, 'estimator__weights': 'uniform'}
knn_new = MultiOutputClassifier(
    estimator=KNeighborsClassifier(
        n_neighbors=9,
        algorithm='auto',
        leaf_size=10,
        metric='minkowski',
        weights='distance'
    )
)

# Train the KNN classifier on the entire training set
knn_new.fit(X_train_scaled_new, y_train_new)


# Make predictions on the test set
y_pred_new = knn_new.predict(X_test_scaled_new)


In [10]:
# y_test_new is DataFrames
y_test_columns_new = y_test_new.columns[-2:]
# Convert y_pred_new to a DataFrame
y_pred_df_new = pd.DataFrame(y_pred_new, columns=y_test_columns_new)

balanced_acc_scores_new = {}

for column in y_test_columns_new:
    y_test_column_new = y_test_new[column]
    y_pred_column_new = y_pred_df_new[column]
    
    balanced_acc_scores_new[column] = balanced_accuracy_score(y_test_column_new, y_pred_column_new)


In [11]:
# Generate classification report for each output separately
y_test_array_new = y_test_new.to_numpy()

hamming_loss_genetic_disorder_new = hamming_loss(y_test_array_new[:, 0], y_pred_new[:, 0])
hamming_loss_disorder_subclass_new = hamming_loss(y_test_array_new[:, 1], y_pred_new[:, 1])

f1_score_genetic_disorder_new = f1_score(y_test_array_new[:, 0], y_pred_new[:, 0], average='weighted')
f1_score_disorder_subclass_new = f1_score(y_test_array_new[:, 1], y_pred_new[:, 1], average='weighted')

recall_genetic_disorder_new = recall_score(y_test_array_new[:, 0], y_pred_new[:, 0], average='weighted')
recall_disorder_subclass_new = recall_score(y_test_array_new[:, 1], y_pred_new[:, 1], average='weighted')

balanced_accuracy_genetic_disorder_new = balanced_accuracy_score(y_test_array_new[:, 0], y_pred_new[:, 0])
balanced_accuracy_disorder_subclass_new = balanced_accuracy_score(y_test_array_new[:, 1], y_pred_new[:, 1])

accuracy_genetic_disorder_new = accuracy_score(y_test_array_new[:, 0], y_pred_new[:, 0])
accuracy_disorder_subclass_new = accuracy_score(y_test_array_new[:, 1], y_pred_new[:, 1])


In [12]:
print("Classification for Genetic Disorder:")

print("Hamming Loss for Genetic Disorder:", hamming_loss_genetic_disorder)
print("F1 Score for Genetic Disorder:", f1_score_genetic_disorder)
print("Recall for Genetic Disorder:", recall_genetic_disorder)
print("Accuracy for Genetic Disorder:", accuracy_genetic_disorder)
print("Balanced Accuracy for Genetic Disorder:", balanced_accuracy_genetic_disorder)

print("\nClassification for Disorder Subclass:")

print("Hamming Loss for Disorder Subclass:", hamming_loss_disorder_subclass)
print("F1 Score for Disorder Subclass:", f1_score_disorder_subclass)
print("Recall for Disorder Subclass:", recall_disorder_subclass)
print("Balanced Accuracy for Disorder Subclass:", balanced_accuracy_disorder_subclass)
print("Accuracy for Disorder Subclass:", accuracy_disorder_subclass)


print("\n ------------ Without Cancer and Alzeimer samples --------------")

print("Classification for Genetic Disorder:")

print("\nHamming Loss for Genetic Disorder:", hamming_loss_genetic_disorder_new)
print("F1 Score for Genetic Disorder:", f1_score_genetic_disorder_new)
print("Recall for Genetic Disorder:", recall_genetic_disorder_new)
print("Balanced Accuracy for Genetic Disorder:", balanced_accuracy_genetic_disorder_new)
print("Accuracy for Genetic Disorder:", accuracy_genetic_disorder_new)

print("\nClassification for Disorder Subclass:")

print("Hamming Loss for Disorder Subclass:", hamming_loss_disorder_subclass_new)
print("F1 Score for Disorder Subclass:", f1_score_disorder_subclass_new)
print("Recall for Disorder Subclass:", recall_disorder_subclass_new)
print("Balanced Accuracy for Disorder Subclass:", balanced_accuracy_disorder_subclass_new)
print("Accuracy for Disorder Subclass:", accuracy_disorder_subclass_new)


Classification for Genetic Disorder:
Hamming Loss for Genetic Disorder: 0.5657370517928287
F1 Score for Genetic Disorder: 0.421005181153321
Recall for Genetic Disorder: 0.4342629482071713
Accuracy for Genetic Disorder: 0.4342629482071713
Balanced Accuracy for Genetic Disorder: 0.32585223742526964

Classification for Disorder Subclass:
Hamming Loss for Disorder Subclass: 0.7888446215139442
F1 Score for Disorder Subclass: 0.20234406195567967
Recall for Disorder Subclass: 0.21115537848605578
Balanced Accuracy for Disorder Subclass: 0.11326995444642501
Accuracy for Disorder Subclass: 0.21115537848605578

 ------------ Without Cancer and Alzeimer samples --------------
Classification for Genetic Disorder:

Hamming Loss for Genetic Disorder: 0.5657370517928287
F1 Score for Genetic Disorder: 0.421005181153321
Recall for Genetic Disorder: 0.4342629482071713
Balanced Accuracy for Genetic Disorder: 0.32585223742526964
Accuracy for Genetic Disorder: 0.4342629482071713

Classification for Disorder