In [11]:
# Import necessary libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn import tree
from sklearn.metrics import balanced_accuracy_score, accuracy_score, precision_score, recall_score, f1_score, hamming_loss, classification_report
from sklearn.preprocessing import LabelEncoder

In [12]:
# Load dataset
dataset= pd.read_csv('dataset.csv')
#print(dataset.columns)

# Features to use
features = ['Patient Age', "Mother's age", "Father's age", 'No. of previous abortion',
    'White Blood cell count (thousand per microliter)']
targets = ['Genetic Disorder', 'Disorder Subclass']

# Select only the desired columns
selected_columns = features + targets
dataset = dataset[selected_columns]


# Replace 'Unknown' with NaN in Target 
dataset['Genetic Disorder'].replace('Unknown', np.nan, inplace=True)
dataset['Disorder Subclass'].replace('Unknown', np.nan, inplace=True)
# Eliminate samples of NaN targets
dataset.dropna(inplace=True)


# Encode categorical variables
label_encoder = LabelEncoder()

# List of categorical columns to encode
categorical_unordered_columns = [
        'Genetic Disorder', 'Disorder Subclass'
]

quantitative_with_unknowns_or_ordered_columns = [
    'Patient Age', "Mother's age", "Father's age", 'No. of previous abortion',
    'White Blood cell count (thousand per microliter)']

dataset_encoded=dataset.copy()

for column in quantitative_with_unknowns_or_ordered_columns:
    dataset_encoded[column] = label_encoder.fit_transform(dataset_encoded[column].astype(str))
dataset=pd.get_dummies(dataset_encoded, columns=categorical_unordered_columns, drop_first=False)


# Alocate features and labels
X = dataset.iloc[:, :-12]  # Features
y = dataset.iloc[:, -12:]  # Labels (last two columns encoded into 3+9)


# Split the data into validation, validation and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75,  random_state=1) # training as 75% 


In [13]:
# Select and train model of classifier

# Best Hyperparameters: {'estimator__criterion': 'gini', 'estimator__max_depth': 10, 'estimator__max_features': None, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 10, 'estimator__splitter': 'random'}
# Best Mean Accuracy: 0.2690801625415589
'''
dt = MultiOutputClassifier(tree.DecisionTreeClassifier(criterion= 'gini', max_depth= 10, max_features= None, min_samples_leaf= 1, min_samples_split= 10, splitter = 'random', random_state = 1))

kf = KFold(n_splits=5, shuffle=True, random_state=1)

cv_scores = cross_val_score(dt, X_train, y_train, cv=kf)

print("Cross-validated scores:", cv_scores)
print("Mean cross-validated score:", cv_scores.mean())'''

dt = tree.DecisionTreeClassifier(criterion= 'gini', max_depth= 10, max_features= None, min_samples_leaf= 1, min_samples_split= 10, splitter = 'random', random_state = 1)

dt.fit(X_train, y_train)

y_pred = dt.predict(X_test) 


In [14]:
# Get 12 targets to evaluate model
'''y_test_columns = y_test.columns[-12:] 
y_pred_columns = y_pred[:, -12:]

balanced_acc_scores_ = {}

for i, column in enumerate(y_test_columns):
    y_test_ = y_test[column]
    y_pred_ = y_pred_columns[:, i]
    
    balanced_acc_scores_[column] = balanced_accuracy_score(y_test_, y_pred_)'''
# y_test_new is DataFrames
y_test_columns = y_test.columns[-12:]
# Convert y_pred_new to a DataFrame
y_pred_df = pd.DataFrame(y_pred, columns=y_test_columns)

balanced_acc_scores = {}

for column in y_test_columns:
    y_test_column = y_test[column]
    y_pred_column = y_pred_df[column]
    
    balanced_acc_scores[column] = balanced_accuracy_score(y_test_column, y_pred_column)


# Use dataset without Unknown values

In [15]:
# Train models without unknown data

# Replace 'Unknown' with NaN
dataset.replace('Unknown', np.nan, inplace=True)

dataset.dropna(inplace=True)


In [16]:
# Alocate features and labels

X_new = dataset.iloc[:, :-12]  # Features
y_new = dataset.iloc[:, -12:]  # Labels (last two columns encoded into 3+9)

# split the data into validation, validation and test

X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X, y, train_size=0.75,  random_state=1) # training as 75%

In [17]:
# Select and train model of classifier

# Best Hyperparameters: {'estimator__criterion': 'gini', 'estimator__max_depth': 10, 'estimator__max_features': None, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 10, 'estimator__splitter': 'random'}
# Best Mean Accuracy: 0.2690801625415589

#dt_new = MultiOutputClassifier(tree.DecisionTreeClassifier(criterion= 'gini', max_depth= 10, max_features= None, min_samples_leaf= 1, min_samples_split= 10, splitter = 'random', random_state = 1))
dt_new = tree.DecisionTreeClassifier(criterion= 'gini', max_depth= 10, max_features= None, min_samples_leaf= 1, min_samples_split= 10, splitter = 'random', random_state = 1)
'''
kf = KFold(n_splits=5, shuffle=True, random_state=1)  
cv_scores = cross_val_score(dt_new, X_train_new, y_train_new, cv=kf)

print("Cross-validated scores:", cv_scores)
print("Mean cross-validated score:", cv_scores.mean())
'''
dt_new.fit(X_train_new, y_train_new)

y_pred_new = dt_new.predict(X_test_new) 


In [18]:
'''# Get 12 targets to evaluate model
y_test_columns = y_test.columns[-12:] 
y_pred_columns = y_pred[:, -12:]

balanced_acc_scores_ = {}

for i, column in enumerate(y_test_columns):
    y_test_ = y_test[column]
    y_pred_ = y_pred_columns[:, i]
    
    balanced_acc_scores_[column] = balanced_accuracy_score(y_test_, y_pred_)
'''
# y_test_new is DataFrames
y_test_columns_new = y_test_new.columns[-12:]
# Convert y_pred_new to a DataFrame
y_pred_df_new = pd.DataFrame(y_pred_new, columns=y_test_columns_new)

balanced_acc_scores_new = {}

for column in y_test_columns_new:
    y_test_column = y_test_new[column]
    y_pred_column = y_pred_df_new[column]
    
    balanced_acc_scores_new[column] = balanced_accuracy_score(y_test_column, y_pred_column)


In [19]:
# Evaluate model
y_test_array = y_test.to_numpy()
y_test_array_new = y_test_new.to_numpy()

# Before removing Unknowns
accuracy = accuracy_score(y_test_array, y_pred)
precision = precision_score(y_test_array, y_pred, average='micro') 
recall = recall_score(y_test_array, y_pred, average='micro')
f1 = f1_score(y_test_array, y_pred, average='micro')
hamming = hamming_loss(y_test_array, y_pred)

# Unknowns Removed
accuracy_new = accuracy_score(y_test_array_new, y_pred_new)
precision_new = precision_score(y_test_array_new, y_pred_new, average='micro') 
recall_new = recall_score(y_test_array_new, y_pred_new, average='micro')
f1_new = f1_score(y_test_array_new, y_pred_new, average='micro')
hamming_new = hamming_loss(y_test_array_new, y_pred_new)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("Hamming Loss:", hamming)
print("Accuracy:", balanced_acc_scores)

print('\n-------------------------REMOVING UNKNOWNS ---------------------------')

print("\nAccuracy:", accuracy_new)
print("Precision:", precision_new)
print("Recall:", recall_new)
print("F1-Score:", f1_new)
print("Hamming Loss:", hamming_new)
print("Accuracy:", balanced_acc_scores_new)




Accuracy: 0.008865248226950355
Precision: 0.4715447154471545
Recall: 0.1799645390070922
F1-Score: 0.2605068976580045
Hamming Loss: 0.17028664302600474
Accuracy: {'Genetic Disorder_Mitochondrial genetic inheritance disorders': 0.4975843998987006, 'Genetic Disorder_Multifactorial genetic inheritance disorders': 0.5011332882539953, 'Genetic Disorder_Single-gene inheritance diseases': 0.4976028489556257, "Disorder Subclass_Alzheimer's": 0.5, 'Disorder Subclass_Cancer': 0.49988864142538975, 'Disorder Subclass_Cystic fibrosis': 0.49657142653527725, 'Disorder Subclass_Diabetes': 0.5015433806800713, 'Disorder Subclass_Hemochromatosis': 0.4996426018584703, "Disorder Subclass_Leber's hereditary optic neuropathy": 0.5, 'Disorder Subclass_Leigh syndrome': 0.4995000806158033, 'Disorder Subclass_Mitochondrial myopathy': 0.5024561253355917, 'Disorder Subclass_Tay-Sachs': 0.5002953538014866}

-------------------------REMOVING UNKNOWNS ---------------------------

Accuracy: 0.008865248226950355
Precisi

In [20]:
# Metrics and information for each output column
print('\nClassification Report:')
print(classification_report(y_test, y_pred))
for column, score in balanced_acc_scores.items():
    print(f'Balanced Accuracy for {column}: {score}')

print('\n-------------------------REMOVING UNKNOWNS ---------------------------')

print('\nClassification Report:')
print(classification_report(y_test_new, y_pred_new))

for column, score in balanced_acc_scores_new.items():
    print(f'Balanced Accuracy for {column}: {score}')



Classification Report:
              precision    recall  f1-score   support

           0       0.52      0.58      0.55      2343
           1       0.20      0.00      0.01       471
           2       0.37      0.13      0.20      1698
           3       0.00      0.00      0.00        32
           4       0.00      0.00      0.00        22
           5       0.07      0.00      0.01       745
           6       0.22      0.00      0.01       417
           7       0.00      0.00      0.00       315
           8       0.00      0.00      0.00       141
           9       0.25      0.01      0.03      1186
          10       0.31      0.01      0.03      1016
          11       0.15      0.01      0.01       638

   micro avg       0.47      0.18      0.26      9024
   macro avg       0.17      0.06      0.07      9024
weighted avg       0.31      0.18      0.19      9024
 samples avg       0.35      0.18      0.24      9024

Balanced Accuracy for Genetic Disorder_Mitochondrial ge

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
