# Decision Tree

In [20]:
# Import necessary libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import balanced_accuracy_score, hamming_loss, f1_score, recall_score, roc_auc_score, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ParameterGrid, GridSearchCV

In [39]:
# Load dataset
dataset= pd.read_csv('dataset.csv')

# Replace 'Unknown' with NaN
dataset.replace('Unknown', np.nan, inplace=True)
# Eliminate samples with NaN
dataset.dropna(inplace=True)


# Encode categorical variables
label_encoder = LabelEncoder()

categorical_columns = [
    'Genes in mother\'s side', 'Inherited from father', 'Maternal gene', 'Paternal gene', 'Status',
    'Respiratory Rate (breaths/min)', 'Heart Rate (rates/min)', 'Follow-up',
    'Gender', 'Birth asphyxia', 'Autopsy shows birth defect (if applicable)', 'Place of birth',
    'Folic acid details (peri-conceptional)',
    'H/O serious maternal illness', 'H/O radiation exposure (x-ray)', 'H/O substance abuse', 
    'Assisted conception IVF/ART', 'History of anomalies in previous pregnancies',
    'Birth defects', 'Blood test result',
    'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5'
]

quantitative_with_unknowns_or_ordered_columns = [
    'Patient Age', "Mother's age", "Father's age", 'No. of previous abortion',
    'White Blood cell count (thousand per microliter)']


for column in categorical_columns:
    dataset[column] = label_encoder.fit_transform(dataset[column].astype(str))

# Create a copy for encoding
dataset_encoded = dataset.copy()

# Encode target variables 'Genetic Disorder' and 'Disorder Subclass'
# Create mapping
genetic_disorder_mapping = {label: i for i, label in enumerate(dataset_encoded['Genetic Disorder'].unique()) if pd.notna(label)}
disorder_subclass_mapping = {label: i for i, label in enumerate(dataset_encoded['Disorder Subclass'].unique())}

# Replace each original value with its corresponding encoded value as per the mapping
dataset_encoded['Genetic Disorder'] = dataset_encoded['Genetic Disorder'].map(genetic_disorder_mapping)
dataset_encoded['Disorder Subclass'] = dataset_encoded['Disorder Subclass'].map(disorder_subclass_mapping)


# Allocate features and labels
X = dataset_encoded.drop(columns=['Genetic Disorder', 'Disorder Subclass'])
y = dataset_encoded[['Genetic Disorder', 'Disorder Subclass']]


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=1)

In [40]:
# Correspond weights to classes

# Get values mapped
encoded_values_genetic_disorder = dataset_encoded['Genetic Disorder'].unique()
encoded_values_disorder_subclass = dataset_encoded['Disorder Subclass'].unique()

# Inverse mappings to get back the original names
inverse_genetic_disorder_mapping = {i: label for label, i in genetic_disorder_mapping.items()}
inverse_disorder_subclass_mapping = {i: label for label, i in disorder_subclass_mapping.items()}

# Map encoded values back to original names
names_genetic_disorder = [inverse_genetic_disorder_mapping[i] for i in encoded_values_genetic_disorder]
names_disorder_subclass = [inverse_disorder_subclass_mapping[i] for i in encoded_values_disorder_subclass]

# Associate weights with encoded values
weights_genetic_disorder = {encoded_value: provided_weights_genetic_disorder[name] for encoded_value, name in zip(encoded_values_genetic_disorder, names_genetic_disorder)}
weights_disorder_subclass = {encoded_value: provided_weights_subclass_disorder[name] for encoded_value, name in zip(encoded_values_disorder_subclass, names_disorder_subclass)}

class_weights = [
    weights_genetic_disorder,
    weights_disorder_subclass
]


# Tuning

In [41]:
# small min_samples_split can lead to overfit, while big values avoid learning for umbalanced data try min_weight_fraction_leaf 
# sample_weight augments the probability estimates in the probability array

param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20],
    'min_samples_leaf': [2, 5, 8],    
    'max_features': [None, 'sqrt', 'log2'],
    'class_weight': ['balanced']
}


dt_tuning = DecisionTreeClassifier()

grid_search_dt_tuning = GridSearchCV(
    dt_tuning, param_grid, scoring='accuracy', n_jobs=-1)

grid_search_dt_tuning.fit(X_train, y_train)

print('Best Parameters: ', grid_search_dt_tuning.best_params_)


Best Parameters:  {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'splitter': 'best'}


 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
 nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan]


In [42]:
# 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'splitter': 'best'}

dt_multioutput = DecisionTreeClassifier(
    class_weight= 'balanced', criterion= 'gini', max_depth= None, max_features= None, min_samples_leaf=2, splitter= 'best')


dt_multioutput.fit(X_train, y_train)  

y_pred = dt_multioutput.predict(X_test)


In [43]:
# Generate classification report for each output separately

y_test_array = y_test.to_numpy()

f1_score_genetic_disorder = f1_score(y_test_array[:, 0], y_pred[:, 0], average='weighted')
f1_score_disorder_subclass = f1_score(y_test_array[:, 1], y_pred[:, 1], average='weighted')

recall_genetic_disorder = recall_score(y_test_array[:, 0], y_pred[:, 0], average='weighted')
recall_disorder_subclass = recall_score(y_test_array[:, 1], y_pred[:, 1], average='weighted')

accuracy_genetic_disorder = accuracy_score(y_test_array[:, 0], y_pred[:, 0])
accuracy_disorder_subclass = accuracy_score(y_test_array[:, 1], y_pred[:, 1])

balanced_accuracy_genetic_disorder = balanced_accuracy_score(y_test_array[:, 0], y_pred[:, 0])
balanced_accuracy_disorder_subclass = balanced_accuracy_score(y_test_array[:, 1], y_pred[:, 1])


# Without Cancer and Alzeimer

In [45]:
# Assuming your DataFrame is named dataset
filtered_dataset = dataset_encoded[(dataset['Disorder Subclass'] != 7) & (dataset['Disorder Subclass'] != 8)]

# Update your X and y with the filtered dataset
X_filtered = filtered_dataset.drop(columns=['Genetic Disorder', 'Disorder Subclass'])
y_filtered = filtered_dataset[['Genetic Disorder', 'Disorder Subclass']]


X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_filtered, y_filtered, train_size=0.75,  random_state=1) # training as 75%

In [46]:
# 'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 2, 'splitter': 'best'}

dt_multioutput_new = DecisionTreeClassifier(
    class_weight= 'balanced', criterion= 'gini', max_depth= None, max_features= None, min_samples_leaf=2, splitter= 'best')


dt_multioutput_new.fit(X_train_new, y_train_new)  

y_pred_new = dt_multioutput_new.predict(X_test_new)


In [47]:
# Generate classification report for each output separately
y_test_array_new = y_test_new.to_numpy()

hamming_loss_genetic_disorder_new = hamming_loss(y_test_array_new[:, 0], y_pred_new[:, 0])
hamming_loss_disorder_subclass_new = hamming_loss(y_test_array_new[:, 1], y_pred_new[:, 1])

f1_score_genetic_disorder_new = f1_score(y_test_array_new[:, 0], y_pred_new[:, 0], average='weighted')
f1_score_disorder_subclass_new = f1_score(y_test_array_new[:, 1], y_pred_new[:, 1], average='weighted')

recall_genetic_disorder_new = recall_score(y_test_array_new[:, 0], y_pred_new[:, 0], average='weighted')
recall_disorder_subclass_new = recall_score(y_test_array_new[:, 1], y_pred_new[:, 1], average='weighted')

balanced_accuracy_genetic_disorder_new = balanced_accuracy_score(y_test_array_new[:, 0], y_pred_new[:, 0])
balanced_accuracy_disorder_subclass_new = balanced_accuracy_score(y_test_array_new[:, 1], y_pred_new[:, 1])

accuracy_genetic_disorder_new = accuracy_score(y_test_array_new[:, 0], y_pred_new[:, 0])
accuracy_disorder_subclass_new = accuracy_score(y_test_array_new[:, 1], y_pred_new[:, 1])


In [48]:
print("Classification for Genetic Disorder:")

print("Hamming Loss for Genetic Disorder:", hamming_loss_genetic_disorder)
print("F1 Score for Genetic Disorder:", f1_score_genetic_disorder)
print("Recall for Genetic Disorder:", recall_genetic_disorder)
print("Accuracy for Genetic Disorder:", accuracy_genetic_disorder)
print("Balanced Accuracy for Genetic Disorder:", balanced_accuracy_genetic_disorder)

print("\nClassification for Disorder Subclass:")

print("Hamming Loss for Disorder Subclass:", hamming_loss_disorder_subclass)
print("F1 Score for Disorder Subclass:", f1_score_disorder_subclass)
print("Recall for Disorder Subclass:", recall_disorder_subclass)
print("Balanced Accuracy for Disorder Subclass:", balanced_accuracy_disorder_subclass)
print("Accuracy for Disorder Subclass:", accuracy_disorder_subclass)

print("\n ------------ Without Cancer and Alzeimer samples --------------")

print("Classification for Genetic Disorder:")

print("\nHamming Loss for Genetic Disorder:", hamming_loss_genetic_disorder_new)
print("F1 Score for Genetic Disorder:", f1_score_genetic_disorder_new)
print("Recall for Genetic Disorder:", recall_genetic_disorder_new)
print("Balanced Accuracy for Genetic Disorder:", balanced_accuracy_genetic_disorder_new)
print("Accuracy for Genetic Disorder:", accuracy_genetic_disorder_new)

print("\nClassification for Disorder Subclass:")

print("Hamming Loss for Disorder Subclass:", hamming_loss_disorder_subclass_new)
print("F1 Score for Disorder Subclass:", f1_score_disorder_subclass_new)
print("Recall for Disorder Subclass:", recall_disorder_subclass_new)
print("Balanced Accuracy for Disorder Subclass:", balanced_accuracy_disorder_subclass_new)
print("Accuracy for Disorder Subclass:", accuracy_disorder_subclass_new)



Classification for Genetic Disorder:
Hamming Loss for Genetic Disorder: 0.5657370517928287
F1 Score for Genetic Disorder: 0.44855812533659495
Recall for Genetic Disorder: 0.4342629482071713
Accuracy for Genetic Disorder: 0.4342629482071713
Balanced Accuracy for Genetic Disorder: 0.34914005366322876

Classification for Disorder Subclass:
Hamming Loss for Disorder Subclass: 0.7649402390438247
F1 Score for Disorder Subclass: 0.23258436296171958
Recall for Disorder Subclass: 0.2350597609561753
Balanced Accuracy for Disorder Subclass: 0.17074420677361854
Accuracy for Disorder Subclass: 0.2350597609561753

 ------------ Without Cancer and Alzeimer samples --------------
Classification for Genetic Disorder:

Hamming Loss for Genetic Disorder: 0.5737051792828686
F1 Score for Genetic Disorder: 0.4375686089795171
Recall for Genetic Disorder: 0.4262948207171315
Balanced Accuracy for Genetic Disorder: 0.3431063921833939
Accuracy for Genetic Disorder: 0.4262948207171315

Classification for Disorder