# Decision Tree Classifier

In [265]:
# Import necessary libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import balanced_accuracy_score, hamming_loss, f1_score, recall_score, roc_auc_score
from sklearn.preprocessing import LabelEncoder

provided_weights_genetic_disorder = {
    'Mitochondrial genetic inheritance disorders': 0.48794813542417026,
    'Single-gene inheritance diseases': 0.6160580705934504,
    'Multifactorial genetic inheritance disorders': 0.8959937939823793
}

provided_weights_subclass_disorder = {
    'Leigh syndrome': 0.740510888236272,
    'Mitochondrial myopathy': 0.7799634288247355,
    'Cystic fibrosis': 0.8257328087770821,
    'Tay-Sachs': 0.8583698121571453,
    'Diabetes': 0.9084058292236937,
    'Hemochromatosis': 0.9319554496592232,
    "Leber's hereditary optic neuropathy": 0.9674738183631628,
    "Alzheimer's": 0.9926303540754696,
    'Cancer': 0.9949576106832161
}


In [255]:
# Load dataset
dataset= pd.read_csv('dataset.csv')

# Replace 'Unknown' with NaN
dataset.replace('Unknown', np.nan, inplace=True)

# Eliminate samples with
dataset.dropna(inplace=True)


# Encode categorical variables
label_encoder = LabelEncoder()

categorical_columns = [
    'Genes in mother\'s side', 'Inherited from father', 'Maternal gene', 'Paternal gene', 'Status',
    'Respiratory Rate (breaths/min)', 'Heart Rate (rates/min)', 'Follow-up',
    'Gender', 'Birth asphyxia', 'Autopsy shows birth defect (if applicable)', 'Place of birth',
    'Folic acid details (peri-conceptional)',
    'H/O serious maternal illness', 'H/O radiation exposure (x-ray)', 'H/O substance abuse', 
    'Assisted conception IVF/ART', 'History of anomalies in previous pregnancies',
    'Birth defects', 'Blood test result',
    'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5'
]

quantitative_with_unknowns_or_ordered_columns = [
    'Patient Age', "Mother's age", "Father's age", 'No. of previous abortion',
    'White Blood cell count (thousand per microliter)']


for column in categorical_columns:
    dataset[column] = label_encoder.fit_transform(dataset[column].astype(str))

# Create a copy for encoding
dataset_encoded = dataset.copy()

# Encode target variables 'Genetic Disorder' and 'Disorder Subclass'
# Create mapping
genetic_disorder_mapping = {label: i for i, label in enumerate(dataset_encoded['Genetic Disorder'].unique()) if pd.notna(label)}
disorder_subclass_mapping = {label: i for i, label in enumerate(dataset_encoded['Disorder Subclass'].unique())}

# Replace each original value with its corresponding encoded value as per the mapping
dataset_encoded['Genetic Disorder'] = dataset_encoded['Genetic Disorder'].map(genetic_disorder_mapping)
dataset_encoded['Disorder Subclass'] = dataset_encoded['Disorder Subclass'].map(disorder_subclass_mapping)


# Correspond weights to classes
# Get values mapped
encoded_values_genetic_disorder = dataset_encoded['Genetic Disorder'].unique()
encoded_values_disorder_subclass = dataset_encoded['Disorder Subclass'].unique()

# Inverse mappings to get back the original names
inverse_genetic_disorder_mapping = {i: label for label, i in genetic_disorder_mapping.items()}
inverse_disorder_subclass_mapping = {i: label for label, i in disorder_subclass_mapping.items()}

# Map encoded values back to original names
names_genetic_disorder = [inverse_genetic_disorder_mapping[i] for i in encoded_values_genetic_disorder]
names_disorder_subclass = [inverse_disorder_subclass_mapping[i] for i in encoded_values_disorder_subclass]

# Associate weights with encoded values
weights_genetic_disorder = {encoded_value: provided_weights_genetic_disorder[name] for encoded_value, name in zip(encoded_values_genetic_disorder, names_genetic_disorder)}
weights_disorder_subclass = {encoded_value: provided_weights_subclass_disorder[name] for encoded_value, name in zip(encoded_values_disorder_subclass, names_disorder_subclass)}

# Combine both class weights into a dictionary
class_weights = {'Genetic Disorder': weights_genetic_disorder, 'Disorder Subclass': weights_disorder_subclass}


# Allocate features and labels
X = dataset_encoded.drop(columns=['Genetic Disorder', 'Disorder Subclass'])
y = dataset_encoded[['Genetic Disorder', 'Disorder Subclass']]


In [256]:
# Select and train model of classifier

# Best Hyperparameters: {'estimator__criterion': 'gini', 'estimator__max_depth': 10, 'estimator__max_features': None, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 10, 'estimator__splitter': 'random'}
# Best Mean Accuracy: 0.2690801625415589

'''
dt = MultiOutputClassifier(tree.DecisionTreeClassifier(criterion= 'gini', max_depth= 10, max_features= None, min_samples_leaf= 1, min_samples_split= 10, splitter = 'random', random_state = 1, class_weight = class_weights)

kf = KFold(n_splits=5, shuffle=True, random_state=1)
cv_scores = cross_val_score(dt, X_train, y_train, cv=kf)

print("Cross-validated scores:", cv_scores)
print("Mean cross-validated score:", cv_scores.mean())

'''
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=1)


# Define the class weights for both outputs
class_weights_genetic_disorder = {0: 0.48794813542417026, 1: 0.8959937939823793, 2: 0.6160580705934504}
class_weights_disorder_subclass = {0: 0.740510888236272, 1: 0.9084058292236937, 2: 0.7799634288247355, 3: 0.8257328087770821, 4: 0.8583698121571453, 5: 0.9674738183631628, 6: 0.9319554496592232, 7: 0.9926303540754696, 8: 0.9949576106832161}

# DecisionTreeClassifier with specified parameters and class weights
dt_classifier_genetic_disorder = DecisionTreeClassifier(
    criterion='gini',
    max_depth=10,
    max_features=None,
    min_samples_leaf=1,
    min_samples_split=10,
    splitter='random',
    random_state=1,
    class_weight=class_weights_genetic_disorder
)

dt_classifier_disorder_subclass = DecisionTreeClassifier(
    criterion='gini',
    max_depth=10,
    max_features=None,
    min_samples_leaf=1,
    min_samples_split=10,
    splitter='random',
    random_state=1,
    class_weight=class_weights_disorder_subclass
)

# Create a MultiOutputClassifier
dt = MultiOutputClassifier(estimator=DecisionTreeClassifier(
    criterion='gini',
    max_depth=10,
    max_features=None,
    min_samples_leaf=1,
    min_samples_split=10,
    splitter='random',
    random_state=1
), n_jobs=-1)
'''
# Create a decision tree classifier with sample weights
dt = MultiOutputClassifier(
    DecisionTreeClassifier(
        criterion='gini',
        max_depth=10,
        max_features=None,
        min_samples_leaf=1,
        min_samples_split=10,
        splitter='random',
        random_state=1,
        class_weight=class_weights
    )
)'''

# Fit the model with sample weights
dt.fit(X_train, y_train)

# Predict using the trained model
y_pred = dt.predict(X_test)
proba_pred = dt.predict_proba(X_test)


In [257]:
# Get 12 targets to evaluate model
'''y_test_columns = y_test.columns[-12:] 
y_pred_columns = y_pred[:, -12:]

balanced_acc_scores_ = {}

for i, column in enumerate(y_test_columns):
    y_test_ = y_test[column]
    y_pred_ = y_pred_columns[:, i]
    
    balanced_acc_scores_[column] = balanced_accuracy_score(y_test_, y_pred_)'''
# y_test_new is DataFrames
y_test_columns = y_test.columns[:]
# Convert y_pred_new to a DataFrame
y_pred_df = pd.DataFrame(y_pred, columns=y_test_columns)

balanced_acc_scores = {}

for column in y_test_columns:
    y_test_column = y_test[column]
    y_pred_column = y_pred_df[column]
    
    balanced_acc_scores[column] = balanced_accuracy_score(y_test_column, y_pred_column)



In [258]:
# Generate classification report for each output separately

y_test_array = y_test.to_numpy()

hamming_loss_genetic_disorder = hamming_loss(y_test_array[:, 0], y_pred[:, 0])
hamming_loss_disorder_subclass = hamming_loss(y_test_array[:, 1], y_pred[:, 1])

f1_score_genetic_disorder = f1_score(y_test_array[:, 0], y_pred[:, 0], average='weighted')
f1_score_disorder_subclass = f1_score(y_test_array[:, 1], y_pred[:, 1], average='weighted')

recall_genetic_disorder = recall_score(y_test_array[:, 0], y_pred[:, 0], average='weighted')
recall_disorder_subclass = recall_score(y_test_array[:, 1], y_pred[:, 1], average='weighted')

balanced_accuracy_genetic_disorder = balanced_accuracy_score(y_test_array[:, 0], y_pred[:, 0])
balanced_accuracy_disorder_subclass = balanced_accuracy_score(y_test_array[:, 1], y_pred[:, 1])

# Calculate Hamming loss for each output separately

print("Classification for Genetic Disorder:")

print("Hamming Loss for Genetic Disorder:", hamming_loss_genetic_disorder)
print("F1 Score for Genetic Disorder:", f1_score_genetic_disorder)
print("Recall for Genetic Disorder:", recall_genetic_disorder)
print("Balanced Accuracy for Genetic Disorder:", balanced_accuracy_genetic_disorder)

print("\nClassification for Disorder Subclass:")

print("Hamming Loss for Disorder Subclass:", hamming_loss_disorder_subclass)
print("F1 Score for Disorder Subclass:", f1_score_disorder_subclass)
print("Recall for Disorder Subclass:", recall_disorder_subclass)
print("Balanced Accuracy for Disorder Subclass:", balanced_accuracy_disorder_subclass)


Classification for Genetic Disorder:
Hamming Loss for Genetic Disorder: 0.5099601593625498
F1 Score for Genetic Disorder: 0.48561448108188154
Recall for Genetic Disorder: 0.4900398406374502
Balanced Accuracy for Genetic Disorder: 0.4161086335206483

Classification for Disorder Subclass:
Hamming Loss for Disorder Subclass: 0.6972111553784861
F1 Score for Disorder Subclass: 0.3009299784990234
Recall for Disorder Subclass: 0.30278884462151395
Balanced Accuracy for Disorder Subclass: 0.21088086749851453


# Use dataset without Cancer and Alzeimer

In [259]:
# Assuming your DataFrame is named dataset
filtered_dataset = dataset[(dataset['Disorder Subclass'] != "Cancer") & (dataset['Disorder Subclass'] != "Alzheimer's")]

# Update your X and y with the filtered dataset
X_filtered = filtered_dataset.drop(columns=['Genetic Disorder', 'Disorder Subclass'])
y_filtered = filtered_dataset[['Genetic Disorder', 'Disorder Subclass']]


X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_filtered, y_filtered, train_size=0.75,  random_state=1) # training as 75%

In [263]:
# Select and train model of classifier

# Best Hyperparameters: {'estimator__criterion': 'gini', 'estimator__max_depth': 10, 'estimator__max_features': None, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 10, 'estimator__splitter': 'random'}
# Best Mean Accuracy: 0.2690801625415589
'''
dt = MultiOutputClassifier(tree.DecisionTreeClassifier(criterion= 'gini', max_depth= 10, max_features= None, min_samples_leaf= 1, min_samples_split= 10, splitter = 'random', random_state = 1))

kf = KFold(n_splits=5, shuffle=True, random_state=1)

cv_scores = cross_val_score(dt, X_train, y_train, cv=kf)

print("Cross-validated scores:", cv_scores)
print("Mean cross-validated score:", cv_scores.mean())'''


# Define the class weights for both outputs
class_weights_genetic_disorder = {0: 0.48794813542417026, 1: 0.8959937939823793, 2: 0.6160580705934504}
class_weights_disorder_subclass = {0: 0.740510888236272, 1: 0.9084058292236937, 2: 0.7799634288247355, 3: 0.8257328087770821, 4: 0.8583698121571453, 5: 0.9674738183631628, 6: 0.9319554496592232, 7: 0.9926303540754696, 8: 0.9949576106832161}

# DecisionTreeClassifier with specified parameters and class weights
dt_classifier_genetic_disorder_new = DecisionTreeClassifier(
    criterion='gini',
    max_depth=10,
    max_features=None,
    min_samples_leaf=1,
    min_samples_split=10,
    splitter='random',
    random_state=1,
    class_weight=class_weights_genetic_disorder
)

dt_classifier_disorder_subclass_new = DecisionTreeClassifier(
    criterion='gini',
    max_depth=10,
    max_features=None,
    min_samples_leaf=1,
    min_samples_split=10,
    splitter='random',
    random_state=1,
    class_weight=class_weights_disorder_subclass
)

# Create a MultiOutputClassifier
dt_new = MultiOutputClassifier(estimator=DecisionTreeClassifier(
    criterion='gini',
    max_depth=10,
    max_features=None,
    min_samples_leaf=1,
    min_samples_split=10,
    splitter='random',
    random_state=1
), n_jobs=-1)

# Fit the model with sample weights
dt_new.fit(X_train_new, y_train_new)

# Predict using the trained model
y_pred_new = dt_new.predict(X_test_new)
proba_pred_new = dt_new.predict_proba(X_test_new) 


In [264]:
# Get 12 targets to evaluate model
'''y_test_columns = y_test.columns[-12:] 
y_pred_columns = y_pred[:, -12:]

balanced_acc_scores_ = {}

for i, column in enumerate(y_test_columns):
    y_test_ = y_test[column]
    y_pred_ = y_pred_columns[:, i]
    
    balanced_acc_scores_[column] = balanced_accuracy_score(y_test_, y_pred_)'''
# y_test_new is DataFrames
y_test_columns_new = y_test_new.columns[-2:]
# Convert y_pred_new to a DataFrame
y_pred_df_new = pd.DataFrame(y_pred_new, columns=y_test_columns_new)

balanced_acc_scores_new = {}

for column in y_test_columns_new:
    y_test_column_new = y_test_new[column]
    y_pred_column_new = y_pred_df_new[column]
    
    balanced_acc_scores_new[column] = balanced_accuracy_score(y_test_column_new, y_pred_column_new)


In [266]:
# Generate classification report for each output separately

y_test_array_new = y_test_new.to_numpy()

hamming_loss_genetic_disorder = hamming_loss(y_test_array_new[:, 0], y_pred_new[:, 0])
hamming_loss_disorder_subclass = hamming_loss(y_test_array_new[:, 1], y_pred_new[:, 1])

f1_score_genetic_disorder = f1_score(y_test_array_new[:, 0], y_pred_new[:, 0], average='weighted')
f1_score_disorder_subclass = f1_score(y_test_array_new[:, 1], y_pred_new[:, 1], average='weighted')

recall_genetic_disorder = recall_score(y_test_array_new[:, 0], y_pred_new[:, 0], average='weighted')
recall_disorder_subclass = recall_score(y_test_array_new[:, 1], y_pred_new[:, 1], average='weighted')

balanced_accuracy_genetic_disorder = balanced_accuracy_score(y_test_array_new[:, 0], y_pred_new[:, 0])
balanced_accuracy_disorder_subclass = balanced_accuracy_score(y_test_array_new[:, 1], y_pred_new[:, 1])

# Calculate Hamming loss for each output separately

print("Classification for Genetic Disorder:")

print("Hamming Loss for Genetic Disorder:", hamming_loss_genetic_disorder)
print("F1 Score for Genetic Disorder:", f1_score_genetic_disorder)
print("Recall for Genetic Disorder:", recall_genetic_disorder)
print("Balanced Accuracy for Genetic Disorder:", balanced_accuracy_genetic_disorder)

print("\nClassification for Disorder Subclass:")

print("Hamming Loss for Disorder Subclass:", hamming_loss_disorder_subclass)
print("F1 Score for Disorder Subclass:", f1_score_disorder_subclass)
print("Recall for Disorder Subclass:", recall_disorder_subclass)
print("Balanced Accuracy for Disorder Subclass:", balanced_accuracy_disorder_subclass)



Classification for Genetic Disorder:
Hamming Loss for Genetic Disorder: 0.5220883534136547
F1 Score for Genetic Disorder: 0.4719688442164553
Recall for Genetic Disorder: 0.4779116465863454
Balanced Accuracy for Genetic Disorder: 0.41021505376344086

Classification for Disorder Subclass:
Hamming Loss for Disorder Subclass: 0.6867469879518072
F1 Score for Disorder Subclass: 0.29957827384538377
Recall for Disorder Subclass: 0.3132530120481928
Balanced Accuracy for Disorder Subclass: 0.2795609057019937


# Tuning Hyperparameters

In [None]:
param_grid = {
    'estimator__criterion': ['gini', 'entropy'],
    'estimator__splitter': ['best', 'random'],
    'estimator__max_depth': [None, 10, 20, 30],
    'estimator__min_samples_split': [2, 5, 10],
    'estimator__min_samples_leaf': [1, 2, 4],
    'estimator__max_features': [None, 'sqrt', 'log2']
}

dt = MultiOutputClassifier(tree.DecisionTreeClassifier())

# Generate all combinations
grid = ParameterGrid(param_grid)


all_mean_scores = []
for params in grid:
    dt.set_params(**params)

    scores = cross_val_score(dt, X_train, y_train, cv=5)
  
    # Calculate mean accuracy
    mean_accuracy = scores.mean()
    all_mean_scores.append((params, mean_accuracy))
    
    print(f'Parameters: {params}, Mean Accuracy: {mean_accuracy}')

# Find the best hyperparameters based on the highest mean accuracy
best_params, best_mean_accuracy = max(all_mean_scores, key=lambda x: x[1])

print("Best Hyperparameters:", best_params)
print("Best Mean Accuracy:", best_mean_accuracy)

NameError: name 'ParameterGrid' is not defined