# Decision Tree Classifier

In [79]:
# Import necessary libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import balanced_accuracy_score, hamming_loss, f1_score, recall_score, roc_auc_score, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import ParameterGrid, GridSearchCV

# Previously obtained weights

provided_weights_genetic_disorder = {
    'Mitochondrial genetic inheritance disorders': 0.48794813542417026,
    'Single-gene inheritance diseases': 0.6160580705934504,
    'Multifactorial genetic inheritance disorders': 0.8959937939823793
}

provided_weights_subclass_disorder = {
    'Leigh syndrome': 0.740510888236272,
    'Mitochondrial myopathy': 0.7799634288247355,
    'Cystic fibrosis': 0.8257328087770821,
    'Tay-Sachs': 0.8583698121571453,
    'Diabetes': 0.9084058292236937,
    'Hemochromatosis': 0.9319554496592232,
    "Leber's hereditary optic neuropathy": 0.9674738183631628,
    "Alzheimer's": 0.9926303540754696,
    'Cancer': 0.9949576106832161
}

# Load dataset
df= pd.read_csv('dataset.csv')

In [69]:
# Replace 'Unknown' with NaN
df['Genetic Disorder'].replace('Unknown', np.nan, inplace=True)
df['Disorder Subclass'].replace('Unknown', np.nan, inplace=True)

df.dropna(inplace=True)

In [77]:
# Features to use
features = ['Genes in mother\'s side', 'Inherited from father', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5']
targets = ['Genetic Disorder', 'Disorder Subclass']

# Select only the desired columns
selected_columns = features + targets
df = df[selected_columns]


# Encode categorical variables
label_encoder = LabelEncoder()

all_x_columns= [
    "Genes in mother's side", 'Inherited from father','Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5'
]

df_encoded=df.copy()
for column in all_x_columns:
    df_encoded[column] = label_encoder.fit_transform(df_encoded[column].astype(str))


# Allocate features and labels
X = df_encoded.drop(columns=['Genetic Disorder', 'Disorder Subclass'])
y = df_encoded[['Genetic Disorder', 'Disorder Subclass']]


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, random_state=1)


# DecisionTreeClassifier for genetic disorder
dt = DecisionTreeClassifier(
    criterion='gini', max_depth=10, max_features='log2', min_samples_leaf=1, min_samples_split=10, splitter='random',
)


dt.fit(X_train, y_train)  

# Predict using the trained models
y_pred = dt.predict(X_test)

# Generate classification report for each output separately

y_test_array = y_test.to_numpy()

hamming_loss_genetic_disorder = hamming_loss(y_test_array[:, 0], y_pred[:, 0])
hamming_loss_disorder_subclass = hamming_loss(y_test_array[:, 1], y_pred[:, 1])

f1_score_genetic_disorder = f1_score(y_test_array[:, 0], y_pred[:, 0], average='weighted')
f1_score_disorder_subclass = f1_score(y_test_array[:, 1], y_pred[:, 1], average='weighted')

recall_genetic_disorder = recall_score(y_test_array[:, 0], y_pred[:, 0], average='weighted')
recall_disorder_subclass = recall_score(y_test_array[:, 1], y_pred[:, 1], average='weighted')

accuracy_genetic_disorder = accuracy_score(y_test_array[:, 0], y_pred[:, 0])
accuracy_disorder_subclass = accuracy_score(y_test_array[:, 1], y_pred[:, 1])

balanced_accuracy_genetic_disorder = balanced_accuracy_score(y_test_array[:, 0], y_pred[:, 0])
balanced_accuracy_disorder_subclass = balanced_accuracy_score(y_test_array[:, 1], y_pred[:, 1])



In [78]:

print("Classification for Genetic Disorder:")

print("Hamming Loss for Genetic Disorder:", hamming_loss_genetic_disorder)
print("F1 Score for Genetic Disorder:", f1_score_genetic_disorder)
print("Recall for Genetic Disorder:", recall_genetic_disorder)
print("Accuracy for Genetic Disorder:", accuracy_genetic_disorder)
print("Balanced Accuracy for Genetic Disorder:", balanced_accuracy_genetic_disorder)

print("\nClassification for Disorder Subclass:")

print("Hamming Loss for Disorder Subclass:", hamming_loss_disorder_subclass)
print("F1 Score for Disorder Subclass:", f1_score_disorder_subclass)
print("Recall for Disorder Subclass:", recall_disorder_subclass)
print("Balanced Accuracy for Disorder Subclass:", balanced_accuracy_disorder_subclass)
print("Accuracy for Disorder Subclass:", accuracy_disorder_subclass)

Classification for Genetic Disorder:
Hamming Loss for Genetic Disorder: 0.4204343971631206
F1 Score for Genetic Disorder: 0.5593067720060263
Recall for Genetic Disorder: 0.5795656028368794
Accuracy for Genetic Disorder: 0.5795656028368794
Balanced Accuracy for Genetic Disorder: 0.4759004803818712

Classification for Disorder Subclass:
Hamming Loss for Disorder Subclass: 0.6289893617021277
F1 Score for Disorder Subclass: 0.3503376307498625
Recall for Disorder Subclass: 0.37101063829787234
Balanced Accuracy for Disorder Subclass: 0.23281969272956507
Accuracy for Disorder Subclass: 0.37101063829787234


In [None]:
# Replace 'Unknown' with NaN
df.replace('Unknown', np.nan, inplace=True)

# Eliminate samples with NaN
df.dropna(inplace=True)

In [None]:
# Encode categorical variables
label_encoder = LabelEncoder()

all_x_columns= [
    'Patient Age', "Genes in mother's side", 'Inherited from father', 'Maternal gene', 'Paternal gene',
    'Blood cell count (mcL)', "Mother's age", "Father's age", 'Status', 'Respiratory Rate (breaths/min)',
    'Heart Rate (rates/min)', 'Follow-up', 'Gender', 'Birth asphyxia',
    'Autopsy shows birth defect (if applicable)', 'Place of birth', 'Folic acid details (peri-conceptional)',
    'H/O serious maternal illness', 'H/O radiation exposure (x-ray)', 'H/O substance abuse',
    'Assisted conception IVF/ART', 'History of anomalies in previous pregnancies', 'No. of previous abortion',
    'Birth defects', 'White Blood cell count (thousand per microliter)', 'Blood test result',
    'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5'
]

df_encoded=df.copy()
for column in all_x_columns:
    df_encoded[column] = label_encoder.fit_transform(df_encoded[column].astype(str))


In [None]:
# Get the unique values in the encoded column
encoded_values_genetic_disorder = dataset_encoded['Genetic Disorder'].unique()
encoded_values_disorder_subclass = dataset_encoded['Disorder Subclass'].unique()

# Create a mapping between encoded values and weights
mapping_genetic_disorder = {
    value: provided_weights_genetic_disorder.get(label_encoder.inverse_transform([value])[0], 0) 
    for value in encoded_values_genetic_disorder
}
mapping_disorder_subclass = {
    value: provided_weights_subclass_disorder.get(label_encoder.inverse_transform([value])[0], 0) 
    for value in encoded_values_disorder_subclass
}


# Compare the original weights with the weights from the mapping
for value, original_weight in provided_weights_genetic_disorder.items():
    try:
        # Ensure the class name is used for encoding
        encoded_value = label_encoder.transform([value])[0]
        mapped_weight = mapping_genetic_disorder.get(encoded_value, 0)
        print(f"Original Weight for {value}: {original_weight}, Mapped Weight: {mapped_weight}")
    except Exception as e:
        print(f"Error processing class {value}: {e}")

for value, original_weight in provided_weights_subclass_disorder.items():
    try:
        # Ensure the class name is used for encoding
        encoded_value = label_encoder.transform([value])[0]
        mapped_weight = mapping_disorder_subclass.get(encoded_value, 0)
        print(f"Original Weight for {value}: {original_weight}, Mapped Weight: {mapped_weight}")
    except Exception as e:
        print(f"Error processing class {value}: {e}")


# Tuning

In [None]:
# Define the parameter grid
param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],    
    'max_features': [None, 'sqrt', 'log2']
}

# Create a Decision Tree Classifier
dt_tuning = DecisionTreeClassifier()


best_params = {}

grid_search_gd_dt_tuning = GridSearchCV(
        dt_tuning, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the data, passing sample_weight separately
grid_search_gd_dt_tuning.fit(X_train, y_train['Genetic Disorder'], sample_weight=sample_weights_genetic_disorder)

# Store the best parameters for each target
best_params['Genetic Disorder'] = grid_search_gd_dt_tuning.best_params_


grid_search_ds_dt_tuning = GridSearchCV(
        dt_tuning, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the grid search to the data, passing sample_weight separately
grid_search_ds_dt_tuning.fit(X_train, y_train['Disorder Subclass'], sample_weight=sample_weights_disorder_subclass)

# Store the best parameters for each target
best_params['Disorder Subclass'] = grid_search_ds_dt_tuning.best_params_



# Print the best parameters for each target
for target, params in best_params.items():
    print(f"Best Parameters for {target}: {params}")



In [None]:

# DecisionTreeClassifier for genetic disorder
dt_genetic_disorder = DecisionTreeClassifier(
    criterion='gini', max_depth=10, max_features='log2', min_samples_leaf=1, min_samples_split=10, splitter='random',
    class_weight=class_weights_genetic_disorder
)

# DecisionTreeClassifier for disorder subclass
dt_disorder_subclass = DecisionTreeClassifier(
    criterion='entropy', max_depth=10, max_features=None, min_samples_leaf=4, min_samples_split=2, splitter='random',
    class_weight=class_weights_disorder_subclass
)

# Fit the models
dt_genetic_disorder.fit(X_train, y_train['Genetic Disorder'])  
dt_disorder_subclass.fit(X_train, y_train['Disorder Subclass'])  

# Predict using the trained models
y_pred_genetic_disorder = dt_genetic_disorder.predict(X_test)
y_pred_disorder_subclass = dt_disorder_subclass.predict(X_test)

# Combine the predictions into a single array if needed
y_pred = np.column_stack((y_pred_genetic_disorder, y_pred_disorder_subclass))


In [None]:
# y_test_new is DataFrames
y_test_columns = y_test.columns[:]
# Convert y_pred_new to a DataFrame
y_pred_df = pd.DataFrame(y_pred, columns=y_test_columns)

balanced_acc_scores = {}

for column in y_test_columns:
    y_test_column = y_test[column]
    y_pred_column = y_pred_df[column]
    
    balanced_acc_scores[column] = balanced_accuracy_score(y_test_column, y_pred_column)



In [None]:
# Generate classification report for each output separately

y_test_array = y_test.to_numpy()

hamming_loss_genetic_disorder = hamming_loss(y_test_array[:, 0], y_pred[:, 0])
hamming_loss_disorder_subclass = hamming_loss(y_test_array[:, 1], y_pred[:, 1])

f1_score_genetic_disorder = f1_score(y_test_array[:, 0], y_pred[:, 0], average='weighted')
f1_score_disorder_subclass = f1_score(y_test_array[:, 1], y_pred[:, 1], average='weighted')

recall_genetic_disorder = recall_score(y_test_array[:, 0], y_pred[:, 0], average='weighted')
recall_disorder_subclass = recall_score(y_test_array[:, 1], y_pred[:, 1], average='weighted')

accuracy_genetic_disorder = accuracy_score(y_test_array[:, 0], y_pred[:, 0])
accuracy_disorder_subclass = accuracy_score(y_test_array[:, 1], y_pred[:, 1])

balanced_accuracy_genetic_disorder = balanced_accuracy_score(y_test_array[:, 0], y_pred[:, 0])
balanced_accuracy_disorder_subclass = balanced_accuracy_score(y_test_array[:, 1], y_pred[:, 1])


# Use dataset without Cancer and Alzeimer

In [None]:
# Assuming your DataFrame is named dataset
filtered_dataset = dataset_encoded[(dataset['Disorder Subclass'] != 7) & (dataset['Disorder Subclass'] != 8)]

# Update your X and y with the filtered dataset
X_filtered = filtered_dataset.drop(columns=['Genetic Disorder', 'Disorder Subclass'])
y_filtered = filtered_dataset[['Genetic Disorder', 'Disorder Subclass']]


X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X_filtered, y_filtered, train_size=0.75,  random_state=1) # training as 75%

In [None]:
# DecisionTreeClassifier for genetic disorder
dt_genetic_disorder_new = DecisionTreeClassifier(
    criterion='gini', max_depth=10, max_features='log2', min_samples_leaf=1, min_samples_split=10, splitter='random',
    class_weight='balanced'
)

# DecisionTreeClassifier for disorder subclass
dt_disorder_subclass_new = DecisionTreeClassifier(
    criterion='entropy', max_depth=10, max_features=None, min_samples_leaf=4, min_samples_split=2, splitter='random',
    class_weight='balanced'
)

# Fit the models
dt_genetic_disorder.fit(X_train, y_train['Genetic Disorder']) 
dt_disorder_subclass.fit(X_train, y_train['Disorder Subclass'])  

# Predict using the trained models
y_pred_genetic_disorder_new = dt_genetic_disorder.predict(X_test)
y_pred_disorder_subclass_new = dt_disorder_subclass.predict(X_test)

# Combine the predictions into a single array if needed
y_pred_new = np.column_stack((y_pred_genetic_disorder_new, y_pred_disorder_subclass_new))

In [None]:
# y_test_new is DataFrames
y_test_columns_new = y_test_new.columns[:]
# Convert y_pred_new to a DataFrame
y_pred_df_new = pd.DataFrame(y_pred_new, columns=y_test_columns_new)

balanced_acc_scores_new = {}

for column in y_test_columns:
    y_test_column_new = y_test_new[column]
    y_pred_column_new = y_pred_df_new[column]
    
    balanced_acc_scores_new[column] = balanced_accuracy_score(y_test_column_new, y_pred_column_new)



In [None]:
# Generate classification report for each output separately
y_test_array_new = y_test_new.to_numpy()

hamming_loss_genetic_disorder_new = hamming_loss(y_test_array_new[:, 0], y_pred_new[:, 0])
hamming_loss_disorder_subclass_new = hamming_loss(y_test_array_new[:, 1], y_pred_new[:, 1])

f1_score_genetic_disorder_new = f1_score(y_test_array_new[:, 0], y_pred_new[:, 0], average='weighted')
f1_score_disorder_subclass_new = f1_score(y_test_array_new[:, 1], y_pred_new[:, 1], average='weighted')

recall_genetic_disorder_new = recall_score(y_test_array_new[:, 0], y_pred_new[:, 0], average='weighted')
recall_disorder_subclass_new = recall_score(y_test_array_new[:, 1], y_pred_new[:, 1], average='weighted')

balanced_accuracy_genetic_disorder_new = balanced_accuracy_score(y_test_array_new[:, 0], y_pred_new[:, 0])
balanced_accuracy_disorder_subclass_new = balanced_accuracy_score(y_test_array_new[:, 1], y_pred_new[:, 1])

accuracy_genetic_disorder_new = accuracy_score(y_test_array_new[:, 0], y_pred_new[:, 0])
accuracy_disorder_subclass_new = accuracy_score(y_test_array_new[:, 1], y_pred_new[:, 1])


In [None]:
print("Classification for Genetic Disorder:")

print("Hamming Loss for Genetic Disorder:", hamming_loss_genetic_disorder)
print("F1 Score for Genetic Disorder:", f1_score_genetic_disorder)
print("Recall for Genetic Disorder:", recall_genetic_disorder)
print("Accuracy for Genetic Disorder:", accuracy_genetic_disorder)
print("Balanced Accuracy for Genetic Disorder:", balanced_accuracy_genetic_disorder)

print("\nClassification for Disorder Subclass:")

print("Hamming Loss for Disorder Subclass:", hamming_loss_disorder_subclass)
print("F1 Score for Disorder Subclass:", f1_score_disorder_subclass)
print("Recall for Disorder Subclass:", recall_disorder_subclass)
print("Balanced Accuracy for Disorder Subclass:", balanced_accuracy_disorder_subclass)
print("Accuracy for Disorder Subclass:", accuracy_disorder_subclass)


print("\n ------------ Without Cancer and Alzeimer samples --------------")

print("Classification for Genetic Disorder:")

print("\nHamming Loss for Genetic Disorder:", hamming_loss_genetic_disorder_new)
print("F1 Score for Genetic Disorder:", f1_score_genetic_disorder_new)
print("Recall for Genetic Disorder:", recall_genetic_disorder_new)
print("Balanced Accuracy for Genetic Disorder:", balanced_accuracy_genetic_disorder_new)
print("Accuracy for Genetic Disorder:", accuracy_genetic_disorder_new)

print("\nClassification for Disorder Subclass:")

print("Hamming Loss for Disorder Subclass:", hamming_loss_disorder_subclass_new)
print("F1 Score for Disorder Subclass:", f1_score_disorder_subclass_new)
print("Recall for Disorder Subclass:", recall_disorder_subclass_new)
print("Balanced Accuracy for Disorder Subclass:", balanced_accuracy_disorder_subclass_new)
print("Accuracy for Disorder Subclass:", accuracy_disorder_subclass_new)
