# Decision Tree Classifier

In [31]:
# Import necessary libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn import tree
from sklearn.metrics import balanced_accuracy_score, accuracy_score, precision_score, recall_score, f1_score, hamming_loss, classification_report
from sklearn.preprocessing import LabelEncoder



In [85]:
# Load dataset
dataset= pd.read_csv('dataset.csv')

# Replace 'Unknown' with NaN in Target
dataset['Genetic Disorder'].replace('Unknown', np.nan, inplace=True)
dataset['Disorder Subclass'].replace('Unknown', np.nan, inplace=True)
# Eliminate samples of NaN targets
dataset.dropna(inplace=True)


# Encode categorical variables
label_encoder = LabelEncoder()

# List of categorical columns to encode
categorical_unordered_columns = [
    'Genes in mother\'s side', 'Inherited from father', 'Maternal gene', 'Paternal gene', 'Status',
    'Respiratory Rate (breaths/min)', 'Heart Rate (rates/min)', 'Follow-up',
    'Gender', 'Birth asphyxia', 'Autopsy shows birth defect (if applicable)', 'Place of birth',
    'Folic acid details (peri-conceptional)',
    'H/O serious maternal illness', 'H/O radiation exposure (x-ray)', 'H/O substance abuse', 
    'Assisted conception IVF/ART', 'History of anomalies in previous pregnancies',
    'Birth defects', 'Blood test result',
    'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5', 
    'Genetic Disorder', 'Disorder Subclass'
]

quantitative_with_unknowns_or_ordered_columns = [
    'Patient Age', "Mother's age", "Father's age", 'No. of previous abortion',
    'White Blood cell count (thousand per microliter)']

dataset_encoded=dataset.copy()

for column in quantitative_with_unknowns_or_ordered_columns:
    dataset_encoded[column] = label_encoder.fit_transform(dataset_encoded[column].astype(str))
dataset=pd.get_dummies(dataset_encoded, columns=categorical_unordered_columns, drop_first=False)


# Alocate features and labels
X = dataset.iloc[:, :-12]  # Features
y = dataset.iloc[:, -12:]  # Labels (last two columns encoded into 3+9)


# Split the data into validation, validation and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75,  random_state=1) # training as 75% 
    

In [87]:
# Select and train model of classifier

# Best Hyperparameters: {'estimator__criterion': 'gini', 'estimator__max_depth': 10, 'estimator__max_features': None, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 10, 'estimator__splitter': 'random'}
# Best Mean Accuracy: 0.2690801625415589
'''
dt = MultiOutputClassifier(tree.DecisionTreeClassifier(criterion= 'gini', max_depth= 10, max_features= None, min_samples_leaf= 1, min_samples_split= 10, splitter = 'random', random_state = 1))

kf = KFold(n_splits=5, shuffle=True, random_state=1)

cv_scores = cross_val_score(dt, X_train, y_train, cv=kf)

print("Cross-validated scores:", cv_scores)
print("Mean cross-validated score:", cv_scores.mean())'''

dt = tree.DecisionTreeClassifier(criterion= 'gini', max_depth= 10, max_features= None, min_samples_leaf= 1, min_samples_split= 10, splitter = 'random', random_state = 1)

dt.fit(X_train, y_train)

y_pred = dt.predict(X_test) 


In [89]:
# Get 12 targets to evaluate model
'''y_test_columns = y_test.columns[-12:] 
y_pred_columns = y_pred[:, -12:]

balanced_acc_scores_ = {}

for i, column in enumerate(y_test_columns):
    y_test_ = y_test[column]
    y_pred_ = y_pred_columns[:, i]
    
    balanced_acc_scores_[column] = balanced_accuracy_score(y_test_, y_pred_)'''
# y_test_new is DataFrames
y_test_columns = y_test.columns[-12:]
# Convert y_pred_new to a DataFrame
y_pred_df = pd.DataFrame(y_pred, columns=y_test_columns)

balanced_acc_scores = {}

for column in y_test_columns:
    y_test_column = y_test[column]
    y_pred_column = y_pred_df[column]
    
    balanced_acc_scores[column] = balanced_accuracy_score(y_test_column, y_pred_column)



# Use dataset without Unknown values

In [90]:
# Train models without unknown data

# Replace 'Unknown' with NaN
dataset.replace('Unknown', np.nan, inplace=True)

dataset.dropna(inplace=True)

In [91]:
# Alocate features and labels

X_new = dataset.iloc[:, :-12]  # Features
y_new = dataset.iloc[:, -12:]  # Labels (last two columns encoded into 3+9)

# split the data into validation, validation and test

X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(X, y, train_size=0.75,  random_state=1) # training as 75%

In [92]:
# Select and train model of classifier

# Best Hyperparameters: {'estimator__criterion': 'gini', 'estimator__max_depth': 10, 'estimator__max_features': None, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 10, 'estimator__splitter': 'random'}
# Best Mean Accuracy: 0.2690801625415589

#dt_new = MultiOutputClassifier(tree.DecisionTreeClassifier(criterion= 'gini', max_depth= 10, max_features= None, min_samples_leaf= 1, min_samples_split= 10, splitter = 'random', random_state = 1))
dt_new = tree.DecisionTreeClassifier(criterion= 'gini', max_depth= 10, max_features= None, min_samples_leaf= 1, min_samples_split= 10, splitter = 'random', random_state = 1)
'''
kf = KFold(n_splits=5, shuffle=True, random_state=1)  
cv_scores = cross_val_score(dt_new, X_train_new, y_train_new, cv=kf)

print("Cross-validated scores:", cv_scores)
print("Mean cross-validated score:", cv_scores.mean())
'''
dt_new.fit(X_train_new, y_train_new)

y_pred_new = dt_new.predict(X_test_new) 


In [93]:
'''# Get 12 targets to evaluate model
y_test_columns = y_test.columns[-12:] 
y_pred_columns = y_pred[:, -12:]

balanced_acc_scores_ = {}

for i, column in enumerate(y_test_columns):
    y_test_ = y_test[column]
    y_pred_ = y_pred_columns[:, i]
    
    balanced_acc_scores_[column] = balanced_accuracy_score(y_test_, y_pred_)
'''
# y_test_new is DataFrames
y_test_columns_new = y_test_new.columns[-12:]
# Convert y_pred_new to a DataFrame
y_pred_df_new = pd.DataFrame(y_pred_new, columns=y_test_columns_new)

balanced_acc_scores_new = {}

for column in y_test_columns_new:
    y_test_column = y_test_new[column]
    y_pred_column = y_pred_df_new[column]
    
    balanced_acc_scores_new[column] = balanced_accuracy_score(y_test_column, y_pred_column)




In [96]:
# Evaluate model
y_test_array = y_test.to_numpy()
y_test_array_new = y_test_new.to_numpy()

# Before removing Unknowns
accuracy = accuracy_score(y_test_array, y_pred)
precision = precision_score(y_test_array, y_pred, average='micro') 
recall = recall_score(y_test_array, y_pred, average='micro')
f1 = f1_score(y_test_array, y_pred, average='micro')
hamming = hamming_loss(y_test_array, y_pred)

# Unknowns Removed
accuracy_new = accuracy_score(y_test_array_new, y_pred_new)
precision_new = precision_score(y_test_array_new, y_pred_new, average='micro') 
recall_new = recall_score(y_test_array_new, y_pred_new, average='micro')
f1_new = f1_score(y_test_array_new, y_pred_new, average='micro')
hamming_new = hamming_loss(y_test_array_new, y_pred_new)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)
print("Hamming Loss:", hamming)
print("Accuracy:", balanced_acc_scores)

print('\n-------------------------REMOVING UNKNOWNS ---------------------------')

print("\nAccuracy:", accuracy_new)
print("Precision:", precision_new)
print("Recall:", recall_new)
print("F1-Score:", f1_new)
print("Hamming Loss:", hamming_new)
print("Accuracy:", balanced_acc_scores_new)




Accuracy: 0.21365248226950354
Precision: 0.5070593593441628
Recall: 0.3701241134751773
F1-Score: 0.4279034014476971
Hamming Loss: 0.16494902482269502
Accuracy: {'Genetic Disorder_Mitochondrial genetic inheritance disorders': 0.642436875327998, 'Genetic Disorder_Multifactorial genetic inheritance disorders': 0.5918633371004528, 'Genetic Disorder_Single-gene inheritance diseases': 0.5867476097553626, "Disorder Subclass_Alzheimer's": 0.49955357142857143, 'Disorder Subclass_Cancer': 0.5665114395626645, 'Disorder Subclass_Cystic fibrosis': 0.6253859461269983, 'Disorder Subclass_Diabetes': 0.5765752233378133, 'Disorder Subclass_Hemochromatosis': 0.5481307509899361, "Disorder Subclass_Leber's hereditary optic neuropathy": 0.5495309997712194, 'Disorder Subclass_Leigh syndrome': 0.5703631463080496, 'Disorder Subclass_Mitochondrial myopathy': 0.5313417809329898, 'Disorder Subclass_Tay-Sachs': 0.5471999650430569}

-------------------------REMOVING UNKNOWNS ---------------------------

Accuracy: 0

In [97]:
# Metrics and information for each output column
print('\nClassification Report:')
print(classification_report(y_test, y_pred))
for column, score in balanced_acc_scores.items():
    print(f'Balanced Accuracy for {column}: {score}')

print('\n-------------------------REMOVING UNKNOWNS ---------------------------')

print('\nClassification Report:')
print(classification_report(y_test_new, y_pred_new))

for column, score in balanced_acc_scores_new.items():
    print(f'Balanced Accuracy for {column}: {score}')



Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.66      0.66      2343
           1       0.39      0.23      0.28       471
           2       0.51      0.42      0.46      1698
           3       0.00      0.00      0.00        32
           4       0.17      0.14      0.15        22
           5       0.41      0.35      0.38       745
           6       0.37      0.18      0.25       417
           7       0.41      0.11      0.17       315
           8       0.32      0.11      0.16       141
           9       0.42      0.28      0.34      1186
          10       0.34      0.15      0.21      1016
          11       0.32      0.15      0.20       638

   micro avg       0.51      0.37      0.43      9024
   macro avg       0.36      0.23      0.27      9024
weighted avg       0.47      0.37      0.41      9024
 samples avg       0.47      0.37      0.40      9024

Balanced Accuracy for Genetic Disorder_Mitochondrial ge

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Tuning Hyperparameters

In [23]:
param_grid = {
    'estimator__criterion': ['gini', 'entropy'],
    'estimator__splitter': ['best', 'random'],
    'estimator__max_depth': [None, 10, 20, 30],
    'estimator__min_samples_split': [2, 5, 10],
    'estimator__min_samples_leaf': [1, 2, 4],
    'estimator__max_features': [None, 'sqrt', 'log2']
}

dt = MultiOutputClassifier(tree.DecisionTreeClassifier())

# Generate all combinations
grid = ParameterGrid(param_grid)


all_mean_scores = []
for params in grid:
    dt.set_params(**params)

    scores = cross_val_score(dt, X_train, y_train, cv=5)
  
    # Calculate mean accuracy
    mean_accuracy = scores.mean()
    all_mean_scores.append((params, mean_accuracy))
    
    print(f'Parameters: {params}, Mean Accuracy: {mean_accuracy}')

# Find the best hyperparameters based on the highest mean accuracy
best_params, best_mean_accuracy = max(all_mean_scores, key=lambda x: x[1])

print("Best Hyperparameters:", best_params)
print("Best Mean Accuracy:", best_mean_accuracy)

NameError: name 'ParameterGrid' is not defined