# Decision Tree Classifier

In [1]:
# Import necessary libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn import tree
from sklearn.metrics import balanced_accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder



In [2]:
# Load dataset
dataset= pd.read_csv('dataset.csv')

# Replace 'Unknown' with NaN in Target
dataset['Genetic Disorder'].replace('Unknown', np.nan, inplace=True)
dataset['Disorder Subclass'].replace('Unknown', np.nan, inplace=True)
# Eliminate samples of NaN targets
dataset.dropna(inplace=True)


# Encode categorical variables
label_encoder = LabelEncoder()

# List of categorical columns to encode
categorical_unordered_columns = [
    'Genes in mother\'s side', 'Inherited from father', 'Maternal gene', 'Paternal gene', 'Status',
    'Respiratory Rate (breaths/min)', 'Heart Rate (rates/min)', 'Follow-up',
    'Gender', 'Birth asphyxia', 'Autopsy shows birth defect (if applicable)', 'Place of birth',
    'Folic acid details (peri-conceptional)',
    'H/O serious maternal illness', 'H/O radiation exposure (x-ray)', 'H/O substance abuse', 
    'Assisted conception IVF/ART', 'History of anomalies in previous pregnancies',
    'Birth defects', 'Blood test result',
    'Symptom 1', 'Symptom 2', 'Symptom 3', 'Symptom 4', 'Symptom 5', 
    'Genetic Disorder', 'Disorder Subclass'
]

quantitative_with_unknowns_or_ordered_columns = [
    'Patient Age', "Mother's age", "Father's age", 'No. of previous abortion',
    'White Blood cell count (thousand per microliter)']

dataset_encoded=dataset.copy()

for column in quantitative_with_unknowns_or_ordered_columns:
    dataset_encoded[column] = label_encoder.fit_transform(dataset_encoded[column].astype(str))
dataset=pd.get_dummies(dataset_encoded, columns=categorical_unordered_columns, drop_first=False)


# Alocate features and labels
X = dataset.iloc[:, :-12]  # Features
y = dataset.iloc[:, -12:]  # Labels (last two columns encoded into 3+9)


# Split the data into validation, validation and test
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75,  random_state=1) # training as 75% 
    

In [3]:
# Select and train model of classifier

# Best Hyperparameters: {'estimator__criterion': 'gini', 'estimator__max_depth': 10, 'estimator__max_features': None, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 10, 'estimator__splitter': 'random'}
# Best Mean Accuracy: 0.2690801625415589

dt = MultiOutputClassifier(tree.DecisionTreeClassifier(criterion= 'gini', max_depth= 10, max_features= None, min_samples_leaf= 1, min_samples_split= 10, splitter = 'random', random_state = 1))

kf = KFold(n_splits=5, shuffle=True, random_state=1)

cv_scores = cross_val_score(dt, X_train, y_train, cv=kf)

print("Cross-validated scores:", cv_scores)
print("Mean cross-validated score:", cv_scores.mean())

dt.fit(X_train, y_train)

y_pred = dt.predict(X_test) 


Cross-validated scores: [0.06058367 0.06243074 0.0439601  0.04617658 0.05319542]
Mean cross-validated score: 0.053269301810121905


In [4]:
# Get 12 targets to evaluate model
y_test_columns = y_test.columns[-12:] 
y_pred_columns = y_pred[:, -12:]

balanced_acc_scores_ = {}

for i, column in enumerate(y_test_columns):
    y_test_ = y_test[column]
    y_pred_ = y_pred_columns[:, i]
    
    balanced_acc_scores_[column] = balanced_accuracy_score(y_test_, y_pred_)



# Use dataset without Unknown values

In [5]:
# Train models without unknown data

# Replace 'Unknown' with NaN
dataset.replace('Unknown', np.nan, inplace=True)

dataset.dropna(inplace=True)

In [6]:
# Alocate features and labels

X = dataset.iloc[:, :-12]  # Features
y = dataset.iloc[:, -12:]  # Labels (last two columns encoded into 3+9)

# split the data into validation, validation and test

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75,  random_state=1) # training as 75%

In [7]:
# Select and train model of classifier

# Best Hyperparameters: {'estimator__criterion': 'gini', 'estimator__max_depth': 10, 'estimator__max_features': None, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 10, 'estimator__splitter': 'random'}
# Best Mean Accuracy: 0.2690801625415589

dt = MultiOutputClassifier(tree.DecisionTreeClassifier(criterion= 'gini', max_depth= 10, max_features= None, min_samples_leaf= 1, min_samples_split= 10, splitter = 'random', random_state = 1))

kf = KFold(n_splits=5, shuffle=True, random_state=1)  

cv_scores = cross_val_score(dt, X_train, y_train, cv=kf)

print("Cross-validated scores:", cv_scores)
print("Mean cross-validated score:", cv_scores.mean())

dt.fit(X_train, y_train)

y_pred = dt.predict(X_test) 


Cross-validated scores: [0.06058367 0.06243074 0.0439601  0.04617658 0.05319542]
Mean cross-validated score: 0.053269301810121905


In [8]:
# Get 12 targets to evaluate model
y_test_columns = y_test.columns[-12:] 
y_pred_columns = y_pred[:, -12:]

balanced_acc_scores_new = {}

for i, column in enumerate(y_test_columns):
    y_test_new = y_test[column]
    y_pred_new = y_pred_columns[:, i]
    
    balanced_acc_scores_new[column] = balanced_accuracy_score(y_test_new, y_pred_new)


In [9]:
# Metrics and information for each output column
print('\nClassification Report:')
print(classification_report(y_test_, y_pred_))
for column, score in balanced_acc_scores_.items():
    print(f'Balanced Accuracy for {column}: {score}')

print('\n-------------------------REMOVING UNKNOWNS ---------------------------')

print('\nClassification Report:')
print(classification_report(y_test_new, y_pred_new))

for column, score in balanced_acc_scores_new.items():
    print(f'Balanced Accuracy for {column}: {score}')



Classification Report:
              precision    recall  f1-score   support

       False       0.88      0.93      0.90      3874
        True       0.33      0.20      0.25       638

    accuracy                           0.83      4512
   macro avg       0.60      0.56      0.57      4512
weighted avg       0.80      0.83      0.81      4512

Balanced Accuracy for Genetic Disorder_Mitochondrial genetic inheritance disorders: 0.6425133221053974
Balanced Accuracy for Genetic Disorder_Multifactorial genetic inheritance disorders: 0.5769690292337931
Balanced Accuracy for Genetic Disorder_Single-gene inheritance diseases: 0.5367640595608529
Balanced Accuracy for Disorder Subclass_Alzheimer's: 0.5131696428571428
Balanced Accuracy for Disorder Subclass_Cancer: 0.5889046365661065
Balanced Accuracy for Disorder Subclass_Cystic fibrosis: 0.6139036813871077
Balanced Accuracy for Disorder Subclass_Diabetes: 0.5541720469778024
Balanced Accuracy for Disorder Subclass_Hemochromatosis: 0.5805461

# Tuning Hyperparameters

In [10]:
param_grid = {
    'estimator__criterion': ['gini', 'entropy'],
    'estimator__splitter': ['best', 'random'],
    'estimator__max_depth': [None, 10, 20, 30],
    'estimator__min_samples_split': [2, 5, 10],
    'estimator__min_samples_leaf': [1, 2, 4],
    'estimator__max_features': [None, 'sqrt', 'log2']
}

dt = MultiOutputClassifier(tree.DecisionTreeClassifier())

# Generate all combinations
grid = ParameterGrid(param_grid)


all_mean_scores = []
for params in grid:
    dt.set_params(**params)

    scores = cross_val_score(dt, X_train, y_train, cv=5)
  
    # Calculate mean accuracy
    mean_accuracy = scores.mean()
    all_mean_scores.append((params, mean_accuracy))
    
    print(f'Parameters: {params}, Mean Accuracy: {mean_accuracy}')

# Find the best hyperparameters based on the highest mean accuracy
best_params, best_mean_accuracy = max(all_mean_scores, key=lambda x: x[1])

print("Best Hyperparameters:", best_params)
print("Best Mean Accuracy:", best_mean_accuracy)

NameError: name 'ParameterGrid' is not defined