# Decision Tree Classifier

In [1]:
# Import necessary libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn import tree
from sklearn.metrics import balanced_accuracy_score, classification_report


In [2]:
# Load dataset

dataset= pd.read_csv('dataset.csv')

# Replace 'Unknown' with NaN in Target
dataset['Genetic Disorder'].replace('Unknown', np.nan, inplace=True)
dataset['Disorder Subclass'].replace('Unknown', np.nan, inplace=True)

dataset.dropna(inplace=True)

# Alocate features and labels

X = dataset.iloc[:, :-2]  # Features
y = dataset.iloc[:, -2:]  # Labels (last two columns)

# Perform one-hot encoding on the categorical features
X = pd.get_dummies(X, drop_first=True)
# drop_first is used to drop one of the columns for each categorical feature to avoid multicollinearity.

# Check if the target variable has more than 2 classes
if y.nunique().any() > 2:
    y = pd.get_dummies(y, drop_first=True)
    

In [3]:
# Split the data into validation, validation and test

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75,  random_state=1) # training as 75% 


# Select and train model of classifier

dt = MultiOutputClassifier(tree.DecisionTreeClassifier(splitter = 'random', random_state = 1))
# evaluate the change of the criterion from gini to entropy?

# Perform cross-validation
cv_scores = cross_val_score(dt, X_train, y_train, cv=5) # 5-fold cross-validation

print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Score:", cv_scores.mean())


dt.fit(X_train, y_train)

y_pred = dt.predict(X_test) 


Cross-Validation Scores: [0.20724049 0.19468046 0.20243812 0.20908755 0.20206871]
Mean Cross-Validation Score: 0.20310306612486148


In [4]:
# Separate the multi output targets
y_test_last1 = y_test.iloc[:, -1]
y_test_last2 = y_test.iloc[:, -2]

y_pred_last1 = y_pred[:, -1]
y_pred_last2 = y_pred[:, -2]

# Evaluate the model

balanced_acc_last1 = balanced_accuracy_score(y_test_last1, y_pred_last1)
balanced_acc_last2 = balanced_accuracy_score(y_test_last2, y_pred_last2)


# Use dataset without Unknown values

In [5]:
# Train models without unknown data

# Replace 'Unknown' with NaN
dataset.replace('Unknown', np.nan, inplace=True)

dataset.dropna(inplace=True)

In [6]:
# Alocate features and labels

X = dataset.iloc[:, :-2]  # Features
y = dataset.iloc[:, -2:]  # Labels (last two columns)

# Perform one-hot encoding on the categorical features
X = pd.get_dummies(X, drop_first=True)
# drop_first is used to drop one of the columns for each categorical feature to avoid multicollinearity.

# Check if the target variable has more than 2 classes
if y.nunique().any() > 2:
    y = pd.get_dummies(y, drop_first=True)

# split the data into validation, validation and test

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75,  random_state=1) # training as 75%

In [7]:
# Select and train model of classifier

dt = MultiOutputClassifier(tree.DecisionTreeClassifier(splitter = 'random', random_state = 1))
# evaluate the change of the criterion from gini to entropy?

dt.fit(X_train, y_train)

y_pred = dt.predict(X_test) 


In [8]:
# Separate the multi output targets
y_test_last1_new = y_test.iloc[:, -1]
y_test_last2_new = y_test.iloc[:, -2]

y_pred_last1_new = y_pred[:, -1]
y_pred_last2_new = y_pred[:, -2]

# Evaluate the model

balanced_acc_last1_new = balanced_accuracy_score(y_test_last2_new, y_pred_last2_new)
balanced_acc_last2_new = balanced_accuracy_score(y_test_last2_new, y_pred_last2_new)


In [9]:
# Metrics and information for each output column

metrics_dict = {
    'Model': 'Decision Tree - W/ Unknowns',

    # Metrics for the original dataset
    'Scenario': 'Original',
    'Balanced Accuracy (Last Column)': balanced_acc_last1,
    'Balanced Accuracy (Second to Last Column)': balanced_acc_last2,
    'Classification Report (Last Column)': classification_report(y_test_last1, y_pred_last1),
    'Classification Report (Second to Last Column)': classification_report(y_test_last2, y_pred_last2),

    # Metrics for the dataset without samples with unknown values
    'Scenario': 'Decision Tree - Removed Unknowns',
    'Balanced Accuracy (Last Column)': balanced_acc_last1_new,
    'Balanced Accuracy (Second to Last Column)': balanced_acc_last2_new,
    'Classification Report (Last Column)': classification_report(y_test_last1_new, y_pred_last1_new),
    'Classification Report (Second to Last Column)': classification_report(y_test_last2_new, y_pred_last2_new),
}

# Convert the dictionary to a Pandas DataFrame
metrics_df = pd.DataFrame(metrics_dict.items(), columns=['Metric', 'Value'])

# Save the DataFrame to an Excel file
file_path = 'metrics_report.csv'
metrics_df.to_csv(file_path, index=False)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [10]:
# Metrics and information for each output column
print('\nClassification Report (Last Column):')
print(classification_report(y_test_last1, y_pred_last1))
print("balanced accuracy:", balanced_acc_last1)

print('\nClassification Report (Second to Last Column):')
print(classification_report(y_test_last2, y_pred_last2))
print("balanced accuracy:", balanced_acc_last2)

print('\n-------------------------REMOVING UNKNOWNS ---------------------------')

print('\nClassification Report (Last Column):')
print(classification_report(y_test_last1_new, y_pred_last1_new))
print("balanced accuracy:", balanced_acc_last1_new)

print('\nClassification Report (Second to Last Column):')
print(classification_report(y_test_last2_new, y_pred_last2_new))
print("balanced accuracy:", balanced_acc_last2_new)


Classification Report (Last Column):
                                     precision    recall  f1-score   support

                        Alzheimer's       0.11      0.12      0.12        32
                             Cancer       0.07      0.05      0.05        22
                    Cystic fibrosis       0.36      0.39      0.38       745
                           Diabetes       0.28      0.26      0.27       417
                    Hemochromatosis       0.29      0.27      0.28       315
Leber's hereditary optic neuropathy       0.24      0.23      0.23       141
                     Leigh syndrome       0.38      0.37      0.38      1186
             Mitochondrial myopathy       0.31      0.33      0.32      1016
                          Tay-Sachs       0.27      0.25      0.26       638

                           accuracy                           0.32      4512
                          macro avg       0.26      0.25      0.25      4512
                       weighted avg 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
