##Decision Tree Classifier

In [48]:
# Import necessary libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score

In [49]:
# Load dataset

dataset= pd.read_csv('dataset.csv')

# Replace 'Unknown' with NaN in Target
dataset['Genetic Disorder'].replace('Unknown', np.nan, inplace=True)
dataset['Disorder Subclass'].replace('Unknown', np.nan, inplace=True)

dataset.dropna(inplace=True)

# Alocate features and labels

X = dataset.iloc[:, :-2]  # Features
y = dataset.iloc[:, -2:]  # Labels (last two columns)

# Perform one-hot encoding on the categorical features
X = pd.get_dummies(X, drop_first=True)
# drop_first is used to drop one of the columns for each categorical feature to avoid multicollinearity.

# Check if the target variable has more than 2 classes
if y.nunique().any() > 2:
    y = pd.get_dummies(y, drop_first=True)


In [50]:
# split the data into validation, validation and test

X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size=0.5,  random_state=1) # training as 50% 
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1) # test as 25% and validations as 25%

# Select and train model of classifier

clf = MultiOutputClassifier(tree.DecisionTreeClassifier(splitter = 'random', random_state = 1))
# evaluate the change of the criterion from gini to entropy?

clf.fit(X_train, y_train)

y_pred = clf.predict(X_validation) 


In [51]:
# Evaluate the model

# Separate the multi output targets
y_validation_last1 = y_validation.iloc[:, -1]
y_validation_last2 = y_validation.iloc[:, -2]

y_pred_last1 = y_pred[:, -1]
y_pred_last2 = y_pred[:, -2]


# Apply metrics

accuracy_last1 = accuracy_score(y_validation_last1, y_pred_last1)
precision_last1 = precision_score(y_validation_last1, y_pred_last1, average='weighted')
recall_last1 = recall_score(y_validation_last1, y_pred_last1, average='weighted')
f1_last1 = f1_score(y_validation_last1, y_pred_last1, average='weighted')
balanced_acc_last1 = balanced_accuracy_score(y_validation_last1, y_pred_last1)


accuracy_last2 = accuracy_score(y_validation_last2, y_pred_last2)
precision_last2 = precision_score(y_validation_last2, y_pred_last2, average='weighted')
recall_last2 = recall_score(y_validation_last2, y_pred_last2, average='weighted')
f1_last2 = f1_score(y_validation_last2, y_pred_last2, average='weighted')
balanced_acc_last2 = balanced_accuracy_score(y_validation_last2, y_pred_last2)


In [52]:
# Train model without unknown data

# Replace 'Unknown' with NaN in other
dataset.replace('Unknown', np.nan, inplace=True)

dataset.dropna(inplace=True)

In [53]:
# Alocate features and labels

X = dataset.iloc[:, :-2]  # Features
y = dataset.iloc[:, -2:]  # Labels (last two columns)

# Perform one-hot encoding on the categorical features
X = pd.get_dummies(X, drop_first=True)
# drop_first is used to drop one of the columns for each categorical feature to avoid multicollinearity.

# Check if the target variable has more than 2 classes
if y.nunique().any() > 2:
    y = pd.get_dummies(y, drop_first=True)

# split the data into validation, validation and test

X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size=0.5,  random_state=1) # training as 50% 
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1) # test as 25% and validations as 25%


In [54]:
# Select and train model of classifier

clf = MultiOutputClassifier(tree.DecisionTreeClassifier(splitter = 'random', random_state = 1))
# evaluate the change of the criterion from gini to entropy?

clf.fit(X_train, y_train)

y_pred = clf.predict(X_validation) 


In [55]:
# Evaluate the model

# Separate the multi output targets
y_validation_last1 = y_validation.iloc[:, -1]
y_validation_last2 = y_validation.iloc[:, -2]

y_pred_last1 = y_pred[:, -1]
y_pred_last2 = y_pred[:, -2]


# Apply metrics

accuracy_last1_2 = accuracy_score(y_validation_last1, y_pred_last1)
precision_last1_2 = precision_score(y_validation_last1, y_pred_last1, average='weighted')
recall_last1_2 = recall_score(y_validation_last1, y_pred_last1, average='weighted')
f1_last1_2 = f1_score(y_validation_last1, y_pred_last1, average='weighted')
balanced_acc_last1_2 = balanced_accuracy_score(y_validation_last1, y_pred_last1)


accuracy_last2_2 = accuracy_score(y_validation_last2, y_pred_last2)
precision_last2_2 = precision_score(y_validation_last2, y_pred_last2, average='weighted')
recall_last2_2 = recall_score(y_validation_last2, y_pred_last2, average='weighted')
f1_last2_2 = f1_score(y_validation_last2, y_pred_last2, average='weighted')
balanced_acc_last2_2 = balanced_accuracy_score(y_validation_last2, y_pred_last2)

  _warn_prf(average, modifier, msg_start, len(result))


In [57]:

print("\nMetrics for the last column:")
print("Accuracy:", accuracy_last1)
print("Precision:", precision_last1)
print("Recall:", recall_last1)
print("F1 Score:", f1_last1)
print("balanced accuracy:", balanced_acc_last1)

print("\nMetrics for the second-to-last column:")
print("Accuracy:", accuracy_last2)
print("Precision:", precision_last2)
print("Recall:", recall_last2)
print("F1 Score:", f1_last2)
print("balanced accuracy:", balanced_acc_last2)

print('\n-------------------------REMOVING UNKNOWNS ---------------------------')

print("\nMetrics for the last column:")
print("Accuracy:", accuracy_last1_2)
print("Precision:", precision_last1_2)
print("Recall:", recall_last1_2)
print("F1 Score:", f1_last1_2)
print("balanced accuracy:", balanced_acc_last1_2)

print("\nMetrics for the second-to-last column:")
print("Accuracy:", accuracy_last2_2)
print("Precision:", precision_last2_2)
print("Recall:", recall_last2_2)
print("F1 Score:", f1_last2_2)
print("balanced accuracy:", balanced_acc_last2_2)


Metrics for the last column:
Accuracy: 0.3167109929078014
Precision: 0.31485364374741737
Recall: 0.3167109929078014
F1 Score: 0.3155485749720203
balanced accuracy: 0.24276277477776237

Metrics for the second-to-last column:
Accuracy: 0.5113031914893617
Precision: 0.5084881300574924
Recall: 0.5113031914893617
F1 Score: 0.5097805104589799
balanced accuracy: 0.44204814221037453

-------------------------REMOVING UNKNOWNS ---------------------------

Metrics for the last column:
Accuracy: 0.32
Precision: 0.32000172135177685
Recall: 0.32
F1 Score: 0.3133084587432364
balanced accuracy: 0.24611987943997926

Metrics for the second-to-last column:
Accuracy: 0.412
Precision: 0.4075234558544903
Recall: 0.412
F1 Score: 0.4093380486790097
balanced accuracy: 0.3815946925703024
