##Decision Tree Classifier

In [256]:
# Import necessary libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, balanced_accuracy_score
from sklearn.preprocessing import LabelEncoder

In [247]:
# Load dataset

dataset= pd.read_csv('dataset.csv')

# Alocate features and labels

X = dataset.iloc[:, :-2]  # Features
y = dataset.iloc[:, -2:]  # Labels (last two columns)

# Perform one-hot encoding on the categorical features
X = pd.get_dummies(X, drop_first=True)
# drop_first is used to drop one of the columns for each categorical feature to avoid multicollinearity.

# Check if the target variable has more than 2 classes
if y.nunique().any() > 2:
    y = pd.get_dummies(y, drop_first=True)

#label_encoder = LabelEncoder()
#y = y.apply(lambda col: label_encoder.fit_transform(col)) # deal with multiclass and multioutput data



In [253]:
# split the data into validation, validation and test

X_train, X_temp, y_train, y_temp = train_test_split(X, y, train_size=0.5,  random_state=1) # training as 50% 
X_validation, X_test, y_validation, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=1) # test as 25% and validations as 25%

# Select and train model of classifier

clf = MultiOutputClassifier(tree.DecisionTreeClassifier(splitter = 'random', random_state = 1))
# evaluate the change of the criterion from gini to entropy?

clf.fit(X_train, y_train)

y_pred = clf.predict(X_validation) 


In [254]:
# Evaluate the model

# Separate the multi output targets
y_validation_last1 = y_validation.iloc[:, -1]
y_validation_last2 = y_validation.iloc[:, -2]

y_pred_last1 = y_pred[:, -1]
y_pred_last2 = y_pred[:, -2]


# Apply metrics

accuracy_last1 = accuracy_score(y_validation_last1, y_pred_last1)
precision_last1 = precision_score(y_validation_last1, y_pred_last1, average='weighted')
recall_last1 = recall_score(y_validation_last1, y_pred_last1, average='weighted')
f1_last1 = f1_score(y_validation_last1, y_pred_last1, average='weighted')
balanced_acc_last1 = balanced_accuracy_score(y_validation_last1, y_pred_last1)


accuracy_last2 = accuracy_score(y_validation_last2, y_pred_last2)
precision_last2 = precision_score(y_validation_last2, y_pred_last2, average='weighted')
recall_last2 = recall_score(y_validation_last2, y_pred_last2, average='weighted')
f1_last2 = f1_score(y_validation_last2, y_pred_last2, average='weighted')
balanced_acc_last2 = balanced_accuracy_score(y_validation_last2, y_pred_last2)


In [255]:
print("Metrics for the last column:")
print("Accuracy:", accuracy_last1)
print("Precision:", precision_last1)
print("Recall:", recall_last1)
print("F1 Score:", f1_last1)
print("balanced accuracy:", balanced_acc_last1)

print("\nMetrics for the second-to-last column:")
print("Accuracy:", accuracy_last2)
print("Precision:", precision_last2)
print("Recall:", recall_last2)
print("F1 Score:", f1_last2)
print("balanced accuracy:", balanced_acc_last2)


Metrics for the last column:
Accuracy: 0.2724144176779569
Precision: 0.26906666136770363
Recall: 0.2724144176779569
F1 Score: 0.2702617678409767
balanced accuracy: 0.21177961361331468

Metrics for the second-to-last column:
Accuracy: 0.44883173338163374
Precision: 0.4410519740955179
Recall: 0.44883173338163374
F1 Score: 0.4442669535717083
balanced accuracy: 0.3388344059462966
