##Decision Tree Classifier

In [45]:
# Import necessary libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [46]:
# Load dataset

dataset= pd.read_csv('dataset.csv')
#print(dataset.head()) # to see first lines of dataset


# Alocate features and labels

X = dataset.iloc[:, :-2]  # Features
y = dataset.iloc[:, -2:]  # Labels (last two columns)
print (X.shape)

# Perform one-hot encoding on the categorical features

X = pd.get_dummies(X, drop_first=True)
# drop_first argument is used to drop one of the columns for each categorical feature to avoid multicollinearity.

# Check if the target variable has more than 2 classes
if y.nunique().any() > 2:
    y = pd.get_dummies(y, drop_first=True)
    # Apply one-hot encoding to y to represent each class as a separate binary column.



(22083, 32)


In [47]:
# split the data into validation, validation and test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1) # train and validation as 75% and test as 25%
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # train as 75% and validation as 25%

print(X_train.shape, X_validation.shape, X_test.shape)
print(y_train.shape, y_validation.shape, y_test.shape)

(12421, 17429) (4141, 17429) (5521, 17429)
(12421, 2) (4141, 2) (5521, 2)


In [48]:
# Select and train model of classifier

clf = MultiOutputClassifier(tree.DecisionTreeClassifier(splitter='random', random_state=1))
# evaluate the change of the classifier from gini to entropy?

clf.fit(X_train, y_train)

y_pred = clf.predict(X_validation) 

In [50]:
# Evaluate the model

# Separate the multi output targets
y_validation_last1 = y_validation.iloc[:, -1]
y_validation_last2 = y_validation.iloc[:, -2]

y_pred_last1 = y_pred[:, -1]
y_pred_last2 = y_pred[:, -2]


print("Validation set shape:", y_validation_last2.shape)
print("Predictions shape:", y_pred_last2.shape)

# Apply  metrics
accuracy_last1 = accuracy_score(y_validation_last1, y_pred_last1)
precision_last1 = precision_score(y_validation_last1, y_pred_last1, average='weighted')
recall_last1 = recall_score(y_validation_last1, y_pred_last1, average='weighted')
f1_last1 = f1_score(y_validation_last1, y_pred_last1, average='weighted')

accuracy_last2 = accuracy_score(y_validation_last2, y_pred_last2)
precision_last2 = precision_score(y_validation_last2, y_pred_last2, average='weighted')
recall_last2 = recall_score(y_validation_last2, y_pred_last2, average='weighted')
f1_last2 = f1_score(y_validation_last2, y_pred_last2, average='weighted')




Validation set shape: (4141,)
Predictions shape: (4141,)


In [51]:
print("Metrics for the last column:")
print("Accuracy:", accuracy_last1)
print("Precision:", precision_last1)
print("Recall:", recall_last1)
print("F1 Score:", f1_last1)

print("\nMetrics for the second-to-last column:")
print("Accuracy:", accuracy_last2)
print("Precision:", precision_last2)
print("Recall:", recall_last2)
print("F1 Score:", f1_last2)

Metrics for the last column:
Accuracy: 0.2750543347017629
Precision: 0.2732284103025439
Recall: 0.2750543347017629
F1 Score: 0.27392248851277445

Metrics for the second-to-last column:
Accuracy: 0.4296063752716735
Precision: 0.4300198556219927
Recall: 0.4296063752716735
F1 Score: 0.42973923300440164
