# using one hot encoding improves performance

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tree.base import DecisionTree
from tree.utils import *
from metrics import *

np.random.seed(42)

In [2]:
# Test case 1
# Real Input and Real Output

N = 30
P = 5
X = pd.DataFrame(np.random.randn(N, P))
X = one_hot_encoding(X)
y = pd.Series(np.random.randn(N))


for criteria in ["information_gain", "gini_index"]:
    tree = DecisionTree(criterion=criteria)  # Split based on Inf. Gain
    tree.fit(X, y)
    y_hat = tree.predict(X)
    tree.plot()
    print("Criteria :", criteria)
    print("RMSE: ", rmse(y_hat, y))
    print("MAE: ", mae(y_hat, y))

Criteria : information_gain
RMSE:  0.35058133577908407
MAE:  0.2790232592413352
Criteria : gini_index
RMSE:  0.660664966269471
MAE:  0.46294685207331393


In [3]:
# Test case 2
# Real Input and Discrete Output

N = 30
P = 5
X = pd.DataFrame(np.random.randn(N, P))
X = one_hot_encoding(X)
y = pd.Series(np.random.randint(P, size=N), dtype="category")
# print(X)
# print(y)

for criteria in ["information_gain", "gini_index"]:
    tree = DecisionTree(criterion=criteria)  # Split based on Inf. Gain
    tree.fit(X, y)
    y_hat = tree.predict(X)
    tree.plot()
    print("Criteria :", criteria)
    # print(y_hat)
    print("Accuracy: ", accuracy(y_hat, y))
    for cls in y.unique():
        print("Precision: ", precision(y_hat, y, cls))
        print("Recall: ", recall(y_hat, y, cls))


Criteria : information_gain
Accuracy:  0.8333333333333334
Precision:  1.0
Recall:  0.8
Precision:  0.75
Recall:  0.9
Precision:  1.0
Recall:  1.0
Precision:  0.6666666666666666
Recall:  1.0
Precision:  0.5
Recall:  0.3333333333333333
Criteria : gini_index
Accuracy:  0.8
Precision:  1.0
Recall:  0.8
Precision:  0.7142857142857143
Recall:  1.0
Precision:  1.0
Recall:  0.6
Precision:  0.6666666666666666
Recall:  1.0
Precision:  0.5
Recall:  0.3333333333333333


In [4]:
# Test case 3
# Discrete Input and Discrete Output

N = 30
P = 5
X = pd.DataFrame({i: pd.Series(np.random.randint(P, size=N), dtype="category") for i in range(5)})
X = one_hot_encoding(X)
y = pd.Series(np.random.randint(P, size=N), dtype="category")
# print(X)
# print(y)

for criteria in ["information_gain", "gini_index"]:
    tree = DecisionTree(criterion=criteria)  # Split based on Inf. Gain
    tree.fit(X, y)
    y_hat = tree.predict(X)
    tree.plot()
    print("Criteria :", criteria)
    # print(y_hat)
    print("Accuracy: ", accuracy(y_hat, y))
    for cls in y.unique():
        print("Precision: ", precision(y_hat, y, cls))
        print("Recall: ", recall(y_hat, y, cls))

Criteria : information_gain
Accuracy:  0.9333333333333333
Precision:  0.8571428571428571
Recall:  0.8571428571428571
Precision:  1.0
Recall:  1.0
Precision:  1.0
Recall:  0.8888888888888888
Precision:  0.8333333333333334
Recall:  1.0
Precision:  1.0
Recall:  1.0
Criteria : gini_index
Accuracy:  0.8333333333333334
Precision:  0.6
Recall:  0.8571428571428571
Precision:  1.0
Recall:  0.75
Precision:  1.0
Recall:  0.8888888888888888
Precision:  0.8333333333333334
Recall:  1.0
Precision:  1.0
Recall:  0.6


In [5]:
# Test case 4
# Discrete Input and Real Output

N = 30
P = 5
X = pd.DataFrame({i: pd.Series(np.random.randint(P, size=N), dtype="category") for i in range(5)})
X = one_hot_encoding(X)
y = pd.Series(np.random.randn(N))

for criteria in ["information_gain", "gini_index"]:
    tree = DecisionTree(criterion=criteria)  # Split based on Inf. Gain
    tree.fit(X, y)
    y_hat = tree.predict(X)
    tree.plot()
    print("Criteria :", criteria)
    print("RMSE: ", rmse(y_hat, y))
    print("MAE: ", mae(y_hat, y))


Criteria : information_gain
RMSE:  0.29478475248015334
MAE:  0.20104605291276426
Criteria : gini_index
RMSE:  0.6916939503745642
MAE:  0.4018729710292463
