# Decision Tree

In [37]:
import sys
sys.path.append("../")
import numpy as np
from typing import Any

In [39]:
def _calculate_majority_vote(y : np.array) -> Any:
    """Compute Mode of a node 

    Args:
        y (np.array) (n,):
            Target value of a certain node 
            with the size on n data 

    Returns:
        Any (single value): 
            Return string or int depend on the
            target value used
    """
    # Extract output
    vals, counts = np.unique(y, return_counts = True)

    # Find the majority vote
    ind_max = np.argmax(counts)
    y_pred = vals[ind_max]

    return y_pred

In [43]:
class Tree:
    """
    Object-based representation of a binary decision tree.

    Parameters
    ----------
    feature : str, default=None
        Node feeature to split on

    threshold : float, default=None
        Threshold for the internal node i

    value : float, default=None
        Containts the constant prediction value of each node

    impurity : float, default=None
        Holds the impurity (i.e., the value of the splitting criterion)
        at node i

    children_left : Tree object, default=None
        Handles the case where X[:, feature[i]] <= threshold[i]

    children_right : Tree object, default=None
        Handles the case where X[:, feature[i]] > threshold[i]

    is_leaf : bool, deafult=False
        Whether the current node is a leaf or not

    n_samples : int, default=None
        The number of samples in current node
    """
    def __init__(
        self,
        feature : str = None,
        threshold : float = None,
        value : float = None,
        impurity : float = None,
        children_left = None,
        children_right = None,
        is_leaf=False,
        n_samples=None
    ):
        self.feature = feature
        self.threshold = threshold
        self.value = value
        self.impurity = impurity
        self.children_left = children_left
        self.children_right = children_right
        self.is_leaf = is_leaf
        self.n_samples = n_samples

In [34]:
X = np.random.randint(0,10,size=10)
A = np.unique(X)

print(_generate_possible_split(X))
print(_generate_possible_split2(X))

[0.5 2.  4.  5.5 6.5 7.5 8.5]
[0.5 2.  4.  5.5 6.5 7.5 8.5]


In [28]:
(A[:-1] + A[1:])/2

array([0.5, 1.5, 3.5, 5.5])

array([0.5, 1.5, 3.5, 5.5])

In [None]:
## Regression Case

In [None]:
## Classification Case

In [4]:
import sys
sys.path.append("../")
import numpy as np

from src.tree import DecisionTreeClassifier
from src.metrics import accuracy_score

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [5]:
# LOAD DATA
iris = load_iris()
X = iris.data
y = np.where(iris.target==2,
             1,
             0)


# SPLIT DATA
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size = 0.3,
                                                    stratify = y,
                                                    random_state = 42)



# CLASSIFY - A Very Fit Tree
# Create a decision tree classfier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Plot tree
clf._export_tree()
print("")

# Predict & calculate accuracy score test
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
print(f"Accuracy score Train : {accuracy_score(y_train, y_pred_train):.4f}")
print(f"Accuracy score Test  : {accuracy_score(y_test, y_pred_test):.4f}")
print("")
print("")


# CLASSIFY - A Simple Tree
# Create a decision tree classfier
clf = DecisionTreeClassifier(max_depth=3)
clf.fit(X_train, y_train)

# Plot tree
clf._export_tree()
print("")

# Predict & calculate accuracy score test
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
print(f"Accuracy score Train : {accuracy_score(y_train, y_pred_train):.4f}")
print(f"Accuracy score Test  : {accuracy_score(y_test, y_pred_test):.4f}")
print("")
print("")

The Decision Tree
-----------------
feature_3 <= 1.75?
|   |T: feature_0 <= 7.10?
|   |   |T: feature_1 <= 2.25?
|   |   |   |T: feature_2 <= 4.75?
|   |   |   |   |T: Pred: 0.00
|   |   |   |   |F: Pred: 1.00
|   |   |   |F: Pred: 0.00
|   |   |F: Pred: 1.00
|   |F: feature_2 <= 4.85?
|   |   |T: feature_0 <= 6.05?
|   |   |   |T: Pred: 0.00
|   |   |   |F: Pred: 1.00
|   |   |F: Pred: 1.00

Accuracy score Train : 1.0000
Accuracy score Test  : 0.9111


The Decision Tree
-----------------
feature_3 <= 1.75?
|   |T: feature_0 <= 7.10?
|   |   |T: feature_1 <= 2.25?
|   |   |   |T: Pred: 0.00
|   |   |   |F: Pred: 0.00
|   |   |F: Pred: 1.00
|   |F: feature_2 <= 4.85?
|   |   |T: feature_0 <= 6.05?
|   |   |   |T: Pred: 0.00
|   |   |   |F: Pred: 1.00
|   |   |F: Pred: 1.00

Accuracy score Train : 0.9905
Accuracy score Test  : 0.9111


