In [1]:
import numpy as np
import pandas as pd

In [2]:
class DecisionTreeClassifier:
    class DecisionTreeNode:
        def __init__(self, feature=None, class_name=None):
            if feature is not None:
                self.feature_ = feature
                self.children_ = {}
                self.leaf_ = False
            if class_name is not None:
                self.class_ = class_name
                self.leaf_ = True
        def is_leaf(self):
            return self.leaf_

    def __init__(self, X, y):
        self.X_ = X
        self.y_ = y
        self.head_node_ = None

    @staticmethod
    def information(y):
        unique_values, frequency = np.unique(y, return_counts=True)
        frequency = frequency / np.sum(frequency)
        result = np.sum(list(map(lambda x: (-x * np.log2(x)), frequency)))
        return result

    def value_entropy(self, X, y, feature, value):
        y_required = y[X[feature] == value]
        return self.information(y_required) * len(y_required)

    def leftover_information(self, X, y, feature):
        unique_values = np.unique(X[feature])
        result = np.sum(list(map(lambda v: self.value_entropy(X, y, feature, v), unique_values)))
        result /= len(X)
        return result

    def select_best_feature(self, X, y):
        features = X.columns
        features_leftover_info = [self.leftover_information(X, y, f) for f in features]
        return features[np.argmin(features_leftover_info)]

    @staticmethod
    def has_impurity(y):
        return None if len(y) > 1 else y.iloc[0]

    @staticmethod
    def majority_class(y):
        return y.mode()[0]

    def construct_recursive(self, X, y, features, tuples):
        X_current = X[tuples]
        y_current = y[tuples]
        check_impurity = self.has_impurity(y_current)
        if check_impurity is not None:
            return self.DecisionTreeNode(class_name=check_impurity)
        if features.empty:
            return self.DecisionTreeNode(class_name=self.majority_class(y_current))
        best_feature = self.select_best_feature(X_current, y_current)
        result = self.DecisionTreeNode(feature=best_feature)
        features = features[features != best_feature]
        for val in X_current[best_feature].unique():
            child_tuples = (X_current[best_feature] == val)
            result.children_[val] = self.construct_recursive(X_current[features], y_current, features, child_tuples)
        return result

    def predict_recursive(self, traversing_node, X_test):
        if traversing_node.is_leaf():
            return traversing_node.class_
        current_feature = traversing_node.feature_
        return self.predict_recursive(traversing_node.children_[X_test[current_feature].iloc[0]], X_test)

    def construct(self):
        features = self.X_.columns
        tuples = pd.Series([True] * len(self.X_))
        self.head_node_ = self.construct_recursive(self.X_, self.y_, features, tuples)

    def predict(self, X_test):
        return self.predict_recursive(self.head_node_, X_test)

In [3]:
df = pd.read_csv('golf.csv')
df

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Play Golf
0,Rainy,Hot,High,False,No
1,Rainy,Hot,High,True,No
2,Overcast,Hot,High,False,Yes
3,Sunny,Mild,High,False,Yes
4,Sunny,Cool,Normal,False,Yes
5,Sunny,Cool,Normal,True,No
6,Overcast,Cool,Normal,True,Yes
7,Rainy,Mild,High,False,No
8,Rainy,Cool,Normal,False,Yes
9,Sunny,Mild,Normal,False,Yes


In [4]:
df.columns

Index(['Outlook', 'Temperature', 'Humidity', 'Windy', 'Play Golf'], dtype='object')

In [5]:
X = df.drop('Play Golf', axis=1)
y = df['Play Golf']

In [6]:
X

Unnamed: 0,Outlook,Temperature,Humidity,Windy
0,Rainy,Hot,High,False
1,Rainy,Hot,High,True
2,Overcast,Hot,High,False
3,Sunny,Mild,High,False
4,Sunny,Cool,Normal,False
5,Sunny,Cool,Normal,True
6,Overcast,Cool,Normal,True
7,Rainy,Mild,High,False
8,Rainy,Cool,Normal,False
9,Sunny,Mild,Normal,False


In [7]:
y

0      No
1      No
2     Yes
3     Yes
4     Yes
5      No
6     Yes
7      No
8     Yes
9     Yes
10    Yes
11    Yes
12    Yes
13     No
Name: Play Golf, dtype: object

In [8]:
dtc = DecisionTreeClassifier(X, y)
dtc.construct()

In [9]:
X_test = pd.DataFrame([['Rainy', 'Hot', 'High', True]], columns=['Outlook', 'Temperature', 'Humidity', 'Windy'])
X_test

Unnamed: 0,Outlook,Temperature,Humidity,Windy
0,Rainy,Hot,High,True


In [10]:
print(dtc.predict(X_test))

No
