# <center> Implementing Decision Trees </center>
## <center> INF283 - Project 1 </center>
### <center> Sindre E. de Lange </center>

In [None]:
# Uncomment if needing to install treelib
#! pip install treelib

In [2]:
import numpy as np
import pandas as pd

import os
import sys

In [11]:
# To import ImpurityMeasure
sys.path.append("../classes/")

In [8]:
from ImpurityMeasure import *
# Utilizing the simple tree library that is "Treelib"
from treelib import Node, Tree

## Getting data in order to test the model

In [12]:
DATASET_PATH = "../csv/"
print(os.listdir(DATASET_PATH))

['mushrooms.csv', 'tennis.csv']


In [3]:
tennis_dataset = "tennis.csv"
dataset_tennis = pd.read_csv(DATASET_PATH + tennis_dataset)

In [4]:
dataset_tennis.head()

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes


In [5]:
dataset_tennis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
outlook     14 non-null object
temp        14 non-null object
humidity    14 non-null object
windy       14 non-null bool
play        14 non-null object
dtypes: bool(1), object(4)
memory usage: 542.0+ bytes


In [6]:
dataset_tennis.describe()

Unnamed: 0,outlook,temp,humidity,windy,play
count,14,14,14,14,14
unique,3,3,2,2,2
top,rainy,mild,normal,False,yes
freq,5,6,7,8,9


In [7]:
data_enc = dataset_tennis
for columns in dataset_tennis:
    data_enc[columns], unique_data = pd.factorize(dataset_tennis[columns])

In [8]:
print(data_enc.head())

   outlook  temp  humidity  windy  play
0        0     0         0      0     0
1        0     0         0      1     0
2        1     0         0      0     1
3        2     1         0      0     1
4        2     2         1      0     1


In [9]:
X_enc = data_enc.drop(['play'], axis=1)
y_enc = data_enc['play']

In [10]:
X_enc.head()

Unnamed: 0,outlook,temp,humidity,windy
0,0,0,0,0
1,0,0,0,1
2,1,0,0,0
3,2,1,0,0
4,2,2,1,0


In [11]:
y_enc.head()

0    0
1    0
2    1
3    1
4    1
Name: play, dtype: int64

In [12]:
from sklearn import tree
from sklearn.model_selection import train_test_split
clf = tree.DecisionTreeClassifier()
X_train, X_test, y_train, y_test = train_test_split(X_enc, y_enc, test_size=0.3, random_state=42)
clf = clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

0.6


## Data for verifying the model *check*

# Model implementation

# Notes:
- The best feature to pick as the one to classify on is the one with the most information (gain), i.e. highest entropy
    - After finding the best feature, re-evaluate the entropy of each feature and again pick the one with the highest entropy

# Problems:
> cannot instantiate and use imp_measure inside the **learn** method <br>
> unsure about the **Gini Index** calculations <br>
> Not able to return **classification label** from predict method

In [13]:
imp_measure = ImpurityMeasure("entropy")

In [14]:
def learn(X, y, impurity_measure_alt='entropy'):
    """Function that learns a decision tree classifier from data (=X, y).
        Default impurity measure for information gain is Entropy.
        Expects cleaned data, in particular categorical, discrete values"""
    tree = Tree()
    tree = make_tree(X, y, tree)
    return tree

In [15]:
def make_tree(X, y, tree, current_node=None):
    """Recursive method to make a tree
    X, y = the data, training- and target variables, respectively
    impurity_measer = the desired measure of impurity (entropy or gini index)
    tree = the tree object one wishes to populate
    current_node = the current node one stands one"""
    # Edge case: tree not initialized - store whole dataset in root
    # Also necessary to get a reference to root node (because of identifier)
    if current_node is None:
        # Combine data to one dataset
        data = pd.concat([X, y], axis=1)
        # Set root node name to something generic that is easy to get
        root_node_name = "root_node"
        # Get best split
        root_node_tag = imp_measure.getLargestInformationGain(X, y)
        # Make the root node, store the entire dataset
        tree.create_node(tag=root_node_tag, identifier=root_node_name, data=data)
        # Get a reference to the root node
        current_node = tree.get_node(root_node_name) 
        # Call recursive method
        return make_children(X=X, y=y, tree=tree, current_node=current_node)
    # Tree is initialized
    else:
         # Edge cases - no children to make:
        # 1. Unique values in target variable = y
        if len(set(y)) == 1:
            (element, ) = set(y)
            # Make a node of the leaf
            # and set its single unique value as its name, and value
            node_name = str(element)
            data = element
            tree.create_node(tag=node_name, data=data, parent=current_node)
            return tree
        # 2. No columns left in X, i.e. splitted on entire dataset
        elif len(X.columns) == 0:
            # Set to majority in y
            data = y.max()
            node_name = str(data)
            tree.create_node(tag=node_name, data=data, parent=current_node)
            return tree
        else:
            node_name = imp_measure.getLargestInformationGain(X, y)
            current_node.tag = node_name
            return make_children(X=X, y=y, tree=tree, current_node=current_node)

In [16]:
def make_children(X, y, tree, current_node):
    """Identify and initialize the children for a specific node
    X, y = dataset of current node
    impurity_measer = the desired measure of impurity (entropy or gini index)
    tree = the tree object one wishes to populate
    current_node = the current node one stands one, and wishes to define children"""
    # For each unique value in the parents column - make a child node
    child_list = list(set(X[current_node.tag]))
    data = pd.concat([X, y], axis=1)
    for value in child_list:
        node_name = str(value)
        # Split dataset
        data_loc = split_data(value, data, current_node.tag)
        # Remove parent column - for children nodes
        #data_loc = data_loc.drop([current_node.tag], axis=1)
        tree.create_node(tag=node_name, data=data_loc, parent=current_node)
    # Need referece to each child node, therefore new loop
    for children_node in current_node.fpointer:
        current_node = tree.get_node(children_node)
        X = current_node.data.drop([y.name], axis=1)
        y = current_node.data[y.name]
        tree = make_tree(X=X, y=y, tree=tree, current_node=current_node)
             
    return tree

In [17]:
def split_data(value, data, column):
    """Splits the dataframe such that it return the rows, where the specified
    columns value == value"""
    data_loc = data.loc[data[column] == value]
    return data_loc

In [None]:
root = tree.root

In [None]:
print(root)

In [20]:
def predict(x, tree):
    """Predict class label of some new data point x."""
    
    current_node = tree.get_node(tree.root)
    classification_value = getClassificationLabel(x=x, tree=tree, current_node=current_node)
    print(classification_value)

In [21]:
def getClassificationLabel(x, tree, current_node):
    """x = inputed data to classify
    current_node = current node to inspect (its children)"""
    # Current node's name is the name of the column that is (presumably) best to split on
    split_column = current_node.tag
    # Find the children node which has the same value, in the same column
    correct_children_node = None
    # Find the column in the inputed dataset = x, and get its value
    val_x = x.get(split_column)
    # Loop through the children of current node
    for node_children in current_node.fpointer:
        # node_children is only identifier to children - need to get the actual references to those children
        current_children_node = tree.get_node(node_children)
        # Check if children node is leaf - return its data = classification label
        if current_children_node.is_leaf():
            return current_children_node.data
        # If a node's column = parents tag contains val_x --> correct child node
        if val_x in current_children_node.data[split_column]:
            correct_children_node = current_children_node
            break
    getClassificationLabel(x=x, tree=tree, current_node=correct_children_node)

In [24]:
predict(not_optimal_test_data, tree)

None


In [18]:
tree = learn(X_enc, y_enc)
print(tree)

outlook
├── 1
│   └── 1
├── humidity
│   ├── 0
│   │   └── 0
│   └── 1
│       └── 1
└── windy
    ├── 0
    │   └── 1
    └── 1
        └── 0



In [23]:
not_optimal_test_data = X_enc.iloc[0]

In [None]:
tree.get_node("root_node")
for children in 