# <center> Implementing Decision Trees </center>
## <center> INF283 - Project 1 </center>
### <center> Sindre E. de Lange </center>

In [None]:
# Uncomment if needing to install treelib
#! pip install treelib

In [1]:
import numpy as np
import pandas as pd

import os
import sys

In [2]:
# To import ImpurityMeasure
sys.path.append("../classes/")

In [3]:
from ImpurityMeasure import *
from DataCleaning import *
# Utilizing the simple tree library that is "Treelib"
from treelib import Node, Tree

In [4]:
d_clean = DataCleaning()

## Getting data in order to test the model

In [5]:
DATASET_PATH = "../csv/"
print(os.listdir(DATASET_PATH))

['mushrooms.csv', 'tennis.csv']


In [6]:
mushrooms_dataset = 'mushrooms.csv'
dataset_mushrooms = pd.read_csv(DATASET_PATH + mushrooms_dataset)

In [7]:
target_var = 'class'
X_shrooms = dataset_mushrooms.drop([target_var], axis=1)
y_shrooms = dataset_mushrooms[target_var]

In [None]:
dataset_mushrooms.head()

In [19]:
data_shrooms_no_qmarks = d_clean.removeQmarksDf(dataset_mushrooms)

In [None]:
data_shrooms_no_qmarks.shape

In [None]:
target_var = 'class'
X_no_qmarks = data_shrooms_no_qmarks.drop([target_var], axis=1)
y_no_qmarks = data_shrooms_no_qmarks[target_var]

In [20]:
dataset_mushrooms_fact = d_clean.factorizeDf(data_shrooms_no_qmarks)

In [21]:
target_var = 'class'
X_shrooms_fact = dataset_mushrooms_fact.drop([target_var], axis=1)
y_shrooms_fact = dataset_mushrooms_fact[target_var]

## Tennis dataset

In [None]:
tennis_dataset = "tennis.csv"
dataset_tennis = pd.read_csv(DATASET_PATH + tennis_dataset)

In [None]:
dataset_tennis.head()

In [None]:
dataset_tennis.info()

In [None]:
dataset_tennis.describe()

In [None]:
data_tennis_factorized = d_clean.factorizeDf(dataset_tennis)

In [None]:
data_tennis_factorized

In [None]:
target_var = 'play'
X_tennis_enc = data_tennis_factorized.drop([target_var], axis=1)
y_tennis_enc = data_tennis_factorized[target_var]

In [None]:
from sklearn import tree
from sklearn.model_selection import train_test_split
clf = tree.DecisionTreeClassifier()
X_train, X_test, y_train, y_test = train_test_split(X_tennis_enc, y_tennis_enc, test_size=0.3, random_state=42)
clf = clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

## Data for verifying the model *check*

# Model implementation

# Notes:
- The best feature to pick as the one to classify on is the one with the most information (gain), i.e. highest entropy
    - After finding the best feature, re-evaluate the entropy of each feature and again pick the one with the highest entropy

# Problems:
> Cannot instantiate and use imp_measure inside the **learn** method <br>
> Unsure about the **Gini Index** calculations --> lowest gini no IG<br>
> Not able to return **classification label** from predict method <br>
> Cannot find '?' in mushroom dataset

# Q's
> Can we use Scikit-learns train_test_split to split into training and test data? If so - use **stratify** to get even data

In [None]:
#imp_measure_alt = "entropy"
#imp_measure_alt = "gini"

In [None]:
#imp_measure = ImpurityMeasure(imp_measure_alt)

In [8]:
def learn(X, y, imp_measure_alt='entropy'):
    """Function that learns a decision tree classifier from data (=X, y).
        Default impurity measure for information gain is Entropy.
        Expects cleaned data, in particular categorical, discrete values"""
    imp_measure = ImpurityMeasure(imp_measure_alt)
    tree = Tree()
    tree = make_tree(X, y, tree, imp_measure)
    return tree

In [9]:
def make_tree(X, y, tree, imp_measure, current_node=None):
    """Recursive method to make a tree
    X, y = the data, training- and target variables, respectively
    impurity_measer = the desired measure of impurity (entropy or gini index)
    tree = the tree object one wishes to populate
    current_node = the current node one stands one"""
    # Edge case: tree not initialized - store whole dataset in root
    # Also necessary to get a reference to root node (because of identifier)
    if current_node is None:
        # Combine data to one dataset
        data = pd.concat([X, y], axis=1)
        # Set root node name to something generic that is easy to get
        root_node_name = "root_node"
        # Get best split
        root_node_tag = imp_measure.getLargestInformationGain(X, y)
        # Make the root node, store the entire dataset
        tree.create_node(tag=root_node_tag, identifier=root_node_name, data=data)
        # Get a reference to the root node
        current_node = tree.get_node(root_node_name) 
        # Call recursive method
        return make_children(X=X, y=y, tree=tree, imp_measure=imp_measure, current_node=current_node)
    # Tree is initialized
    else:
         # Edge cases - no children to make:
        # 1. Unique values in target variable = y
        if len(set(y)) == 1:
            (element, ) = set(y)
            # Make a node of the leaf
            # and set its single unique value as its name, and value
            node_name = str(element)
            data = element
            tree.create_node(tag=node_name, data=data, parent=current_node)
            return tree
        # 2. No columns left in X, i.e. splitted on entire dataset
        elif len(X.columns) == 0:
            # Set to majority in y
            data = y.max()
            node_name = str(data)
            tree.create_node(tag=node_name, data=data, parent=current_node)
            return tree
        else:
            node_name = imp_measure.getLargestInformationGain(X, y)
            current_node.tag = node_name
            return make_children(X=X, y=y, tree=tree, imp_measure=imp_measure, current_node=current_node)

In [10]:
def make_children(X, y, tree, imp_measure, current_node):
    """Identify and initialize the children for a specific node
    X, y = dataset of current node
    impurity_measer = the desired measure of impurity (entropy or gini index)
    tree = the tree object one wishes to populate
    current_node = the current node one stands one, and wishes to define children"""
    # For each unique value in the parents column - make a child node
    child_list = list(set(X[current_node.tag]))
    data = pd.concat([X, y], axis=1)
    for value in child_list:
        node_name = str(value)
        # Split dataset
        data_loc = split_data(value, data, current_node.tag)
        # Remove parent column - for children nodes
        #data_loc = data_loc.drop([current_node.tag], axis=1)
        tree.create_node(tag=node_name, data=data_loc, parent=current_node)
    # Need referece to each child node, therefore new loop
    for children_node in current_node.fpointer:
        current_node = tree.get_node(children_node)
        X = current_node.data.drop([y.name], axis=1)
        y = current_node.data[y.name]
        tree = make_tree(X=X, y=y, tree=tree, imp_measure=imp_measure, current_node=current_node)
             
    return tree

In [11]:
def split_data(value, data, column):
    """Splits the dataframe such that it return the rows, where the specified
    columns value == value"""
    data_loc = data.loc[data[column] == value]
    return data_loc

In [12]:
def predict(x, tree):
    """Predict class label of some new data point x."""
    
    current_node = tree.get_node(tree.root)
    classification_value = getClassificationLabel(x=x, tree=tree, current_node=current_node)
    print(classification_value)

In [13]:
def getClassificationLabel(x, tree, current_node):
    """x = inputed data to classify
    current_node = current node to inspect (its children)"""
    # Current node's name is the name of the column that is (presumably) best to split on
    split_column = current_node.tag
    # Find the children node which has the same value, in the same column
    correct_children_node = None
    # Find the column in the inputed dataset = x, and get its value
    val_x = x.get(split_column)
    # Hacky, but does not work to return inside a for loop, and no time to change logic
    correct_label = None
    # Loop through the children of current node
    for node_children in current_node.fpointer:
        # node_children is only identifier to children - need to get the actual references to those children
        current_children_node = tree.get_node(node_children)
        # Check if children node is leaf - return its data = classification label
        if current_children_node.is_leaf():
            correct_label = current_children_node.data
            break
        # If a node's column = parents tag contains val_x --> correct child node
        if val_x in current_children_node.data[split_column]:
            correct_children_node = current_children_node
            break    
    
    if correct_label is None:
        return getClassificationLabel(x=x, tree=tree, current_node=correct_children_node)
    
    return correct_label

In [24]:
predict(not_optimal_test_data, tree)

0


In [14]:
tree = learn(X_shrooms, y_shrooms)
print(tree)

KeyboardInterrupt: 

In [26]:
not_optimal_test_data = X_shrooms_fact.iloc[0]
print(y_shrooms_fact.iloc[0])

0


In [None]:
tree_shrooms = learn(X_no_qmarks, y_no_qmarks)
print(tree_shrooms)

In [22]:
tree = learn(X_shrooms_fact, y_shrooms_fact)
print(tree)

odor
├── 0
│   └── 0
├── 1
│   └── 1
├── 2
│   └── 1
├── 4
│   └── 0
├── 5
│   └── 0
├── 6
│   └── 0
├── 7
│   └── 0
├── 8
│   └── 0
└── spore-print-color
    ├── 0
    │   └── 1
    ├── 1
    │   └── 1
    ├── 3
    │   └── 1
    ├── 5
    │   └── 0
    ├── 6
    │   └── 1
    ├── 7
    │   └── 1
    ├── 8
    │   └── 1
    └── habitat
        ├── 1
        │   └── 1
        ├── 4
        │   └── 1
        ├── 5
        │   └── 1
        ├── cap-color
        │   ├── 0
        │   │   └── 1
        │   ├── 1
        │   │   └── 0
        │   ├── 2
        │   │   └── 0
        │   └── 8
        │       └── 1
        └── gill-size
            ├── 0
            │   └── 0
            └── 1
                └── 1

