# <center> Implementing Decision Trees </center>
## <center> INF283 - Project 1 </center>
### <center> Sindre E. de Lange </center>

In [None]:
# Uncomment if needing to install treelib
#! pip install treelib

In [5]:
import numpy as np
import pandas as pd

import os
import sys

In [6]:
# To import ImpurityMeasure and DataCleaning
sys.path.append("../classes/")

In [7]:
from ImpurityMeasure import *
from DataCleaning import *
# Utilizing the simple tree library that is "Treelib"
from treelib import Node, Tree

In [8]:
d_clean = DataCleaning()

# Getting data in order to test the model

In [13]:
DATASET_PATH = "../csv/"
print(os.listdir(DATASET_PATH))

['mushrooms.csv', 'tennis.csv']


## Mushrooms dataset

In [12]:
mushrooms_dataset = 'mushrooms.csv'
dataset_mushrooms = pd.read_csv(DATASET_PATH + mushrooms_dataset)

In [7]:
dataset_mushrooms.shape

(8124, 23)

In [8]:
# Data cleaning
data_shrooms_no_qmarks = d_clean.removeQmarksDf(dataset_mushrooms)
data_shrooms_fact_no_qmarks = d_clean.factorizeDf(data_shrooms_no_qmarks)

In [9]:
data_shrooms_fact_no_qmarks.shape

(5644, 23)

In [10]:
target_var = 'class'
X_no_qmarks_fact = data_shrooms_fact_no_qmarks.drop([target_var], axis=1)
y_no_qmarks_fact = data_shrooms_fact_no_qmarks[target_var]

In [126]:
X_no_qmarks_fact_train, X_no_qmarks_fact_test, y_no_qmarks_fact_train, y_no_qmarks_fact_test = train_test_split(X_no_qmarks_fact, y_no_qmarks_fact, test_size=0.3, random_state=42, stratify=y_no_qmarks_fact)

## Tennis dataset

In [14]:
tennis_dataset = "tennis.csv"
dataset_tennis = pd.read_csv(DATASET_PATH + tennis_dataset)

In [15]:
dataset_tennis.head()

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes


In [16]:
# Data cleaning
data_tennis_factorized = d_clean.factorizeDf(dataset_tennis)

In [17]:
data_tennis_factorized.head()

Unnamed: 0,outlook,temp,humidity,windy,play
0,0,0,0,0,0
1,0,0,0,1,0
2,1,0,0,0,1
3,2,1,0,0,1
4,2,2,1,0,1


In [18]:
target_var = 'play'
X_tennis_enc = data_tennis_factorized.drop([target_var], axis=1)
y_tennis_enc = data_tennis_factorized[target_var]

In [49]:
X_tennis_enc_train, X_tennis_enc_test, y_tennis_enc_train, y_tennis_enc_test = train_test_split(X_tennis_enc, y_tennis_enc, test_size=0.2, random_state=42, stratify=y_tennis_enc)

## Data for verifying the model *check*

# Model implementation

In [21]:
def findBestTree(tree0_acc, tree0, tree1_acc, tree1, origin_acc, origin):
    """ Find The Best Tree
    
    Finds the best tree, based on their acuraccies
    
    Args:
        tree0_acc: float
        tree0: treelib.tree.Tree
        tree1_acc: float
        tree1: treelib.tree.Tree
        origin_acc: float
        origin: treelib.tree.Tree
        
    Returns:
        best_copy/origin: treelib.tree.Tree
        copy_acc/origin_acc: float
        string/"origin": String
    """
    best_copy = tree0 if tree0_acc >= tree1_acc else tree1
    return best_copy if copy_acc >= origin_acc else origin

In [22]:
def learn(X, y, imp_measure_alt='entropy', pruning=False, pruning_amount=0.45):
    """ Learn
    
    Learns a decision tree classifier from data.
    NOTE: Expects cleaned data, in particular categorical, discrete values (pd.factorize)
    
    Args:
        X: pandas dataframe
        y: pandas series
        imp_measure_alt: String. How to calculate the information gain for the datasets column.
            Either 'entropy' (standard) or 'gini'
        pruning: Boolean. To use pruning, or not to use pruning - that is the question
        pruning_amount: Float. Percentage distribution of the training dataset
        
    Returns:
        treelib.tree.Tree. Tree classifier learned from data
        """
    
    # Divide into training and pruning dataset
    if pruning:
        X, X_prune, y, y_prune= train_test_split(X, y, test_size=pruning_amount, random_state=42, stratify=y)
    
    imp_measure = ImpurityMeasure(imp_measure_alt)
    tree = Tree()
    tree = make_tree(X, y, tree, imp_measure)
    if pruning:
        tree = prune(X_prune, y_prune, tree)
    return tree

In [23]:
def prune_nodes(X_prune, y_prune, tree, node):
    """ Prune Nodes
    
    Checks the accuracy gain/loss obtained by each inputed node, by seeing if the accuracy in a tree with that specific node,
    and without that specific node, when alternating its parents label. 
    
    Args:
        X_prune: pandas dataframe
        y_prune: pandas series
        tree: treelib.tree.Tree
        node: treelib.node.Node
        
    Returns:
        treelib.tree.Tree. Pruned or not pruned
    
    """
    # Get parent pointer for the node
    parent_node_ref = node.bpointer
    parent_node = tree.get_node(parent_node_ref)
    
    # Create three copies
    orig_tree = Tree(tree.subtree(tree.root), deep=True)
    copy_0 = Tree(orig_tree.subtree(orig_tree.root), deep=True)
    copy_1 = Tree(orig_tree.subtree(orig_tree.root), deep=True)
    
    # Set up the new trees
    copy_0 = edit_child_nodes(copy_0, parent_node, 0)
    copy_1 = edit_child_nodes(copy_1, parent_node, 1)
    
    # Calculate accuracy for new trees
    acc_orig = accuracy(X_prune, y_prune, orig_tree)
    acc_copy_0 = accuracy(X_prune, y_prune, copy_0)
    acc_copy_1 = accuracy(X_prune, y_prune, copy_1)

    best_tree = findBestTree(acc_copy_0, copy_0, acc_copy_1, copy_0, acc_orig, orig_tree)
    print("Best tree: \n", best_tree)
    return best_tree

In [24]:
def prune(X_prune, y_prune, tree):
    """ Prune
    
    Prunes a tree, e.g. removes seemingly unecessary nodes while maintaing accuracy
    
    Args:
        X_prune: pandas dataframe
        y_prune: pandas series
        tree: treelib.tree.Tree

    Returns:
        treelib.tree.Tree. Optimal tree after pruning
    """
    
    # Keep track of the original tree - to compare with the returned trees
    tree_orig = tree
    # A small hack in order to stop when there is no improvement
    same_tree=False
    
    while not same_tree:
        # Get an iterator of all the nodes
        nodes_iterator = tree.all_nodes_itr()
        # Need to loop through, reverse style
        nodes_list_reversed =  reversed(list(tree.all_nodes_itr()))
        for node in nodes_list_reversed:
            # Edge case, when reached the root node
            if (node.identifier == tree.root):
                break
            # When the tree is updated in prune_nodes, the node list should also be updated
            elif not node in tree.all_nodes_itr():
                break
            else:
                tree = prune_nodes(X_prune, y_prune, tree, node)
        # No change, i.e. no improvement from pruning. 
        if checkIFDictEquals(tree.nodes, tree_orig.nodes):
            same_tree = True
    return tree

In [25]:
def edit_child_nodes(tree, node, data):
    """ Edit Child Nodes
    
    Edit the data of a node in the given tree, and delete its children
    
    Args:
        tree: treelib.tree.Tree
        node: treelib.node.Node
        data: int
    
    Returns:
        treelib.tree.Tree
    
    node.data = data
    # Delete its children
    for children in node.fpointer:
        tree.remove_node(children)
    # Return the updated tree
    """
    get_node = tree.get_node(node.identifier)
    get_node.data = data
    for children in get_node.fpointer:
        tree.remove_node(children)
    # This means there is no need to delete children - implicitly deleted
    
    return tree

In [158]:
learn(X_no_qmarks_fact_train, y_no_qmarks_fact_train, imp_measure_alt="entropy", pruning=True)

NameError: name 'copy_acc' is not defined

In [26]:
def accuracy(X_prune, y_prune, this_tree):
    """ Accuracy
    
    Calculates the accuracy: Number of errors/total data length
    
    Args:
        X_prune: pandas dataframe
        y_prune: pandas series
        this_tree: treelib.tree.Tree
        
    Returns:
        float. corrected predicted labels / total number of labels
    """
    data_len = len(y_prune)
    correct = 0
    for idx, val in enumerate(y_prune):
        predicted_label = predict(X_prune.iloc[idx], this_tree)
        if predicted_label == val:
            correct += 1
    return (correct/data_len)

In [28]:
def make_tree(X, y, tree, imp_measure, current_node=None):
    """ Make Tree
    
    Recursive method to make a tree of the type treelib.tree.Tree
    
    Args:
        X: pandas dataframe. Training features
        y: pandas series. Target variable
        tree: treelib.tree.Tree. Tree object to populate
        imp_measure: String. Name of impurity measure - either 'entropy' or 'gini'
        current_node: treelib.node.Node. Current node to build subtree from. 
        
    Returns:
        treelib.tree.Tree. A (populated) decision tree based on inputed datasets X and y
    """
    # Edge case: tree not initialized - store whole dataset in root
    if current_node is None:
        # Combine data to one dataset
        data = pd.concat([X, y], axis=1)
        # Set root node name to something generic that is easy to get
        # Get best split
        root_node_tag = imp_measure.getLargestInformationGain(X, y)
        # Make the root node, store the entire dataset
        tree.create_node(tag=root_node_tag, data=data)
        # Get a reference to the root node
        current_node = tree.get_node(tree.root) 
        # Call recursive method
        return make_children(X=X, y=y, tree=tree, imp_measure=imp_measure, current_node=current_node)
    # Tree is initialized
    else:
         # Edge cases - no children to make:
        # 1. Unique values in target variable = y
        if len(set(y)) == 1:
            (element, ) = set(y)
            # Make a node of the leaf
            # and set its single unique value as its name, and value
            node_name = str(element)
            data = element
            tree.create_node(tag=node_name, data=data, parent=current_node)
            return tree
        # 2. No columns left in X, i.e. splitted on entire dataset
        elif len(X.columns) == 0:
            # Set to majority in y
            data = y.max()
            node_name = str(data)
            tree.create_node(tag=node_name, data=data, parent=current_node)
            return tree
        else:
            node_name = imp_measure.getLargestInformationGain(X, y)
            # Create a new node with the name = name of best split column, and the data of its parent
            current_node = tree.create_node(tag=node_name, data=current_node.data, parent=current_node)
            return make_children(X=X, y=y, tree=tree, imp_measure=imp_measure, current_node=current_node)

In [29]:
def make_children(X, y, tree, imp_measure, current_node):
    """ Make Children for a specific column/node in a tree
    
    Identifies the unique values, in a column, in a dataset, initialized node corresponding to that 
    value, and appends them to current node, i.e. their parent.
    
    Args:
        X: pandas dataframe.
        y: pandas series.
        imp_measure: String. Name of impurity measure - either 'entropy' or 'gini'
        current_node: Reference to a specific node, in a tree, that one wishes to populate with children
        
    Returns:
        treelib.tree.Tree. Inputed tree, with appended children of current node
    """
    # For each unique value in the parents column - make a child node
    child_list = list(set(X[current_node.tag]))
    data = pd.concat([X, y], axis=1)
    for value in child_list:
        node_name = str(value)
        # Split dataset
        data_loc = split_data(value, data, current_node.tag)
        # Remove parent column - for children nodes
        tree.create_node(tag=node_name, data=data_loc, parent=current_node)
    # Need referece to each child node, therefore new loop
    for children_node in current_node.fpointer:
        current_node = tree.get_node(children_node)
        X = current_node.data.drop([y.name], axis=1)
        y = current_node.data[y.name]
        tree = make_tree(X=X, y=y, tree=tree, imp_measure=imp_measure, current_node=current_node)
             
    return tree

In [30]:
def split_data(value, data, column):
    """ Split Data
    
    Splits a dataframe such that it return the rows, where the specified
    columns value == value
    
    Args:
        value: int/float/whatever, as long as it corresponds to det value in the column, in the dataframe
        data: pandas dataframe. 
        column: String. 
        
    Returns:
        pandas dataframe. 
    """
    data_loc = data.loc[data[column] == value]
    return data_loc

In [31]:
def predict(x, tree):
    """Predict class label of some new data point x.
    
    Takes in a row of data, runs it through a tree classifier, and outputs the classifiers predicted label. 
    
    Args:
        x: pandas series. Data row to predict on
        tree: treelib.tree.Tree. Tree classifier to predict the data row's label
    
    Returns:
        int. Assuming the dataset is factorized, otherwise it will be whatever the values are in the target variable series.
    """
    
    current_node = tree.get_node(tree.root)
    classification_value = getClassificationLabel(x=x, tree=tree, current_node=current_node)
    return classification_value

In [32]:
def getClassificationLabel(x, tree, current_node):
    """ Get Classification Label
    
    Recursive method that uses the inputed data row 'x' to traverse the decision tree,
    find the leaf that corresponds to 'x's data, and return its label/data
    
    Args:
        x: pandas series. Data row to predict on
        tree: treelib.tree.Tree. Tree classifier to predict the data row's label
        current_node: treelib.node.Node. Current node to check if one of its children is the correct leaf
    
    Returns:
        int. Assuming the dataset is factorized, otherwise it will be whatever the values are in the target variable series.
    """
    
    # Current node's name is the name of the column that is (presumably) best to split on
    # Note - after a small change to the tree setup, we have to use the parents
    # name, if one is at a node that represents a unique value from parents column
    current_node_tag = current_node.tag
    if current_node_tag.isdigit():
        # Get parents name 
        parent = tree.get_node(current_node.bpointer)
        split_column = parent.tag
    else:
        split_column = current_node.tag
    # Find the children node which has the same value, in the same column
    correct_children_node = None
    # Find the column in the inputed dataset = x, and get its value
    val_x = x.get(split_column)
    # Hacky, but does not work to return inside a for loop, and no time to change logic
    correct_label = None
    # Loop through the children of current node
    for node_children in current_node.fpointer:
        # node_children is only identifier to children - need to get the actual references to those children
        current_children_node = tree.get_node(node_children)
        # Check if children node is leaf - return its data = classification label
        if current_children_node.is_leaf():
            correct_label = current_children_node.data
            break
        # If a node's column = parents tag contains val_x --> correct child node
        elif val_x in current_children_node.data[split_column].values:
            correct_children_node = current_children_node
            break
        else:
            #There is no child node with that specific value in that column
            # Set label to majority in current node
            target_variabel = current_node.data.drop(columns=x.index, axis=1)
            # So hacky I might throw up, but when I get a df, not a series.. 
            correct_label = getMostFrequentValueFromPandasDFThatShouldBeASeries(target_variabel)
          
    if correct_label is None:
        return getClassificationLabel(x=x, tree=tree, current_node=correct_children_node)
    
    return correct_label

In [33]:
def getMostFrequentValueFromPandasDFThatShouldBeASeries(df):
    """ Get The Most Frequent Value From a Panas Dataframe, That Probably Should've Been a Pandas Series
    
    Was having trouble with accessing the target variable column in a data frame, because it was returned as 
    a pandas dataframe. This method takes a Pandas Dataframe (1D), accesses it's single column name, and 
    returns the value of the most frequent value, in that column.
    
    Args:
        df: pandas dataframe.
        
    Returns:
        int/float/whatever that is stored in the target variable, that appears most frequently. 
    
    """
    column_name = df.columns.values[0]
    return list((df[column_name].value_counts()).index)[0]

In [34]:
def isLeaf(node):
    """ Is a Leaf
    
    Check whether a specific node should be a leaf or not. 
    Currently not in use, but seemingly ok to have, if I were to continue with this project. 
    
    Args:
        node: treelib.node.Node
        
    Returns:
        boolean    
    """
    if len(node.fpointer) == 0:
        return True
    elif len(set(node.data['play'])) == 1:
        return True
    else:
        return False

In [53]:
def checkIFDictEquals(dict1, dict2):
    """ Check if two dictionaries are equal
    
    A somewhat hacky way of comparing two dictionaries. Checks "equalness" by seeing if the two dictionaries have 
    the same amount of keys. 
    This works (somewhat) for treelib, in this context, because amount of keys = amount of nodes. 
    If two trees have exactly the same amount of nodes, then there is a pretty good chance they are the same. 
    
    Args:
        dict1: dictionary
        dict1: dictionary
        
    Returns:
        Boolean.     
    """
    dict1_keys_len = len(dict1.keys())
    dict2_keys_len = len(dict2.keys())
    return dict1_keys_len == dict2_keys_len

## Model testing

### Mushrooms

In [None]:
mushroom_tree = learn(X_no_qmarks_fact_train, y_no_qmarks_fact_train)
print(accuracy(X_no_qmarks_fact_test, y_no_qmarks_fact_test, mushroom_tree))
print(mushroom_tree)

In [None]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_no_qmarks_fact_train, y_no_qmarks_fact_train)
print(clf.score(X_no_qmarks_fact_test, y_no_qmarks_fact_test))

### Tennis

In [54]:
tennis_tree = learn(X_tennis_enc_train, y_tennis_enc_train)
print(tennis_tree)

outlook
├── 0
│   └── temp
│       ├── 0
│       │   └── 0
│       └── 1
│           └── 1
├── 1
│   └── 1
└── 2
    └── windy
        ├── 0
        │   └── 1
        └── 1
            └── 0



In [55]:
tennis_acc = accuracy(X_tennis_enc_test, y_tennis_enc_test, tennis_tree)
print(tennis_acc)

0.6666666666666666


In [56]:
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_tennis_enc_train, y_tennis_enc_train)
print(clf.score(X_tennis_enc_test, y_tennis_enc_test))

0.3333333333333333


NOTE: Not surprising considering the amount of data