# <center> Implementing Decision Trees </center>
## <center> INF283 - Project 1 </center>
### <center> Sindre E. de Lange </center>

# Q

> Calculate the Gini Index? Do we have to change anything more than just the calculation: entropy to Gini? Calculate the Gini for the system, etc.? <br>
> **NOTE** Lowest GINI = max IG, so do not need to change <br>
> How to compare our classifier with scikitlearns? Can we use .score, etc.?

In [None]:
# Uncomment if needing to install treelib
#! pip install treelib

In [2]:
import numpy as np
import pandas as pd

import os
import sys

In [3]:
# To import ImpurityMeasure and DataCleaning
sys.path.append("../classes/")

In [11]:
from ImpurityMeasure import *
from DataCleaning import *
# Utilizing the simple tree library that is "Treelib"
from treelib import Node, Tree
from sklearn.model_selection import train_test_split

In [5]:
d_clean = DataCleaning()

## Getting data in order to test the model

In [6]:
DATASET_PATH = "../csv/"
print(os.listdir(DATASET_PATH))

['mushrooms.csv', 'tennis.csv']


In [7]:
mushrooms_dataset = 'mushrooms.csv'
dataset_mushrooms = pd.read_csv(DATASET_PATH + mushrooms_dataset)

In [None]:
dataset_mushrooms.shape()

In [8]:
data_shrooms_no_qmarks = d_clean.removeQmarksDf(dataset_mushrooms)
data_shrooms_fact_no_qmarks = d_clean.factorizeDf(data_shrooms_no_qmarks)

In [9]:
data_shrooms_fact_no_qmarks.shape

(5644, 23)

In [10]:
target_var = 'class'
X_no_qmarks_fact = data_shrooms_fact_no_qmarks.drop([target_var], axis=1)
y_no_qmarks_fact = data_shrooms_fact_no_qmarks[target_var]

In [16]:
X_no_qmarks_fact_train, X_no_qmarks_fact_test, y_no_qmarks_fact_train, y_no_qmarks_fact_test = train_test_split(X_no_qmarks_fact, y_no_qmarks_fact, test_size=0.4, random_state=42, stratify=y_no_qmarks_fact)

## Tennis dataset

In [18]:
tennis_dataset = "tennis.csv"
dataset_tennis = pd.read_csv(DATASET_PATH + tennis_dataset)

In [None]:
dataset_tennis.head()

In [19]:
data_tennis_factorized = d_clean.factorizeDf(dataset_tennis)

In [None]:
dataset_tennis_factorized.head()

In [20]:
target_var = 'play'
X_tennis_enc = data_tennis_factorized.drop([target_var], axis=1)
y_tennis_enc = data_tennis_factorized[target_var]

In [None]:
from sklearn import tree
from sklearn.model_selection import train_test_split
clf = tree.DecisionTreeClassifier()
X_train, X_test, y_train, y_test = train_test_split(X_tennis_enc, y_tennis_enc, test_size=0.3, random_state=42)
clf = clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

## Data for verifying the model *check*

# Model implementation

In [716]:
def learn(X, y, imp_measure_alt='entropy', pruning=False, pruning_amount=0.2):
    """ Learn
    
    Learns a decision tree classifier from data.
    NOTE: Expects cleaned data, in particular categorical, discrete values (pd.factorize)
    
    Args:
        X: pandas dataframe
        y: pandas series
        imp_measure_alt: String. How to calculate the information gain for the datasets column.
            Either 'entropy' (standard) or 'gini'
        pruning: Boolean. To use pruning, or not to use pruning - that is the question
        pruning_amount: Float. Percentage distribution of the training dataset
        
        Returns:
            treelib.tree.Tree. Tree classifier learned from data
        """
    
    # Divide into training and pruning dataset
    if pruning:
        X, X_prune, y, y_prune= train_test_split(X, y, test_size=pruning_amount, random_state=42, stratify=y)
    
    imp_measure = ImpurityMeasure(imp_measure_alt)
    tree = Tree()
    tree = make_tree(X, y, tree, imp_measure)
    if pruning:
        new_tree = prune(X_prune, y_prune, tree)
    return tree

In [715]:
def prune(X_prune, y_prune, tree):
    """ Prune
    
    Prunes a tree, e.g. removes seemingly unecessary nodes while still maintaing acuraccy
    
    Args:
        X_prune: pandas dataframe
        y_prune: pandas series
        tree: treelib.tree.Tree
    
    Returns:
        treelib.tree.Tree. Optimal tree after pruning
    """

    # When to stop? When there is not improvement, i.e. when best tree = orig tree
    best_tree = None

    for leaf_nodes in tree.leaves(tree.root):
        # Get parent pointer for the nodes
        parent_node_ref = leaf_nodes.bpointer
        parent_node = tree.get_node(parent_node_ref)
        # Edge case: When the parent node is the root node
        if parent_node_ref is not tree.root:
            # Create a copy of the inputed tree
            orig_tree = Tree(tree.subtree(tree.root), deep=True)
            # Create more copies, so that one can compare acuraccy after pruning
            copy_0 = Tree(orig_tree.subtree(orig_tree.root), deep=True)
            copy_1 = Tree(orig_tree.subtree(orig_tree.root), deep=True)
            # Set up the new trees
            new_tree_0 = edit_child_nodes(copy_0, parent_node, 0)
            new_tree_1 = edit_child_nodes(copy_1, parent_node, 1)
            # Calculate acuraccy for new trees
            acc_orig = acuraccy(X_prune, y_prune, orig_tree)
            acc_copy_0 = acuraccy(X_prune, y_prune, new_tree_0)
            acc_copy_1 = acuraccy(X_prune, y_prune, new_tree_1)
        
            print("Acuraccy for original tree: ", acc_orig, 
              "\n acuraccy for edited tree, 0: ", acc_copy_0, 
              "\n acuraccy for edited tree, 1:", acc_copy_1)
            best_tree, best_acc, string = findBestTree(acc_copy_0, new_tree_0, acc_copy_1, new_tree_1, acc_orig, orig_tree)
            print("Picked therefore: ", string, ", with acc: ", best_acc)
            # If best tree == orig tree --> no improvement - return
            # But how to check if the same tree? Should be "is"
            if best_tree is orig_tree:
                break
            else:
                prune(X_prune, y_prune, best_tree)
        else:
            continue
    return best_tree

In [717]:
def findBestTree(tree0_acc, tree0, tree1_acc, tree1, origin_acc, origin):
    """ Find The Best Tree
    
    Finds the best tree, based on their acuraccies
    
    Args:
        tree0_acc: float
        tree0: treelib.tree.Tree
        tree1_acc: float
        tree1: treelib.tree.Tree
        origin_acc: float
        origin: treelib.tree.Tree
        
    Returns:
        best_copy/origin: treelib.tree.Tree
        copy_acc/origin_acc: float
        string/"origin": String
    """
    best_copy, copy_acc, tree_name = (tree0, tree0_acc, "tree0") if tree0_acc >= tree1_acc else (tree1, tree1_acc, "tree1")
    return (best_copy, copy_acc, tree_name) if copy_acc >= origin_acc else (origin, origin_acc, "origin")

In [711]:
def acuraccy(X_prune, y_prune, this_tree):
    """ Acuraccy
    
    Calculates the acuraccy: Number of errors/total data length
    
    Args:
        X_prune: pandas dataframe
        y_prune: pandas series
        this_tree: treelib.tree.Tree
        
    Returns:
        float
    """
    data_len = y_prune.size
    errors = 0
    # TODO: Change this to enumerate or something
    for i in range(data_len):
        predicted_label = predict(X_prune.iloc[i], this_tree)
        if predicted_label != y_prune.iloc[i]:
            errors += 1
    return (errors/data_len)

In [712]:
def edit_child_nodes(tree, node, data):
    # Change the parent nodes data to a label = data
    # Seems like there is no "set data" in treelib = make a new node in its place
    print(tree)
    print(node.tag)
    tree.create_node(tag=node.tag, data=data, parent=node.bpointer)
    tree.remove_node(node.identifier)
    # This means there is no need to delete children - implicitly deleted
    
    # Return the updated tree
    return tree

In [713]:
learn(X_no_qmarks_fact_train, y_no_qmarks_fact_train, imp_measure_alt="entropy", pruning=True)

odor
├── 0
│   └── 0
├── 1
│   └── 1
├── 2
│   └── 1
├── 3
│   └── spore-print-color
│       ├── 0
│       │   └── 1
│       ├── 1
│       │   └── 1
│       ├── 4
│       │   └── 0
│       └── 5
│           └── cap-color
│               ├── 0
│               │   └── 1
│               ├── 1
│               │   └── 0
│               ├── 2
│               │   └── 0
│               ├── 3
│               │   └── 1
│               ├── 5
│               │   └── 1
│               └── 7
│                   └── 1
├── 4
│   └── 0
├── 5
│   └── 0
└── 6
    └── 0

0
odor
├── 0
│   └── 0
├── 1
│   └── 1
├── 2
│   └── 1
├── 3
│   └── spore-print-color
│       ├── 0
│       │   └── 1
│       ├── 1
│       │   └── 1
│       ├── 4
│       │   └── 0
│       └── 5
│           └── cap-color
│               ├── 0
│               │   └── 1
│               ├── 1
│               │   └── 0
│               ├── 2
│               │   └── 0
│               ├── 3
│               │   └── 1
│               ├── 5
│    

Acuraccy for original tree:  0.803834808259587 
 acuraccy for edited tree, 0:  0.8200589970501475 
 acuraccy for edited tree, 1: 0.38200589970501475
Picked therefore:  tree0 , with acc:  0.8200589970501475
odor
├── 0
├── 1
├── 2
├── 3
│   └── spore-print-color
├── 4
│   └── 0
├── 5
│   └── 0
└── 6
    └── 0

3
odor
├── 0
├── 1
├── 2
├── 3
│   └── spore-print-color
├── 4
│   └── 0
├── 5
│   └── 0
└── 6
    └── 0

3
Acuraccy for original tree:  0.8200589970501475 
 acuraccy for edited tree, 0:  0.11504424778761062 
 acuraccy for edited tree, 1: 0.11504424778761062
Picked therefore:  origin , with acc:  0.8200589970501475
odor
├── 0
├── 1
├── 2
├── 3
│   └── spore-print-color
│       ├── 0
│       ├── 1
│       ├── 4
│       │   └── 0
│       └── 5
│           └── cap-color
│               ├── 0
│               │   └── 1
│               ├── 1
│               │   └── 0
│               ├── 2
│               │   └── 0
│               ├── 3
│               │   └── 1
│               ├── 5
│   

KeyboardInterrupt: 

PyDoc: <br>
Title --> to, tre ord <br>
Explanation --> enkel forklaring på hva metoden gjør <br>
Parametre: <br>
1. <br>
2. <br>
.... <br>
Return type: returnerer hva, hvilken type er det

In [526]:
def make_tree(X, y, tree, imp_measure, current_node=None):
    """Recursive method to make a tree
    X, y = the data, training- and target variables, respectively
    impurity_measer = the desired measure of impurity (entropy or gini index)
    tree = the tree object one wishes to populate
    current_node = the current node one stands one"""
    # Edge case: tree not initialized - store whole dataset in root
    # Also necessary to get a reference to root node (because of identifier)
    # Not necessary, however too little time to fix 
    if current_node is None:
        # Combine data to one dataset
        data = pd.concat([X, y], axis=1)
        # Set root node name to something generic that is easy to get
        root_node_name = "root_node"
        # Get best split
        root_node_tag = imp_measure.getLargestInformationGain(X, y)
        # Make the root node, store the entire dataset
        tree.create_node(tag=root_node_tag, identifier=root_node_name, data=data)
        # Get a reference to the root node
        current_node = tree.get_node(root_node_name) 
        # Call recursive method
        return make_children(X=X, y=y, tree=tree, imp_measure=imp_measure, current_node=current_node)
    # Tree is initialized
    else:
         # Edge cases - no children to make:
        # 1. Unique values in target variable = y
        if len(set(y)) == 1:
            (element, ) = set(y)
            # Make a node of the leaf
            # and set its single unique value as its name, and value
            node_name = str(element)
            data = element
            tree.create_node(tag=node_name, data=data, parent=current_node)
            return tree
        # 2. No columns left in X, i.e. splitted on entire dataset
        elif len(X.columns) == 0:
            # Set to majority in y
            data = y.max()
            node_name = str(data)
            tree.create_node(tag=node_name, data=data, parent=current_node)
            return tree
        else:
            node_name = imp_measure.getLargestInformationGain(X, y)
            current_node = tree.create_node(tag=node_name, data=current_node.data, parent=current_node)
            return make_children(X=X, y=y, tree=tree, imp_measure=imp_measure, current_node=current_node)

In [521]:
def make_children(X, y, tree, imp_measure, current_node):
    """Identify and initialize the children for a specific node
    X, y = dataset of current node
    impurity_measer = the desired measure of impurity (entropy or gini index)
    tree = the tree object one wishes to populate
    current_node = the current node one stands one, and wishes to define children"""
    # For each unique value in the parents column - make a child node
    child_list = list(set(X[current_node.tag]))
    data = pd.concat([X, y], axis=1)
    for value in child_list:
        node_name = str(value)
        # Split dataset
        data_loc = split_data(value, data, current_node.tag)
        # Remove parent column - for children nodes
        tree.create_node(tag=node_name, data=data_loc, parent=current_node)
    # Need referece to each child node, therefore new loop
    for children_node in current_node.fpointer:
        current_node = tree.get_node(children_node)
        X = current_node.data.drop([y.name], axis=1)
        y = current_node.data[y.name]
        tree = make_tree(X=X, y=y, tree=tree, imp_measure=imp_measure, current_node=current_node)
             
    return tree

In [522]:
def split_data(value, data, column):
    """Splits the dataframe such that it return the rows, where the specified
    columns value == value"""
    data_loc = data.loc[data[column] == value]
    return data_loc

In [523]:
def predict(x, tree):
    """Predict class label of some new data point x."""
    
    current_node = tree.get_node(tree.root)
    classification_value = getClassificationLabel(x=x, tree=tree, current_node=current_node)
    return classification_value

In [524]:
def getClassificationLabel(x, tree, current_node):
    """x = inputed data to classify
    current_node = current node to inspect (its children)"""
    # Current node's name is the name of the column that is (presumably) best to split on
    # Note - after a small change to the tree setup, we have to use the parents
    # name, if one is at a node that represents a unique value from parents column
    current_node_tag = current_node.tag
    if current_node_tag.isdigit():
        # Get parents name 
        parent = tree.get_node(current_node.bpointer)
        split_column = parent.tag
    else:
        split_column = current_node.tag
    # Find the children node which has the same value, in the same column
    correct_children_node = None
    # Find the column in the inputed dataset = x, and get its value
    val_x = x.get(split_column)
    # Hacky, but does not work to return inside a for loop, and no time to change logic
    correct_label = None
    # Loop through the children of current node
    for node_children in current_node.fpointer:
        # node_children is only identifier to children - need to get the actual references to those children
        current_children_node = tree.get_node(node_children)
        # Check if children node is leaf - return its data = classification label
        if current_children_node.is_leaf():
            correct_label = current_children_node.data
            break
        # If a node's column = parents tag contains val_x --> correct child node
        elif val_x in current_children_node.data[split_column].values:
            correct_children_node = current_children_node
            break
        else:
            #There is no child node with that specific value in that column
            # Set label to majority in current node
            target_variabel = current_node.data.drop(columns=x.index, axis=1)
            # So hacky I might throw up, but when I get a df, not a series.. 
            correct_label = getMostFrequentValueFromPandasDFThatShouldBeASeries(target_variabel)
          
    if correct_label is None:
        return getClassificationLabel(x=x, tree=tree, current_node=correct_children_node)
    
    return correct_label

In [525]:
def getMostFrequentValueFromPandasDFThatShouldBeASeries(df):
    column_name = df.columns.values[0]
    return list((df[column_name].value_counts()).index)[0]

In [313]:
def isLeaf(node):
    if len(node.fpointer) == 0:
        return True
    elif len(set(node.data['play'])) == 1:
        return True
    else:
        return False

In [319]:
predict(not_optimal_test_data, tree_tennis)

0

In [197]:
tree = learn(X_shrooms, y_shrooms)
print(tree)

NameError: name 'X_shrooms' is not defined

In [198]:
not_optimal_test_data = X_shrooms_fact.iloc[5]
print(y_shrooms_fact.iloc[5])

NameError: name 'X_shrooms_fact' is not defined

In [471]:
y = data_shrooms_fact_no_qmarks.drop(columns=X_no_qmarks_fact_train.columns, axis=1)

In [506]:
print(getMostFrequentValueFromPandasDFThatShouldBeASeries(y))

1


In [481]:
column_name = y.columns.values[0]
print(list(y[column_name].value_counts())[0])

In [504]:
print(list((y[column_name].value_counts()).index)[0])

1


In [453]:
string = (y.columns.values)
print(type(string[0]))

<class 'str'>


In [470]:
from scipy.stats import mode
print(list(mode(y[string[0]])))

[array([1], dtype=int64), array([3488])]


In [461]:
string

array(['class'], dtype=object)

In [714]:
tree_shrooms = learn(X_no_qmarks_fact_train, y_no_qmarks_fact_train)
print(type(tree_shrooms))

<class 'treelib.tree.Tree'>


In [651]:
tree_tennis_1 = learn(X_tennis_enc, y_tennis_enc, imp_measure_alt)
tree_tennis_2 = learn(X_tennis_enc, y_tennis_enc, imp_measure_alt)
print(tree_tennis_1 is tree_tennis_2)

False


In [638]:
tree_tennis.root

'root_node'

In [640]:
tree_tennis.get_node(tree_tennis.root)

Node(tag=outlook, identifier=root_node, data=    outlook  temp  humidity  windy  play
0         0     0         0      0     0
1         0     0         0      1     0
2         1     0         0      0     1
3         2     1         0      0     1
4         2     2         1      0     1
5         2     2         1      1     0
6         1     2         1      1     1
7         0     1         0      0     0
8         0     2         1      0     1
9         2     1         1      0     1
10        0     1         1      1     1
11        1     1         0      1     1
12        1     0         1      0     1
13        2     1         0      1     0)

In [411]:
not_optimal_test_data = data_tennis_factorized.iloc[0]

In [419]:
print(not_optimal_test_data.index)

Index(['outlook', 'temp', 'humidity', 'windy', 'play'], dtype='object')


In [403]:
print(data_tennis_factorized.columns)

Index(['outlook', 'temp', 'humidity', 'windy', 'play'], dtype='object')


In [405]:
y = data_tennis_factorized.drop(columns=X_tennis_enc.columns, axis=1)

In [406]:
print(y)

    play
0      0
1      0
2      1
3      1
4      1
5      0
6      1
7      0
8      1
9      1
10     1
11     1
12     1
13     0
