# <center> Implementing Decision Trees </center>
## <center> INF283 - Project 1 </center>
### <center> Sindre E. de Lange </center>

In [2]:
import numpy as np
import os
import pandas as pd

## Getting data in order to test the model

In [4]:
! git clone https://github.com/sjwhitworth/golearn.git

Cloning into 'golearn'...


In [3]:
DATASET_PATH = "golearn/examples/datasets/"
print(os.listdir(DATASET_PATH))

['articles.csv', 'c45-numeric.csv', 'chim.csv', 'exam.csv', 'exams.csv', 'house-votes-84.csv', 'iris.arff', 'iris.csv', 'iris_binned.csv', 'iris_headers.csv', 'iris_headers_subset.csv', 'iris_sorted_asc.csv', 'iris_sorted_desc.csv', 'mnist_test.csv', 'mnist_train.csv', 'randomdata.csv', 'sources.txt', 'tennis.csv', 'weather.arff']


In [4]:
tennis_dataset = "tennis.csv"
dataset_tennis = pd.read_csv(DATASET_PATH + tennis_dataset)

In [5]:
dataset_tennis.head()

Unnamed: 0,outlook,temp,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes


In [6]:
dataset_tennis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 5 columns):
outlook     14 non-null object
temp        14 non-null object
humidity    14 non-null object
windy       14 non-null bool
play        14 non-null object
dtypes: bool(1), object(4)
memory usage: 542.0+ bytes


In [7]:
dataset_tennis.describe()

Unnamed: 0,outlook,temp,humidity,windy,play
count,14,14,14,14,14
unique,3,3,2,2,2
top,sunny,mild,high,False,yes
freq,5,6,7,8,9


In [8]:
# Separate learning features and target features
X = dataset_tennis.drop(['play'], axis=1)
y = dataset_tennis['play']

In [9]:
X_copy = X
y_copy = y

In [10]:
for columns in X:
    X_copy[columns], unique_x = pd.factorize(X[columns])

In [11]:
y_copy, unique_y = pd.factorize(y)

In [20]:
print(X_copy)
print(X_copy.shape)

    outlook  temp  humidity  windy
0         0     0         0      0
1         0     0         0      1
2         1     0         0      0
3         2     1         0      0
4         2     2         1      0
5         2     2         1      1
6         1     2         1      1
7         0     1         0      0
8         0     2         1      0
9         2     1         1      0
10        0     1         1      1
11        1     1         0      1
12        1     0         1      0
13        2     1         0      1
(14, 4)


In [21]:
print(y_copy)
print(y_copy.shape)

[0 0 1 1 1 0 1 0 1 1 1 1 1 0]
(14,)


In [22]:
from sklearn import tree
from sklearn.model_selection import train_test_split
clf = tree.DecisionTreeClassifier()
X_train, X_test, y_train, y_test = train_test_split(X_copy, y_copy, test_size=0.3, random_state=42)
clf = clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

0.6


## Data for verifying the model *check*

# Model implementation

## 1. Implement the ID3 algorithm from scratch

In [444]:
def calc_entropy(p):
    """Calculate the entropy for a given fraction"""
    if p!=0:
        return -p * np.log2(p)
    else:
        return 0

In [445]:
def calc_entropy_system(data):
    """Calculates the entropy of the system
    Data is the target variable"""
    tot_len = len(data)
    unique, counts = np.unique(data, return_counts=True)
    dic = dict(zip(unique, counts))
    entropy = 0
    for key, value in dic.items():
        entropy += calc_entropy(value/tot_len)
    return entropy

In [446]:
import collections

def calc_entropy_dataset(data, target_variable):
    X = data.drop([target_variable], axis=1)
    y = data[target_variable]
    tot_num_occurences = len(y)
    # TODO: target_variable_entropy/entropy of the whole system
    
    entropy_system = calc_entropy_system(y)
    X_y_zip = {}
    for columns in X:
        # Map each value in each column to their "outcome"/target variable
        X_y_zip[columns] = data[[columns, target_variable]].apply(tuple, axis=1)
    each_feature_w_entropy = calc_entropy_feature(X_y_zip, tot_num_occurences)
    

    return calc_entropy_all_branches(each_feature_w_entropy, entropy_system)

In [447]:
def calc_entropy_feature(X_y_zip, tot_num_occurences):
    """Calcalutes the entropy for each feature in a dataset.
    Assumes that all unique columns are zipped with the target variable.
    Returns a dictionary on the format 
        {'column feature': 
            {unique value: [
                number of times this value occured in the set
                number of values in the set
                entropy when this value occured in the set
            ], ... }}"""
    columns_entropy = {}
    for feature in X_y_zip:
        # Get unique variables for each key
        list_of_unique_variables = list(set([x[0] for x in X_y_zip[feature].values]))

        val_dict = {}
        for val in list_of_unique_variables:
            # Total number of days for each unique variable
            num_days_val = len([x[1] for x in X_y_zip[feature] if x[0] == val])
            # Total number of days for each key (assuming it is binary (and tennis), for now)
            num_days_val_tennis = len([x[1] for x in X_y_zip[feature] if x[0] == val and x[1] == 1])
            num_days_val_not_tennis = num_days_val - num_days_val_tennis
            #Calculate entropy for each unique value
            val_entropy = calc_entropy(num_days_val_tennis/num_days_val) + calc_entropy(num_days_val_not_tennis/num_days_val)
            # Make a list with relevant data for each unique value
            val_list = [num_days_val, tot_num_occurences, val_entropy]
            # Append that list to a dictionary, where the unique value is key
            val_dict[val] = val_list
        # Append dictionaries for unique values, to their respectively feature
        columns_entropy[feature] = val_dict
    return columns_entropy

In [448]:
def calc_entropy_all_branches(data, entropy_src):
    """Calculates the entropy among all the branches
    Expects a dictionary on the format 
        {'column feature': 
            {unique value: [
                number of this value occured in the set
                number of values in the set
                entropy when this value occured in the set
            ], ... }}"""
    column_entropy_dict_full = {}
    for column_feature in data:
        entropy_all = 0
        # Take each value from each 'unique value', for each 'column feature' from the inputed dictionary
        # and calculates the entropy for each 'unique value', e.g. sunny, rainy, etc. 
        for unique_val in data[column_feature]:
            # NOTE: As mentioned in PyDoc - assumes this format
            num_val = data[column_feature][unique_val][0]
            num_tot = data[column_feature][unique_val][1]
            num_val_entropy = data[column_feature][unique_val][2]
            entropy_all += (num_val/num_tot)*num_val_entropy
        column_entropy_dict_full[column_feature] = entropy_all
    
    information_gain_dict = {}
    for key, value in column_entropy_dict_full.items():
        information_gain_dict[key] = randomness_reduction(entropy_src, value)
    return information_gain_dict

In [449]:
def randomness_reduction(entropy_src, entropy_branch):
    """ Calculates the reduction in randomness, aka Information Gain.
    Takes in the entropy of the entire system, and the entropy for one branch
    Returns the Information Gain - restricted to 3 decimals."""
    return (round(entropy_src - entropy_branch, 3))

In [450]:
def getLargestInformationGain(data, target_variable):
    """Gets the largest IG for any given dataset (that is Pandas DataFrame)"""
    ig_dict = calc_entropy_dataset(data, target_variable)
    return (max(ig_dict, key=ig_dict.get))

In [451]:
getLargestInformationGain(dataset_tennis_copy, 'play')

'outlook'

# Notes:
- The best feature to pick as the one to classify on is the one with the most information (gain), i.e. highest entropy
    - After finding the best feature, re-evaluate the entropy of each feature and again pick the one with the highest entropy

In [None]:
def learn(X, y, impurity_measure='entropy'):
    """Function that learns a decision tree classifier from X and y.
        Default impurity measure for information gain is Entropy."""
    # Make a leaf for each class - true "purity"
    # Will implement pruning later

In [395]:
def predict(x, tree):
    """Predict class label of some new data point x."""

Create a tree: (1:16:11, https://www.youtube.com/watch?v=3jl2h9hSRvc&feature=youtu.be)

In [396]:
# https://stackoverflow.com/questions/41760856/most-simple-tree-data-structure-in-python-that-can-be-easily-traversed-in-both
class Tree(object):
    def __init__(self, data, children=None, parent=None):
        self.data = data
        self.children = {}
        self.parent = parent

    def add_child(self, data):
        new_child = Tree(data, parent=self)
        self.children.append(new_child)
        return new_child

    def is_root(self):
        return self.parent is None

    def is_leaf(self):
        return not self.children

    """ ToString method """
    def __str__(self):
        if self.is_leaf():
            return str(self.data)
        return '{data} [{children}]'.format(data=self.data, children=', '.join(map(str, self.children)))

# TODO
1. Check data, potentially implement categorical --> numerical categories 

In [397]:
myTree = Tree('foo')

In [398]:
print(myTree)

foo
