# Decision Tree - Regression (ID3)

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Create dataset
dataset = {'Outlook':['Rainy','Rainy','Overcast','Sunny','Sunny','Sunny','Overcast','Rainy','Rainy','Sunny','Rainy','Overcast','Overcast','Sunny'],
       'Temperature':['Hot','Hot','Hot','Mild','Cool','Cool','Cool','Mild','Cool','Mild','Mild','Mild','Hot','Mild'],
       'Humidity':['High','High','High','High','Normal','Normal','Normal','High','Normal','Normal','Normal','High','Normal','High'],
       'Windy':['False','True','False','False','False','True','True','False','False','False','True','True','False','True'],
       'Hours Played':[25, 30, 46, 45, 52, 23, 43, 35, 38, 46, 48, 52, 44, 30]}

In [3]:
# Dataset do DataFrame
df = pd.DataFrame(dataset,columns=['Outlook','Temperature','Humidity','Windy','Hours Played'])

In [4]:
# Show dataset
df

Unnamed: 0,Outlook,Temperature,Humidity,Windy,Hours Played
0,Rainy,Hot,High,False,25
1,Rainy,Hot,High,True,30
2,Overcast,Hot,High,False,46
3,Sunny,Mild,High,False,45
4,Sunny,Cool,Normal,False,52
5,Sunny,Cool,Normal,True,23
6,Overcast,Cool,Normal,True,43
7,Rainy,Mild,High,False,35
8,Rainy,Cool,Normal,False,38
9,Sunny,Mild,Normal,False,46


In [5]:
# Column with classes
label_col = 'Hours Played'

### Standard deviation for whole label column
<img src="./Images/sd.png">

### Standard deviation for two attributes (target and predictor)
<img src="./Images/sd2.png">

In [6]:
# Standard deviation
def standard_deviation(data, label_col, label=None, feature=None):
    # If not given label count standard deviation and len for whole label column
    if label == None:
        stand_dev = np.std(data[label_col])
        count = len(data[label_col])
        return stand_dev, count
    # Else count standard deviation and len for given label and feature
    else:
        stand_dev = np.std(data[data[label] == feature])[0]
        count = len(data[data[label] == feature])
        return stand_dev, count

### Standard Deviation Reduction
<img src="./Images/sdr.png">

In [7]:
# Standard Deviation Reduction
def standard_deviation_reduction(data, label_col):
    # List to grab Standard Deviation Reduction
    SD_tab = []
    # Standard deviation for whole label column
    S_all, c_all = standard_deviation(df, label_col)
    # For each column except label column
    for column in data.columns[:-1]:
        # List to grab Standard Deviation
        S_tab = []
        # For each unique value in column
        for unique in data[column].unique():
            # Standard Deviation and len for each unique value in column
            S, c = standard_deviation(data, label_col, label=column, feature=unique)
            # Standard Deviation for two attributes
            S_tab.append((c / len(data)) * S)
        # Standard Deviation Reduction
        SD_tab.append(S_all - np.sum(S_tab))
    return SD_tab

### Standard Deviation Reduction
<img src="./Images/cov.png">

In [8]:
# Coeffeicient of Variation
def coeffeicient_of_variation(data, label_col, label, feature):
    S, c = standard_deviation(data, label_col, label, feature)
    return S/np.mean(data[data[label] == feature][label_col]) * 100

In [9]:
# Threshold to stop iteration
threshold = 10

# Build tree
def build_tree(data, tree_dict=None):
    # Entropy for whole label data
    SD = standard_deviation_reduction(data, label_col)
    # Winner column
    winner_col = data.columns[np.argmax(SD)]
    # If tree_dict is None, create tree
    if tree_dict is None:                    
        tree_dict={}
        tree_dict[winner_col] = {}
    # Each unique value in winner column
    for value in np.unique(data[winner_col]):
        # Coeffeicient of Variation for 
        CV = coeffeicient_of_variation(data, label_col, label=winner_col, feature=value)
        # Part of tree with winner column
        grab_part = data[data[winner_col] == value]
        # Unique values (Yes / No) in part of tree
        unique_labels = np.unique(grab_part[label_col])
        # If only the values of one class were left stop iteration
        if len(unique_labels) == 1:
            # Add class to tree_dict
            tree_dict[winner_col][value] = unique_labels[0]
        # If CV < threshold or is only 3 element in leaf stop iteration
        elif (CV < threshold) or (len(grab_part[label_col]) <= 3):
            # Add mean of results to tree_dict
            tree_dict[winner_col][value] = np.mean(grab_part[label_col])
        # Else recursively call functions
        else:
            # Add result of recursively call function to tree_dict
            tree_dict[winner_col][value] = build_tree(grab_part)
    return tree_dict

In [10]:
# Build tree
tree = build_tree(df)

In [11]:
# Library to preety print
import pprint
# Print tree
pprint.pprint(tree)

{'Outlook': {'Overcast': 46.25,
             'Rainy': {'Temperature': {'Cool': 38, 'Hot': 27.5, 'Mild': 41.5}},
             'Sunny': {'Windy': {'False': 47.666666666666664, 'True': 26.5}}}}


In [12]:
# Prediction function
def predict(ex, tree):
    # For every nodes
    for nodes in tree.keys():
        # Grab selected column from sample
        value = ex[nodes]
        # Add selected column to dict
        tree = tree[nodes][value]
        # If tree is dict recursively call function
        if type(tree) is dict:
            prediction = predict(ex, tree)
        # Else final prediction
        else:
            prediction = tree
            break
    return prediction

In [13]:
# Test example
ex = df.iloc[6]
ex

Outlook         Overcast
Temperature         Cool
Humidity          Normal
Windy               True
Hours Played          43
Name: 6, dtype: object

In [14]:
# Prediction
predict(ex, tree)

46.25