In [1]:
import numpy as np
import pandas as pd
import matplotlib as mp

In [2]:
# The Dataframe used in Decision Tree should have the label in the last column

training_data = [
    ['Green', 3, 'Apple'],
    ['Yellow', 3, 'Apple'],
    ['Red', 1, 'Grape'],
    ['Red', 1, 'Grape'],
    ['Yellow', 3, 'Lemon'],
]

In [34]:
training_dataframe = pd.DataFrame(training_data)
training_dataframe = training_dataframe.rename(columns = {0: "color", 1: "diameter", 2: "label"})
training_dataframe

Unnamed: 0,color,diameter,label
0,Green,3,Apple
1,Yellow,3,Apple
2,Red,1,Grape
3,Red,1,Grape
4,Yellow,3,Lemon


In [57]:
# Search Unique Value in a Column of a Pandas Dataframe
def get_column_unique_value(dataframe, col_name):
    unique_values = []
    
    for index, rows in dataframe.iterrows():
        unique_values.append(rows.loc[col_name])
    
    return set(unique_values)

In [59]:
get_column_unique_value(training_dataframe, "label")

{'Apple', 'Grape', 'Lemon'}

In [37]:
# Count the Number of Label in a Pandas Dataframe
def get_class_count(dataframe):
    class_count = {}
    
    for index, rows in dataframe.iterrows():
        label = rows[-1]
        if label not in class_count:
            class_count[label] = 0
        class_count[label] += 1
    
    return class_count

In [38]:
get_class_count(training_dataframe)

{'Apple': 2, 'Grape': 2, 'Lemon': 1}

In [39]:
# Calculate the Gini_Impurity of a Specific Node
def gini_impurity(dataframe):
    class_count = get_class_count(dataframe)
    gini_impurity = 1
    
    for label in class_count:
        label_prob = class_count[label]/float(len(dataframe))
        gini_impurity -= label_prob**2
    
    return gini_impurity

In [40]:
gini_impurity(training_dataframe)

0.6399999999999999

In [43]:
# Claculate the Information Gain of a Split
def information_gain(left_dataframe, right_dataframe, current_dataframe):
    
    current_gini = gini_impurity(current_dataframe)
    left_gini = gini_impurity(left_dataframe)
    right_gini = gini_impurity(right_dataframe)
    
    left_weight = float(len(left_dataframe))/float(len(current_dataframe))
    information_gain = current_gini - (left_weight * left_gini + (1 - left_weight) * right_gini)
    
    return information_gain

In [42]:
len(training_dataframe.columns)-1

2

In [102]:
# Split Data in Columns with Numerical Variable
def number_split(dataframe, col_name, value):

    right_dataframe = dataframe.loc[dataframe[col_name] > value]
    left_dataframe = dataframe.loc[dataframe[col_name] <= value]
    return right_dataframe, left_dataframe

In [110]:
# Split Data in Columns with Categorical Variable
def categorical_split(dataframe, col_name, value):
    
    right_dataframe = dataframe.loc[dataframe[col_name] == value]
    left_dataframe = dataframe.loc[dataframe[col_name] != value]
    return right_dataframe, left_dataframe

In [120]:
# Finding the best split for each dataframe
def find_best_split(dataframe):
    
    best_information_gain = 0
    best_label_to_split = None
    best_value_to_split = None
    current_gini = gini_impurity(dataframe)
    n_features = training_dataframe.columns.drop("label") # Finding the number of feature, -1 for label columns
    
    for col_name in n_features:
        unique_values = get_column_unique_value(dataframe, col_name)
        
        for values in unique_values:
            
            number_or_not = isinstance(values, int) or isinstance(values, float)
            
            if number_or_not == True:
                
                right_dataframe, left_dataframe = number_split(dataframe, col_name, values)
                
                infor_gain = information_gain(left_dataframe, right_dataframe, dataframe)
                
            else: 
                right_dataframe, left_dataframe = number_split(dataframe, col_name, values)
                
                infor_gain = information_gain(left_dataframe, right_dataframe, dataframe)
            
            if infor_gain > best_information_gain:
                
                best_information_gain = infor_gain
                best_label_to_split = col_name
                best_value_to_split = values
                
    
    return best_label_to_split, best_value_to_split, best_information_gain

In [121]:
find_best_split(training_dataframe)

('diameter', 1, 0.37333333333333324)

In [None]:
def build_trees(dataframe):
    