In [2]:
import numpy as np
import pandas as pd
import matplotlib as mp

In [46]:
# The Dataframe used in Decision Tree should have the label in the last column

training_data = [
    ['Green', 3, 'Apple'],
    ['Yellow', 3, 'Apple'],
    ['Red', 1, 'Grape'],
    ['Red', 1, 'Grape'],
    ['Yellow', 3, 'Lemon'],
    ['Green', 3, 'Apple'],
    ['Red', 3, 'Apple'],
    ['Green', 1, 'Grape'],
    ['Yellow', 1, 'Grape'],
    ['Red', 4, 'Apple'],
    ['Yellow', 2, 'Grape'],
    ['Green', 4, 'Apple'],
    ['Green', 1.5, 'Grape'],
    ['Yellow', 4, 'Lemon'],
    ['Yellow', 3, 'Lemon'],
]

In [47]:
training_dataframe = pd.DataFrame(training_data)
training_dataframe = training_dataframe.rename(columns = {0: "color", 1: "diameter", 2: "label"})
training_dataframe

Unnamed: 0,color,diameter,label
0,Green,3.0,Apple
1,Yellow,3.0,Apple
2,Red,1.0,Grape
3,Red,1.0,Grape
4,Yellow,3.0,Lemon
5,Green,3.0,Apple
6,Red,3.0,Apple
7,Green,1.0,Grape
8,Yellow,1.0,Grape
9,Red,4.0,Apple


In [5]:
# Search Unique Value in a Column of a Pandas Dataframe
def get_column_unique_value(dataframe, col_name):
    unique_values = []
    
    for index, rows in dataframe.iterrows():
        unique_values.append(rows.loc[col_name])
    
    return set(unique_values)

In [48]:
get_column_unique_value(training_dataframe, "label")

{'Apple', 'Grape', 'Lemon'}

In [7]:
# Count the Number of Label in a Pandas Dataframe
def get_class_count(dataframe):
    class_count = {}
    
    for index, rows in dataframe.iterrows():
        label = rows[-1]
        if label not in class_count:
            class_count[label] = 0
        class_count[label] += 1
    
    return class_count

In [49]:
get_class_count(training_dataframe)

{'Apple': 6, 'Grape': 6, 'Lemon': 3}

In [9]:
# Calculate the Gini_Impurity of a Specific Node
def gini_impurity(dataframe):
    class_count = get_class_count(dataframe)
    gini_impurity = 1
    
    for label in class_count:
        label_prob = class_count[label]/float(len(dataframe))
        gini_impurity -= label_prob**2
    
    return gini_impurity

In [10]:
gini_impurity(training_dataframe)

0.6399999999999999

In [11]:
# Claculate the Information Gain of a Split
def information_gain(left_dataframe, right_dataframe, current_dataframe):
    
    current_gini = gini_impurity(current_dataframe)
    left_gini = gini_impurity(left_dataframe)
    right_gini = gini_impurity(right_dataframe)
    
    left_weight = float(len(left_dataframe))/float(len(current_dataframe))
    information_gain = current_gini - (left_weight * left_gini + (1 - left_weight) * right_gini)
    
    return information_gain

In [12]:
len(training_dataframe.columns)-1

2

In [13]:
def dataframe_split(dataframe, col_name, value):
    
    number_or_not = isinstance(value, int) or isinstance(value, float)
    
    if number_or_not == True:
        right_dataframe = dataframe.loc[dataframe[col_name] > value]
        left_dataframe = dataframe.loc[dataframe[col_name] <= value]
        return right_dataframe, left_dataframe
    
    else:
        right_dataframe = dataframe.loc[dataframe[col_name] == value]
        left_dataframe = dataframe.loc[dataframe[col_name] != value]
        return right_dataframe, left_dataframe

In [14]:
# Finding the best split for each dataframe
def find_best_split(dataframe):
    
    best_information_gain = 0
    best_label_to_split = None
    best_value_to_split = None
    current_gini = gini_impurity(dataframe)
    n_features = training_dataframe.columns.drop("label") # Finding the number of feature, -1 for label columns
    
    for col_name in n_features:
        unique_values = get_column_unique_value(dataframe, col_name)
        
        for values in unique_values:
            right_dataframe, left_dataframe = dataframe_split(dataframe, col_name, values)
                
            infor_gain = information_gain(left_dataframe, right_dataframe, dataframe)
            
            if infor_gain > best_information_gain:
                
                best_information_gain = infor_gain
                best_label_to_split = col_name
                best_value_to_split = values
                
    
    return best_label_to_split, best_value_to_split, best_information_gain

In [50]:
find_best_split(training_dataframe)

('diameter', 2.0, 0.37333333333333324)

In [16]:
class leaf: 
    
    def __init__(self, dataframe):
        self.predictions = get_class_count(dataframe)

In [17]:
class node:
    
    def __init__(self, label, value, right_branch, left_branch):
        self.label = label
        self.value = value
        self.right_branch = right_branch
        self.left_branch = left_branch

In [31]:
def build_trees(dataframe):
    
    label_to_split, value_to_split, infor_gain = find_best_split(dataframe)
    
    if infor_gain == 0:
        return leaf(dataframe)
    
    right_dataframe, left_dataframe = dataframe_split(dataframe, label_to_split, value_to_split)
    
    right_branch = build_trees(right_dataframe)
    
    left_branch = build_trees(left_dataframe)
    
    return node(label_to_split, value_to_split, right_branch, left_branch)
    

In [19]:
def print_tree(node, spacing = " "):
    
    if isinstance(node, leaf):
        print(spacing + "Predict" + str(node.predictions))
        return
        
    print(spacing + "Feature to split: " + str(node.label) + ", Value to split: " + str(node.value))
    
    print("   " + "--> True:")
    print_tree(node.right_branch)
    
    print("   " + "--> False:")
    print_tree(node.left_branch)


In [51]:
tree = build_trees(training_dataframe)
print_tree(tree)

 Feature to split: diameter, Value to split: 2.0
   --> True:
 Feature to split: color, Value to split: Yellow
   --> True:
 Feature to split: diameter, Value to split: 3.0
   --> True:
 Predict{'Lemon': 1}
   --> False:
 Predict{'Apple': 1, 'Lemon': 2}
   --> False:
 Predict{'Apple': 5}
   --> False:
 Predict{'Grape': 6}


In [52]:
def predict(data, node):
    if isinstance(node, leaf):
        return node.predictions
    
    if data[node.label] == node.value:
        return predict(data, node.right_branch)
    else:
        return predict(data, node.left_branch)

In [53]:
a = training_dataframe.loc[0]

In [58]:
type(a)

pandas.core.series.Series

In [54]:
test_set = pd.DataFrame(
    {
        "color": ['Red', "Green"], 
        "diameter": [3, 1], 
        "label": ['Apple', "Grape"]
    }
)

In [55]:
test_set

Unnamed: 0,color,diameter,label
0,Red,3,Apple
1,Green,1,Grape


In [56]:
type(test_set.loc[0])

pandas.core.series.Series

In [36]:
predict(test_set.loc[0], tree)

{'Grape': 2}

In [57]:
for i in range(len(test_set)):
    a = predict(test_set.loc[i], tree)
    print(a)

{'Grape': 6}
{'Grape': 6}


In [37]:
len(test_set)

2