In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets

In [2]:
iris = datasets.load_iris()
iris

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [3]:
df = pd.DataFrame(iris.data)
df.columns = ["sl", "sw", 'pl', 'pw']

In [4]:
#Function to convert a continuous data into labelled data
#There are 4 lables  - a, b, c, d

def label(val, *boundaries):
    if (val < boundaries[0]):
        return 'a'
    elif (val < boundaries[1]):
        return 'b'
    elif (val < boundaries[2]):
        return 'c'
    else:
        return 'd'
    
#Function to find label for a value
#if MIN_Value <=val < (m + Mean_Value) / 2 then it is assigned label a
#if (m + Mean_Value) <=val < Mean_Value then it is assigned label b
#if (Mean_Value) <=val < (Mean_Value + MAX_Value)/2 then it is assigned label c
#if (Mean_Value + MAX_Value)/2 <=val <= MAX_Value  then it is assigned label d    

def toLabel(df, old_feature_name):
    second = df[old_feature_name].mean()
    minimum = df[old_feature_name].min()
    first = (minimum + second)/2
    maximum = df[old_feature_name].max()
    third = (maximum + second)/2
    return df[old_feature_name].apply(label, args= (first, second, third))

In [5]:
#Convert all columns to labelled data
df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')

In [6]:
#To drop old Features

df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)

In [7]:
set(df['sl_labeled'])

{'a', 'b', 'c', 'd'}

In [8]:
y = pd.DataFrame(iris.target)
unused_features = set(df.columns)
unused_features

{'pl_labeled', 'pw_labeled', 'sl_labeled', 'sw_labeled'}

In [9]:
def check_purity(data , y):
    
    label_column = y
    unique_classes = np.unique(label_column)

    if len(unique_classes) == 1:
        return True
    else:
        return False

In [10]:
def get_potential_splits(data , unused_features):
    
    potential_splits = {}
    for column in unused_features:          
        values = data.loc[: , column]
        unique_values = np.unique(values)
        
        potential_splits[column] = unique_values
    
    return potential_splits

In [11]:
def split_data(data, y , split_column, split_value):
    data["y"] = y
    
    split_column_values = data.loc[: , split_column]

    data_below = data[split_column_values == split_value]
    data_above = data[split_column_values != split_value]
    
    y1 = data_below.loc[: , "y"]
    del data_below["y"]
    y2 = data_above.loc[: , "y"]
    del data_above["y"]
    
    del data["y"]
    
    return data_below, data_above , y1 , y2

In [12]:
def calculate_entropy(data , y):
    
    label_column = y
    _, counts = np.unique(label_column, return_counts=True)

    probabilities = counts / counts.sum()
    entropy = sum(probabilities * -np.log2(probabilities))
     
    return entropy

In [13]:
def calculate_overall_entropy(data_below, data_above , y1 , y2):
    
    n = len(data_below) + len(data_above)
    p_data_below = len(data_below) / n
    p_data_above = len(data_above) / n

    overall_entropy =  (p_data_below * calculate_entropy(data_below , y1) 
                      + p_data_above * calculate_entropy(data_above , y2))
    
    return overall_entropy

In [14]:
def determine_best_split(data, potential_splits , y):
    
    overall_entropy = 9999
    for column in potential_splits:
        for value in potential_splits[column]:
            data_below, data_above , y1 , y2 = split_data(data, y , split_column=column, split_value=value)
            current_overall_entropy  = calculate_overall_entropy(data_below, data_above , y1 , y2)

            if current_overall_entropy <= overall_entropy:
                overall_entropy = current_overall_entropy
                best_split_column = column
                best_split_value = value
    
    return best_split_column, best_split_value

In [15]:
def build_tree(data , y , unused_features , n):
    # Base Case

    if (len(unused_features) == 0) or (check_purity(data , y)):
        print("Level" , n-(len(unused_features)))
        unique_classes, counts_unique_classes = np.unique(y, return_counts=True)
        print("Count of " , unique_classes[0] , " = " , counts_unique_classes[0])
        print("Current Entropy is = " , calculate_entropy(data , y))
        print("Reached leaf Node")
        print()
        return
    
    potential_splits = get_potential_splits(data , unused_features)
    split_column, split_value = determine_best_split(data, potential_splits , y)
    unused_features.remove(split_column)
    data_below, data_above , y1 , y2 = split_data(data, y , split_column, split_value)
    
    print("Level" , n-(len(unused_features))) 
    unique_classes, counts_unique_classes = np.unique(y, return_counts=True)
    i = 0
    for clas in unique_classes:
        print("Count of " , clas , " = " , counts_unique_classes[i])
        i += 1
    print("Current Entropy is = " , calculate_entropy(data , y))
    print("Splitting on feature " , split_column , " with gain ratio " ,
          calculate_entropy(data , y) - calculate_overall_entropy(data_below , data_above , y1 , y2))
    print()
    
    no_answer = build_tree(data_above, y2 , unused_features, n)
    yes_answer = build_tree(data_below, y1 , unused_features, n)


In [16]:
# Iris Data Impl
unused_features = set(df.columns)
build_tree(df , y , unused_features , len(unused_features))

Level 1
Count of  0  =  50
Count of  1  =  50
Count of  2  =  50
Current Entropy is =  1.584962500721156
Splitting on feature  pl_labeled  with gain ratio  0.9182958340544894



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Level 2
Count of  1  =  50
Count of  2  =  50
Current Entropy is =  1.0
Splitting on feature  pw_labeled  with gain ratio  0.472627976110783

Level 3
Count of  1  =  50
Count of  2  =  16
Current Entropy is =  0.7990485210442682
Splitting on feature  sl_labeled  with gain ratio  0.15886760473918082

Level 4
Count of  1  =  29
Count of  2  =  16
Current Entropy is =  0.9389320105807948
Splitting on feature  sw_labeled  with gain ratio  0.028593110142260403

Level 4
Count of  1  =  15
Current Entropy is =  0.8112781244591328
Reached leaf Node

Level 4
Count of  1  =  14
Current Entropy is =  0.9895875212220556
Reached leaf Node

Level 4
Count of  1  =  21
Current Entropy is =  0.0
Reached leaf Node

Level 4
Count of  2  =  34
Current Entropy is =  0.0
Reached leaf Node

Level 4
Count of  0  =  50
Current Entropy is =  0.0
Reached leaf Node

