In [126]:
from sklearn import datasets
import pandas as pd
import math
from itertools import groupby

In [43]:
iris = datasets.load_iris()

In [44]:
df = pd.DataFrame(iris.data)
df.columns = ["sl", "sw", 'pl', 'pw']

In [45]:
#Function to find label for a value
#if MIN_Value <=val < (m + Mean_Value) / 2 then it is assigned label a
#if (m + Mean_Value) <=val < Mean_Value then it is assigned label b
#if (Mean_Value) <=val < (Mean_Value + MAX_Value)/2 then it is assigned label c
#if (Mean_Value + MAX_Value)/2 <=val <= MAX_Value  then it is assigned label d

def label(val, *boundaries):
    if (val < boundaries[0]):
        return 'a'
    elif (val < boundaries[1]):
        return 'b'
    elif (val < boundaries[2]):
        return 'c'
    else:
        return 'd'

#Function to convert a continuous data into labelled data
#There are 4 lables  - a, b, c, d
def toLabel(df, old_feature_name):
    second = df[old_feature_name].mean()
    minimum = df[old_feature_name].min()
    first = (minimum + second)/2
    maximum = df[old_feature_name].max()
    third = (maximum + second)/2
    return df[old_feature_name].apply(label, args= (first, second, third))

In [46]:
#Convert all columns to labelled data
df['sl_labeled'] = toLabel(df, 'sl')
df['sw_labeled'] = toLabel(df, 'sw')
df['pl_labeled'] = toLabel(df, 'pl')
df['pw_labeled'] = toLabel(df, 'pw')
df.head()

Unnamed: 0,sl,sw,pl,pw,sl_labeled,sw_labeled,pl_labeled,pw_labeled
0,5.1,3.5,1.4,0.2,b,c,a,a
1,4.9,3.0,1.4,0.2,a,b,a,a
2,4.7,3.2,1.3,0.2,a,c,a,a
3,4.6,3.1,1.5,0.2,a,c,a,a
4,5.0,3.6,1.4,0.2,a,c,a,a


In [47]:
df.drop(['sl', 'sw', 'pl', 'pw'], axis = 1, inplace = True)

In [73]:
set(df['sl_labeled'])

{'a', 'b', 'c', 'd'}

In [190]:
iris.target[df.sw_labeled == "d"]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2])

In [197]:
def get_mistake(df, y, f):
    possibe_values = set(df[f])
    
    mistake = 0
    for value in possibe_values:
        p = y[df[f] ==value].values.tolist()
        if len(p) == 0:
            continue
        x = [len(list(group)) for key, group in groupby(p)]
        mistake += sum(x)-max(x)
    return mistake

In [214]:
def build_tree(df, y, unused_features):
    #base case
    # 1. unused is empty
    # 2. y contains only one distinct value
    if len(unused_features) == 0:
        return
    if len(set(y.values.reshape(1,-1)[0])) == 1:
        return
    
    best_feature = ""
    min_mistake = math.inf
    for f in unused_features:
        mistake = get_mistake(df, y, f)
        if mistake < min_mistake:
            min_mistake = mistake
            best_feature = f
        # loop over possible values : val
        # find subset of df & y with f == val
        # find number of mistakes in this subset 
        # if we predict the most common y as the output
        # find sum of all these mistakes
        # update best feature so that that particular feature
        # makes least number of mistakes
    
    # here you should know the best feature
    # print it out
    print("Best Feature ", best_feature)
    unused_features.remove(best_feature)
    possibe_values = set(df[f])
    for value in possibe_values: 
        p = y[df[f] ==value]
        x = df[df[f]==value]
        
        build_tree(x, p, unused_features)
    
    # remove best feature from unused features
    # loop over possible values of best feature
    # call build tree recursively

In [215]:
y = pd.DataFrame(iris.target)
unused_features = set(df.columns)
build_tree(df, y, unused_features)


(150, 1)
32 sl_labeled a
48 sl_labeled b
53 sl_labeled c
17 sl_labeled d
55 sw_labeled c
19 sw_labeled a
64 sw_labeled b
12 sw_labeled d
56 pw_labeled c
50 pw_labeled a
10 pw_labeled b
34 pw_labeled d
63 pl_labeled c
50 pl_labeled a
7 pl_labeled b
30 pl_labeled d
Best Feature  pw_labeled
end
(63, 1)
37 sl_labeled c
1 sl_labeled a
22 sl_labeled b
3 sl_labeled d
11 sw_labeled c
12 sw_labeled a
40 sw_labeled b
63 pl_labeled c
Best Feature  sl_labeled
end
(63, 1)
11 sw_labeled c
12 sw_labeled a
40 sw_labeled b
63 pl_labeled c
Best Feature  sw_labeled
end
(63, 1)
63 pl_labeled c
Best Feature  pl_labeled
end
(63, 1)
end
(50, 1)
end
(7, 1)
end
(30, 1)
