In [46]:
import pandas as pd
import numpy as np


df = pd.read_csv('iris.csv')
df.head()

Unnamed: 0,SL,SW,PL,PW,class
0,5.1,3.5,1.4,0.2,1
1,4.9,3.0,1.4,0.2,1
2,4.7,3.2,1.3,0.2,1
3,4.6,3.1,1.5,0.2,1
4,5.0,3.6,1.4,0.2,1


In [47]:
# decision attribute - last column
decision = df.columns[-1]

# all other columns are features
features = df.columns[:-1]

# add all rows indexes to the 1st bin
bins = {"all": df.index.values}
bins

{'all': array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
         13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
         26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
         39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
         52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
         78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
         91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
        104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
        117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
        130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
        143, 144, 145, 146, 147, 148, 149], dtype=int64)}

In [48]:
def entropy(target_col):
    elements, counts = np.unique(target_col, return_counts = True)
    entropy = np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts)) for i in range(len(elements))])
    return entropy

# calculate entropy for bin 1
print(entropy(df[decision].loc[bins["all"]]))

1.584962500721156


In [68]:
def conditional_entropy(df, target_col, bins):
    c_entropy = 0
    for bin in bins:
        c_entropy += entropy(df[target_col].loc[bins[bin]]) * len(df[target_col].loc[bins[bin]])
    return c_entropy/len(df)
print(conditional_entropy(df, decision, bins))

1.584962500721156


In [56]:
def possible_cuts(df, features):
    cuts = {f : [] for f in features}
    for feature in features:
        for i, v in enumerate(sorted(df[feature].unique())):
            if i == 0:
                continue
            cuts[feature].append((v + sorted(df[feature].unique())[i-1])/2)
    return cuts
print(possible_cuts(df, features))

{'SL': [4.35, 4.45, 4.55, 4.65, 4.75, 4.85, 4.95, 5.05, 5.15, 5.25, 5.35, 5.45, 5.55, 5.65, 5.75, 5.85, 5.95, 6.05, 6.15, 6.25, 6.35, 6.45, 6.55, 6.65, 6.75, 6.85, 6.95, 7.05, 7.15, 7.25, 7.35, 7.5, 7.65, 7.800000000000001], 'SW': [2.1, 2.25, 2.3499999999999996, 2.45, 2.55, 2.6500000000000004, 2.75, 2.8499999999999996, 2.95, 3.05, 3.1500000000000004, 3.25, 3.3499999999999996, 3.45, 3.55, 3.6500000000000004, 3.75, 3.8499999999999996, 3.95, 4.05, 4.15, 4.300000000000001], 'PL': [1.05, 1.15, 1.25, 1.35, 1.45, 1.55, 1.65, 1.7999999999999998, 2.45, 3.15, 3.4, 3.55, 3.6500000000000004, 3.75, 3.8499999999999996, 3.95, 4.05, 4.15, 4.25, 4.35, 4.45, 4.55, 4.65, 4.75, 4.85, 4.95, 5.05, 5.15, 5.25, 5.35, 5.45, 5.55, 5.65, 5.75, 5.85, 5.95, 6.05, 6.199999999999999, 6.35, 6.5, 6.65, 6.800000000000001], 'PW': [0.15000000000000002, 0.25, 0.35, 0.45, 0.55, 0.8, 1.05, 1.15, 1.25, 1.35, 1.45, 1.55, 1.65, 1.75, 1.85, 1.95, 2.05, 2.1500000000000004, 2.25, 2.3499999999999996, 2.45]}


In [83]:
def possible_splits(df, cuts : dict):
    splits = {}
    for feature, cuts in cuts.items():
        splits[feature] = {}
        for cut in cuts:
            splits[feature][cut] = {}
            splits[feature][cut]["left"] = df[df[feature] <= cut].index.values
            splits[feature][cut]["right"] = df[df[feature] > cut].index.values
    return splits
# print(cuts_split(df, possible_cuts(df, features)))

In [70]:
def best_cut(df, splits, bins, features, decision, curr_entropy = 1):
    best_feature = None
    best_cut = None
    curr_entropy = 1
    for feature in features:
        for cut in splits[feature]:
            c_entropy = conditional_entropy(df, decision, splits[feature][cut])
            if c_entropy < curr_entropy:
                curr_entropy = c_entropy
                best_feature = feature
                best_cut = cut
    return best_feature, best_cut, curr_entropy


('PL', 2.45, 0.6666666666666666)


In [80]:
def perform_cut(df, splits, bins, feature, value):
    bins_left = {}
    bins_right = {}
    for bin in bins:
        bins_left[f"{bin}_{feature}_{value}_left"] = list(np.intersect1d(bins[bin], splits[feature][value]["left"]))
        bins_right[f"{bin}_{feature}_{value}_right"] = list(np.intersect1d(bins[bin], splits[feature][value]["right"]))
    bins_together = {**bins_left, **bins_right}
    return bins_together

print(perform_cut(df, splits, bins, "PL", 2.45))

{'all_PL_2.45_left_PL_2.45_left': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49], 'all_PL_2.45_right_PL_2.45_left': [], 'all_PL_2.45_left_PL_2.45_right': [], 'all_PL_2.45_right_PL_2.45_right': [50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]}


In [91]:
max_cuts = 3
min_entropy = 0.1

# decision attribute - last column
decision = df.columns[-1]

# all other columns are features
features = df.columns[:-1]

# add all rows indexes to the 1st bin
bins = {"all": df.index.values}


while max_cuts > 0 and min_entropy < conditional_entropy(df, decision, bins):
    max_cuts -= 1
    best_feature, best_cut, curr_entropy = best_cut(df, splits, bins, features, decision)
    print(f"Best cut: {best_feature} <= {best_cut}")
    bins = perform_cut(df, splits, bins, best_feature, best_cut)
    print(f"Entropy: {curr_entropy}")
    print(f"Bins: {bins}")
    print()

TypeError: 'numpy.float64' object is not callable