In [1]:
#import libraries
import numpy as np 
import pandas as pd
import math 
import sys
import matplotlib.pyplot as plt

In [2]:
columns_name = ['x1', 'x2', 'y']
df = pd.read_csv('data/Druns.txt', names = columns_name, sep = ' ')
df

Unnamed: 0,x1,x2,y
0,0.1,-2,0
1,0.0,-1,1
2,0.0,0,0
3,0.0,1,0
4,0.0,2,0
5,0.0,3,0
6,0.0,4,0
7,0.0,5,0
8,0.0,6,1
9,0.0,7,0


In [3]:
def entropy(y):
    prob = y.value_counts()/y.shape[0]
    entropy = np.sum(-prob*np.log2(prob + 1e-9)) #for prob not to be 0 in log
    return entropy

In [4]:
def info_gain(y, mask, func = entropy):
    a = sum(mask)
    b = mask.shape[0] - a
    if a == 0 or b == 0:
        return 0
    else:
        return func(y) - (func(y[mask])*a/(a+b) + func(y[-mask])*b/(a+b)) 


In [5]:
def info_gain_ratio(col, y, mask, func = info_gain):
    if entropy != 0:
        return func(y, mask)/entropy(col)
    return None

In [6]:
def total_info_gain(x, y, func = entropy):
    options = x.sort_values().unique()
    res = []
    for value in options:
        mask = x >= value
        ig = info_gain_ratio(x, y, mask)
        res.append([value, ig])
    return res

In [7]:
colname = ['value', 'ig_ratio']
df1 = pd.DataFrame(total_info_gain(df["x1"], df["y"]))
print(df1)
df2 = pd.DataFrame(total_info_gain(df["x2"], df["y"]))
print(df2)


     0         1
0  0.0  0.000000
1  0.1  0.100518
    0         1
0  -2  0.000000
1  -1  0.012770
2   0  0.011064
3   1  0.001412
4   2  0.000313
5   3  0.004716
6   4  0.014295
7   5  0.030408
8   6  0.057694
9   7  0.011064
10  8  0.054648


In [8]:
colname_d3 = ['x1', 'x2', 'y']
df_d3 = pd.read_csv('data/D3leaves.txt', names= colname_d3, sep = ' ')
df_d3

Unnamed: 0,x1,x2,y
0,10,1,1
1,10,2,1
2,10,3,1
3,1,1,0
4,1,3,1


In [9]:
def max_info_gain(x, y, func = entropy):
    options = x.sort_values().unique()
    ig = []
    split_value = []
    entro_list = []
    
    for value in options:
        mask = x >= value
        ig_val = info_gain(y, mask, func)
        ig.append(ig_val)
        split_value.append(value)
        entro_list.append(entropy(mask))
    if len(ig) == 0:
        return (None, None, None, False)
    else: 
        best_ig = max(ig)
        best_ig_index = ig.index(best_ig)
        best_split = split_value[best_ig_index]
        entro = entro_list[best_ig_index]
        return(best_split, best_ig, entro, True)

In [None]:
temp = df_d3['y'] 
cu = df_d3.drop('y', axis = 1).apply(max_info_gain, y = temp)
cu = cu.loc[:, cu.loc[3, :]]
cu

In [11]:
def get_best_split(x, out):
    masks = x.drop(out, axis = 1).apply(max_info_gain, y = x[out])
    if np.sum(masks.loc[3, :]) == 0:
        return (None, None, False, None)
    else:
        masks = masks.loc[:, masks.loc[3,:]]
        masks = masks.apply(pd.to_numeric)
        
        temporary = masks.idxmin(axis = 1)[2] #column name of the lowest entropy
        #temporary2 = np.sum(masks.loc[2, :])
        #if temporary2 == 0: #box has the lowest entropy
        #    return (None, None, False, 0)
        if masks[temporary][2] == 0: #new
            #return (None, None, False)
            check = 1 #there is entropy == 0 #new
            masks = masks.loc[:, masks.loc[2, :] == 0] #new, choose columns has entropy = 0 to split
        else:
            check = 0 #there is no entropy == 0 #new
            
        #masks['Max'] = masks.idxmax(axis = 1)
        #split_vari = masks.loc[1][-1]
        split_vari = max(masks)
        split_value = masks[split_vari][0]
        split_ig = masks[split_vari][1]
        return(split_vari, split_value, split_ig, check)

In [12]:
get_best_split(df_d3, 'y')

('x2', 2.0, 0.32192809402174527, 0)

In [13]:
def make_split(vari, value, data):
    data1 = data[data[vari] >= value]
    data2 = data[data[vari] < value]
    return data1, data2

In [14]:
def make_pred(data):
    return data.value_counts().idxmax()

In [15]:
def train_tree(data, y):
    var, val, ig, check = get_best_split(data, y)
    if ig is not None and ig != 1e-20:
        left, right = make_split(var, val, data)
        split_type = ">="
        question = "{} {}  {}".format(var, split_type, val)
        subtree = {question: []}
        if check == 1: #new
            a = make_pred(left[y]) #new
            b = make_pred(right[y]) #new
            subtree[question].append(a) #new
            subtree[question].append(b) #new
            return subtree 
        yes_ans = train_tree(left, y) 
        no_ans = train_tree(right, y)
        if yes_ans == no_ans:
            subtree = yes_ans #new subtree
        else:
            subtree[question].append(yes_ans)
            subtree[question].append(no_ans)
    else:
        return make_pred(data[y])
    return subtree
        


In [16]:
train_tree(df_d3, 'y')

: 

: 

In [None]:
df_d1 = pd.read_csv('data/D1.txt', names = colname_d3, sep = " ")
train_tree(df_d1,'y')

In [None]:
#cu2 = df_d1.drop('y', axis = 1).apply(max_info_gain, y = df_d1['y'])
#cu2 = cu2.loc[:, cu2.loc[3, :]]
#cu2 = cu2.apply(pd.to_numeric)
#cu2
#cu2 = cu2.loc[:, cu2.loc[2, :] == 0]
#cu2['Max'] = cu2.idxmax(axis=1) #(max info gain)
#cu2


In [None]:
get_best_split(df_d1, 'y')

In [None]:
df_d2 = pd.read_csv('data/D2.txt', names = colname_d3, sep = " ")
train_tree(df_d2,'y')

In [None]:
fig, ax = plt.subplots()

for label in [0,1]:
    #filters this sequence of indices by keeping only those where df_d1['y'][i] is equal to label, go first with label 0
    row_ix = [i for i in range(len(df_d1['y'])) if df_d1['y'][i] == label] 
    plt.scatter([df_d1['x1'][i] for i in row_ix], [df_d1['x2'][i] for i in row_ix], c= ['red' if label == 0 else 'blue'])

ax.set_xlabel('x1')
ax.set_ylabel('x2')
ax.hlines(y= 0.201829, xmin = 0, xmax = 1, linestyles= '--', colors= 'black')
plt.show()

In [None]:
def predict_d2(x1, x2):
    if x1 >= 0.533076:
        if x1 >= 0.73682:
            if x2 >=  0.210079:
                return 1
            else:
                return 0
        elif x1 >= 0.538288:
            if x1 >=  0.542481: 
                if x1 >=  0.548306:
                    if x1 >=  0.645326:
                        if x2 >=  0.301105:
                            return 1
                        else:
                            return 0
                    elif x1 >=  0.611437:
                        if x2 >=  0.403494:
                            return 1
                        else:
                            return 0
                    elif x1 >=  0.60386:
                        return 1
                    else:
                        return 1
                else:
                    return 1
            elif x2 >=  0.62952:
                return 1
            else:
                return 0
        elif x2 >= 0.639018:
            if x1 >=  0.111076:
                if x1 >=  0.329959:
                    return 1
                else:
                    return 1
            elif x2 >=  0.964767:
                return 1
            else:
                return 0
        elif x1 >=  0.409972:
            if x2 >=  0.534979:
                if x1 >=  0.426073:
                    return 1
                return 1
            else:
                return 0
        else:
            return 0

In [None]:
fig, ax = plt.subplots()

for label in [0,1]:
    row_ind = [i for i in range (len(df_d2['y'])) if df_d2['y'][i] == label]
    plt.scatter([df_d2['x1'][row_ind]], [df_d2['x2'][row_ind]], c = 'red' if label == 0 else 'blue')

x1 = np.linspace(0, 1, 100)
x2 = np.linspace(0, 1, 100)
x1_grid, x2_grid = np.meshgrid(x1, x2)
z = np.zeros_like(x1_grid)
for i in range (x1_grid.shape[0]):
    for j in range (x2_grid.shape[1]):
        z[i][j] = predict_d2(x1_grid[i, j], x2_grid[i, j])
plt.contourf(x1_grid, x2_grid, z, cmap='RdBu', alpha=0.5)
ax.set_xlabel('x1')
ax.set_ylabel('x2')
plt.show()

In [None]:
df_q2 = pd.read_csv('data/2.2.txt', names = ['x1', 'x2', 'x3', 'y'], sep = ' ')
df_q2

In [None]:
get_best_split(df_q2, 'y')