In [266]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook

df = pd.read_csv('banknotes.txt', header = None)
df.columns = ['X_0','X_1','X_2','X_3','Y']
X = df[['X_0','X_1','X_2','X_3']].values
y = df['Y'].values

X.shape, y.shape

import warnings
warnings.simplefilter("error")
warnings.simplefilter("ignore", DeprecationWarning)

In [267]:
def gini_impurity(array, classes):
    psquared = 0
    n = len(array)
    for x in classes:
        p_temp = np.sum(array == x)/n
        psquared+=p_temp*p_temp
    return 1 - psquared

def weighted_average(array1, array2, classes, p_1, p_2):
    gini_1, gini_2 = \
    gini_impurity(array1, classes), gini_impurity(array2, classes)
    return p_1*gini_1 + p_2*gini_2




In [268]:
def gini_impurity(array, classes):
    psquared = 0
    n = len(array)
    for x in classes:
        p_temp = np.sum(array == x)/n
        psquared+=p_temp*p_temp
    return 1 - psquared

def weighted_average(array1, array2, classes, p_1, p_2):
    gini_1, gini_2 = \
    gini_impurity(array1, classes), gini_impurity(array2, classes)
    return p_1*gini_1 + p_2*gini_2
    
    
def get_weighted_impurity(feature_column, candidate, classes, target_column, input_indices):
    n = len(feature_column)
    
    idx_less, = np.where(feature_column[input_indices] <= candidate)
    idx_more, = np.where(feature_column[input_indices] > candidate)
    
    if len(idx_less) == 0 or len(idx_more) == 0:
        return [],[],np.float('inf')

    p_1 = len(idx_less)/n
    p_2 = len(idx_more)/n
    
    idx_less = input_indices[idx_less]
    idx_more = input_indices[idx_more]
    

    return idx_less, idx_more, weighted_average(target_column[idx_less], target_column[idx_more], classes, p_1, p_2)

def find_split(X, y, input_indices, verbose = False):
    rows, columns = X.shape
    min_c = None
    min_r = None
    min_gini = np.float('inf')
    idx_left, idx_right = None, None

    if verbose:
        t = tqdm_notebook(range(0,columns))
    else:
        t = range(0,columns)

    for c in t:
        for r in range(0,rows):

            column = X[:,c]
            candidate = column[r]

            idx_less, idx_more, impurity = get_weighted_impurity(column, candidate, classes, y, input_indices)
            if impurity is None:
                continue

            if impurity < min_gini:
                min_gini = impurity
                min_c = c
                min_r = r
                idx_left = idx_less
                idx_right = idx_more
                
    return min_c, min_r, min_gini, idx_left, idx_right

In [322]:
class Node:
    def __init__(self, feature_index, split_value):
        self.feature_index = feature_index
        self.split_value = split_value
        self.left = None
        self.right = None
        
    def __str__(self):
        return 'values = ' + str(self.split_value) +', index = '+ str(self.feature_index)
    
def create_node(X, y, input_indices):
    if len(input_indices) > 1:
        min_c, min_r, min_gini, idx_left, idx_right = find_split(X,y,input_indices)
        return Node(min_c,X[min_r][min_c]), idx_left, idx_right
    else:
        return None,None,None

In [325]:
root, idx_left, idx_right = create_node(X,y,np.arange(len(X)))

temp = root

while idx_left is not None:
    temp.left, idx_left, idx_right = create_node(X,y,idx_left)
    temp = temp.left

In [329]:
root, idx_left, idx_right = create_node(X,y,np.arange(len(X)))

temp = root

while idx_right is not None:
    temp.right, idx_left, idx_right = create_node(X,y,idx_right)
    temp = temp.right


In [354]:
def build(X,y,root, root_idx_left, root_idx_right):
    if root is not None and len(root_idx_left) > 1 and len(root_idx_right)> 1:
        
        root.left, idx_left, idx_right = create_node(X,y,root_idx_left)
        root = build(X,y, root.left,idx_left, idx_right)

        root.right, idx_left, idx_right = create_node(X,y,root_idx_right)
        root = build(X,y,root.right,idx_left, idx_right)
        
        return root

root, idx_left, idx_right = create_node(X,y,np.arange(len(X)))
build(X,y, root, idx_left, idx_right)

AttributeError: 'NoneType' object has no attribute 'right'

In [339]:
root

<__main__.Node at 0x11c07ab50>

In [320]:
node[0].feature_index

In [313]:
temp.split_value

array([[[[  3.6216 ,   8.6661 ,  -2.8073 ,  -0.44699],
         [  4.5459 ,   8.1674 ,  -2.4586 ,  -1.4621 ],
         [  3.866  ,  -2.6383 ,   1.9242 ,   0.10645],
         ...,
         [ -3.7503 , -13.4586 ,  17.5932 ,  -2.7771 ],
         [ -3.5637 ,  -8.3827 ,  12.393  ,  -1.2823 ],
         [ -2.5419 ,  -0.65804,   2.6842 ,   1.1952 ]]]])

In [327]:
while root:
    print(root.split_value, root.feature_index)
    root = root.left

0.31803000000000003 0


In [332]:
def build(root, root_idx_left, root_idx_right):
    if root is None:
        return
    
    if root_idx_left is not None:
        if len(root_idx_left) < 5:
            return
    elif not root_idx_left:
        return
    
    if root_idx_right is not None:
        if len(root_idx_right) < 5:
            return
    elif not root_idx_right:
        return
    
    
    root.left, idx_left, idx_right = create_node(X,y,root_idx_left)
    root = build(root.left,idx_left, idx_right)
    
    root.left, idx_left, idx_right = create_node(X,y,root_idx_right)
    root = build(root.right,idx_left, idx_right)
    
    return root

In [333]:
input_indices = np.arange(0,len(X))
root, idx_left, idx_right = create_node(X, y, input_indices)
build(root, idx_left, idx_right)

AttributeError: 'NoneType' object has no attribute 'left'

In [286]:
root.left.left

In [549]:
import pandas as pd
import numpy as np


In [550]:
class sets_and_batch:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.size = X.shape
        self.N = self.size[0]
        X_fold = None
        y_fold = None
    
    def to_set(self, k):
        idx = np.arange(self.N)
        [np.random.shuffle(idx) for i in range(0,10)]
        sets = [i*int(self.N/k) for i in range(0,k+1)]
        
        X_kfold, y_kfold = {}, {}

        for i in range(0,k):
            x_beg = self.X[sets[0]:sets[i]]
            x_val = self.X[sets[i]:sets[i+1]]
            x_rem = self.X[sets[i+1]:]

            y_beg = self.y[sets[0]:sets[i]]
            y_val = self.y[sets[i]:sets[i+1]]
            y_rem = self.y[sets[i+1]:]

            x_train = np.concatenate((x_beg, x_rem), 0)
            X_kfold[i] = {}
            X_kfold[i]['train'] = x_train
            X_kfold[i]['val'] = x_val

            y_train = np.concatenate((y_beg, y_rem), 0)
            y_kfold[i] = {}
            y_kfold[i]['train'] = y_train
            y_kfold[i]['val'] = y_val
            
        self.X_fold = X_kfold
        self.y_fold = y_kfold
        
    def get_fold(self,i):
        return self.X_fold[i], self.y_fold[i]

In [551]:
class Node:
    def __init__(self, feature_index, split_value, groups):
        self.feature_index = feature_index
        self.split_value = split_value
        self.groups = groups
        self.left = None
        self.right = None
        
        
    def __str__(self):
        return 'values = ' + str(self.split_value) +', index = '+ str(self.feature_index)

In [552]:
def gini_index(target_groups, classes):
    N = len(target_groups[0]+target_groups[1])
    gini = 0
    for t in target_groups:
        s = 0
        if len(t)==0:
            continue
        for c in classes:
            temp = t.count(c)/len(t)
            temp = temp*temp
            s+=temp
        gini+=(1-s)*(len(t)/N)
    return gini

In [553]:
def generic_split(feature_index, value, Xdataset,Ydataset):
    lX,rX,lY,rY = [],[],[],[]
    
    for i in range(0,len(Xdataset)):
        row = Xdataset[i]
        target = Ydataset[i]
        if row[feature_index] < value:
            lX.append(row)
            lY.append(target)
        else:
            rX.append(row)
            rY.append(target)
            
    return lX,lY,rX,rY

In [554]:
#split according to smallest gini_index

In [555]:
def best_split(Xdataset, Ydataset):
    N = len(Xdataset)
    feature_space = len(Xdataset[0])
    g = np.inf
    v = None
    feature = None
    group = []
    
    for i in range(0,N):
        for j in range(0,feature_space):
            lx,ly,rx,ry = generic_split(j,Xdataset[i][j],Xdataset,Ydataset)
            g_index = gini_index([ly,ry],list(set(Ydataset)))
            #print('X%d < %.3f Gini=%.3f' % ((j+1), Xdataset[i][j], g_index))
            if g_index < g:
                g = g_index
                feature = j
                value = Xdataset[i][feature]
                group = [[lx,ly],[rx,ry]]
    result = Node(feature, value, group)
    return result

In [556]:
def to_terminal(targets):
    from scipy import stats
    return stats.mode(targets)

In [557]:
def main_split(root, max_depth, min_size, depth):
    left_X,left_Y = root.groups[0]
    right_X,right_Y = root.groups[1]
    
    if not left_Y or not right_Y:
        root.left = root.right = to_terminal(left_Y + right_Y)
        return

    if depth >= max_depth:
        root.left = to_terminal(left_Y)
        root.right = to_terminal(right_Y)
        return 

    if len(left_Y) <  min_size:
        root.left = to_terminal(left_Y)
    else:
        root.left = best_split(left_X, left_Y)
        main_split(root.left, max_depth, min_size, depth+1)

    if len(right_Y) <  min_size:
        root.right = to_terminal(right_Y)
    else:
        root.right = best_split(right_X, right_Y)
        main_split(root.right, max_depth, min_size, depth+1)



In [558]:
def build_tree(X_train,y_train, max_depth, min_size):
    root = best_split(X_train,y_train)
    main_split(root, max_depth, min_size, 1)
    return root

In [560]:
df = pd.read_csv('banknotes.txt', header = None)
df.columns = ['X_0','X_1','X_2','X_3','Y']
X = df[['X_0','X_1','X_2','X_3']].values
y = df['Y'].values

In [577]:
def build_agg_tree(X_train,y_train, min_index, num_roots,max_depth, min_size):
    N = X_train.shape
    if len(N) > 1:
        N = N[0]
    roots = []
    for i in range(num_roots):
        idx = np.random.choice(N,int(N/min_index))
        X = X_train[idx]
        Y = y_train[idx]
        root = best_split(X_train,y_train)
        main_split(root, max_depth, min_size, 1)
        roots.append(root)
    return roots

In [578]:
def predict(model, datapoint):
    value = model.split_value
    index = model.feature_index
    while True:
        if datapoint[index] < value:
            model = model.left
            if isinstance(model,Node):
                value = model.split_value
                index = model.feature_index
            else:
                result = model
                break
        else:
            model = model.right
            if isinstance(model,Node):
                value = model.split_value
                index = model.feature_index
            else:
                result = model
                break
    return result

In [548]:
df = pd.read_csv('banknotes.txt', header = None)
df.columns = ['X_0','X_1','X_2','X_3','Y']
X = df[['X_0','X_1','X_2','X_3']].values
y = df['Y'].values

snb = sets_and_batch(X,y)
snb.to_set(5)

for i in range(0,5):
    X,Y = snb.get_fold(i)
    node = build_tree(X['train'],Y['train'],10,1)
    pred = [predict(node,X['val'][i]).mode[0] for i in range(0,len(X['val']))]
    result = np.array(Y['val'] == np.array(pred),int).sum()/len(pred)
    print('fold ' + str(i),result)

fold 0 0.981751824818
fold 1 0.978102189781
fold 2 0.967153284672
fold 3 0.956204379562
fold 4 0.992700729927


In [596]:
def predict_agg(models, datapoint):
    from scipy import stats
    return stats.mode([predict(model,datapoint).mode[0] for model in models]).mode[0]

In [579]:
df = pd.read_csv('banknotes.txt', header = None)
df.columns = ['X_0','X_1','X_2','X_3','Y']
X = df[['X_0','X_1','X_2','X_3']].values
y = df['Y'].values

snb = sets_and_batch(X,y)
snb.to_set(5)

for i in range(0,5):
    X,Y = snb.get_fold(i)
    nodes = build_agg_tree(X['train'],Y['train'],4,4,10,1)
    pred = [predict_agg(node,X['val'][i]) i in range(0,len(X['val']))]
    result = np.array(Y['val'] == np.array(pred),int).sum()/len(pred)
    print('fold ' + str(i),result)

In [583]:
X['train'][0]

array([ 5.2756 ,  0.13863,  0.12138,  1.1435 ])

In [595]:
from scipy import stats
stats.mode([predict(model,X['train'][0]).mode[0] for model in node]).mode[0]

0