In [549]:
import pandas as pd
import numpy as np


In [550]:
class sets_and_batch:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.size = X.shape
        self.N = self.size[0]
        X_fold = None
        y_fold = None
    
    def to_set(self, k):
        idx = np.arange(self.N)
        [np.random.shuffle(idx) for i in range(0,10)]
        sets = [i*int(self.N/k) for i in range(0,k+1)]
        
        X_kfold, y_kfold = {}, {}

        for i in range(0,k):
            x_beg = self.X[sets[0]:sets[i]]
            x_val = self.X[sets[i]:sets[i+1]]
            x_rem = self.X[sets[i+1]:]

            y_beg = self.y[sets[0]:sets[i]]
            y_val = self.y[sets[i]:sets[i+1]]
            y_rem = self.y[sets[i+1]:]

            x_train = np.concatenate((x_beg, x_rem), 0)
            X_kfold[i] = {}
            X_kfold[i]['train'] = x_train
            X_kfold[i]['val'] = x_val

            y_train = np.concatenate((y_beg, y_rem), 0)
            y_kfold[i] = {}
            y_kfold[i]['train'] = y_train
            y_kfold[i]['val'] = y_val
            
        self.X_fold = X_kfold
        self.y_fold = y_kfold
        
    def get_fold(self,i):
        return self.X_fold[i], self.y_fold[i]

In [551]:
class Node:
    def __init__(self, feature_index, split_value, groups):
        self.feature_index = feature_index
        self.split_value = split_value
        self.groups = groups
        self.left = None
        self.right = None
        
        
    def __str__(self):
        return 'values = ' + str(self.split_value) +', index = '+ str(self.feature_index)

In [552]:
def gini_index(target_groups, classes):
    N = len(target_groups[0]+target_groups[1])
    gini = 0
    for t in target_groups:
        s = 0
        if len(t)==0:
            continue
        for c in classes:
            temp = t.count(c)/len(t)
            temp = temp*temp
            s+=temp
        gini+=(1-s)*(len(t)/N)
    return gini

In [553]:
def generic_split(feature_index, value, Xdataset,Ydataset):
    lX,rX,lY,rY = [],[],[],[]
    
    for i in range(0,len(Xdataset)):
        row = Xdataset[i]
        target = Ydataset[i]
        if row[feature_index] < value:
            lX.append(row)
            lY.append(target)
        else:
            rX.append(row)
            rY.append(target)
            
    return lX,lY,rX,rY

In [554]:
#split according to smallest gini_index

In [555]:
def best_split(Xdataset, Ydataset):
    N = len(Xdataset)
    feature_space = len(Xdataset[0])
    g = np.inf
    v = None
    feature = None
    group = []
    
    for i in range(0,N):
        for j in range(0,feature_space):
            lx,ly,rx,ry = generic_split(j,Xdataset[i][j],Xdataset,Ydataset)
            g_index = gini_index([ly,ry],list(set(Ydataset)))
            #print('X%d < %.3f Gini=%.3f' % ((j+1), Xdataset[i][j], g_index))
            if g_index < g:
                g = g_index
                feature = j
                value = Xdataset[i][feature]
                group = [[lx,ly],[rx,ry]]
    result = Node(feature, value, group)
    return result

In [556]:
def to_terminal(targets):
    from scipy import stats
    return stats.mode(targets)

In [557]:
def main_split(root, max_depth, min_size, depth):
    left_X,left_Y = root.groups[0]
    right_X,right_Y = root.groups[1]
    
    if not left_Y or not right_Y:
        root.left = root.right = to_terminal(left_Y + right_Y)
        return

    if depth >= max_depth:
        root.left = to_terminal(left_Y)
        root.right = to_terminal(right_Y)
        return 

    if len(left_Y) <  min_size:
        root.left = to_terminal(left_Y)
    else:
        root.left = best_split(left_X, left_Y)
        main_split(root.left, max_depth, min_size, depth+1)

    if len(right_Y) <  min_size:
        root.right = to_terminal(right_Y)
    else:
        root.right = best_split(right_X, right_Y)
        main_split(root.right, max_depth, min_size, depth+1)



In [558]:
def build_tree(X_train,y_train, max_depth, min_size):
    root = best_split(X_train,y_train)
    main_split(root, max_depth, min_size, 1)
    return root

In [560]:
df = pd.read_csv('banknotes.txt', header = None)
df.columns = ['X_0','X_1','X_2','X_3','Y']
X = df[['X_0','X_1','X_2','X_3']].values
y = df['Y'].values

In [577]:
def build_agg_tree(X_train,y_train, min_index, num_roots,max_depth, min_size):
    N = X_train.shape
    if len(N) > 1:
        N = N[0]
    roots = []
    for i in range(num_roots):
        idx = np.random.choice(N,int(N/min_index))
        X = X_train[idx]
        Y = y_train[idx]
        root = best_split(X_train,y_train)
        main_split(root, max_depth, min_size, 1)
        roots.append(root)
    return roots

In [578]:
def predict(model, datapoint):
    value = model.split_value
    index = model.feature_index
    while True:
        if datapoint[index] < value:
            model = model.left
            if isinstance(model,Node):
                value = model.split_value
                index = model.feature_index
            else:
                result = model
                break
        else:
            model = model.right
            if isinstance(model,Node):
                value = model.split_value
                index = model.feature_index
            else:
                result = model
                break
    return result

In [548]:
df = pd.read_csv('banknotes.txt', header = None)
df.columns = ['X_0','X_1','X_2','X_3','Y']
X = df[['X_0','X_1','X_2','X_3']].values
y = df['Y'].values

snb = sets_and_batch(X,y)
snb.to_set(5)

for i in range(0,5):
    X,Y = snb.get_fold(i)
    node = build_tree(X['train'],Y['train'],10,1)
    pred = [predict(node,X['val'][i]).mode[0] for i in range(0,len(X['val']))]
    result = np.array(Y['val'] == np.array(pred),int).sum()/len(pred)
    print('fold ' + str(i),result)

fold 0 0.981751824818
fold 1 0.978102189781
fold 2 0.967153284672
fold 3 0.956204379562
fold 4 0.992700729927


In [596]:
def predict_agg(models, datapoint):
    from scipy import stats
    return stats.mode([predict(model,datapoint).mode[0] for model in models]).mode[0]

In [579]:
df = pd.read_csv('banknotes.txt', header = None)
df.columns = ['X_0','X_1','X_2','X_3','Y']
X = df[['X_0','X_1','X_2','X_3']].values
y = df['Y'].values

snb = sets_and_batch(X,y)
snb.to_set(5)

for i in range(0,5):
    X,Y = snb.get_fold(i)
    nodes = build_agg_tree(X['train'],Y['train'],4,4,10,1)
    pred = [predict_agg(node,X['val'][i]) i in range(0,len(X['val']))]
    result = np.array(Y['val'] == np.array(pred),int).sum()/len(pred)
    print('fold ' + str(i),result)

In [583]:
X['train'][0]

array([ 5.2756 ,  0.13863,  0.12138,  1.1435 ])

In [595]:
from scipy import stats
stats.mode([predict(model,X['train'][0]).mode[0] for model in node]).mode[0]

0