In [24]:
# Split a dataset based on an attribute and an attribute value
def test_split(index, value, dataset):
	left, right = list(), list()
	for row in dataset:
		if row[index] < value:
			left.append(row)
		else:
			right.append(row)
	return left, right

def gini_impurity(array, classes):
    psquared = 0
    n = len(array)
    for x in classes:
        p_temp = np.sum(array == x)/n
        psquared+=p_temp*p_temp
    return 1 - psquared

def entropy(array, classes):
    ent = 0
    n = len(array)
    for x in classes:
        p_temp = np.sum(array == x)/n
        ent+=-p_temp*np.log(p_temp)
    return ent

def weighted_average(array1, array2, classes, p_1, p_2):
    if len(array1) == 0:
        gini_1 = 0
    else:
        gini_1 = gini_impurity(array1, classes)
    
    if len(array2) == 0:
        gini_2 = 0
    else:
        gini_2 = gini_impurity(array2, classes)

    return p_1*gini_1 + p_2*gini_2

def gini_index(target_groups, classes):
    N = len(target_groups[0])+len(target_groups[1])
    p1,p2 = len(target_groups[0])/N, len(target_groups[1])/N
    return weighted_average( target_groups[0], target_groups[1], classes,p1,p2)

# Select the best split point for a dataset
def get_split(dataset):
	class_values = list(set(row[-1] for row in dataset))
	b_index, b_value, b_score, b_groups = 999, 999, 999, None
	for index in range(len(dataset[0])-1):
		for row in dataset:
			groups = test_split(index, row[index], dataset)
			gini = gini_index(groups, class_values)
			if gini < b_score:
				b_index, b_value, b_score, b_groups = index, row[index], gini, groups
	return {'index':b_index, 'value':b_value, 'groups':b_groups}

# Create a terminal node value
def to_terminal(group):
	outcomes = [row[-1] for row in group]
	return max(set(outcomes), key=outcomes.count)

# Create child splits for a node or make terminal
def split(node, max_depth, min_size, depth):
	left, right = node['groups']
	del(node['groups'])
	# check for a no split
	if not left or not right:
		node['left'] = node['right'] = to_terminal(left + right)
		return
	# check for max depth
	if depth >= max_depth:
		node['left'], node['right'] = to_terminal(left), to_terminal(right)
		return
	# process left child
	if len(left) <= min_size:
		node['left'] = to_terminal(left)
	else:
		node['left'] = get_split(left)
		split(node['left'], max_depth, min_size, depth+1)
	# process right child
	if len(right) <= min_size:
		node['right'] = to_terminal(right)
	else:
		node['right'] = get_split(right)
		split(node['right'], max_depth, min_size, depth+1)

# Build a decision tree
def build_tree(train, max_depth, min_size):
	root = get_split(train)
	split(root, max_depth, min_size, 1)
	return root

# Print a decision tree
def print_tree(node, depth=0):
	if isinstance(node, dict):
		print('%s[X%d < %.3f]' % ((depth*' ', (node['index']+1), node['value'])))
		print_tree(node['left'], depth+1)
		print_tree(node['right'], depth+1)
	else:
		print('%s[%s]' % ((depth*' ', node)))
        

# Make a prediction with a decision tree
def predict(node, row):
	if row[node['index']] < node['value']:
		if isinstance(node['left'], dict):
			return predict(node['left'], row)
		else:
			return node['left']
	else:
		if isinstance(node['right'], dict):
			return predict(node['right'], row)
		else:
			return node['right']

# Make a prediction with a decision tree
def predict(node, row):
	if row[node['index']] < node['value']:
		if isinstance(node['left'], dict):
			return predict(node['left'], row)
		else:
			return node['left']
	else:
		if isinstance(node['right'], dict):
			return predict(node['right'], row)
		else:
			return node['right']

dataset = [[2.771244718,1.784783929,0],
	[1.728571309,1.169761413,0],
	[3.678319846,2.81281357,0],
	[3.961043357,2.61995032,0],
	[2.999208922,2.209014212,0],
	[7.497545867,3.162953546,1],
	[9.00220326,3.339047188,1],
	[7.444542326,0.476683375,1],
	[10.12493903,3.234550982,1],
	[6.642287351,3.319983761,1]]
tree = build_tree(dataset, 1, 1)
print_tree(tree)

[X1 < 2.771]
 [0]
 [1]


In [25]:
import pandas as pd
import numpy as np

df = pd.read_csv('banknotes.txt', header =None)

df = df.sample(frac = 1)
df.columns = ['X_0','X_1','X_2','X_3','Y']
#X = df[['X_0','X_1','X_2','X_3']].values
#y = df['Y'].values

tree = build_tree(df.values[:200], 200, 1)

res = []
for row in df.values[:200]:
    prediction = predict(tree, row)
    res.append(row[-1] == prediction)
    
np.array(res,int).all()

True