In [183]:
import numpy as np
import pandas as pd
import math
import time
from xclib.data import data_utils

In [184]:
datapath = './../virus/ass3_parta_data/'
num_classes = 2

In [185]:
def read_datafile(filepath):
    temp = data_utils.read_sparse_file(filepath, force_header=True)
    return temp

In [186]:
class Node:
    
    def __init__(self, split_on, prediction, depth):
        self.split_on = split_on
        self.threshold = 0.0
        self.left = None
        self.right = None
        self.parent = None
        self.depth = depth
        self.prediction = prediction
        self.is_valid = True
    
    def is_leaf(self, y):
        s = np.sum(y)
        if s==0 or s==len(y):
            return True
        return False
    
    def entropy(self, y):
        total = len(y)
        num1 = np.sum(y)
        num0 = total - num1
        if(num0==0 or num1==0):
            return 0.0
        return (num0/total)*math.log(total/num0) + (num1/total)*math.log(total/num1)

    def choose_best_attr2(self, X_tgt, y_tgt):
        num_features = len(X_tgt[0])
        entropy_curr = self.entropy(y_tgt)
        # print("Entropy curr = ", entropy_curr)
        total = len(y_tgt)
        best_diff = 0
        best_attr = 0
        best_thres = 0
        # t_iter_start = time.time()
        for i in range(num_features):
            # t1 = time.time()
            # compute threshold
            thres = np.median(X_tgt[:, i])
            
            # t2 = time.time()
            # print("computing threshold: ",t2-t1)
            # Split wrt threshold
            filt_left = X_tgt[:,i]<=thres
            filt_right = ~filt_left
            # X_left = X_tgt[filt_left]
            # X_right = X_tgt[filt_right]
            y_left = y_tgt[filt_left]
            y_right = y_tgt[filt_right]
            if len(y_left)==0 or len(y_right)==0:
                continue
            
            # t1=time.time()
            # print("split wrt threshold: ", t1-t2)
            # Calculate left and right entropies
            totall = len(y_left)
            num1l = np.sum(y_left)
            num0l = totall - num1l
            if(num0l==0 or num1l==0):
                entropy_left = 0.0
            else:
                entropy_left = (num0l/totall)*math.log(totall/num0l) + (num1l/totall)*math.log(totall/num1l)
            totalr = len(y_right)
            num1r = np.sum(y_right)
            num0r = totalr - num1r
            if(num0r==0 or num1r==0):
                entropy_right = 0.0
            else:
                entropy_right = (num0r/totalr)*math.log(totalr/num0r) + (num1r/totalr)*math.log(totalr/num1r)
            diff = entropy_curr - (entropy_left*(totall/total) + entropy_right*(totalr/total))

            # print("i = ", i)
            # print("Entropy left = ", entropy_left)
            # print("Entropy right = ", entropy_right)
            # print("Threshold = ", thres)
            if (diff >= best_diff):
                best_diff = diff
                best_attr = i
                best_thres = thres
            # t2 = time.time()
            # print("Calculate entropies and compare: ", t2-t1)

        if(best_diff <= 0):
            return [-1, -1]
        # t_iter_end = time.time()
        # print("t_iter: ", t_iter_end-t_iter_start)
        return [best_attr, best_thres]

In [187]:
class MyDecisionTreeClassifier:

    def __init__(self, max_depth=-1):
        self.root = None
        self.max_depth = max_depth
        self.num_leaves = 0
        self.num_nodes = 0
        self.tree_depth = 0
        self.nodes_list = []

    def fit(self, X, y):
        self.root = self.grow_tree(X, y)

    def grow_tree(self, X, y, depth=0):
        self.tree_depth = max(self.tree_depth, depth)

        # print("Depth = ", depth)
        # print("Size = ", len(y))

        num_tgt = len(y)
        if(np.sum(y) > num_tgt/2):
            prediction = 1.0
        else:
            prediction = 0.0
        
        node = Node(split_on=-1, prediction=prediction, depth=depth)
        self.num_nodes += 1
        self.nodes_list.append(node)


        if(node.is_leaf(y) or (node.depth == self.max_depth and self.max_depth!=-1)):
            self.num_leaves += 1
            # print("Split on: leaf\n")
            return node
        
        split_on, thres = node.choose_best_attr2(X, y)
        ind_left = X[:,split_on] <= thres
        ind_right = X[:,split_on] > thres
        node.split_on = split_on
        node.threshold = thres
        # print("Split on: {}\n".format(split_on))

        if split_on == -1:
            self.num_leaves += 1
            # print("Split on: leaf (no info gain)\n")
            return node

        node.left = self.grow_tree(X[ind_left], y[ind_left], depth=depth+1)
        node.left.parent = node
        node.right = self.grow_tree(X[ind_right], y[ind_right], depth=depth+1)
        node.right.parent = node

        return node
    
    def predict_sample(self, x):
        node = self.root
        pred = -1.0
        while node!=None:
            pred = node.prediction
            split_attr = node.split_on
            split_val = node.threshold
            if x[split_attr] <= split_val:
                node = node.left
            else:
                node = node.right
        return pred
    
    def predict(self, X_test, y_test):
        y_preds = np.array([self.predict_sample(x) for x in X_test])
        acc = np.sum(y_preds == y_test) * 100.0 / len(y_test)
        return [y_preds, acc]

    def invalidate_subtree(self, rnode):
        if (rnode == None) or (not rnode.is_valid):
            return
        rnode.is_valid = False
        self.num_nodes -= 1
        if (rnode.left == None and rnode.right == None):
            self.num_leaves -= 1
        self.invalidate_subtree(rnode.left)
        self.invalidate_subtree(rnode.right)

    def prune_tree(self, X_valid, y_valid):
        _, acc = self.predict(X_valid, y_valid)
        best_acc = acc
        to_prune = None
        is_left = False
        for node in self.nodes_list:
            if not node.is_valid:
                # print("Invalidated node found")
                continue
            saved_left = node.left
            saved_right = node.right

            node.left = None
            _, lpacc = self.predict(X_valid, y_valid)
            node.left = saved_left
            if(lpacc > best_acc):
                to_prune = node.left
                best_acc = lpacc
                is_left = True

            node.right = None
            _, rpacc = self.predict(X_valid, y_valid)
            node.right = saved_right
            if(rpacc > best_acc):
                to_prune = node.right
                best_acc = rpacc
                is_left = False

        if(best_acc == acc or to_prune == None):
            return False
        print("Found to prune: {} at depth {}".format(to_prune, to_prune.depth))
        par = to_prune.parent
        if is_left:
            par.left = None
        else:
            par.right = None
        if (par.left == None and par.right == None):
            self.num_leaves += 1
        self.invalidate_subtree(to_prune)
        return True

    def calc_maxdepth(self):
        return self.height(self.root)
    
    def height(self, rnode):
        if(rnode == None):
            return -1
        return max(self.height(rnode.left), self.height(rnode.right)) + 1

In [188]:
train_data = read_datafile(datapath + 'train_x.txt')
X_train = train_data.toarray()
y_train = np.genfromtxt(datapath + 'train_y.txt', dtype=float).reshape((-1,1))
D_train = np.append(X_train, y_train, axis=1)
print(X_train.shape)
print(y_train.shape)

(64713, 482)
(64713, 1)


In [189]:
# X_train = X_train[:10]
# y_train = y_train[:10]

# Train a Decision tree classifier
start = time.time()
clf = MyDecisionTreeClassifier(max_depth=-1)
clf.fit(X_train, y_train)

end = time.time()
print("Tree depth = {}".format(clf.calc_maxdepth()))
print("Total number of nodes = {}".format(clf.num_nodes))
print("Total leaf nodes = {}".format(clf.num_leaves))
print("Time taken = {} s".format(end - start))

Tree depth = 54
Total number of nodes = 20003
Total leaf nodes = 10002
Time taken = 257.25054693222046 s


In [190]:
# Making predictions
X_train = read_datafile(datapath + 'train_x.txt').toarray()
y_train = np.genfromtxt(datapath + 'train_y.txt', dtype=float)
y_preds, train_accuracy = clf.predict(X_train, y_train)
print("Training accuracy = {}%".format(train_accuracy))
X_test = read_datafile(datapath + 'test_x.txt').toarray()
y_test = np.genfromtxt(datapath + 'test_y.txt', dtype=float)
y_preds, test_accuracy = clf.predict(X_test, y_test)
print("Test accuracy = {}%".format(test_accuracy))
X_valid = read_datafile(datapath + 'valid_x.txt').toarray()
y_valid = np.genfromtxt(datapath + 'valid_y.txt', dtype=float)
y_preds, valid_accuracy = clf.predict(X_valid, y_valid)
print("Validation accuracy = {}%".format(valid_accuracy))

Training accuracy = 90.44086968615271%
Test accuracy = 77.97969496082703%
Validation accuracy = 77.73039124791396%


In [194]:
# Making predictions after pruning

obs_num_nodes = [clf.num_nodes]
obs_num_leaves = [clf.num_leaves]
obs_accuracies = [[train_accuracy, test_accuracy, valid_accuracy]]
while clf.prune_tree(X_valid, y_valid):
    start = time.time()
    print("Tree depth = {}".format(clf.calc_maxdepth()))
    print("Total number of nodes = {}".format(clf.num_nodes))
    obs_num_nodes.append(clf.num_nodes)
    print("Total leaf nodes = {}".format(clf.num_leaves))
    obs_num_leaves.append(clf.num_leaves)
    y_preds, train_accuracy = clf.predict(X_train, y_train)
    print("Training accuracy = {}%".format(train_accuracy))
    y_preds, test_accuracy = clf.predict(X_test, y_test)
    print("Test accuracy = {}%".format(test_accuracy))
    y_preds, valid_accuracy = clf.predict(X_valid, y_valid)
    print("Validation accuracy = {}%".format(valid_accuracy))
    obs_accuracies.append([train_accuracy, test_accuracy, valid_accuracy])
    end = time.time()
    print("Time taken = {} s".format(end - start))
    print("")

obs_num_nodes = np.array(obs_num_nodes)
obs_num_leaves = np.array(obs_num_leaves)
obs_accuracies = np.array(obs_accuracies)

KeyboardInterrupt: 

In [None]:
np.savetxt('./../bin/Q1-b/All_nodes.txt', obs_num_nodes, fmt='%d')
np.savetxt('./../bin/Q1-b/Leaves.txt', obs_num_leaves, fmt='%d')
np.savetxt('./../bin/Q1-b/Accuracies.txt', obs_accuracies, delimiter=',', fmt='%.05f')

In [None]:
merax = np.arange(9).reshape((3,3))
meraxcopy = merax
merafilt = merax[:,:] >= 4
merax2 = merax[merafilt]
np.bitwise_not(merafilt)
meraa = np.array([1,2,3,4])
merab = np.array([1,2,3,5])
np.median(np.array([[4,2,1,3]]))
merax*merafilt
~merafilt
# meraa == merab