In [269]:
class decision_tree(classifier):

    def __init__(self, criterion = 'entropy'):
        self.criterion = criterion

    def gini(self, Y):
        size = len(Y)
        counts = dict()
        for y in Y:
            if y not in counts:
                counts[y] = 0.
            counts[y] += 1.
        gini = 0.
        for key in counts:
            prob = counts[key] / size
            gini += prob * (1-prob)
        return gini


    def entropy(self, Y):
        from math import log

        size = len(Y)
        counts = dict()
        for y in Y:
            if y not in counts:
                counts[y] = 0.
            counts[y] += 1.
        entropy = 0.
        for key in counts:
            prob = counts[key] / size
            entropy -= prob * log(prob,2)
        return entropy


    def split_data(self, X, Y, axis, value):
        return_x = []
        return_y = []

        for x, y in (zip(X, Y)):
            if x[axis] == value:
                reduced_x = x[:axis]
                reduced_x.extend(x[axis+1:])
                return_x.append(reduced_x)
                return_y.append(y)
        return return_x, return_y


    def choose_feature(self, X, Y):
        if self.criterion == 'entropy':
#             print("Entropy")
            entropy = self.entropy(Y)
            best_information_gain = 0.
            best_feature = -1
            for i in range(len(X[0])):  # For each feature
                feature_list = [x[i] for x in X]
                values = set(feature_list)
                entropy_i = 0.
                for value in values:
                    sub_x, sub_y = self.split_data(X, Y, i, value)
                    prob = len(sub_x) / float(len(X))
                    entropy_i += prob * self.entropy(sub_y)
                info_gain = entropy - entropy_i
                if info_gain > best_information_gain:
                    best_information_gain = info_gain
                    best_feature = i
            return best_feature
        else:
#             print("Gini")
            entropy = self.gini(Y)
            best_information_gain = 0.
            best_feature = -1
            for i in range(len(X[0])):  # For each feature
                feature_list = [x[i] for x in X]
                values = set(feature_list)
                entropy_i = 0.
                for value in values:
                    sub_x, sub_y = self.split_data(X, Y, i, value)
                    prob = len(sub_x) / float(len(X))
                    entropy_i += prob * self.gini(sub_y)
                info_gain = entropy - entropy_i
                if info_gain > best_information_gain:
                    best_information_gain = info_gain
                    best_feature = i
            return best_feature


    def class_dict(self, Y):
        classes = dict()
        for y in Y:
            if y not in classes:
                classes[y] = 0
            classes[y] += 1
        return classes


    def majority(self, Y):
        from operator import itemgetter
        # Use this function if a leaf cannot be split further and
        # ... the node is not pure

        classcount = self.class_dict(Y)
        sorted_classcount = sorted(classcount.items(), key=itemgetter(1), reverse=True)
        return sorted_classcount[0][0]


    def build_tree(self, X, Y):
#         print(type(X), type(Y))
        # IF there's only one instance or one class, don't continue to split
        if len(Y) <= 1 or len(self.class_dict(Y)) == 1:
#             print(Y)
            return Y[0]

        if len(X[0]) == 1:
            return self.majority(Y)   # TODO: Fix this

        best_feature = self.choose_feature(X, Y)
        if best_feature < 0 or best_feature >= len(X[0]):
            return self.majority(Y)

        this_tree = dict()
        feature_values = [example[best_feature] for example in X]
        unique_values = set(feature_values)
        for value in unique_values:
            # Build a node with each unique value:
            subtree_x, subtree_y = self.split_data(X, Y, best_feature, value)
            if best_feature not in this_tree:
                this_tree[best_feature] = dict()
            if value not in this_tree[best_feature]:
                this_tree[best_feature][value] = 0
            this_tree[best_feature][value] = self.build_tree(subtree_x, subtree_y)
        return this_tree


    def fit(self, X, Y):
        self.fittedTree = self.build_tree(X, Y)
        return self.fittedTree

    def predict(self, X):
        lista = []
        for i in range(len(X)):
            val = self.recursivecall(X[i], self.fittedTree)
            lista.append(val)
        return lista
        
    def recursivecall(self, X, tree):
        try:
            if isinstance(tree,int):
                return tree
            if not isinstance(tree,dict):
                return tree
            keys = tree.keys()
            for k in keys:
                a = X[k]
                if isinstance(tree[k],dict):
                    newtree = tree[k][a]
                    return self.recursivecall(X, newtree)
                else:
                    return tree[k]
        except:
            return ' <=50K.'

In [305]:
import numpy as np
import pandas as pd
import math
import operator
from classifier import classifier
from collections import defaultdict

class randomforest(classifier):
    def __init__(self, trees = 10, max_depth = -1):
        self.trees = trees
        self.max_depth = max_depth
        
    def fit(self, X, Y, num_trees):
        tree_list = create_list(num_trees)  # decision_tree list
        for t in tree_list:
            subsample_x, subsample_y = subsample(X, Y) # Bagging
            feature_list = sample_of_features(X) # Random features
            t.fit(subsample_x, subsample_y, feature_list)
 
    def predict(self, X):
        hypothesis_list = [t.predict(X) for t in tree_list]
        counts = defaultdict(int)
        for h in hypothesis_list:
            counts[h] += 1
        return sorted(counts.items(), reverse=True, key=lambda tup: tup[1])[:len(tree_list)][0][0]
    
    def subsample(self, X, Y):
        x = np.random.randint(0, len(X), size = 50)
        return X[x], Y[x]
 
    def sample_of_features(self, X):
        print(len(X[1]))
        x = np.random.choice(0, len(X[1]), size = 2, replace = False)
        return x

In [188]:
df = pd.read_csv("adulttrain.csv", header = None)

In [189]:
df.columns = ['age','workclass', 'fnlwgt', 'education', 'educationNum', 'maritalStatus', 'occupation', 'relationship', 'race', 'sex', 'capitalGain', 'capitalLoss', 'hoursPerWeek', 'nativeCountry', 'Income']
df.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,educationNum,maritalStatus,occupation,relationship,race,sex,capitalGain,capitalLoss,hoursPerWeek,nativeCountry,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K


In [211]:
age = df.iloc[:,0]
pd.cut(age, 5)

fnlwgt = df.iloc[:,2]
pd.cut(fnlwgt, 5)

educationNum = df.iloc[:,4]
pd.cut(educationNum, 5)

capitalGain = df.iloc[:,10]
pd.cut(capitalGain, 5)

capitalLoss = df.iloc[:,11]
pd.cut(capitalLoss, 5)

hoursPerWeek = df.iloc[:,12]
pd.cut(hoursPerWeek, 5)

0         (20.6, 40.2]
1        (0.902, 20.6]
2         (20.6, 40.2]
3         (20.6, 40.2]
4         (20.6, 40.2]
5         (20.6, 40.2]
6        (0.902, 20.6]
7         (40.2, 59.8]
8         (40.2, 59.8]
9         (20.6, 40.2]
10        (79.4, 99.0]
11        (20.6, 40.2]
12        (20.6, 40.2]
13        (40.2, 59.8]
14        (20.6, 40.2]
15        (40.2, 59.8]
16        (20.6, 40.2]
17        (20.6, 40.2]
18        (40.2, 59.8]
19        (40.2, 59.8]
20        (59.8, 79.4]
21       (0.902, 20.6]
22        (20.6, 40.2]
23        (20.6, 40.2]
24        (20.6, 40.2]
25        (20.6, 40.2]
26        (20.6, 40.2]
27        (59.8, 79.4]
28        (79.4, 99.0]
29        (20.6, 40.2]
             ...      
32531     (79.4, 99.0]
32532     (59.8, 79.4]
32533     (40.2, 59.8]
32534     (20.6, 40.2]
32535     (20.6, 40.2]
32536     (40.2, 59.8]
32537     (40.2, 59.8]
32538     (40.2, 59.8]
32539    (0.902, 20.6]
32540     (20.6, 40.2]
32541     (20.6, 40.2]
32542     (20.6, 40.2]
32543     (

In [275]:
train_data = df.iloc[:,:-1]
train_labels = df.iloc[:,-1]

train_data = train_data.values.tolist()
train_labels = train_labels.values.tolist()

decision = decision_tree()

tree  = decision.fit(train_data, train_labels)
# print('Tree :',tree)

In [276]:
from sklearn.metrics import accuracy_score

dftest = pd.read_csv("adulttest.csv")
dftest.columns = ['age','workclass', 'fnlwgt', 'education', 'educationNum', 'maritalStatus', 'occupation', 'relationship', 'race', 'sex', 'capitalGain', 'capitalLoss', 'hoursPerWeek', 'nativeCountry', 'Income']
dftest.head(2)

test_data = dftest.iloc[:,:-1]
test_labels = dftest.iloc[:,-1]

test_data = test_data.values.tolist()
test_labels = test_labels.values.tolist()

hyp = decision.predict(test_data)
# print(test_labels)
# print(hyp)

print('Accuracy : ', accuracy_score(test_labels,hyp))

Accuracy :  0.44324324324324327


In [None]:
randomforest = randomforest()
print(randomforest.sample_of_features(test_data))