In [1]:
import numpy as np
import os

#Change to suitable path
path = r"C:\Users\thaovv\Program\MachineLearning\algorithms\decisiontree" 
os.chdir(path)

In [2]:
from decisiontree import DecisionTree, TreeNode

In [3]:
from scipy.io import arff

data = arff.loadarff(r"C:\Users\thaovv\Program\MachineLearning\Dataset\BMW.arff")[0] #arff[0] = data, arff[1] = description of data
data = np.array(data.tolist(), dtype=np.int8) #Convert complicated datatype to numpy array dtype

#Split dataset and labels
dataset = data[:, 0:7]
labels = data[:, 7] 

In [4]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test= train_test_split(dataset, labels, test_size=0.3, random_state=42)

In [5]:
class RandomForest:
    def __init__(self, n_trees=20, min_samples_split=3, max_depth=20, min_gain=1e-4, n_features = 4):
        self.n_trees = n_trees
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.min_gain = 1e-4
        self.n_features = n_features
        
    def fit(self, X, y):
        self.trees = []
        for _ in np.arange(self.n_trees):
            treei = DecisionTree(self.max_depth, self.min_gain, self.min_samples_split, n_features=self.n_features)
            Xi, yi = self.bootstrap_sample(X, y)
            treei.fit(Xi, yi)
            self.trees.append(treei)
            
    def predict(self, X):
        #Get prediction for each tree
        tree_preds = np.array([tree.predict(X) for tree in self.trees]).T
        #Majority votes
        label_type = np.unique(tree_preds)
        label_predicts = np.zeros((X.shape[0], )) 
        label_counts = np.zeros((X.shape[0], ))
        for label in label_type:
            label_count_i = np.sum(tree_preds == label, axis=1)
            label_predicts = np.where(label_counts < label_count_i, label, label_predicts)
        return label_predicts   
    
    def bootstrap_sample(self, X, y):
        n_samples = X.shape[0]
        idxs = np.random.choice(n_samples, size=n_samples, replace=True) #Get random number from 0 -> n_sample - 1, with redundance
        return X[idxs], y[idxs]

In [6]:
import time

for n_features in np.arange(1, x_train.shape[1] + 1):
    start = time.time()
    my_model = RandomForest(n_trees=5, max_depth=14, n_features=n_features)
    my_model.fit(x_train, y_train)
    end = time.time()
    design_time = end - start

    start = time.time()
    my_predict = my_model.predict(x_test)
    end = time.time()
    inference_time = end - start

    print("max_depth: {}".format(n_features))
    print("Performance Decision Tree   : {}".format(np.sum(np.equal(my_predict, y_test.reshape(y_test.shape[0],)))))
    print("Design time Decision Tree   : {}".format(design_time))
    print("Inference time Decision Tree: {}".format(inference_time))

max_depth: 1
Performance Decision Tree   : 18
Design time Decision Tree   : 0.011993408203125
Inference time Decision Tree: 0.001995563507080078
max_depth: 2
Performance Decision Tree   : 20
Design time Decision Tree   : 0.01998114585876465
Inference time Decision Tree: 0.0019979476928710938
max_depth: 3
Performance Decision Tree   : 21
Design time Decision Tree   : 0.02698373794555664
Inference time Decision Tree: 0.0010161399841308594
max_depth: 4
Performance Decision Tree   : 19
Design time Decision Tree   : 0.021987438201904297
Inference time Decision Tree: 0.0009984970092773438
max_depth: 5
Performance Decision Tree   : 20
Design time Decision Tree   : 0.029979705810546875
Inference time Decision Tree: 0.0
max_depth: 6
Performance Decision Tree   : 22
Design time Decision Tree   : 0.037978410720825195
Inference time Decision Tree: 0.00099945068359375
max_depth: 7
Performance Decision Tree   : 25
Design time Decision Tree   : 0.04054069519042969
Inference time Decision Tree: 0.0009