In [63]:
import pandas as pd
import numpy as np
import random
import math
from pandas import read_csv

stock_data = pd.read_csv(r'C:\Users\Zz\Desktop\dataset.csv', header = 0, index_col = 0)
dataset = stock_data.values
X = stock_data[['Vol-5d', 'Return']]
Y = stock_data['signal']
print(Y.head())
print(X.head())
Y= Y.apply(np.int64)
X_train = X[:3000]
y_train = Y[:3000].values
X_test = X[3000:]
y_test = Y[3000:].values

Date
2002-01-23    0.0
2002-01-24    0.0
2002-01-25    0.0
2002-01-28    0.0
2002-01-29    0.0
Name: signal, dtype: float64
              Vol-5d    Return
Date                          
2002-01-23  0.050050  0.054994
2002-01-24  0.035992  0.008254
2002-01-25  0.030221  0.001725
2002-01-28  0.004042  0.000860
2002-01-29  0.005726 -0.008596


In [64]:
def entropy(p):
    if p == 0 or p == 1:
        return 0
    else:
        return - (p * np.log2(p) + (1 - p) * math.log2(1-p))

def information_gain(l, r):
    parent = l + r
    if len(parent) > 0:
        parent_p = parent.count(1) / len(parent)
    else:
        parent_p = 0
    if len(l) > 0:
        l_p = l.count(1) / len(l)
    else:
        l_p = 0
    if len(r) > 0:
        r_p = r.count(1) / len(r)
    else:
        r_p = 0    
    
    parent_informationGain = entropy(parent_p)
    l_informationGain = entropy(l_p)
    r_informationGain = entropy(r_p)
    return parent_informationGain - len(l) / len(parent) * l_informationGain - len(r) / len(parent) * r_informationGain

In [65]:
def draw_bootstrap(X_train, y_train):
    bootstrap_indices = list(np.random.choice(range(len(X_train)), len(X_train), replace = True))
    X_bootstrap = X_train.iloc[bootstrap_indices].values
    y_bootstrap = y_train[bootstrap_indices]
    return X_bootstrap, y_bootstrap

In [66]:
def find_split_point(X_bootstrap, y_bootstrap, max_features):
    feature_ls = list()
    num_features = len(X_bootstrap[0])

    while len(feature_ls) <= max_features:
        index = random.sample(range(num_features), 1)
        if index not in feature_ls:
            feature_ls.extend(index)

    best_informationGain = -math.inf
    node = None
    for index in feature_ls:
        for split_point in X_bootstrap[:,index]:
            l = {'X_bootstrap': [], 'y_bootstrap': []}
            r = {'X_bootstrap': [], 'y_bootstrap': []}      
        if type(split_point) in [int, float]:
            for i, value in enumerate(X_bootstrap[:,index]):
                if value > split_point:
                    r['X_bootstrap'].append(X_bootstrap[i])
                    r['y_bootstrap'].append(y_bootstrap[i])
                else:
                    l['X_bootstrap'].append(X_bootstrap[i])
                    l['y_bootstrap'].append(y_bootstrap[i])
        else:
            for i, value in enumerate(X_bootstrap[:,index]):
                if value == split_point:
                    l['X_bootstrap'].append(X_bootstrap[i])
                    l['y_bootstrap'].append(y_bootstrap[i])
                else:
                    r['X_bootstrap'].append(X_bootstrap[i])
                    r['y_bootstrap'].append(y_bootstrap[i])

        split_informationGain = information_gain(l['y_bootstrap'], r['y_bootstrap'])
        if split_informationGain > best_informationGain:
            best_informationGain = split_informationGain
            l['X_bootstrap'] = np.array(l['X_bootstrap'])
            r['X_bootstrap'] = np.array(r['X_bootstrap'])
            node = {'information_gain': split_informationGain,
                    'left_child': l,
                    'right_child': r,
                    'split_point': split_point,
                    'index': index}
    return node

In [67]:
def terminal_node(node):
    y_bootstrap = node['y_bootstrap']
    pred = max(y_bootstrap, key = y_bootstrap.count)
    return pred

def split_node(node, max_features, min_samples_split, max_depth, depth):
    l = node['left_child']
    r = node['right_child']

    del(node['left_child'])
    del(node['right_child'])

    if len(l['y_bootstrap']) == 0 or len(r['y_bootstrap']) == 0:
        empty_child = {'y_bootstrap': l['y_bootstrap'] + r['y_bootstrap']}
        node['left_split'] = terminal_node(empty_child)
        node['right_split'] = terminal_node(empty_child)
        return

    if depth >= max_depth:
        node['left_split'] = terminal_node(l)
        node['right_split'] = terminal_node(r)
        return node

    if len(l['X_bootstrap']) <= min_samples_split:
        node['left_split'] = node['right_split'] = terminal_node(l)
    else:
        node['left_split'] = find_split_point(l['X_bootstrap'], l['y_bootstrap'], max_features)
        split_node(node['left_split'], max_depth, min_samples_split, max_depth, depth + 1)
    if len(r['X_bootstrap']) <= min_samples_split:
        node['right_split'] = node['left_split'] = terminal_node(r)
    else:
        node['right_split'] = find_split_point(r['X_bootstrap'], r['y_bootstrap'], max_features)
        split_node(node['right_split'], max_features, min_samples_split, max_depth, depth + 1)

In [68]:
def build_tree(X_bootstrap, y_bootstrap, max_depth, min_samples_split, max_features):
    root_node = find_split_point(X_bootstrap, y_bootstrap, max_features)
    split_node(root_node, max_features, min_samples_split, max_depth, 1)
    return root_node

def random_forest(X_train, y_train, n_estimators, max_features, max_depth, min_samples_split):
    tree_ls = list()
    oob_ls = list()
    for i in range(n_estimators):
        X_bootstrap, y_bootstrap = draw_bootstrap(X_train, y_train)
        tree = build_tree(X_bootstrap, y_bootstrap, max_features, max_depth, min_samples_split)
        tree_ls.append(tree)
    return tree_ls

In [69]:
def predict_tree(tree, X_test):
    index = tree['index']

    if X_test[index] <= tree['split_point']:
        if type(tree['left_split']) == dict:
            return predict_tree(tree['left_split'], X_test)
        else:
            value = tree['left_split']
            return value
    else:
        if type(tree['right_split']) == dict:
            return predict_tree(tree['right_split'], X_test)
        else:
            return tree['right_split']

In [70]:
def predict_rf(tree_ls, X_test):
    pred_ls = list()
    for i in range(len(X_test)):
        ensemble_preds = [predict_tree(tree, X_test.values[i]) for tree in tree_ls]
        final_pred = max(ensemble_preds, key = ensemble_preds.count)
        pred_ls.append(final_pred)
    return np.array(pred_ls)

In [71]:
n_estimators = 1000
max_features = 2
max_depth = 10
min_samples_split = 2

model = random_forest(X_train, y_train, n_estimators=1000, max_features = 2, max_depth=10, min_samples_split=2)
pred = predict_rf(model, X_test)
accuracy = sum(pred == y_test) / len(y_test)
print(accuracy)

0.5422288855572214
