In [57]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from scipy.stats import uniform, randint

from models.tools import create_x_y, create_vectors


In [58]:
def clean_cols(columns):
    columns = columns.str.replace("<", "").str.replace(">", "")
    columns = columns.str.lower()
    return columns

In [68]:
df = pd.read_csv("amzn.csv")
df.head()

Unnamed: 0,<TICKER>,<PER>,<DATE>,<TIME>,<OPEN>,<HIGH>,<LOW>,<CLOSE>,<VOL>
0,US1.AMZN,60,02/01/15,09:00,312.58,314.75,312.11,313.52,404201
1,US1.AMZN,60,02/01/15,10:00,313.25,313.59,308.63,308.96,509208
2,US1.AMZN,60,02/01/15,11:00,308.84,310.12,306.9601,309.75,427854
3,US1.AMZN,60,02/01/15,12:00,309.77,309.93,308.05,308.4,209933
4,US1.AMZN,60,02/01/15,13:00,308.41,308.48,307.25,308.35,158209


In [69]:
df.columns = clean_cols(df.columns)
df[["open", "high", "low", "close"]] = df[["open", "high", "low", "close"]] / 20
df["diffs"] = df.close.pct_change() * 100
df = df.dropna()
df["bin_3"] = pd.cut(df["diffs"], bins=[-100, -0.25, 0.25, 100], labels=[-1, 0, 1])
df.head()

Unnamed: 0,ticker,per,date,time,open,high,low,close,vol,diffs,bin_3
1,US1.AMZN,60,02/01/15,10:00,15.6625,15.6795,15.4315,15.448,509208,-1.454453,-1
2,US1.AMZN,60,02/01/15,11:00,15.442,15.506,15.348005,15.4875,427854,0.255697,1
3,US1.AMZN,60,02/01/15,12:00,15.4885,15.4965,15.4025,15.42,209933,-0.435835,-1
4,US1.AMZN,60,02/01/15,13:00,15.4205,15.424,15.3625,15.4175,158209,-0.016213,0
5,US1.AMZN,60,02/01/15,14:00,15.4105,15.433,15.386,15.401,187600,-0.107021,0


In [71]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10559 entries, 1 to 10559
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   ticker  10559 non-null  object  
 1   per     10559 non-null  int64   
 2   date    10559 non-null  object  
 3   time    10559 non-null  object  
 4   open    10559 non-null  float64 
 5   high    10559 non-null  float64 
 6   low     10559 non-null  float64 
 7   close   10559 non-null  float64 
 8   vol     10559 non-null  int64   
 9   diffs   10559 non-null  float64 
 10  bin_3   10559 non-null  category
dtypes: category(1), float64(5), int64(2), object(3)
memory usage: 917.9+ KB


In [62]:
def measure(x, y):
    clf = DecisionTreeClassifier()

    params = {
        "max_depth": [2, 3, 5, 10, 20, 50, 100, 200],
        "min_samples_leaf": [2, 3, 5, 10],
        "criterion": ["gini", "entropy"],
    }
    time_split = TimeSeriesSplit(n_splits=5)

    tree_search = GridSearchCV(
        clf,
        param_grid=params,
        cv=time_split,
        verbose=1,
        n_jobs=4,
    )

    split = int(0.8 * len(x))
    tree_search.fit(x[:split], y[:split])
    y_pred = tree_search.predict(x[split:])
    cm = confusion_matrix(y[split:], y_pred)
    acc = sum(y_pred == y[split:]) / len(y_pred)

    return cm, acc

In [63]:
# acc = {}
# cm = {}
# # df = pd.read_csv("datasets/relevant/amzn.csv")
# for lag in [3, 16]:
#     # x, y = create_x_y(
#     #     df, x_cols=["open", "high", "low", "close", "vol"], y_col="bin_3", lag=lag
#     # )
#     x, y = create_vectors(df, main_col='bin_3')
#     cm[lag], acc[lag] = measure(x, y)

In [64]:
x, y = create_vectors(df, main_col='bin_3', lag=17)
clf = DecisionTreeClassifier()
split = int(0.8 * len(x))
clf.fit(x[:split], y[:split])
y_pred = clf.predict(x[split:])
cm = confusion_matrix(y[split:], y_pred)
acc = sum(y_pred == y[split:]) / len(y_pred)
print(cm, acc)

[[166 235 164]
 [240 448 231]
 [170 275 180]] 0.3764817449027975


In [67]:
df = pd.read_csv("datasets/relevant/amzn.csv")
x, y = create_vectors(df, main_col='bin_3', lag=17)
clf = DecisionTreeClassifier()
split = int(0.8 * len(x))
clf.fit(x[:split], y[:split])
y_pred = clf.predict(x[split:])
cm = confusion_matrix(y[split:], y_pred)
acc = sum(y_pred == y[split:]) / len(y_pred)
print(cm, acc)

[[114 151 119]
 [225 488 222]
 [ 97 187 117]] 0.4180232558139535


In [66]:
df.close.head(10)

0    15.4480
1    15.4875
2    15.4200
3    15.4175
4    15.4010
5    15.4260
6    15.1915
7    15.1645
8    15.1530
9    15.0805
Name: close, dtype: float64