### Set Up

In [167]:
import pandas as pd
import numpy as np
import seaborn as sns  
from pylab import mpl, plt 

In [168]:
plt.style.use('seaborn')
mpl.rcParams['font.family'] = 'serif'
%matplotlib inline

In [169]:
bova = pd.read_csv('bova11_TA_v1.csv',index_col=0)

In [170]:
bova.head()

Unnamed: 0,open,high,low,adjusted close,volume,RSI_14,STO_14,CHO,returns
2010-01-04,68.55,69.45,68.3,69.37,168379,,,,
2010-01-05,69.46,70.03,69.32,69.9,179175,,,,0.00764
2010-01-06,69.74,70.32,69.7,70.3,187683,,,,0.005722
2010-01-07,69.52,70.13,69.52,70.0,105126,,,,-0.004267
2010-01-08,69.41,70.0,69.31,69.48,177077,,,,-0.007429


In [171]:
bova = bova.rename(columns={"adjusted close": "adj_close","returns": "return"}) 

### Creatimg  new features

In [172]:
#Close price of N day before
bova['5d_close']= bova['adj_close'].shift(5)
bova['10d_close']= bova['adj_close'].shift(10)
bova['15d_close']= bova['adj_close'].shift(15)

In [173]:
#Rolling volatility of a window  of N days
bova['SD_close_5d']= bova['return'].rolling(5).std()*np.sqrt(252)
bova['SD_close_10d']= bova['return'].rolling(10).std()*np.sqrt(252)
bova['SD_close_15d']= bova['return'].rolling(15).std()*np.sqrt(252)

In [174]:
#My goal is to predict if the return on the next day will be positive
bova['Target'] = np.where(bova['return'] > 0 ,1,0)

In [175]:
bova.head()

Unnamed: 0,open,high,low,adj_close,volume,RSI_14,STO_14,CHO,return,5d_close,10d_close,15d_close,SD_close_5d,SD_close_10d,SD_close_15d,Target
2010-01-04,68.55,69.45,68.3,69.37,168379,,,,,,,,,,,0
2010-01-05,69.46,70.03,69.32,69.9,179175,,,,0.00764,,,,,,,1
2010-01-06,69.74,70.32,69.7,70.3,187683,,,,0.005722,,,,,,,1
2010-01-07,69.52,70.13,69.52,70.0,105126,,,,-0.004267,,,,,,,0
2010-01-08,69.41,70.0,69.31,69.48,177077,,,,-0.007429,,,,,,,0


In [176]:
df = bova.dropna()

In [177]:
df.head()

Unnamed: 0,open,high,low,adj_close,volume,RSI_14,STO_14,CHO,return,5d_close,10d_close,15d_close,SD_close_5d,SD_close_10d,SD_close_15d,Target
2010-01-26,65.05,65.36,64.12,64.98,457095,24.549947,0.0,-153660.903303,-0.0093,68.9,69.99,69.37,0.139092,0.146615,0.149983,0
2010-01-27,64.99,65.0,63.9,64.9,444017,24.291395,0.0,7344.691172,-0.001231,68.12,69.7,69.9,0.157646,0.149112,0.141427,0
2010-01-28,65.06,66.49,63.98,65.2,243961,27.380053,5.882353,70378.105506,0.004622,67.7,70.0,70.3,0.188105,0.149799,0.140004,1
2010-01-29,65.5,65.99,64.7,64.9,129331,26.227741,0.0,61375.649488,-0.004601,65.9,69.5,70.0,0.081866,0.150292,0.139981,0
2010-02-01,65.4,66.16,64.79,66.05,172290,37.147644,22.54902,98129.628841,0.01772,65.59,68.5,69.48,0.16538,0.186797,0.167377,1


### Splitting data

In [178]:
df.head()

Unnamed: 0,open,high,low,adj_close,volume,RSI_14,STO_14,CHO,return,5d_close,10d_close,15d_close,SD_close_5d,SD_close_10d,SD_close_15d,Target
2010-01-26,65.05,65.36,64.12,64.98,457095,24.549947,0.0,-153660.903303,-0.0093,68.9,69.99,69.37,0.139092,0.146615,0.149983,0
2010-01-27,64.99,65.0,63.9,64.9,444017,24.291395,0.0,7344.691172,-0.001231,68.12,69.7,69.9,0.157646,0.149112,0.141427,0
2010-01-28,65.06,66.49,63.98,65.2,243961,27.380053,5.882353,70378.105506,0.004622,67.7,70.0,70.3,0.188105,0.149799,0.140004,1
2010-01-29,65.5,65.99,64.7,64.9,129331,26.227741,0.0,61375.649488,-0.004601,65.9,69.5,70.0,0.081866,0.150292,0.139981,0
2010-02-01,65.4,66.16,64.79,66.05,172290,37.147644,22.54902,98129.628841,0.01772,65.59,68.5,69.48,0.16538,0.186797,0.167377,1


In [179]:
df['Target'] = df['Target'].shift(-1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Target'] = df['Target'].shift(-1)


In [180]:
df.head()

Unnamed: 0,open,high,low,adj_close,volume,RSI_14,STO_14,CHO,return,5d_close,10d_close,15d_close,SD_close_5d,SD_close_10d,SD_close_15d,Target
2010-01-26,65.05,65.36,64.12,64.98,457095,24.549947,0.0,-153660.903303,-0.0093,68.9,69.99,69.37,0.139092,0.146615,0.149983,0.0
2010-01-27,64.99,65.0,63.9,64.9,444017,24.291395,0.0,7344.691172,-0.001231,68.12,69.7,69.9,0.157646,0.149112,0.141427,1.0
2010-01-28,65.06,66.49,63.98,65.2,243961,27.380053,5.882353,70378.105506,0.004622,67.7,70.0,70.3,0.188105,0.149799,0.140004,0.0
2010-01-29,65.5,65.99,64.7,64.9,129331,26.227741,0.0,61375.649488,-0.004601,65.9,69.5,70.0,0.081866,0.150292,0.139981,1.0
2010-02-01,65.4,66.16,64.79,66.05,172290,37.147644,22.54902,98129.628841,0.01772,65.59,68.5,69.48,0.16538,0.186797,0.167377,1.0


In [181]:
df.dropna()

Unnamed: 0,open,high,low,adj_close,volume,RSI_14,STO_14,CHO,return,5d_close,10d_close,15d_close,SD_close_5d,SD_close_10d,SD_close_15d,Target
2010-01-26,65.05,65.36,64.12,64.98,457095,24.549947,0.000000,-1.536609e+05,-0.009300,68.90,69.99,69.37,0.139092,0.146615,0.149983,0.0
2010-01-27,64.99,65.00,63.90,64.90,444017,24.291395,0.000000,7.344691e+03,-0.001231,68.12,69.70,69.90,0.157646,0.149112,0.141427,1.0
2010-01-28,65.06,66.49,63.98,65.20,243961,27.380053,5.882353,7.037811e+04,0.004622,67.70,70.00,70.30,0.188105,0.149799,0.140004,0.0
2010-01-29,65.50,65.99,64.70,64.90,129331,26.227741,0.000000,6.137565e+04,-0.004601,65.90,69.50,70.00,0.081866,0.150292,0.139981,1.0
2010-02-01,65.40,66.16,64.79,66.05,172290,37.147644,22.549020,9.812963e+04,0.017720,65.59,68.50,69.48,0.165380,0.186797,0.167377,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-24,118.26,119.50,117.96,119.26,5788644,63.116431,100.000000,5.440808e+06,0.010592,118.31,117.14,114.68,0.080946,0.165261,0.172431,0.0
2021-05-25,119.89,119.97,118.05,118.40,5697654,58.676499,78.446115,4.354429e+06,-0.007211,118.19,118.13,113.27,0.102653,0.166428,0.165613,1.0
2021-05-26,118.94,119.58,118.79,119.24,4716894,61.523545,99.498747,3.723120e+06,0.007095,118.01,115.27,115.15,0.110558,0.097280,0.155872,1.0
2021-05-27,119.46,119.83,118.79,119.59,6038746,62.677378,100.000000,4.161002e+06,0.002935,117.91,116.04,115.43,0.107432,0.095556,0.155889,1.0


In [182]:
# Model Data base - where I will train and test the Decision Tree
df_ModelDB = df[:'2021']

In [183]:
df_ModelDB.tail()

Unnamed: 0,open,high,low,adj_close,volume,RSI_14,STO_14,CHO,return,5d_close,10d_close,15d_close,SD_close_5d,SD_close_10d,SD_close_15d,Target
2020-12-22,111.62,112.54,111.27,112.25,5273026,65.935933,70.205479,2793871.0,0.007359,111.76,109.44,107.05,0.200033,0.179746,0.150958,1.0
2020-12-23,112.55,113.91,112.52,113.14,4856930,68.046601,83.716475,2273467.0,0.007929,113.35,108.77,107.75,0.177391,0.174424,0.151615,1.0
2020-12-28,114.0,114.77,113.77,114.7,5433712,71.392566,100.0,3340756.0,0.013788,113.99,110.7,108.15,0.202311,0.166655,0.157612,1.0
2020-12-29,115.0,115.4,114.25,114.97,4278894,71.940228,100.0,3816996.0,0.002354,113.5,110.79,109.43,0.196114,0.166111,0.153846,0.0
2020-12-30,115.32,115.7,114.5,114.65,7741592,70.224328,94.83871,1817396.0,-0.002783,111.43,110.2,109.31,0.099231,0.162673,0.154873,0.0


In [184]:
#validation dataset
df_validation = df['2021':]

### Model implementation

In [185]:
# split out validation dataset for the end
from sklearn.model_selection import train_test_split

In [186]:
X =  df_ModelDB.iloc[:, :-1].drop('return',axis=1)

In [187]:
y = df_ModelDB.iloc[:, -1]

In [189]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y,random_state=42)

In [191]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [192]:
dt = DecisionTreeClassifier(max_depth=5,random_state=1)

In [193]:
dt.fit(X_train,y_train)

DecisionTreeClassifier(max_depth=5, random_state=1)

In [194]:
y_pred = dt.predict(X_test)

In [195]:
accuracy_score(y_test, y_pred)

0.5166461159062885