# FINTECH BOOTCAMP - PROJECT 2
## Group 2 Notebook
---
By applying machine learning models, we examine (1) if selective technical indicators could predict the stock direction with statistically significant level (2) Which model is the best (3) Whether we could optimize the model (4) Which time frame the model could generate the best result. 

In [29]:
# Initial import all libraries and dependencies
import yfinance as yf
import matplotlib.dates as mdates
import panel as pn
import datetime
import numpy as np
import pandas as pd
import hvplot.pandas
from finta import TA
# from pandas_datareader import data
import matplotlib.pyplot as plt

# Ignore wanrings
import warnings
warnings.filterwarnings("ignore")


# Machine learning libraries
from sklearn.preprocessing import StandardScaler

### I. DATA FETCHING AND CLEANING

In [30]:
# Define the instruments to download data of a stock/ETF .
tickers = ["AAPL", "TSLA", "MSFT", "SPY", "..."] # to be decided

# Fetch SPY Data from 1/1/2017 until 12/31/2021 and choosing a interval
start_date = datetime.date(2017,1,1)
end_date = datetime.date(2021,12,31)
interval = '1d'

In [31]:
#Use pandas_reader.data.DataReader to load the desired data.
yf.Tickers(tickers[0])
panel_data = yf.download(tickers[0], start = start_date, end = end_date, interval = interval)

# Checkout the data type
type(panel_data)

[*********************100%***********************]  1 of 1 completed


pandas.core.frame.DataFrame

In [32]:
# Review data
# panel_data.tail(5) # code from hanna
panel_data # view head, tail, shape

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2017-01-03,28.950001,29.082500,28.690001,29.037500,27.297691,115127600
2017-01-04,28.962500,29.127501,28.937500,29.004999,27.267141,84472400
2017-01-05,28.980000,29.215000,28.952499,29.152500,27.405802,88774400
2017-01-06,29.195000,29.540001,29.117500,29.477501,27.711325,127007600
2017-01-09,29.487499,29.857500,29.485001,29.747499,27.965151,134247600
...,...,...,...,...,...,...
2021-12-23,175.850006,176.850006,175.270004,176.279999,176.055695,68356600
2021-12-27,177.089996,180.419998,177.070007,180.330002,180.100540,74919600
2021-12-28,180.160004,181.330002,178.529999,179.289993,179.061859,79144300
2021-12-29,179.330002,180.630005,178.139999,179.380005,179.151749,62348900


In [33]:
# Data description and check if null
def data_description(df):
    print("Data Information")
    print(df.info())
    print("-"*50)

In [34]:
data_description(panel_data) # if 0 null and OHLC is floating and Volumne is int, then data is clean to proceed to part II

Data Information
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1258 entries, 2017-01-03 to 2021-12-30
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Open       1258 non-null   float64
 1   High       1258 non-null   float64
 2   Low        1258 non-null   float64
 3   Close      1258 non-null   float64
 4   Adj Close  1258 non-null   float64
 5   Volume     1258 non-null   int64  
dtypes: float64(5), int64(1)
memory usage: 68.8 KB
None
--------------------------------------------------


In [35]:
# IF data is not clean then dropping null or convert datatype
# def data_cleaning(df):
#     df.dropna()

In [36]:
# Convert to ohlcv dataframe to be ready for finta
def ohlcv(df):
    del(df['Close'])
    df = df.rename(columns = {"Open": "open",'High' : 'high', 'Low' : "low", "Adj Close": "close", 'Volume': 'volume'},inplace = True)
    return df

In [37]:
ohlcv(panel_data)
panel_data

Unnamed: 0_level_0,open,high,low,close,volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-01-03,28.950001,29.082500,28.690001,27.297691,115127600
2017-01-04,28.962500,29.127501,28.937500,27.267141,84472400
2017-01-05,28.980000,29.215000,28.952499,27.405802,88774400
2017-01-06,29.195000,29.540001,29.117500,27.711325,127007600
2017-01-09,29.487499,29.857500,29.485001,27.965151,134247600
...,...,...,...,...,...
2021-12-23,175.850006,176.850006,175.270004,176.055695,68356600
2021-12-27,177.089996,180.419998,177.070007,180.100540,74919600
2021-12-28,180.160004,181.330002,178.529999,179.061859,79144300
2021-12-29,179.330002,180.630005,178.139999,179.151749,62348900


### II. DATA PROCESSING AND PREPARATION

In [38]:
# Timeframe for prediction
time_frame = [3,5,7]

# Identify stock direction
def stock_direction(df, days):# days is time frame
    direction = (df['close'].shift(-days) > df['close'])
    direction = direction.iloc[:-days]
    return direction.astype(int) #return y values

In [39]:
stock_direction(panel_data,time_frame[0]) # y values

Date
2017-01-03    1
2017-01-04    1
2017-01-05    1
2017-01-06    1
2017-01-09    1
             ..
2021-12-20    1
2021-12-21    1
2021-12-22    1
2021-12-23    1
2021-12-27    0
Name: close, Length: 1255, dtype: int64

In [40]:
# Using Finta calculate technical indicators
# Define key window to calculate for technical analysis 
window = [5,14,21,50]
def technical_indicators (df): # https://github.com/peerchemist/finta/blob/master/finta/finta.py
    x = pd.DataFrame()
    for n in range(len(window)) :  ### LOOPING DOES NOT SHOW(?)
        a = TA.BBANDS(df,window[n])
        b = TA.RSI(df,window[n])
        c = TA.PIVOT_FIB(df)
        d = TA.OBV(df)
        e = TA.SMA(df,window[n])
        f = TA.EMA(df,window[n])
        g = TA.ROC(df,window[n])
        k = TA.WILLIAMS(df,window[n])
        temp = pd.concat([a,b,c,d,e,f,g,k],axis = 1)
        x = pd.concat([x,temp],axis=1)
    return x

In [41]:
technical_indicators(panel_data)

Unnamed: 0_level_0,BB_UPPER,BB_MIDDLE,BB_LOWER,5 period RSI,pivot,s1,s2,s3,s4,r1,...,s4,r1,r2,r3,r4,OBV,50 period SMA,50 period EMA,ROC,50 Williams %R
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-03,,,,,,,,,,,...,,,,,,,,27.297691,,
2017-01-04,,,,0.000000,28.356731,28.206796,28.114166,27.964231,27.814296,28.506666,...,27.814296,28.506666,28.599296,28.749231,28.899166,-8.447240e+07,,27.282111,,
2017-01-05,,,,85.015366,28.444047,28.371467,28.326627,28.254047,28.181467,28.516627,...,28.181467,28.516627,28.561468,28.634048,28.706628,4.302000e+06,,27.325001,,
2017-01-06,,,,95.515629,28.524434,28.424158,28.362208,28.261933,28.161658,28.624709,...,28.161658,28.624709,28.686659,28.786935,28.887210,1.313096e+08,,27.427453,,
2017-01-09,28.130022,27.529422,26.928822,97.404438,28.789609,28.628213,28.528503,28.367108,28.205713,28.951004,...,28.205713,28.951004,29.050714,29.212109,29.373504,2.655572e+08,,27.543764,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-12-23,178.555870,172.939664,167.323457,63.064880,174.475499,173.058277,172.182715,170.765493,169.348270,175.892722,...,169.348270,175.892722,176.768284,178.185506,179.602729,6.060665e+09,158.594117,161.579139,25.283699,-14.953985
2021-12-27,182.643087,174.775323,166.907560,73.632224,176.058568,175.455008,175.082127,174.478566,173.875006,176.662129,...,173.875006,176.662129,177.035009,177.638570,178.242131,6.135585e+09,159.328771,162.305468,25.621307,-5.207762
2021-12-28,182.567197,176.680896,170.794595,67.438857,179.196849,177.917152,177.126554,175.846858,174.567161,180.476545,...,174.567161,180.476545,181.267143,182.546839,183.826536,6.056440e+09,160.021111,162.962582,23.965524,-7.873096
2021-12-29,182.118037,177.957269,173.796502,67.732461,179.640620,178.571019,177.910218,176.840617,175.771016,180.710221,...,175.771016,180.710221,181.371022,182.440623,183.510224,6.118789e+09,160.681141,163.597451,22.580551,-8.337783


In [42]:
def consol_data(df,days):
    consol_data = technical_indicators(df)
    consol_data["direction"] = stock_direction(df,days)
    consol_data.dropna(inplace = True)
    return consol_data

In [43]:
data = consol_data(panel_data,time_frame[0])
data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1203 entries, 2017-03-16 to 2021-12-27
Data columns (total 73 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   BB_UPPER        1203 non-null   float64
 1   BB_MIDDLE       1203 non-null   float64
 2   BB_LOWER        1203 non-null   float64
 3   5 period RSI    1203 non-null   float64
 4   pivot           1203 non-null   float64
 5   s1              1203 non-null   float64
 6   s2              1203 non-null   float64
 7   s3              1203 non-null   float64
 8   s4              1203 non-null   float64
 9   r1              1203 non-null   float64
 10  r2              1203 non-null   float64
 11  r3              1203 non-null   float64
 12  r4              1203 non-null   float64
 13  OBV             1203 non-null   float64
 14  5 period SMA    1203 non-null   float64
 15  5 period EMA    1203 non-null   float64
 16  ROC             1203 non-null   float64
 17  5 Williams %R  

Encoding data

In [44]:
# Create our features
X = data.copy()
X.drop("direction", axis = 1, inplace = True)
X.head()

Unnamed: 0_level_0,BB_UPPER,BB_MIDDLE,BB_LOWER,5 period RSI,pivot,s1,s2,s3,s4,r1,...,s4,r1,r2,r3,r4,OBV,50 period SMA,50 period EMA,ROC,50 Williams %R
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-03-16,33.356296,32.973841,32.591386,79.089463,34.366391,34.202131,34.100651,33.936391,33.772131,34.530652,...,33.772131,34.530652,34.632132,34.796392,34.960652,2059894000.0,30.572748,31.583613,21.653033,-32.394639
2017-03-17,33.368669,33.013966,32.659264,59.714189,34.50949,34.436909,34.392068,34.319487,34.246906,34.582071,...,34.246906,34.582071,34.626911,34.699492,34.772073,1884354000.0,30.68827,31.649022,21.183338,-35.093511
2017-03-20,33.551638,33.120658,32.689678,75.4809,34.421911,34.315906,34.250416,34.14441,34.038405,34.527916,...,34.038405,34.527916,34.593406,34.699411,34.805416,1970522000.0,30.807958,31.726615,21.836293,-31.718531
2017-03-21,33.464376,33.160784,32.857192,49.041413,34.607571,34.486285,34.411355,34.29007,34.168784,34.728856,...,34.168784,34.728856,34.803786,34.925072,35.046357,1812402000.0,30.913888,31.783407,19.113149,-43.317283
2017-03-22,33.567005,33.206104,32.845202,64.2907,34.546777,34.253591,34.072461,33.779275,33.48609,34.839963,...,33.48609,34.839963,35.021093,35.314279,35.607465,1915843000.0,31.022201,31.853852,19.365606,-37.726373


In [45]:
# Create our target
y = data["direction"].copy()
y.values.reshape(-1,1)

array([[0.],
       [1.],
       [0.],
       ...,
       [1.],
       [1.],
       [0.]])

In [46]:
# Checkout the balance of our target values
y.value_counts() # It's not that imbalance, ok to proceed further

1.0    710
0.0    493
Name: direction, dtype: int64

Split the data into Training and Testing

In [47]:
# Split the X and y into X_train, X_test, y_train, y_test
# Use 70% of the data for training and the remainder for testing
split = int(0.7 * len(X))
X_train = X[: split]
X_test = X[split:]
y_train = y[: split]
y_test = y[split:]

Scale the training and testing 

In [48]:
# Create the StandardScaler instance
scaler = StandardScaler()
# Fit the Standard Scaler with the training data
# # Scale the training and testing data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### III. CHOOSING MODELS AND TRAINING MODEL

#### 5. Model 5: Random Forest Classifier #2

Determine parameters for Random Forest
- - -

In [49]:
# imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [50]:
# instantiate model, set parameter vars to determine most accurate random forest
random_forest_2_0 = RandomForestClassifier(verbose=True)
param_rf = {
    "n_estimators":[5,10,50,100,250],
    "max_depth":[2,4,8,16,32,64,None]
}

In [56]:
# GridSearchCV model selection for cross-validation
from sklearn.model_selection import GridSearchCV

cross_validation = GridSearchCV(
    random_forest_2_0,
    param_rf,
    cv=5
)
cross_validation.fit(X_train,y_train.values.ravel())

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

GridSearchCV(cv=5, estimator=RandomForestClassifier(verbose=True),
             param_grid={'max_depth': [2, 4, 8, 16, 32, 64, None],
                         'n_estimators': [5, 10, 50, 100, 250]})

In [57]:
def display_cv(results_1):
    mean_score = results_1.cv_results_['mean_test_score']
    std_score = results_1.cv_results_['std_test_score']
    params = results_1.cv_results_['params']
    for mean,std,params in zip(mean_score,std_score,params):
        print(f'{round(mean,3)} + or -{round(std,3)} for the {params}')
    print("\n")
    print(f'Best parameters for Random Forest: {results_1.best_params_}')

display_cv(cross_validation)

0.546 + or -0.08 for the {'max_depth': 2, 'n_estimators': 5}
0.57 + or -0.042 for the {'max_depth': 2, 'n_estimators': 10}
0.572 + or -0.054 for the {'max_depth': 2, 'n_estimators': 50}
0.572 + or -0.054 for the {'max_depth': 2, 'n_estimators': 100}
0.575 + or -0.049 for the {'max_depth': 2, 'n_estimators': 250}
0.488 + or -0.117 for the {'max_depth': 4, 'n_estimators': 5}
0.53 + or -0.114 for the {'max_depth': 4, 'n_estimators': 10}
0.449 + or -0.154 for the {'max_depth': 4, 'n_estimators': 50}
0.478 + or -0.134 for the {'max_depth': 4, 'n_estimators': 100}
0.464 + or -0.143 for the {'max_depth': 4, 'n_estimators': 250}
0.432 + or -0.145 for the {'max_depth': 8, 'n_estimators': 5}
0.43 + or -0.141 for the {'max_depth': 8, 'n_estimators': 10}
0.408 + or -0.159 for the {'max_depth': 8, 'n_estimators': 50}
0.417 + or -0.168 for the {'max_depth': 8, 'n_estimators': 100}
0.418 + or -0.161 for the {'max_depth': 8, 'n_estimators': 250}
0.42 + or -0.137 for the {'max_depth': 16, 'n_estimators

Use Best parameters from GridSearchCV
- - -

In [58]:
# create model based on GridSearchCV best results
random_forest_2_1 = RandomForestClassifier(
    n_estimators = 250, 
    max_depth = 2,
    bootstrap = True,
    max_features = 'sqrt',
    oob_score = True,
    criterion = "gini",
    verbose=True,
    random_state = 42
)

# fit to training data
random_forest_2_1.fit(
    X_train,
    y_train
)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:    0.3s finished


RandomForestClassifier(max_depth=2, max_features='sqrt', n_estimators=250,
                       oob_score=True, random_state=42, verbose=True)

In [59]:
# X_test data set used for predictions
y_pred = random_forest_2_1.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:    0.0s finished


In [60]:
# accuracy score based on GridSearchCV best results
accuracy_score_df=accuracy_score(
    y_test,
    random_forest_2_1.predict(X_test),
    normalize = True
)*100

print(accuracy_score_df,"% Accuracy Score")

56.232686980609415 % Accuracy Score


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 250 out of 250 | elapsed:    0.0s finished


In [61]:
# feature importance as pandas series
feature_importance = pd.Series(
    random_forest_2_1.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

print(feature_importance.head(60))

ROC               0.058586
14 period RSI     0.049502
ROC               0.048116
21 Williams %R    0.044497
OBV               0.043256
OBV               0.040848
OBV               0.032688
14 Williams %R    0.032129
ROC               0.031082
BB_LOWER          0.027317
21 period RSI     0.026658
OBV               0.026392
BB_LOWER          0.025680
5 period RSI      0.024810
s4                0.021787
50 period SMA     0.021088
BB_LOWER          0.020690
s4                0.020478
s4                0.020287
50 period RSI     0.019604
50 Williams %R    0.017261
BB_UPPER          0.016907
5 Williams %R     0.016466
s4                0.015054
BB_LOWER          0.014980
ROC               0.014269
BB_UPPER          0.013504
BB_MIDDLE         0.013469
BB_UPPER          0.013063
s3                0.012445
BB_MIDDLE         0.012070
21 period SMA     0.011767
BB_MIDDLE         0.011504
s1                0.010513
14 period SMA     0.009420
r1                0.008994
21 period EMA     0.008938
B

### IV. ANALYSIS AND EVALUATION (TEAM WORK)

### V. DEPLOYING MODEL (TEAM WORK)

### VI. CONCLUSION (TEAM WORK)