# Stock purchase recommendations with Machine Learning - Model Training

In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from tqdm import tqdm, tqdm_notebook # progress bar
import fastparquet
import pickle

from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier

In [3]:
# pd.set_option('display.max_columns', 1500)

## 1) Load Training and Test Data

In [3]:
# load the training and test datae from feature engineering step:
X_train = fastparquet.ParquetFile('data/processed/X_train.parq').to_pandas()
X_test = fastparquet.ParquetFile('data/processed/X_test.parq').to_pandas()
y_train = pickle.load(open('data/processed/y_train.pkl', 'rb'))
y_test = pickle.load(open('data/processed/y_test.pkl', 'rb'))

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7024, 693), (1756, 693), (7024,), (1756,))

In [4]:
# quick inspection
X_train.tail()

Unnamed: 0_level_0,AdjVolume-0,AdjVolume-1,AdjVolume-2,AdjVolume-3,AdjVolume-4,AdjVolume-5,AdjVolume-6,AdjVolume-7,AdjVolume-8,AdjVolume-9,...,weekday,day,AAPL.US,ABBV.US,AMZN.US,CSCO.US,GE.US,INTC.US,MSFT.US,NFLX.US
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-05-10,1.0,0.855512,0.860082,0.885949,0.82251,0.968459,0.814462,0.889968,1.065802,2.269711,...,2,10,0,1,0,0,0,0,0,0
2017-05-10,1.0,1.066204,0.762742,0.850528,0.79176,0.827486,0.821882,0.769241,1.147349,1.519654,...,2,10,0,0,0,1,0,0,0,0
2017-05-10,1.0,1.400148,1.883068,1.043509,0.906711,1.758551,1.548577,1.278464,0.788735,0.543387,...,2,10,1,0,0,0,0,0,0,0
2017-05-10,1.0,1.550622,1.617839,1.37109,1.14276,1.701144,1.824343,2.599446,3.493653,2.042627,...,2,10,0,0,1,0,0,0,0,0
2017-05-10,1.0,1.250876,1.033853,1.058335,1.200615,1.609998,1.289137,1.754546,2.182534,1.805805,...,2,10,0,0,0,0,0,0,1,0


In [5]:
X_test.head()

Unnamed: 0_level_0,AdjVolume-0,AdjVolume-1,AdjVolume-2,AdjVolume-3,AdjVolume-4,AdjVolume-5,AdjVolume-6,AdjVolume-7,AdjVolume-8,AdjVolume-9,...,weekday,day,AAPL.US,ABBV.US,AMZN.US,CSCO.US,GE.US,INTC.US,MSFT.US,NFLX.US
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-05-11,1.0,0.637548,0.797493,0.659131,0.674739,0.765449,1.02645,0.821886,1.118606,1.391469,...,3,11,0,0,0,0,0,0,1,0
2017-05-11,1.0,1.106919,0.822218,0.821698,0.847892,0.767362,1.013889,1.647503,1.138211,2.605818,...,3,11,0,0,0,0,0,1,0,0
2017-05-11,1.0,1.234871,0.702247,0.863877,0.960571,0.965947,0.698233,0.900488,1.619883,1.140485,...,3,11,0,0,0,0,0,0,0,1
2017-05-11,1.0,1.050633,1.120189,0.801362,0.893593,0.831849,0.869383,0.863496,0.80819,1.205442,...,3,11,0,0,0,1,0,0,0,0
2017-05-11,1.0,1.002882,1.404183,1.888495,1.046517,0.909324,1.763619,1.55304,1.282149,0.791008,...,3,11,1,0,0,0,0,0,0,0


In [6]:
y_train[:5]

Date
2013-11-13    False
2013-11-13    False
2013-11-13    False
2013-11-13     True
2013-11-13     True
Name: setup_for_profitable_trade, dtype: bool

## 2) Build Initial Model - RandomForestClassifier

Start the ML process with a simple out of the box RandomForestClassifier to get a base line and validate that the training data is functioning with sklearn and can generate some predictions.

In [7]:
# define a simple pipeline
pipeline = Pipeline([
    ('standardScaler', StandardScaler()),
    ('randomForest', RandomForestClassifier())
])

# inspect parameters
pipeline.get_params()

{'memory': None,
 'steps': [('standardScaler', StandardScaler()),
  ('randomForest', RandomForestClassifier())],
 'verbose': False,
 'standardScaler': StandardScaler(),
 'randomForest': RandomForestClassifier(),
 'standardScaler__copy': True,
 'standardScaler__with_mean': True,
 'standardScaler__with_std': True,
 'randomForest__bootstrap': True,
 'randomForest__ccp_alpha': 0.0,
 'randomForest__class_weight': None,
 'randomForest__criterion': 'gini',
 'randomForest__max_depth': None,
 'randomForest__max_features': 'sqrt',
 'randomForest__max_leaf_nodes': None,
 'randomForest__max_samples': None,
 'randomForest__min_impurity_decrease': 0.0,
 'randomForest__min_samples_leaf': 1,
 'randomForest__min_samples_split': 2,
 'randomForest__min_weight_fraction_leaf': 0.0,
 'randomForest__n_estimators': 100,
 'randomForest__n_jobs': None,
 'randomForest__oob_score': False,
 'randomForest__random_state': None,
 'randomForest__verbose': 0,
 'randomForest__warm_start': False}

In [8]:
# fit the pipeline with all default parameters
pipeline.fit(X_train, y_train.reset_index().setup_for_profitable_trade)

In [9]:
# create prediction
y_pred_firstRF = pipeline.predict(X_test)

# save for backtesting in separate notebook
pickle.dump(y_pred_firstRF, open('../data/model_predictions/y_pred_firstRF.pkl', 'wb'))

We now have a first prediction on the dataset - let's look into the performance of the default settings. To understand the result, we will look at a few different metrics: the accuracy, precision, recall, f1-score, and the confusion matrix.

It is important to note that our data is imbalanced (not the same number of days with trade signal as with no signal). Therefore, F scores are a better metric than ROC-AUC or accuracy. We can not easily fix the imbalance since we are looking at time series data and resampling might introduce lookahead bias.

The ultimate test of the quality of the prediction is to backtest the results (ie simulate financial performance based on the predictions). This will be done in the next workbook. For now let's inspect some of the basic metrics.

In [10]:
print(classification_report(y_test, y_pred_firstRF))

              precision    recall  f1-score   support

       False       0.64      0.81      0.71      1153
        True       0.25      0.12      0.16       603

    accuracy                           0.57      1756
   macro avg       0.44      0.47      0.44      1756
weighted avg       0.50      0.57      0.52      1756



In [11]:
accuracy_score(y_test, y_pred_firstRF)

0.5734624145785877

In [12]:
confusion_matrix(y_test, y_pred_firstRF, labels=[False, True])

array([[935, 218],
       [531,  72]], dtype=int64)

The confusion matrix indicates a large number of False Positives (bad because if we used those to trade, we would enter a trade that turns out not to be as profitable as desired) and False Negatives (bad because they mean missed opportunities for us to enter profitable trades).

The good news is that our out-of-the-box model was able to predict some True Positives but the results in terms of financial performance are most likely very bad (we saved off the prediction and will look at this later).

Even though the model seems to perform poorly, we might be able to learn something from it:

Let's look into the relative feature importances to see if anything stands out

In [13]:
def print_feature_importances(estimator):

    importances = estimator.feature_importances_
    indices = np.argsort(importances)[::-1]

    # Print the feature ranking
    print("Feature ranking:")

    for f in range(X_train.shape[1]):
        print(str(f + 1) + " importance: " + str(importances[indices[f]]) + ". feature name: " + X_train.columns[indices[f]])

In [14]:
      print_feature_importances(pipeline.named_steps['randomForest'])

Feature ranking:
1 importance: 0.005638246134052845. feature name: AdjCloseSMA50_to_AdjCloseSMA200_ratio-17
2 importance: 0.0049959296788903654. feature name: AdjCloseSMA50_to_AdjCloseSMA200_ratio-19
3 importance: 0.004965252889361174. feature name: AdjCloseSMA50_to_AdjCloseSMA200_ratio-13
4 importance: 0.004831041208680416. feature name: week
5 importance: 0.004782055494280791. feature name: AdjCloseSMA50_to_AdjCloseSMA200_ratio-18
6 importance: 0.00451070749694263. feature name: AdjCloseSMA10_to_AdjCloseSMA50_ratio-19
7 importance: 0.004489813139127604. feature name: AdjCloseSMA50_chg-13
8 importance: 0.00440247369123085. feature name: AdjCloseSMA50_to_AdjCloseSMA200_ratio-15
9 importance: 0.004374249826574966. feature name: AdjCloseSMA50_chg-15
10 importance: 0.004366215985586395. feature name: AdjCloseSMA50_to_AdjCloseSMA200_ratio-0
11 importance: 0.004358704385781226. feature name: AdjCloseSMA50_to_AdjCloseSMA200_ratio-16
12 importance: 0.004327183238903859. feature name: AdjClose

At the very bottom of the list are the RSI_above_80 and RSI_below_20 indicators. RSI is an indicator of extremes so having these features not play an important role means that the model did not notice many extremes and having predictive power.
The top section contains many of the SMA50 to SMA200 ratios. The SMA 50 and 200 crossovers or distances are classic momentum indicators (SMA 50 over SMA 200 = uptrend, below = downtrend).
Since the time period in the training data is that of a general market uptrend, it sounds intuitively reasonable that momentum indicators are important.

## 3) Improve RandomForest model with GridSearch

Given the imbalance of our data and the desire to avoid False Positives, we will use a modified F1 score that places stronger emphasis on precision than on recall (they have the same weight in the F1 score). We will skew it towards precision by setting beta to 0.5 and towards recall bysetting beta to 2.0. The combinations were used in separate runs of GridSearchCV and results were saved.

In [15]:
# place higher focus on precision (ie getting most TP and minimize FP) than on recall (ie minimize FN)
# since placing a trade that was based on a FP will be costly and hurts more than missing a trade due to a FN
fhalf_scorer = make_scorer(fbeta_score, beta=0.5) # low beta favors precision over recall
ftwo_scorer = make_scorer(fbeta_score, beta=2) # high beta favors recall over precision

For a GridSearch, we will define a wide range of parameters to be used in the pipeline. The new fhalf_scorer will be used for optimization.

In [16]:
parameters = {
    'randomForest__min_samples_leaf': [2, 5, 10],
    'randomForest__n_estimators' : [10, 20, 50, 100],
    'randomForest__max_features': [5, 'sqrt', 'log2'], # log2 of 690 = 9, sqrt of 690 = 20
    'randomForest__max_depth' : [4, 5, 6, 7, 8],
    'randomForest__criterion' :['gini', 'entropy']
}

my_cv = TimeSeriesSplit(n_splits=3)
cv = GridSearchCV(pipeline, param_grid=parameters, cv=my_cv, scoring='f1', n_jobs=-1, verbose=10) # uses fhalf_scorer
cv.fit(X_train, y_train)

Fitting 3 folds for each of 360 candidates, totalling 1080 fits


In [17]:
print(cv.best_params_)
pickle.dump(cv.best_params_, open('models/GridSearch_vbetaone.pkl', 'wb'))

{'randomForest__criterion': 'gini', 'randomForest__max_depth': 8, 'randomForest__max_features': 'sqrt', 'randomForest__min_samples_leaf': 2, 'randomForest__n_estimators': 10}


In [18]:
print_feature_importances(cv.best_estimator_.named_steps['randomForest'])

Feature ranking:
1 importance: 0.024208221374915895. feature name: AdjCloseSMA50_chg-14
2 importance: 0.020090826869822043. feature name: AdjCloseSMA50_chg-19
3 importance: 0.016684235448935793. feature name: AdjCloseSMA50-19
4 importance: 0.011604048503764073. feature name: week
5 importance: 0.011502632634380214. feature name: AdjCloseSMA50_to_AdjClose_ratio-18
6 importance: 0.010679311421441368. feature name: AdjCloseSMA50_chg-18
7 importance: 0.010533961621291773. feature name: AdjCloseSMA200-8
8 importance: 0.010331101355136822. feature name: AdjCloseSMA50-11
9 importance: 0.009306623306070048. feature name: AdjCloseSMA10_to_AdjCloseSMA50_ratio-11
10 importance: 0.008981571963874791. feature name: AdjCloseSMA10_to_AdjCloseSMA50_ratio-12
11 importance: 0.008812749701659055. feature name: AdjCloseSMA50_to_AdjCloseSMA200_ratio-16
12 importance: 0.00874711923723079. feature name: AdjCloseSMA50_to_AdjCloseSMA200_ratio-19
13 importance: 0.008730565777321213. feature name: AdjCloseSMA50-

In [19]:
y_pred_GridSearch = cv.predict(X_test)

# save for backtesting in separate notebook
pickle.dump(y_pred_GridSearch, open('data/model_predictions/y_pred_GridSearch_vbetaone.pkl', 'wb'))

In [20]:
print(classification_report(y_test, y_pred_GridSearch))

              precision    recall  f1-score   support

       False       0.64      0.86      0.74      1153
        True       0.24      0.09      0.13       603

    accuracy                           0.59      1756
   macro avg       0.44      0.47      0.43      1756
weighted avg       0.51      0.59      0.53      1756



In [21]:
accuracy_score(y_test, y_pred_GridSearch)

0.5945330296127562

In [22]:
confusion_matrix(y_test, y_pred_GridSearch, labels=[False, True])

array([[992, 161],
       [551,  52]], dtype=int64)