# Aggregated Nieve Strategy Performance Results Historical Data & Parameters From RandomizedSearchCV
### Focus is only on Annual Return, as this is the objective (Goal is to determine which lag consistently produces the highest Annual Return)

In [33]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from pathlib import Path
import time
import datetime
import hvplot.pandas

#Import SKLearn Library and CLasses
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import make_classification
from sklearn import metrics

In [2]:
# Open manually aggregated historical data for strategy lag (NIEVE UNGROUPED)
strat_lag_aggregated_file = Path('ManualFiles/df_strat_lag_aggregated.csv')
df_strat_lag_aggregated = pd.read_csv(strat_lag_aggregated_file, infer_datetime_format=True, parse_dates=True)
df_strat_lag_aggregated.head(), df_strat_lag_aggregated.tail()

(      RunDate PeriodEndDate  FeatureLag  n_estimators  max_depth max_features  \
 0  10/22/2021     10/1/2021           1           500       5000         auto   
 1  10/22/2021     10/1/2021           2           500       5000         auto   
 2  10/22/2021     10/1/2021           3           500       5000         auto   
 3  10/22/2021     10/1/2021           4           500       5000         auto   
 4  10/22/2021     10/1/2021           5           500       5000         auto   
 
    random_state  verbose  Annual Return  Annual Volatility  Sharpe Ratio  \
 0             0        0       0.150074           0.172458      0.898254   
 1             0        0       0.077147           0.160113      0.544679   
 2             0        0       0.030820           0.200537      0.252457   
 3             0        0       0.246054           0.184453      1.285082   
 4             0        0       0.055106           0.189581      0.378981   
 
    Calmar Ratio  Max Drawdown  Sortino Ra

In [3]:
# Groupby FeatureLag and calculate descriptive statisitcs for NIEVE PERFORMANCE RESULTS

In [4]:
print('\nDESCRIPTIVE STATISTICS FOR PERFORAMCE RESULTS (NIEVE MODEL PARAMETERS)\n')


DESCRIPTIVE STATISTICS FOR PERFORAMCE RESULTS (NIEVE MODEL PARAMETERS)



In [35]:
# MEAN
df_strat_lag_aggregated_grp_mean = df_strat_lag_aggregated.groupby("FeatureLag").mean()
#df_strat_lag_aggregated_grp_mean

# https://hvplot.holoviz.org/user_guide/Customization.html
df_strat_lag_aggregated_grp_mean.hvplot.bar(
    title = 'Mean Annualized Return Per Feature Lag In Days (Nieve Model Parameters)', 
    y='Annual Return',
    ylabel='Mean Annual Return',
    x='FeatureLag', 
    xlabel ='Feature Lag (Days)', 
    height=500,
    width = 1000

).opts(xrotation=90)

In [6]:
# STANDARD DEVIATION
df_strat_lag_aggregated_grp_std = df_strat_lag_aggregated.groupby("FeatureLag").std()
df_strat_lag_aggregated_grp_std

# https://hvplot.holoviz.org/user_guide/Customization.html
df_strat_lag_aggregated_grp_std.hvplot.bar(
    title = 'Standard Deviation of Mean Annualized Return Per Feature Lag In Days (Nieve Model Parameters)', 
    y='Annual Return',
    ylabel='Standard Deviation',
    x='FeatureLag', 
    xlabel ='Feature Lag (Days)', 
    height=500,
    width = 1500

).opts(xrotation=90)

In [7]:
# SKEW
df_strat_lag_aggregated_grp_skew = df_strat_lag_aggregated.groupby("FeatureLag").skew()
df_strat_lag_aggregated_grp_skew

# https://hvplot.holoviz.org/user_guide/Customization.html
df_strat_lag_aggregated_grp_std.hvplot.bar(
    title = 'Skew of Mean Annualized Return Per Feature Lag In Days (Nieve Model Parameters)', 
    y='Annual Return',
    ylabel='Skew',
    x='FeatureLag', 
    xlabel ='Feature Lag (Days)', 
    height=500,
    width = 1500

).opts(xrotation=90)

In [8]:
# AttributeError: 'DataFrameGroupBy' object has no attribute 'kurt'
#df_strat_lag_aggregated_grp_kurtosis = df_strat_lag_aggregated.groupby("FeatureLag").kurt()

In [9]:
print('********** START OF BEST PARAMETERS FROM RandomizedSearchCV USED FOR NOTEBOOK Current_rfc_model_algo_optimal_params_all_avg.ipynb **********')

********** START OF BEST PARAMETERS FROM RandomizedSearchCV USED FOR NOTEBOOK Current_rfc_model_algo_optimal_params_all_avg.ipynb **********


In [10]:
# Open manually aggregated historical data for strategy lag (Historical Time Series for RandomizedSearchCV)
print(f'Historical Time Series for RandomizedSearchCV Runs (Historical Runs & Ungrouped):\n')
best_params_aggregated = Path('ManualFiles/df_best_params_aggregated.csv')
df_best_params_aggregated = pd.read_csv(best_params_aggregated, infer_datetime_format=True, parse_dates=True)
df_best_params_aggregated

Historical Time Series for RandomizedSearchCV Runs (Historical Runs & Ungrouped):



Unnamed: 0,RunDate,PeriodEndDate,FeatureLag,n_estimators,min_samples_split,max_features,max_depth
0,10/22/2021,10/1/2021,1,726,21,1,3890
1,10/22/2021,10/1/2021,2,762,12,4,3729
2,10/22/2021,10/1/2021,3,204,34,1,3416
3,10/22/2021,10/1/2021,4,920,12,5,3012
4,10/22/2021,10/1/2021,5,483,14,2,195
...,...,...,...,...,...,...,...
985,10/20/2021,10/15/2021,86,359,49,1,5972
986,10/20/2021,10/15/2021,87,509,7,3,3252
987,10/20/2021,10/15/2021,88,694,46,2,2297
988,10/20/2021,10/15/2021,89,165,17,5,193


In [11]:
print('\nDESCRIPTIVE STATISTICS FOR BEST PARAMETERS FROM RandomizedSearchCV GROUPED BY FEATURE LAGS IN DAYS\n')


DESCRIPTIVE STATISTICS FOR BEST PARAMETERS FROM RandomizedSearchCV GROUPED BY FEATURE LAGS IN DAYS



In [12]:
# MEAN LAG VALUES BY FEATURE LAG:  Groupby FeatureLag, then by descriptive statisitcs
df_best_params_aggregated_grp_mean = df_best_params_aggregated.groupby("FeatureLag").mean()

# Save aggregated optimal model mean data
fl_nm = 'algo_optimal_parameters/OptimalRandomizedSearchCvMean.csv'
df_best_params_aggregated_grp_mean.to_csv(fl_nm)

print(f'Mean Values From Historical Runs of RandomizedSearchCV Grouped By Feature Lags:\n')
df_best_params_aggregated_grp_mean

Mean Values From Historical Runs of RandomizedSearchCV Grouped By Feature Lags:



Unnamed: 0_level_0,n_estimators,min_samples_split,max_features,max_depth
FeatureLag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,486.272727,29.454545,1.909091,3034.454545
2,542.818182,18.272727,3.363636,2214.454545
3,452.818182,36.727273,2.545455,3083.000000
4,522.545455,29.727273,2.272727,1976.727273
5,436.818182,36.818182,2.909091,3372.363636
...,...,...,...,...
86,464.909091,37.454545,1.727273,2929.727273
87,540.272727,18.090909,3.000000,2948.181818
88,673.636364,33.545455,4.000000,3156.181818
89,455.090909,29.272727,3.545455,2447.181818


In [13]:
# STANDARD DEVIATION OF MEAN LAG VALUES BY FEATURE LAG: These are not plotted, but retained for future use
df_best_params_aggregated_grp_std = df_best_params_aggregated.groupby("FeatureLag").std()

# Save aggregated optimal model standard deviation data 
fl_nm = 'algo_optimal_parameters/OptimalRandomizedSearchCvStdDeviation.csv'
df_best_params_aggregated_grp_std.to_csv(fl_nm)

print(f'Standard Deviation of Mean Values From Historical Runs of RandomizedSearchCV Grouped By Feature Lags (For Future Use):\n')
df_best_params_aggregated_grp_std

Standard Deviation of Mean Values From Historical Runs of RandomizedSearchCV Grouped By Feature Lags (For Future Use):



Unnamed: 0_level_0,n_estimators,min_samples_split,max_features,max_depth
FeatureLag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,263.929192,13.938175,1.640399,1376.396481
2,247.980168,16.541408,1.206045,1358.097667
3,360.912404,10.845192,1.368476,2120.886466
4,303.096474,12.649829,1.489356,1835.948915
5,277.545967,11.771306,1.375103,2022.206086
...,...,...,...,...
86,236.249637,7.243806,1.009050,1890.952357
87,304.188787,16.373482,1.183216,2053.014896
88,273.028670,13.589434,1.095445,1764.791025
89,291.326434,13.439426,1.368476,1755.997541


In [14]:
# SKEW (FOR FUTURE USE) OF MEAN LAG VALUES BY FEATURE LAG: These are not plotted, but retained for future use
df_best_params_aggregated_grp_skew = df_best_params_aggregated.groupby("FeatureLag").skew()

# Save aggregated optimal model mean data
fl_nm = 'algo_optimal_parameters/OptimalRandomizedSearchCvSkew.csv'
df_best_params_aggregated_grp_skew.to_csv(fl_nm)

print(f'Skew of Mean Values From Historical Runs of RandomizedSearchCV Grouped By Feature Lags (For Future Use):\n')
df_best_params_aggregated_grp_skew

Skew of Mean Values From Historical Runs of RandomizedSearchCV Grouped By Feature Lags (For Future Use):



Unnamed: 0_level_0,n_estimators,min_samples_split,max_features,max_depth
FeatureLag,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0.154223,0.080199,1.504798,0.005725
2,-0.463119,1.048687,-0.445671,0.105595
3,0.248462,-1.797744,0.170269,0.226626
4,-0.342775,0.177063,0.763157,1.009745
5,0.122375,-0.753242,0.477820,-0.570354
...,...,...,...,...
86,0.003005,0.238086,1.374466,0.131289
87,0.152164,0.928082,0.885400,-0.096481
88,-1.081346,0.043262,-1.115731,0.095630
89,0.097290,-0.708993,-0.115878,-0.078488


In [15]:
# For the purpose of the project, only concerned with mean model parameters, but future work will also include analysis of 
# other moments of the return distribution (standard deviation in particular, but also skew and kurtosis)

In [16]:
# MEAN N ESTIMATORS
df_best_params_aggregated_grp_mean.hvplot.bar(
    title = 'Mean of N Estimators (RandomizedSearchCV Parameter Results)', 
    y='n_estimators',
    ylabel='Mean of n_estimators',
    x='FeatureLag', 
    xlabel ='Feature Lag (Days)', 
    height=500,
    width = 1500

).opts(xrotation=90)

In [17]:
# MEAN MINIMUM SAMPLES SPLIT
df_best_params_aggregated_grp_mean.hvplot.bar(
    title = 'Mean of Minimum Samples Split (RandomizedSearchCV Parameter Results)', 
    y='min_samples_split',
    ylabel='Mean of min_samples_split',
    x='FeatureLag', 
    xlabel ='Feature Lag (Days)', 
    height=500,
    width = 1500

).opts(xrotation=90)

In [18]:
# MEAN MAXIMUM FEATURES
df_best_params_aggregated_grp_mean.hvplot.bar(
    title = 'Mean of Maximum Features (RandomizedSearchCV Parameter Results)', 
    y='max_features',
    ylabel='Mean of max_features',
    x='FeatureLag', 
    xlabel ='Feature Lag (Days)', 
    height=500,
    width = 1500

).opts(xrotation=90)

In [19]:
# MEAN MAXIMUM DEPTH
df_best_params_aggregated_grp_mean.hvplot.bar(
    title = 'Mean of Maximum Depth (RandomizedSearchCV Parameter Results)', 
    y='max_depth',
    ylabel='Mean of max_depth',
    x='FeatureLag', 
    xlabel ='Feature Lag (Days)', 
    height=500,
    width = 1500

).opts(xrotation=90)

In [20]:
df_best_params_aggregated

Unnamed: 0,RunDate,PeriodEndDate,FeatureLag,n_estimators,min_samples_split,max_features,max_depth
0,10/22/2021,10/1/2021,1,726,21,1,3890
1,10/22/2021,10/1/2021,2,762,12,4,3729
2,10/22/2021,10/1/2021,3,204,34,1,3416
3,10/22/2021,10/1/2021,4,920,12,5,3012
4,10/22/2021,10/1/2021,5,483,14,2,195
...,...,...,...,...,...,...,...
985,10/20/2021,10/15/2021,86,359,49,1,5972
986,10/20/2021,10/15/2021,87,509,7,3,3252
987,10/20/2021,10/15/2021,88,694,46,2,2297
988,10/20/2021,10/15/2021,89,165,17,5,193


In [21]:
var_list = ['n_estimators', 'min_samples_split', 'max_features', 'max_depth']
df_best_params_aggregated_mean_ungrouped = np.round(df_best_params_aggregated[var_list].mean(),0)
# simple = fact that mean value ignoring lags is used for the project version of model
# future versions will explore whether mean values grouped by lags produces better

fl_nm = 'algo_optimal_parameters/simple_best_params_means.csv'
df_best_params_aggregated_mean_ungrouped.to_csv(fl_nm)

print(f'These rounded mean model parameters will be used for the project version of the model:\n')
df_best_params_aggregated_mean_ungrouped

These rounded mean model parameters will be used for the project version of the model:



n_estimators          477.0
min_samples_split      31.0
max_features            3.0
max_depth            2988.0
dtype: float64

In [22]:
#### Once notebook 'Current_rfc_model_algo_optimal_params_all_avg.ipynb' has been backtested using parameter values  from above cell
#### Manually add each back testing end period to the respective csv files
####     ManualFiles/df_optimal_param_capture_stats_aggregated.csv &
####     ManualFiles/df_strat_optimal_param_lag_aggregated

In [23]:
#### The following statistics reflect aggregated backtesting results using the above mean values resulting from RandomizedSearchCV

In [24]:
print(f'********** START OF RESULTS OF RandomizedSearchCV PARAMETERS ABOVE USED IN NOTEBOOK Current_rfc_model_algo_optimal_params_all_avg.ipynb **********')

********** START OF RESULTS OF RandomizedSearchCV PARAMETERS ABOVE USED IN NOTEBOOK Current_rfc_model_algo_optimal_params_all_avg.ipynb **********


In [25]:
print(f'NB:  THIS IS AN ITERATIVE PROCESS, AS THIS NOTEBOOK IS FIRST RUN TO DETERMINE THE MEAN PARAMETER VALUES RESULTING FROM RandomizedSearchCV')
print(f'NB:  NOTEBOOK Current_rfc_model_algo_optimal_params_all_avg.ipynb IS THEN RUN TO BACKTEST THESE MEAN MODEL PARAMTERS')
print(f'NB:  RESULTS FOR EACH BACTEST PERIOD ARE THEN MANNUALLY AGGREGATED IN "ManualFiles/df_strat_optimal_param_lag_aggregated.csv"')

NB:  THIS IS AN ITERATIVE PROCESS, AS THIS NOTEBOOK IS FIRST RUN TO DETERMINE THE MEAN PARAMETER VALUES RESULTING FROM RandomizedSearchCV
NB:  NOTEBOOK Current_rfc_model_algo_optimal_params_all_avg.ipynb IS THEN RUN TO BACKTEST THESE MEAN MODEL PARAMTERS
NB:  RESULTS FOR EACH BACTEST PERIOD ARE THEN MANNUALLY AGGREGATED IN "ManualFiles/df_strat_optimal_param_lag_aggregated.csv"


In [26]:
# Open manually aggregated historical data for strategy lag (RandomizedSearchCV UNGROUPED)
strat_optimal_param_lag_aggregated = Path('ManualFiles/df_strat_optimal_param_lag_aggregated.csv')
df_strat_optimal_param_lag_aggregated = pd.read_csv(strat_optimal_param_lag_aggregated, infer_datetime_format=True, parse_dates=True)
df_strat_optimal_param_lag_aggregated

Unnamed: 0,RunDate,PeriodEndDate,FeatureLag,n_estimators,max_depth,max_features,random_state,verbose,Annual Return,Annual Volatility,Sharpe Ratio,Calmar Ratio,Max Drawdown,Sortino Ratio,Alpha,Beta
0,10/23/2021,10/1/2021,1,477,2988,3,0,0,0.231698,0.191530,1.184955,0.846560,-0.273694,1.665961,0.067153,0.710589
1,10/23/2021,10/1/2021,2,477,2988,3,0,0,0.057328,0.189137,0.390344,0.203873,-0.281194,0.517359,-0.080495,0.692087
2,10/23/2021,10/1/2021,3,477,2988,3,0,0,0.120314,0.203848,0.660001,0.340622,-0.353218,0.916763,-0.047534,0.803979
3,10/23/2021,10/1/2021,4,477,2988,3,0,0,0.187878,0.200221,0.960880,0.612509,-0.306734,1.343408,0.015642,0.776029
4,10/23/2021,10/1/2021,5,477,2988,3,0,0,0.125791,0.197135,0.700244,0.571768,-0.220003,0.981690,-0.032766,0.751847
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
985,10/23/2021,10/15/2021,86,477,2988,3,0,0,0.240330,0.184960,1.257251,1.137880,-0.211208,1.883706,0.078901,0.668644
986,10/23/2021,10/15/2021,87,477,2988,3,0,0,0.132975,0.167298,0.829922,0.924088,-0.143898,1.197291,0.011030,0.546040
987,10/23/2021,10/15/2021,88,477,2988,3,0,0,0.173458,0.199597,0.902342,0.559332,-0.310117,1.233646,-0.002074,0.777933
988,10/23/2021,10/15/2021,89,477,2988,3,0,0,0.159422,0.186100,0.889181,0.734349,-0.217094,1.211042,0.007179,0.676059


In [27]:
# Groupby FeatureLag and calculate descriptive statisitcs for RandomizedSearchCV PERFORMANCE RESULTS

In [28]:
print('\nDESCRIPTIVE STATISTICS FOR PERFORAMCE RESULTS (RandomizedSearchCV MODEL PARAMETERS)\n')


DESCRIPTIVE STATISTICS FOR PERFORAMCE RESULTS (RandomizedSearchCV MODEL PARAMETERS)



In [36]:
# MEAN
df_strat_optimal_param_lag_aggregated_grp_mean = df_strat_optimal_param_lag_aggregated.groupby("FeatureLag").mean()
#df_strat_optimal_param_lag_aggregated_grp_mean

# https://hvplot.holoviz.org/user_guide/Customization.html
df_strat_optimal_param_lag_aggregated_grp_mean.hvplot.bar(
    title = 'Mean Annualized Return Per Feature Lag In Days (Using Mean RandomizedSearchCV Model Parameters)', 
    y='Annual Return',
    ylabel='Mean Annual Return',
    x='FeatureLag', 
    xlabel ='Feature Lag (Days)', 
    height=500,
    width = 1000

).opts(xrotation=90)

In [30]:
# STANDARD DEVIATION
df_strat_optimal_param_lag_aggregated_grp_std = df_strat_optimal_param_lag_aggregated.groupby("FeatureLag").std()
df_strat_optimal_param_lag_aggregated_grp_std

# https://hvplot.holoviz.org/user_guide/Customization.html
df_strat_optimal_param_lag_aggregated_grp_std.hvplot.bar(
    title = 'Standard Deviation of Mean Annualized Return Per Feature Lag In Days (RandomizedSearchCV Model Parameters)', 
    y='Annual Return',
    ylabel='Standard Deviation',
    x='FeatureLag', 
    xlabel ='Feature Lag (Days)', 
    height=500,
    width = 1500

).opts(xrotation=90)

In [31]:
# SKEW
df_strat_optimal_param_lag_aggregated_grp_skew = df_strat_optimal_param_lag_aggregated.groupby("FeatureLag").skew()
df_strat_optimal_param_lag_aggregated_grp_skew

# https://hvplot.holoviz.org/user_guide/Customization.html
df_strat_optimal_param_lag_aggregated_grp_skew.hvplot.bar(
    title = 'Skew of Mean Annualized Return Per Feature Lag In Days (RandomizedSearchCV Model Parameters)', 
    y='Annual Return',
    ylabel='Skew',
    x='FeatureLag', 
    xlabel ='Feature Lag (Days)', 
    height=500,
    width = 1500

).opts(xrotation=90)

In [32]:
# AttributeError: 'DataFrameGroupBy' object has no attribute 'kurt'
#df_strat_lag_aggregated_grp_kurtosis = df_strat_lag_aggregated.groupby("FeatureLag").kurt()