# Random Forest Classifier Manual Version To Aid Automated Development
### Determine optimal model parameters using RandomizedSearchCV

In [None]:
# Model Resource:
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
# https://www.analyticsvidhya.com/blog/2021/06/tune-hyperparameters-with-gridsearchcv/
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
# https://scikit-learn.org/stable/glossary.html
# https://scikit-learn.org/stable/glossary.html#term-CV-splitter
# https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74

In [None]:
# Feature Set Variables
# ICE BofA US High Yield Index Option-Adjusted Spread (BAMLH0A0HYM2)
# ICE BofA US Corporate Index Option-Adjusted Spread (BAMLC0A0CM)
# ICE BofA BBB US Corporate Index Option-Adjusted Spread (BAMLC0A4CBBB)
# ICE BofA BB US High Yield Index Option-Adjusted Spread (BAMLH0A1HYBB)
# ICE BofA CCC & Lower US High Yield Index Option-Adjusted Spread (BAMLH0A3HYC)

In [138]:
import warnings
import pandas as pd
import numpy as np
from pathlib import Path
import time
import datetime
import hvplot.pandas
#Import SKLearn Library and CLasses
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import make_classification
from sklearn import metrics

In [139]:
start_time = time.time()
run_date = datetime.datetime.now().strftime('%Y_%m_%d')

In [None]:
warnings.filterwarnings('ignore')

In [None]:
# Open signals file for feature set
# ALERT:  In and Out of Sample for feature lag is dependent on lag (.shift)
#         i.e. if Lag = 5 Days then feature values from T to T-4 are out of sample, however if lag = 4 Days, then only T to T-3 are out of sample
#         Future enhancement will allow for this required logic for automated approach
feature_set_pct_path = Path('AutoOutputFiles/df_key_credit_data_usa_adjusted_pct.csv')
X = pd.read_csv(feature_set_pct_path, index_col="Date", infer_datetime_format=True, parse_dates=True)
X.head()

In [None]:
# Construct a data frame for equity security (target/dependent variable)
# NB:  Control of in and out of sample is easier for target set as no lags exist
tareget_set_levels_path = Path('ManualFiles/df_key_credit_data_usa_adjusted_pct_in_sample_end.csv')
equity_data = pd.read_csv(tareget_set_levels_path, index_col="Date", infer_datetime_format=True, parse_dates=True)
equity_data.head()

In [None]:
# Concatenate the equity and credit dataframes for modelling
df_trading_signals = pd.concat([equity_data, X], axis='columns', join='inner')
print(f'The DataFrame named "df_trading_signals" constructed to hold Equity closing price levels, Equity daily returns, indication of positive, 1, and negative, 0, Equity returns.\n')
print(f'"df_trading_signals" has a shape of {df_trading_signals.shape}\n')
df_trading_signals.to_csv('AutoOutputFiles/df_trading_signals_random_forest_version_0001.csv')
df_trading_signals.head()

In [None]:
df_trading_signals.isnull().sum()

## Model

#### Assign Training and Testing Windows

In [None]:
training_start = df_trading_signals.index.min().strftime(format='%Y-%m-%d')
training_end = '2018-12-14'
testing_start = '2018-12-15'
testing_end = df_trading_signals.index.max().strftime(format='%Y-%m-%d')

#### Define X and Y Training & Test Datasets

In [None]:
# Manually split train and test datasets
# SK Learn library also exists for train test split, but following manual approach
x_variables = ['BAMLH0A0HYM2', 'BAMLC0A0CM', 'BAMLC0A4CBBB','BAMLH0A1HYBB', 'BAMLH0A3HYC']

x_train = df_trading_signals[x_variables][training_start:training_end]
y_train = df_trading_signals['PositiveReturn'][training_start:training_end]
#x_train.tail()
#y_train.tail()

In [None]:
# X and Y Testing Datasets
x_test = df_trading_signals[x_variables][testing_start:testing_end]
y_test = df_trading_signals['PositiveReturn'][testing_start:testing_end]
#x_test.tail()
#y_test.tail()

In [None]:
# Model parameters with default values for reference purposes
msg = ''
msg +=('class sklearn.ensemble.RandomForestClassifier(\n')
msg +=('    n_estimators=100, \n')
msg +=('    *, \n')
msg +=('    criterion="gini", \n')
msg +=('    max_depth=None, \n')
msg +=('    min_samples_split=2, \n')
msg +=('    min_samples_leaf=1, \n')
msg +=('    min_weight_fraction_leaf=0.0, \n')
msg +=('    max_features="auto", \n')
msg +=('    max_leaf_nodes=None, \n')
msg +=('    min_impurity_decrease=0.0, \n')
msg +=('    bootstrap=True, \n')
msg +=('    oob_score=False, \n')
msg +=('    n_jobs=None, \n')
msg +=('    random_state=None, \n')
msg +=('    verbose=0, \n')
msg +=('    warm_start=False, \n')
msg +=('    class_weight=None, \n')
msg +=('    ccp_alpha=0.0, \n')
msg +=('    max_samples=None \n')
msg +=(')')
print(msg)

In [None]:
#### RandomizedSearchCV

In [None]:
rfc = RandomForestClassifier(random_state=0)
input_cv_value = 5
input_random_state = 0
input_return_train_score = True

forest_params = [
    {
        'max_depth': list(range(9, 6001)), 
        'max_features': list(range(1,6)), 
        'n_estimators': list(range(1,1001)), 
        'min_samples_split': list(range(1,51))
    }
]

rgs = RandomizedSearchCV(rfc, forest_params, cv=input_cv_value, scoring='accuracy', return_train_score=input_return_train_score)

In [None]:
rgs.fit(x_train, y_train)

In [None]:
#### cv_results_ Attributes 
#### https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html
#### cv_results_ = dict of numpy (masked) ndarrays: A dict with keys as column headers and values as columns, that can be imported into a pandas DataFrame.

In [133]:
print(rgs.scorer_)

make_scorer(accuracy_score)


In [134]:
mean_test_score = rgs.cv_results_['mean_test_score']
mean_test_score

array([0.62562008, 0.62226187, 0.63178098, 0.60658811, 0.64465988,
       0.63793249, 0.63961003, 0.64858614, 0.64130323, 0.61778164])

In [135]:
mean_train_score = rgs.cv_results_['mean_train_score']
mean_train_score

array([0.82176856, 0.82876343, 0.81043568, 0.97467778, 0.81197522,
       0.79518662, 0.78679286, 0.77504206, 0.80120266, 0.85702252])

In [123]:
std_test_score = rgs.cv_results_['std_test_score']
std_test_score

array([0.02135153, 0.01515242, 0.01376473, 0.02034687, 0.01969548,
       0.01763638, 0.01900541, 0.02524905, 0.01650664, 0.01248712])

In [124]:
# params = list of dictionaries
params = rgs.cv_results_['params']
#params

In [125]:
for i in params:
    print(i)

{'n_estimators': 917, 'min_samples_split': 31, 'max_features': 3, 'max_depth': 5807}
{'n_estimators': 864, 'min_samples_split': 29, 'max_features': 3, 'max_depth': 5360}
{'n_estimators': 423, 'min_samples_split': 37, 'max_features': 4, 'max_depth': 76}
{'n_estimators': 533, 'min_samples_split': 7, 'max_features': 3, 'max_depth': 3736}
{'n_estimators': 989, 'min_samples_split': 31, 'max_features': 1, 'max_depth': 2948}
{'n_estimators': 834, 'min_samples_split': 45, 'max_features': 5, 'max_depth': 521}
{'n_estimators': 571, 'min_samples_split': 49, 'max_features': 5, 'max_depth': 4483}
{'n_estimators': 830, 'min_samples_split': 49, 'max_features': 1, 'max_depth': 3301}
{'n_estimators': 852, 'min_samples_split': 39, 'max_features': 2, 'max_depth': 2211}
{'n_estimators': 800, 'min_samples_split': 23, 'max_features': 5, 'max_depth': 3656}


In [126]:
# The mean_fit_time, std_fit_time, mean_score_time and std_score_time are all in seconds.
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html

mean_fit_time = rgs.cv_results_['mean_fit_time']
mean_fit_time
for i in mean_fit_time:
    print(i)

2.177744483947754
2.080293321609497
1.169199562072754
1.4280962467193603
1.5714008808135986
2.5383947849273683
1.7018844604492187
1.249212121963501
1.645448875427246
2.6443511962890627


In [127]:
# best_params = dictionary
best_params = rgs.best_params_
#best_params

In [128]:
for key in best_params:
    print(key, '->', best_params[key])

n_estimators -> 830
min_samples_split -> 49
max_features -> 1
max_depth -> 3301


In [142]:
# For the purpose of this project, only best_params will be retained for multuple iterations and analysis
df_best_params = pd.DataFrame.from_dict(best_params, orient = 'index')
df_best_params = df_best_params.transpose()
df_best_params['run_date'] = run_date
fl_nm = 'df_best_params_' + run_date + '.csv'
df_best_params.to_csv('AutoOutputFiles/' + fl_nm)
df_best_params

Unnamed: 0,n_estimators,min_samples_split,max_features,max_depth,run_date
0,830,49,1,3301,2021_10_18


In [130]:
# best_estimator_: estimator
# Estimator that was chosen by the search, i.e. estimator which gave highest score (or smallest loss if specified) on the left out data. Not available if refit=False.

rgs.best_estimator_

RandomForestClassifier(max_depth=3301, max_features=1, min_samples_split=49,
                       n_estimators=830, random_state=0)

In [131]:
# best_score_: float
# Mean cross-validated score of the best_estimator.
# For multi-metric evaluation, this is not available if refit is False. See refit parameter for more information.
# This attribute is not available if refit is a function.

rgs.best_score_

0.6485861383659609

In [132]:
rgs.best_index_

7

In [None]:
# Can use best_index to return index of highest test score index or rank_test_score for sorted list
rank_test_score = rgs.cv_results_['rank_test_score']
rank_test_score

In [None]:
print(f'The best score of {rgs.best_score_} is found within the mean_test_score array at index = {rgs.best_index_}')

In [None]:
end_time = time.time()
run_time = end_time - start_time
print(f'It took {run_time} seconds to above process.')

In [None]:
# All data in cv_results
print(rgs.cv_results_)

In [None]:
#https://www.codegrepper.com/code-examples/python/how+to+improve+accuracy+of+random+forest+classifier
def display(results):
    print(f'Best parameters are: {results.best_params_}')
    print("\n")
    mean_score = results.cv_results_['mean_test_score']
    std_score = results.cv_results_['std_test_score']
    params = results.cv_results_['params']
    for mean,std,params in zip(mean_score,std_score,params):
        print(f'{round(mean,3)} + or -{round(std,3)} for the {params}')

In [None]:
#https://scikit-learn.org/stable/auto_examples/model_selection/plot_multi_metric_evaluation.html#sphx-glr-auto-examples-model-selection-plot-multi-metric-evaluation-py