# Random Forest Classifier 
### Determine optimal model parameters using RandomizedSearchCV

In [7]:
import warnings
import pandas as pd
import numpy as np
from pathlib import Path
import time
import datetime
import hvplot.pandas
#Import SKLearn Library and CLasses
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import make_classification
from sklearn import metrics

In [8]:
start_time = time.time()
run_date = datetime.datetime.now().strftime('%Y_%m_%d')

In [9]:
warnings.filterwarnings('ignore')

In [14]:
# Open signals file for feature set
feature_set_pct_path = Path('finalized_models_grid/df_key_credit_data_usa_adjusted_pct.csv')
feature_set_pct_path

WindowsPath('finalized_models_grid/df_key_credit_data_usa_adjusted_pct.csv')

In [16]:
X = pd.read_csv(feature_set_pct_path, index_col="Date", infer_datetime_format=True, parse_dates=True)
X.head()

FileNotFoundError: [Errno 2] No such file or directory: 'finalized_models_grid\\df_key_credit_data_usa_adjusted_pct.csv'

In [5]:
# Construct a data frame for equity security (target/dependent variable)
tareget_set_levels_path = Path('AutoOutputFiles/df_equity_data.csv')
equity_data = pd.read_csv(tareget_set_levels_path, index_col="Date", infer_datetime_format=True, parse_dates=True)
equity_data.head()

Unnamed: 0_level_0,Close,EquityPriceReturns,PositiveReturn
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2011-10-18,100.929893,0.019546,1
2011-10-19,99.735985,-0.011829,0
2011-10-20,100.172394,0.004376,1
2011-10-21,102.074387,0.018987,1
2011-10-24,103.325928,0.012261,1


In [6]:
# Concatenate the equity and credit dataframes for modelling
df_trading_signals = pd.concat([equity_data, X], axis='columns', join='inner')
print(f'The DataFrame named "df_trading_signals" constructed to hold Equity closing price levels, Equity daily returns, indication of positive, 1, and negative, 0, Equity returns.\n')
print(f'"df_trading_signals" has a shape of {df_trading_signals.shape}\n')
df_trading_signals.to_csv('AutoOutputFiles/df_trading_signals_random_forest_version_0001.csv')
df_trading_signals.head()

The DataFrame named "df_trading_signals" constructed to hold Equity closing price levels, Equity daily returns, indication of positive, 1, and negative, 0, Equity returns.

"df_trading_signals" has a shape of (2500, 8)



Unnamed: 0_level_0,Close,EquityPriceReturns,PositiveReturn,BAMLH0A0HYM2,BAMLC0A0CM,BAMLC0A4CBBB,BAMLH0A1HYBB,BAMLH0A3HYC
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-10-18,100.929893,0.019546,1,-0.001269,0.0,-0.003344,-0.001684,0.004383
2011-10-19,99.735985,-0.011829,0,-0.021601,-0.024096,-0.016779,-0.021922,-0.024
2011-10-20,100.172394,0.004376,1,-0.007792,-0.004115,-0.006826,-0.010345,-0.005216
2011-10-21,102.074387,0.018987,1,-0.02356,-0.016529,-0.010309,-0.022648,-0.022472
2011-10-24,103.325928,0.012261,1,-0.020107,-0.016807,-0.010417,-0.023173,-0.014559


In [7]:
df_trading_signals.isnull().sum()

Close                 0
EquityPriceReturns    0
PositiveReturn        0
BAMLH0A0HYM2          0
BAMLC0A0CM            0
BAMLC0A4CBBB          0
BAMLH0A1HYBB          0
BAMLH0A3HYC           0
dtype: int64

## Model

#### Assign Training and Testing Windows

In [8]:
training_start = df_trading_signals.index.min().strftime(format='%Y-%m-%d')
training_end = '2018-12-14'
testing_start = '2018-12-15'
testing_end = df_trading_signals.index.max().strftime(format='%Y-%m-%d')

#### Define X and Y Training & Test Datasets

In [9]:
# Manually split train and test datasets
# SK Learn library also exists for train test split, but following manual approach
x_variables = ['BAMLH0A0HYM2', 'BAMLC0A0CM', 'BAMLC0A4CBBB','BAMLH0A1HYBB', 'BAMLH0A3HYC']

x_train = df_trading_signals[x_variables][training_start:training_end]
y_train = df_trading_signals['PositiveReturn'][training_start:training_end]
#x_train.tail()
#y_train.tail()

In [10]:
# X and Y Testing Datasets
x_test = df_trading_signals[x_variables][testing_start:testing_end]
y_test = df_trading_signals['PositiveReturn'][testing_start:testing_end]
#x_test.tail()
#y_test.tail()

#### RandomizedSearchCV

In [11]:
rfc = RandomForestClassifier(random_state=0)
input_cv_value = 5
input_random_state = 0
input_return_train_score = True

forest_params = [
    {
        'max_depth': list(range(9, 6001)), 
        'max_features': list(range(1,6)), 
        'n_estimators': list(range(1,1001)), 
        'min_samples_split': list(range(1,51))
    }
]

rgs = RandomizedSearchCV(rfc, forest_params, cv=input_cv_value, scoring='accuracy', return_train_score=input_return_train_score)

In [12]:
rgs.fit(x_train, y_train)

RandomizedSearchCV(cv=5, estimator=RandomForestClassifier(random_state=0),
                   param_distributions=[{'max_depth': [9, 10, 11, 12, 13, 14,
                                                       15, 16, 17, 18, 19, 20,
                                                       21, 22, 23, 24, 25, 26,
                                                       27, 28, 29, 30, 31, 32,
                                                       33, 34, 35, 36, 37, 38, ...],
                                         'max_features': [1, 2, 3, 4, 5],
                                         'min_samples_split': [1, 2, 3, 4, 5, 6,
                                                               7, 8, 9, 10, 11,
                                                               12, 13, 14, 15,
                                                               16, 17, 18, 19,
                                                               20, 21, 22, 23,
                                                     

#### cv_results_ Attributes 
#### https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html
#### cv_results_ = dict of numpy (masked) ndarrays: A dict with keys as column headers and values as columns, that can be imported into a pandas DataFrame.

In [19]:
# Rather than using best_params_, multiple iterations are run and mean values for best_params_ are used for model parameter

In [20]:
# best_params = dictionary
best_params = rgs.best_params_
#best_params

In [21]:
# For the purpose of this project, only best_params will be retained for multuple iterations and analysis
df_best_params = pd.DataFrame.from_dict(best_params, orient = 'index')
df_best_params = df_best_params.transpose()
df_best_params['run_date'] = run_date
fl_nm = 'df_best_params_' + run_date + '.csv'
df_best_params.to_csv('AutoOutputFiles/' + fl_nm)
df_best_params

Unnamed: 0,n_estimators,min_samples_split,max_features,max_depth,run_date
0,95,38,4,1029,2021_10_18


In [18]:
#https://scikit-learn.org/stable/auto_examples/model_selection/plot_multi_metric_evaluation.html#sphx-glr-auto-examples-model-selection-plot-multi-metric-evaluation-py