In [4]:
##### FRED updates data (feature set data)the following morning at ~ 9:30 A.M CDT (~ 10:30 A.M. EDT)
##### Since lagged feature set values are used for model, this does not pose any issues (i.e. Lag = 5 days; feature data from 5 days ago is available for model)

##### yahooFinance data is available same day after equity market close, however there is no point is obtaining this data same day
##### Run full data sourcing process and model for T-1 on T at ~ 10:30 A.M Eastern Daylight Time


# Forward Testing Version 1:  

In [29]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
from pathlib import Path
import time
import datetime
import hvplot.pandas

#Import SKLearn Library and CLasses
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import make_classification
from sklearn import metrics

from joblib import dump, load
import joblib

import yfinance as yf
import pyfolio as pf
from pyfolio import timeseries 
import matplotlib.pyplot as plt
import empyrical
%matplotlib inline

In [30]:
# Feature Set
# ICE BofA US High Yield Index Option-Adjusted Spread (BAMLH0A0HYM2)
# ICE BofA US Corporate Index Option-Adjusted Spread (BAMLC0A0CM)
# ICE BofA BBB US Corporate Index Option-Adjusted Spread (BAMLC0A4CBBB)
# ICE BofA BB US High Yield Index Option-Adjusted Spread (BAMLH0A1HYBB)
# ICE BofA CCC & Lower US High Yield Index Option-Adjusted Spread (BAMLH0A3HYC)

In [31]:
feature_set_pct_path = Path('AutoOutputFiles/df_key_credit_data_usa_adjusted_pct.csv')
df_feature_set = pd.read_csv(feature_set_pct_path, index_col="Date", infer_datetime_format=True, parse_dates=True)
# Uncomment if required for testing
print(f'Feature set prior to shift:\n')
df_feature_set.tail()

Feature set prior to shift:



Unnamed: 0_level_0,BAMLH0A0HYM2,BAMLC0A0CM,BAMLC0A4CBBB,BAMLH0A1HYBB,BAMLH0A3HYC
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-10-15,-0.018868,-0.011111,-0.017857,-0.035714,0.001536
2021-10-18,-0.003205,0.0,0.0,0.0,-0.004601
2021-10-19,-0.006431,0.0,0.0,-0.013889,-0.003082
2021-10-20,-0.006472,0.0,0.0,-0.004695,-0.001546
2021-10-21,-0.013029,0.0,0.0,-0.023585,-0.004644


In [36]:
target_set_levels_path = Path('AutoOutputFiles/df_equity_data.csv')
equity_data = pd.read_csv(target_set_levels_path, index_col="Date", infer_datetime_format=True, parse_dates=True)
# Uncomment if required for testing
print(f'\nTarget set on import:\n')
equity_data.tail()


Target set on import:



Unnamed: 0_level_0,Close,EquityPriceReturns,PositiveReturn
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2021-10-18,447.190002,0.002961,1
2021-10-19,450.640015,0.007715,1
2021-10-20,452.410004,0.003928,1
2021-10-21,453.589996,0.002608,1
2021-10-22,453.119995,-0.001036,0


In [37]:
# Shift feature set by lag
lag = 30

df_feature_set = df_feature_set.shift(lag).dropna()

# Uncomment if required for testing
print(f'\nFeature set post shift and dates will now more easily be understood for forward test\n')
df_feature_set.tail()


Feature set post shift and dates will now more easily be understood for forward test



Unnamed: 0_level_0,BAMLH0A0HYM2,BAMLC0A0CM,BAMLC0A4CBBB,BAMLH0A1HYBB,BAMLH0A3HYC
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-10-15,-0.003106,0.0,0.0,0.0,-0.006494
2021-10-18,0.018692,0.0,0.0,0.02193,0.013072
2021-10-19,-0.003058,0.0,-0.00885,-0.008584,0.009677
2021-10-20,-0.009202,-0.01087,0.0,-0.008658,-0.01278
2021-10-21,0.009288,0.0,0.0,0.004367,0.009709


In [39]:
# Since lagged value = 30 is being used for forward testing, each dates feature set value represents the value from 30 day's ago
# Given that equity data (target set) is available for one extra date in the future (related to availability of feature set data), the end date
#      in the concatentation process below will end on the last available data for the feature set
#      e.g. period ending equity data = 2021-10-22, feature set period ending equity data = 2021-10-21, and since we need to enter/exit the equity
#           position at the end of the prior period to earn the next day's return, 

df_trading_signals = pd.concat([equity_data, df_feature_set], axis='columns', join='inner')
# Uncomment if required for testing
print(f'\nTrading Signals DataFrame used for modelling:\n')
df_trading_signals.tail()


Trading Signals DataFrame used for modelling:



Unnamed: 0_level_0,Close,EquityPriceReturns,PositiveReturn,BAMLH0A0HYM2,BAMLC0A0CM,BAMLC0A4CBBB,BAMLH0A1HYBB,BAMLH0A3HYC
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2021-10-15,445.869995,0.007616,1,-0.003106,0.0,0.0,0.0,-0.006494
2021-10-18,447.190002,0.002961,1,0.018692,0.0,0.0,0.02193,0.013072
2021-10-19,450.640015,0.007715,1,-0.003058,0.0,-0.00885,-0.008584,0.009677
2021-10-20,452.410004,0.003928,1,-0.009202,-0.01087,0.0,-0.008658,-0.01278
2021-10-21,453.589996,0.002608,1,0.009288,0.0,0.0,0.004367,0.009709


In [40]:
print('The feature set is ready for forward testing')
print(f'This process was followed as we want the lagged feature set to align with the prediction date')
print(f'For example:  5 Day lag requires the data set to end at the prior 5 Day Date')
print(f'I find it confusing if the predicted value uses the prior 5 day (T-5) date')

X_variables = ['BAMLH0A0HYM2', 'BAMLC0A0CM', 'BAMLC0A4CBBB', 'BAMLH0A1HYBB', 'BAMLH0A3HYC']
X = df_trading_signals[X_variables]
X

The feature set is ready for forward testing
This process was followed as we want the lagged feature set to align with the prediction date
For example:  5 Day lag requires the data set to end at the prior 5 Day Date
I find it confusing if the predicted value uses the prior 5 day (T-5) date


Unnamed: 0_level_0,BAMLH0A0HYM2,BAMLC0A0CM,BAMLC0A4CBBB,BAMLH0A1HYBB,BAMLH0A3HYC
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2011-10-25,0.030195,0.012048,0.009804,0.031175,0.030205
2011-10-26,0.012069,0.000000,0.000000,0.009302,0.018848
2011-10-27,0.049404,0.029762,0.029126,0.050691,0.050360
2011-10-28,0.012987,0.005780,0.004717,0.013158,0.018591
2011-10-31,0.096154,0.091954,0.075117,0.106061,0.091258
...,...,...,...,...,...
2021-10-15,-0.003106,0.000000,0.000000,0.000000,-0.006494
2021-10-18,0.018692,0.000000,0.000000,0.021930,0.013072
2021-10-19,-0.003058,0.000000,-0.008850,-0.008584,0.009677
2021-10-20,-0.009202,-0.010870,0.000000,-0.008658,-0.012780


In [41]:
#fl_nm = 'model_candidates/Lag_' + str(i) + '_random_forest_' + testing_end + '.joblib'


# YYYY-MM-DD

finalized_model_period_end_date = '2021-10-15'

fl_path = 'algo_optimal_parameters/back_test_using_mean_grid_values/Lag_' + str(lag) + '_random_forest_' + finalized_model_period_end_date + '.joblib'
print(fl_path)

algo_optimal_parameters/back_test_using_mean_grid_values/Lag_30_random_forest_2021-10-15.joblib


In [42]:
# load, no need to initialize the loaded_rf
loaded_rfc = joblib.load(fl_path)

In [43]:
predictions = loaded_rfc.predict(X)

In [44]:
print(predictions)

[1 1 1 ... 1 1 1]


In [45]:
len(predictions)

2500

In [46]:
type(predictions)

numpy.ndarray

In [47]:
new_predictions = pd.Series(predictions)
new_predictions

0       1
1       1
2       1
3       0
4       1
       ..
2495    1
2496    0
2497    1
2498    1
2499    1
Length: 2500, dtype: int64

In [48]:
# Instantiate data frame using the index from data frame "X"
# Data frame X contains the dates we want
df_new_predictions = pd.DataFrame(index=X.index)
df_new_predictions

2011-10-25
2011-10-26
2011-10-27
2011-10-28
2011-10-31
...
2021-10-15
2021-10-18
2021-10-19
2021-10-20
2021-10-21


In [49]:
# Reference for the best method to add the values of a Series as a new column of a DataFrame: 
# https://stackoverflow.com/questions/12555323/adding-new-column-to-existing-dataframe-in-python-pandas

# Example code
# 'e' in the following is the column name to assign 
#df1 = df1.assign(e=pd.Series(np.random.randn(sLength)).values)

In [50]:
# Add the panda series that contains the prdictions
df_new_predictions = df_new_predictions.assign(ForwardTestPredictions=pd.Series(predictions).values)
df_new_predictions

Unnamed: 0_level_0,ForwardTestPredictions
Date,Unnamed: 1_level_1
2011-10-25,1
2011-10-26,1
2011-10-27,1
2011-10-28,0
2011-10-31,1
...,...
2021-10-15,1
2021-10-18,0
2021-10-19,1
2021-10-20,1


In [51]:
# Add the actual equity price returns to the data frame
var_list = ['EquityPriceReturns']
equity_actual_return = df_trading_signals[var_list]

equity_actual_return

frames = [df_new_predictions, equity_actual_return]
df_forward_test_performance_results = pd.concat(frames, axis='columns', join='inner')

# Values after October 15, 2021 reflect the forward test predictions & results
df_forward_test_performance_results

Unnamed: 0_level_0,ForwardTestPredictions,EquityPriceReturns
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2011-10-25,1,-0.019444
2011-10-26,1,0.010159
2011-10-27,1,0.034835
2011-10-28,0,-0.000233
2011-10-31,1,-0.024106
...,...,...
2021-10-15,1,0.007616
2021-10-18,0,0.002961
2021-10-19,1,0.007715
2021-10-20,1,0.003928


In [54]:
# Only retain the forward test dates to allow the following code to calculate forward test cummulative return
#################  I HAVE TO FIGURE OUT A WAY TO CALCULATE THE NUMBER OF ROWS TO RETAIN; CURRENTLY HARD CODED
#################  ONLY WANT DATA AFTER OCTOBER 15, 2021
df_forward_test_performance_results = df_forward_test_performance_results.iloc[-4:]

df_forward_test_performance_results['Equity_Position'] = df_forward_test_performance_results['ForwardTestPredictions'] * df_forward_test_performance_results['EquityPriceReturns']
df_forward_test_performance_results['Strategy_Cum_Rtn'] = (1 + df_forward_test_performance_results['Equity_Position']).cumprod()
df_forward_test_performance_results['Equity_Cum_Rtn'] = (1 + df_forward_test_performance_results['EquityPriceReturns']).cumprod()

In [55]:
df_forward_test_performance_results

Unnamed: 0_level_0,ForwardTestPredictions,EquityPriceReturns,Equity_Position,Strategy_Cum_Rtn,Equity_Cum_Rtn
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-10-19,1,0.007715,0.007715,1.007715,1.007715
2021-10-20,1,0.003928,0.003928,1.011673,1.011673
2021-10-21,1,0.002608,0.002608,1.014312,1.014312
