In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.dates as mdates
import pandas_datareader.data as web
from datetime import datetime, timedelta
import scipy.stats as stats
from sklearn.metrics import brier_score_loss, roc_curve, auc, log_loss
from sklearn.preprocessing import StandardScaler

| market_category | feature_name | id |
|-----------------|--------------|----|
| Bank            | bac          |  1 |
| Bank            | citi         |  2 |
| Commodity       | corn         |  3 |
| Currency        | euro         |  4 |
| Commodity       | gold         |  5 |
| Inflation       | infl5y       |  6 |
| Commodity       | iyr          |  7 |
| Currency        | pound        |  8 |
| Commodity       | silver       |  9 |
| Commodity       | soybns       | 10 |
| Equity          | sp12m        | 11 |
| Equity          | sp6m         | 12 |
| Commodity       | wheat        | 13 |
| Currency        | yen          | 14 |


Return Model (Log Price)

In [13]:
df = pd.read_csv("mpd_sp500.csv")
df['Date'] = pd.to_datetime(df['Date'])
df = df.set_index('Date')

In [14]:
# forwards filling
df = df.fillna(method='ffill')

In [15]:
# create a new df that extract the columns of SP_adj_close	SP_lg_pr	SP_lg_ret(%)	VIX
data = df[['SP_adj_close', 'SP_lg_pr', 'SP_lg_ret(%)', 'VIX']]
data

Unnamed: 0_level_0,SP_adj_close,SP_lg_pr,SP_lg_ret(%),VIX
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-01-17,1480.939941,7.300432,0.597345,0.1357
2013-01-24,1494.819946,7.309761,0.932878,0.1269
2013-01-31,1498.109985,7.311960,0.219854,0.1428
2013-02-07,1509.390015,7.319461,0.750130,0.1350
2013-02-14,1521.380005,7.327373,0.791222,0.1266
...,...,...,...,...
2024-01-10,4783.450195,8.472917,1.657668,0.1269
2024-01-17,4739.209961,8.463626,-0.929164,0.1479
2024-01-24,4868.549805,8.490551,2.692566,0.1314
2024-01-31,4845.649902,8.485837,-0.471474,0.1435


In [16]:
# keep columns that have names containing f11 and f12 only
df = df.filter(regex='f11|f12')


In [17]:
# merge data to df merge on index
df = pd.merge(df, data, left_index=True, right_index=True, how='left')

In [18]:
# drop columns that has "maturity_target" , "lg_change_decr", and "lg_change_incr" in the column name; those are irrelevant for feature selection
df = df[df.columns.drop(list(df.filter(regex='maturity_target')))]
df = df[df.columns.drop(list(df.filter(regex='lg_change_decr')))]
df = df[df.columns.drop(list(df.filter(regex='lg_change_incr')))]
df = df[df.columns.drop(list(df.filter(regex='SP_adj_close')))]

# drop SP_lg_ret(%)	
df = df.drop(['SP_lg_ret(%)'], axis=1)
# df = df.drop(['SP_lg_pr'], axis=1)
df = df.drop(['VIX'], axis=1)



In [19]:
# Generate lagged variables from f1_mu to SP_lg_pr
lags = 6
for lag in range(1, lags+1):
    # for col in df.columns[df.columns.get_loc('f1_mu'):df.columns.get_loc('SP_lg_ret_vol')+1]:
    # for col in df.columns[df.columns.get_loc('f1_mu'):df.columns.get_loc('VIX')+1]: 
    for col in df.columns[df.columns.get_loc('f11_mu'):df.columns.get_loc('SP_lg_pr')+1]: 
    #for col in df.columns[df.columns.get_loc('f11_mu'):df.columns.get_loc('SP_lg_ret(%)')+1]:    
        df[f'{col}_lag{lag}'] = df[col].shift(lag)

  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)
  df[f'{col}_lag{lag}'] = df[col].shift(lag)


In [20]:
df_lagged = df.copy()
# drop NA rows
df_lagged = df_lagged.dropna()
df_lagged.shape


(571, 133)

In [21]:
# df_lagged.to_csv('mpd_sp500_lagged_log_price.csv', index=False)

In [22]:
# Define the target variable
start_colunm = df_lagged.columns.get_loc('f11_mu_lag1')
# end_column = df_lagged.columns.get_loc('VIX_lag6')
end_column = df_lagged.columns.get_loc('SP_lg_pr_lag6')
#end_column = df_lagged.columns.get_loc('SP_lg_ret(%)_lag6')

column_index = list(range(start_colunm, end_column+1))

X = df_lagged.iloc[:, column_index]
# y = df_lagged['SP_lg_ret(%)'] 
y = df_lagged['SP_lg_pr'] 

split_index = int(len(X)*0.75)
X_train = X[:split_index]
X_test = X[split_index:]
y_train = y[:split_index]
y_test = y[split_index:]

In [23]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((428, 114), (143, 114), (428,), (143,))

In [24]:
# run a lasso regression to select features
from sklearn.linear_model import LassoCV

lassoCV = LassoCV(cv=10, random_state=12345, max_iter=10000, tol=0.0001, selection='random')
lassoCV.fit(X_train, y_train)

In [25]:
print("In Sample R^2: ", f'{lassoCV.score(X_train, y_train):.5f}')
print()
print("Out of Sample R^2: ", f'{lassoCV.score(X_test, y_test):.5f}')
print()
# lasso coefficients with corresponding feature names
lasso_coef = pd.DataFrame(lassoCV.coef_, index=X.columns, columns=['coef'])
lasso_coef = lasso_coef[lasso_coef.coef != 0]

print("Number of features selected: ", len(lasso_coef))
print(lasso_coef)

print()
# show the predicted value
lass_y_pred = lassoCV.predict(X_test)
# calculate the MSE, RMSE, and MAE
from sklearn.metrics import mean_squared_error, mean_absolute_error
lass_mse = mean_squared_error(y_test, lass_y_pred)
lass_rmse = np.sqrt(lass_mse)
lass_mae = mean_absolute_error(y_test, lass_y_pred)
lass_mape = np.mean(np.abs((y_test - lass_y_pred) / y_test)) * 100

print('Out of Sample Test set evaluation:')
print(f'MSE: {lass_mse:.5f}, RMSE: {lass_rmse:.5f}, MAE: {lass_mae:.5f}, MAPE: {lass_mape:.5f}')


In Sample R^2:  0.99314

Out of Sample R^2:  0.87785

Number of features selected:  12
                   coef
f11_kurt_lag1 -0.005614
f12_kurt_lag1  0.007945
SP_lg_pr_lag1  0.959054
f11_kurt_lag2  0.001958
SP_lg_pr_lag2  0.002003
f11_p10_lag4  -0.035641
f12_kurt_lag4 -0.005773
f11_kurt_lag5  0.007417
f12_kurt_lag5 -0.002573
f11_kurt_lag6 -0.004551
f12_kurt_lag6 -0.000052
SP_lg_pr_lag6  0.033627

Out of Sample Test set evaluation:
MSE: 0.00060, RMSE: 0.02458, MAE: 0.01869, MAPE: 0.22389


Applied StandardScaler

In [26]:
# run a lasso regression to select features
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [27]:
lassoCV2 = LassoCV(cv=10, random_state=12345, max_iter=10000, tol=0.0001, selection='random')
lassoCV2.fit(X_train_scaled, y_train)

In [28]:
print("In Sample R^2: ", f'{lassoCV2.score(X_train_scaled, y_train):.5f}')
print()
print("Out of Sample R^2: ", f'{lassoCV2.score(X_test_scaled, y_test):.5f}')
print()

# lasso coefficients with corresponding feature names
lasso_coef = pd.DataFrame(lassoCV2.coef_, index=X.columns, columns=['coef'])
lasso_coef = lasso_coef[lasso_coef.coef != 0]

print("Number of features selected: ", len(lasso_coef))
print(lasso_coef)

print()
# show the predicted value
lassCV2_y_pred = lassoCV2.predict(X_test_scaled)
# calculate the MSE, RMSE, and MAE
from sklearn.metrics import mean_squared_error, mean_absolute_error
lass_mse = mean_squared_error(y_test, lassCV2_y_pred)
lass_rmse = np.sqrt(lass_mse)
lass_mae = mean_absolute_error(y_test, lassCV2_y_pred)
lass_mape = np.mean(np.abs((y_test - lassCV2_y_pred) / y_test)) * 100

print('Test set evaluation:')
print(f'MSE: {lass_mse:.5f}, RMSE: {lass_rmse:.5f}, MAE: {lass_mae:.5f}, MAPE: {lass_mape:.5f}')


In Sample R^2:  0.99301

Out of Sample R^2:  0.87591

Number of features selected:  7
                    coef
f12_mu_lag1     0.000848
SP_lg_pr_lag1   0.213467
SP_lg_pr_lag2   0.017935
f12_p50_lag3    0.000948
f12_prInc_lag3  0.003012
f11_prInc_lag4  0.000592
f12_prInc_lag4  0.001080

Test set evaluation:
MSE: 0.00061, RMSE: 0.02478, MAE: 0.01900, MAPE: 0.22764


Using Log Price Lasso Regression, unscalered has better result (LassoCV)

In [30]:
# convert to dataframe
lassCV2_y_pred_lp = pd.DataFrame(lassCV2_y_pred)
# rename to Predicted_SP_lg_pr
lassCV2_y_pred_lp.columns = ['Predicted_SP_lg_pr']
lassCV2_y_pred_lp.to_csv('Predicted_SP_lg_pr.csv', index=False)