In [12]:
# basic
import numpy as np
import pandas as pd

# diagnostics
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score

# models
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
import statsmodels.api as sm
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

# cosmetics
import warnings
warnings.filterwarnings('ignore')

# Preparation

In [15]:
data = pd.read_csv('data/cleaned_data.csv')
data['date'] = pd.to_datetime(data['date'])
data.head()

Unnamed: 0,permno,date,ticker,log_size,log_bm,log_pcf,mom,strev,vol,roa,roe,log_age_lb,price,log_to,rf,rm,ols_3m_d,ols_1y_d,ols_5y_m,f_ols_1y_d
0,10495,1970-02-27,A,6.47049,-0.210721,1.350408,-0.578969,0.072728,0.064539,0.165,0.08,3.788477,29.5,16.413102,0.006098,0.057959,1.374051,0.853367,1.140401,0.930236
1,10495,1970-03-31,A,6.457554,0.579978,0.988053,-0.522672,0.002921,0.064813,0.228,0.086,3.790459,29.125,16.642725,0.005685,-0.004832,0.987517,0.853597,1.139011,0.959935
2,10495,1970-04-30,A,6.386543,0.579978,0.988053,-0.536403,-0.068671,0.065203,0.228,0.086,3.792313,27.125,16.194663,0.005072,-0.104827,0.893758,0.889231,1.119059,0.971245
3,10495,1970-05-29,A,6.363201,0.579978,0.988053,-0.40425,-0.023041,0.066323,0.228,0.086,3.794103,26.5,16.513551,0.005263,-0.064084,1.008542,0.940364,1.115537,0.962626
4,10495,1970-06-30,A,6.26416,0.579978,1.494476,-0.113006,-0.077634,0.066461,0.228,-0.001,3.796074,24.0,15.603262,0.005736,-0.052295,1.02386,0.958821,1.093605,0.949843


In [16]:
selected_features = ['log_size', 'log_pcf', 'log_to', 'vol', 'ols_3m_d', 'ols_1y_d', 'ols_5y_m']

y = data['f_ols_1y_d']
X = data[selected_features]
for col in X.columns:
    col_mean = X[col].mean()
    col_std = X[col].std()
    X[col] = (X[col] - col_mean) / col_std
    X[col] = X[col].clip(lower=-3, upper=3) # normalize features to [-3,3]

# Split train, val, test
test_time_start = data['date'].max() - pd.DateOffset(years=5) # last 5 years as testme 
val_time_start = test_time_start - pd.DateOffset(years=2) # last 2 years as val
data_test = data[data['date'] >= test_time_start]
data_val = data[(data['date'] >= val_time_start) & (data['date'] < test_time_start)]
data_train = data[data['date'] < val_time_start]
X_test = X[data['date'] >= test_time_start]
X_val = X[(data['date'] >= val_time_start) & (data['date'] < test_time_start)]
X_train = X[data['date'] < val_time_start]
y_test = y[data['date'] >= test_time_start]
y_val = y[(data['date'] >= val_time_start) & (data['date'] < test_time_start)]
y_train = y[data['date'] < val_time_start]

# special train set for time-series cross validation (automanaged by sklearn)
X = X[data['date'] < test_time_start]
y = y[data['date'] < test_time_start]

In [18]:
best_model = xgb.XGBRegressor()
best_model.load_model('models/XGB.json')
y_pred = best_model.predict(X_test)

In [19]:
y_pred # predicted beta in last 5 years

array([1.2827435, 1.2893898, 1.2072872, ..., 1.3300282, 1.3281208,
       1.3565236], dtype=float32)