In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import pandas as pd, warnings
# warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

path_to_data_dir = "../../data/"

def load_local_preprocessed_data():
	# X_train = pd.read_csv(path_to_data_dir + "prep_X_train.csv").set_index(["date", "ticker"])
	# X_val = pd.read_csv(path_to_data_dir + "prep_X_val.csv").set_index(["date", "ticker"])
	# X_test = pd.read_csv(path_to_data_dir + "prep_X_test.csv").set_index(["date", "ticker"])
	X_df = pd.read_csv(path_to_data_dir + "target_adjusted_features.csv").set_index(["date", "ticker"])
	target = pd.read_csv(path_to_data_dir + "s&p_adjusted_target.csv").set_index(["date", "ticker"])
	return X_df, target["change_adj_s&p"]

def split_date(all_dates, train_size = 0.75, days_ahead = 5):
	num_dates = len(all_dates)
	num_train_test_dates = num_dates - days_ahead
	num_train_dates = int(num_train_test_dates * train_size)

	train_dates = all_dates[:num_train_dates]
	val_dates = all_dates[num_train_dates:num_train_dates+days_ahead]
	test_dates = all_dates[num_train_dates+days_ahead:]

	return train_dates, val_dates, test_dates

In [3]:
# Load data
X_df, y_series = load_local_preprocessed_data()

train_dates, val_dates, test_dates = split_date(X_df.index.get_level_values(0).unique().sort_values()[2:])
print(f"# of train dates ({len(train_dates)}), val dates ({len(val_dates)}), test dates ({len(test_dates)})")

X_train, y_train = X_df.loc[train_dates], y_series.loc[train_dates]
X_val, y_val = X_df.loc[val_dates], y_series.loc[val_dates]
X_test, y_test = X_df.loc[test_dates], y_series.loc[test_dates]

print(f"Train shapes: {X_train.shape, y_train.shape}")
print(f"Val shapes: {X_val.shape, y_val.shape}")
print(f"Test shapes: {X_test.shape, y_test.shape}")

# of train dates (19), val dates (5), test dates (7)
Train shapes: ((29745, 339), (29745,))
Val shapes: ((7778, 339), (7778,))
Test shapes: ((11263, 339), (11263,))


In [4]:
# Preprocess
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

onehot_features = ["sector", "area"]
standard_scale_features = [f for f in X_train.columns.values if f not in onehot_features]

preprocessor = ColumnTransformer(
	transformers = [
		("onehot", OneHotEncoder(sparse_output=False, handle_unknown="ignore"), onehot_features),
		("std", StandardScaler(), standard_scale_features)
	]
)

In [5]:
# Assemble pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import PredefinedSplit, GridSearchCV, TimeSeriesSplit
from sklearn.pipeline import make_pipeline

def xgb_RMSE(model_name, model_algo, preprocessor, X_other, y_other, X_test, y_test, param_grid, moving_train_window = False):
	# lists to be returned 
	rmse_scores = {}
	r2_scores = {}
	score_deviations = {}
	best_params = {}
	best_models = {}

	time_split = TimeSeriesSplit(n_splits=5, test_size = 1, gap = 5)

	all_dates = X_other.index.get_level_values(0).unique().sort_values()

	param_grid = dict(item for item in param_grid.items() if model_name in item[0])

	# your code here...
	for i, (train_date_idx, val_date_idx) in enumerate(time_split.split(X_other.index.get_level_values(0).unique())):
			
			# print("Split shapes")
			# print(X_other.shape, X_test.shape)

			if moving_train_window:
				train_date_idx = train_date_idx[-10:]

			train_dates = all_dates[train_date_idx]
			val_date = all_dates[val_date_idx]

			train_set_X = X_other.loc[train_dates]
			train_set_y = y_other.loc[train_dates]

			val_set_X = X_other.loc[val_date]
			val_set_y = y_other.loc[val_date]

			other_set_X = pd.concat([train_set_X, val_set_X], axis = 0).reset_index(drop = True)
			other_set_y = pd.concat([train_set_y, val_set_y], axis = 0).reset_index(drop = True)

			# Prevent automatic CV with GridSearchCV
			ps = PredefinedSplit(([-1] * train_set_X.shape[0]) + ([1] * val_set_X.shape[0]))

			pipe = make_pipeline(preprocessor, model_algo)

			grid = GridSearchCV(
					pipe, cv = ps, param_grid = param_grid, scoring = "neg_mean_squared_error",
					return_train_score = True, verbose = True, refit=False)

			grid.fit(other_set_X, other_set_y)

			print(f"Best params for val date: {val_date[0]}")
			print(grid.best_params_)

			truths = [train_set_y, val_set_y, y_test]

			pipe.set_params(**grid.best_params_)

			pipe.fit(train_set_X, train_set_y)

			preds = list(map(lambda v: pipe.predict(v), [train_set_X, val_set_X, X_test]))

			train_rmse, val_rmse, test_rmse = list(
				map(
					lambda x: mean_squared_error(x[0], x[1], squared=False),
					zip(truths, preds)))

			train_r2, val_r2, test_r2 = list(
				map(
					lambda x: r2_score(x[0], x[1]),
					zip(truths, preds)))
			
			train_stddev, val_stddev, test_stddev = list(
				map(
					lambda pred: pred.std(),
					preds))

			print(f"RMSE for best estimator:")
			print(f"Train: {train_rmse:.6f}")
			print(f"Val: {val_rmse:.6f}")
			print(f"Test: {test_rmse:.6f}")
			print(f"R2 for best estimator:")
			print(f"Train: {train_r2:.6f}")
			print(f"Val: {val_r2:.6f}")
			print(f"Test: {test_r2:.6f}")
			print(f"Stddev for best estimator:")
			print(f"Train: {train_stddev:.6f}")
			print(f"Val: {val_stddev:.6f}")
			print(f"Test: {test_stddev:.6f}")
			

			rmse_scores[val_date[0]] = [train_rmse, val_rmse, test_rmse]
			r2_scores[val_date[0]] = [train_r2, val_r2, test_r2]
			score_deviations[val_date[0]] = [train_stddev, val_stddev, test_stddev]
			best_params[val_date[0]] = grid.best_params_
			best_models[val_date[0]] = pipe

	return rmse_scores, r2_scores, score_deviations, best_params, best_models

In [6]:
X_other = pd.concat([X_train, X_val], axis = 0)
y_other = pd.concat([y_train, y_val], axis = 0)

In [8]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.linear_model import ElasticNet, LinearRegression, BayesianRidge
# from sklearn.neighbors import KNeighborsRegressor
from xgboost.sklearn import XGBRegressor


models = [
	("xgbregressor", XGBRegressor()),
	# ("elasticnet", ElasticNet()),
	# ("bayesianridge", BayesianRidge()),
	# ("kneighborsregressor", KNeighborsRegressor()),
	# ("randomforestregressor", RandomForestRegressor()),
	# ("adaboostregressor", AdaBoostRegressor())
]

total_param_grid = {
	"xgbregressor__n_estimators": [100, 500, 1000],
	'xgbregressor__learning_rate': [1e-3, 1e-5, 1e-7],
	"xgbregressor__reg_lambda": [0.1, 1.0, 10.0],
	"xgbregressor__subsample": [0.5, 0.7, None],
	"xgbregressor__max_depth": [3, 5, 7],

	# "randomforestregressor__n_estimators": [100, 250, 500],
	# "randomforestregressor__max_features": ["log2"],
	# "randomforestregressor__criterion": ["poisson", "friedman_mse"],
	# "randomforestregressor__max_samples": [0.5, 0.7],

	# "elasticnet__alpha": [0.001, .01, .1, 1.0, 10.0],
	# "elasticnet__l1_ratio": [.01, .1, .7, .9],

	# "bayesianridge__alpha_1": [1e-10, 1e-8, 1e-5, 1e-2],
	# "bayesianridge__alpha_2": [1e-10, 1e-8, 1e-5, 1e-2],
	# "bayesianridge__lambda_1": [1e-10, 1e-8, 1e-5, 1e-2],
	# "bayesianridge__lambda_2": [1e-10, 1e-8, 1e-5, 1e-2],

	# "kneighborsregressor__n_neighbors": [50, 100, 150, 200, 300, 500, 700, 1000],
	# "kneighborsregressor__weights": ["uniform", "distance"],
}

warnings.simplefilter("ignore")

all_results = {}

for model_name, model_algo in models:
	print(model_name)
	grid_result = xgb_RMSE(model_name, model_algo, preprocessor, X_other, y_other, X_test, y_test, total_param_grid, moving_train_window=True)
	all_results[model_name] = grid_result

xgbregressor
Fitting 1 folds for each of 27 candidates, totalling 27 fits
Best params for val date: 2023-10-30
{'xgbregressor__learning_rate': 1e-07, 'xgbregressor__max_depth': 5, 'xgbregressor__n_estimators': 100, 'xgbregressor__reg_lambda': 1.0, 'xgbregressor__subsample': None}
RMSE for best estimator:
Train: 0.140791
Val: 0.074975
Test: 0.065115
R2 for best estimator:
Train: 0.000009
Val: -0.093313
Test: -0.001664
Stddev for best estimator:
Train: 0.000001
Val: 0.000000
Test: 0.000000
Fitting 1 folds for each of 27 candidates, totalling 27 fits
Best params for val date: 2023-10-31
{'xgbregressor__learning_rate': 1e-07, 'xgbregressor__max_depth': 5, 'xgbregressor__n_estimators': 100, 'xgbregressor__reg_lambda': 1.0, 'xgbregressor__subsample': 0.5}
RMSE for best estimator:
Train: 0.140851
Val: 0.067793
Test: 0.065227
R2 for best estimator:
Train: 0.000004
Val: -0.015419
Test: -0.005116
Stddev for best estimator:
Train: 0.000000
Val: 0.000000
Test: 0.000000
Fitting 1 folds for each of 

In [9]:
jsonData = {}

for model_name, _ in models:
	rmse_scores, r2_scores, score_deviations, best_params, best_models = all_results[model_name]
	jsonData[model_name] = {
		"rmse_scores": rmse_scores,
		"r2_scores": r2_scores,
		"pred_deviations": score_deviations,
		"best_params": best_params
	}

In [11]:
jsonData

{'xgbregressor': {'rmse_scores': {'2023-10-30': [0.14079103788650682,
    0.07497538867205285,
    0.06511474749598006],
   '2023-10-31': [0.14085113700586413,
    0.06779320343915794,
    0.06522685021272046],
   '2023-11-01': [0.14071438118205803,
    0.07130837016382845,
    0.06515681552319047],
   '2023-11-02': [0.14115770657086965,
    0.07571476985418321,
    0.06539711716379852],
   '2023-11-03': [0.11492555175521657,
    0.06437493169358369,
    0.06740798779487335]},
  'r2_scores': {'2023-10-30': [8.948979964173986e-06,
    -0.09331304770325155,
    -0.001664465172172891],
   '2023-10-31': [4.4174338178493144e-06,
    -0.01541891026921216,
    -0.005116401078331112],
   '2023-11-01': [4.151373070748754e-06,
    -0.019233260606708713,
    -0.002959153747312371],
   '2023-11-02': [8.938307550998559e-06,
    -0.16760843494544275,
    -0.010370724179547208],
   '2023-11-03': [0.34389243522897517,
    -0.031562367304188044,
    -0.0734610029261551]},
  'pred_deviations': {'2023-10

In [3]:
import json

with open("xgb_cv_results_moving_window.json", "w") as cv_json:
	json.dump(jsonData, cv_json)