In [45]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import pandas as pd, warnings
# warnings.simplefilter(action='ignore', category=pd.errors.SettingWithCopyWarning)

path_to_data_dir = "../../data/"

def load_local_preprocessed_data():
	X_train = pd.read_csv(path_to_data_dir + "prep_X_train.csv").set_index(["date", "ticker"])
	X_val = pd.read_csv(path_to_data_dir + "prep_X_val.csv").set_index(["date", "ticker"])
	X_test = pd.read_csv(path_to_data_dir + "prep_X_test.csv").set_index(["date", "ticker"])
	target = pd.read_csv(path_to_data_dir + "s&p_adjusted_target.csv").set_index(["date", "ticker"])
	return X_train, X_val, X_test, target["change_adj_s&p"]

In [46]:
# Load data
X_train, X_val, X_test, y_series = load_local_preprocessed_data()

train_dates, val_dates, test_dates = list(map(lambda df: df.index.get_level_values(0).unique().sort_values(), [X_train, X_val, X_test]))
print(f"# of train dates ({len(train_dates)}), val dates ({len(val_dates)}), test dates ({len(test_dates)})")

y_train = y_series.loc[train_dates]
y_val = y_series.loc[val_dates]
y_test = y_series.loc[test_dates]

print(f"Train shapes: {X_train.shape, y_train.shape}")
print(f"Val shapes: {X_val.shape, y_val.shape}")
print(f"Test shapes: {X_test.shape, y_test.shape}")

# of train dates (17), val dates (5), test dates (7)
Train shapes: ((26571, 380), (26571,))
Val shapes: ((7778, 380), (7778,))
Test shapes: ((11263, 380), (11263,))


In [81]:
# Assemble pipeline
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import PredefinedSplit, GridSearchCV, TimeSeriesSplit
from sklearn.pipeline import make_pipeline

def imputation_RMSE(model_name, model_algo, X_other, y_other, X_test, y_test, param_grid, moving_train_window = False):
	# lists to be returned 
	rmse_scores = {}
	r2_scores = {}
	score_deviations = {}
	best_params = {}
	best_models = {}

	time_split = TimeSeriesSplit(n_splits=5, test_size = 1, gap = 5)

	all_dates = X_other.index.get_level_values(0).unique().sort_values()

	param_grid = dict(item for item in param_grid.items() if model_name in item[0])

	# your code here...
	for i, (train_date_idx, val_date_idx) in enumerate(time_split.split(X_other.index.get_level_values(0).unique())):
			
			# print("Split shapes")
			# print(X_other.shape, X_test.shape)

			if moving_train_window:
				train_date_idx = train_date_idx[-10:]

			train_dates = all_dates[train_date_idx]
			val_date = all_dates[val_date_idx]

			train_set_X = X_other.loc[train_dates]
			train_set_y = y_other.loc[train_dates]

			val_set_X = X_other.loc[val_date]
			val_set_y = y_other.loc[val_date]

			other_set_X = pd.concat([train_set_X, val_set_X], axis = 0).reset_index(drop = True)
			other_set_y = pd.concat([train_set_y, val_set_y], axis = 0).reset_index(drop = True)

			# Prevent automatic CV with GridSearchCV
			ps = PredefinedSplit(([-1] * train_set_X.shape[0]) + ([1] * val_set_X.shape[0]))

			pipe = make_pipeline(model_algo)

			grid = GridSearchCV(
					pipe, cv = ps, param_grid = param_grid, scoring = "neg_mean_squared_error",
					return_train_score = True, verbose = True, refit=False)

			grid.fit(other_set_X, other_set_y)

			print(f"Best params for val date: {val_date[0]}")
			print(grid.best_params_)

			truths = [train_set_y, val_set_y, y_test]

			pipe.set_params(**grid.best_params_)

			pipe.fit(train_set_X, train_set_y)

			preds = list(map(lambda v: pipe.predict(v), [train_set_X, val_set_X, X_test]))

			train_rmse, val_rmse, test_rmse = list(
				map(
					lambda x: mean_squared_error(x[0], x[1], squared=False),
					zip(truths, preds)))

			train_r2, val_r2, test_r2 = list(
				map(
					lambda x: r2_score(x[0], x[1]),
					zip(truths, preds)))
			
			train_stddev, val_stddev, test_stddev = list(
				map(
					lambda pred: pred.std(),
					preds))

			print(f"RMSE for best estimator:")
			print(f"Train: {train_rmse:.6f}")
			print(f"Val: {val_rmse:.6f}")
			print(f"Test: {test_rmse:.6f}")
			print(f"R2 for best estimator:")
			print(f"Train: {train_r2:.6f}")
			print(f"Val: {val_r2:.6f}")
			print(f"Test: {test_r2:.6f}")
			print(f"Stddev for best estimator:")
			print(f"Train: {train_stddev:.6f}")
			print(f"Val: {val_stddev:.6f}")
			print(f"Test: {test_stddev:.6f}")
			

			rmse_scores[val_date[0]] = [train_rmse, val_rmse, test_rmse]
			r2_scores[val_date[0]] = [train_r2, val_r2, test_r2]
			score_deviations[val_date[0]] = [train_stddev, val_stddev, test_stddev]
			best_params[val_date[0]] = grid.best_params_
			best_models[val_date[0]] = pipe

	return rmse_scores, r2_scores, score_deviations, best_params, best_models

In [None]:
X_other = pd.concat([X_train, X_val], axis = 0)
y_other = pd.concat([y_train, y_val], axis = 0)

In [86]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import ElasticNet, LinearRegression, BayesianRidge
from sklearn.neighbors import KNeighborsRegressor


models = [
	("linearregression", LinearRegression()),
	("elasticnet", ElasticNet()),
	("bayesianridge", BayesianRidge()),
	("kneighborsregressor", KNeighborsRegressor()),
	("randomforestregressor", RandomForestRegressor()),
	# ("adaboostregressor", AdaBoostRegressor())
]

total_param_grid = {

	"randomforestregressor__n_estimators": [100, 250, 500],
	"randomforestregressor__max_features": ["log2"],
	"randomforestregressor__criterion": ["poisson", "friedman_mse"],
	"randomforestregressor__max_samples": [0.5, 0.7],

	"elasticnet__alpha": [0.001, .01, .1, 1.0, 10.0],
	"elasticnet__l1_ratio": [.01, .1, .7, .9],

	"bayesianridge__alpha_1": [1e-10, 1e-8, 1e-5, 1e-2],
	"bayesianridge__alpha_2": [1e-10, 1e-8, 1e-5, 1e-2],
	"bayesianridge__lambda_1": [1e-10, 1e-8, 1e-5, 1e-2],
	"bayesianridge__lambda_2": [1e-10, 1e-8, 1e-5, 1e-2],

	"kneighborsregressor__n_neighbors": [50, 100, 150, 200, 300, 500, 700, 1000],
	"kneighborsregressor__weights": ["uniform", "distance"],
}

warnings.simplefilter("ignore")

all_results = {}

for model_name, model_algo in models:
	print(model_name)
	grid_result = imputation_RMSE(model_name, model_algo, X_other, y_other, X_test, y_test, total_param_grid, moving_train_window=True)
	all_results[model_name] = grid_result

linearregression
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Best params for val date: 2023-10-30
{}
RMSE for best estimator:
Train: 0.140035
Val: 298249212.638202
Test: 156218154.703998
R2 for best estimator:
Train: 0.010722
Val: -17300780297868990464.000000
Test: -5765361617927428096.000000
Stddev for best estimator:
Train: 0.014594
Val: 298152676.233181
Test: 156204284.058089
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Best params for val date: 2023-10-31
{}
RMSE for best estimator:
Train: 0.140142
Val: 227979692.963550
Test: 119489371.571099
R2 for best estimator:
Train: 0.010044
Val: -11483261732541892608.000000
Test: -3373044724223354880.000000
Stddev for best estimator:
Train: 0.014157
Val: 227905996.601270
Test: 119478762.081483
Fitting 1 folds for each of 1 candidates, totalling 1 fits
Best params for val date: 2023-11-01
{}
RMSE for best estimator:
Train: 0.140016
Val: 0.073182
Test: 23329550.685708
R2 for best estimator:
Train: 0.009909
Val: -0.

In [87]:
jsonData = {}

for model_name, _ in models:
	rmse_scores, r2_scores, score_deviations, best_params, best_models = all_results[model_name]
	jsonData[model_name] = {
		"rmse_scores": rmse_scores,
		"r2_scores": r2_scores,
		"pred_deviations": score_deviations,
		"best_params": best_params
	}

In [88]:
import json

with open("grid_search_cv_results_moving_window.json", "w") as cv_json:
	json.dump(jsonData, cv_json)