In [1]:
import datetime as dt

import pandas as pd
from sklearn.metrics import mean_absolute_percentage_error, r2_score, mean_absolute_error
from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler,
    StandardScaler
)
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.model_selection import GridSearchCV

from analytics.machine_learning.price_prediction_with_fundamentals import utils

# Lasso Hypertuning

In [2]:
from sklearn.linear_model import Lasso

dataset = utils.get_dataset()

train_validation_set, test_set = utils.split_data_to_train_and_test(
    df=dataset,
    cutoff_date=dt.datetime(2023,6,1)
)

train_set, validation_set =  utils.split_data_to_train_and_test(
    df=train_validation_set,
    cutoff_date=dt.datetime(2023,4,1)
)

cols_to_drop = ['symbol', 'fiscal_date_ending', 'avg_next_three_months_price']


y_train = train_set[['avg_next_three_months_price', 'sector']]
X_train = train_set.drop(cols_to_drop, axis=1)

y_test = test_set[['avg_next_three_months_price', 'sector']]
X_test = test_set.drop(cols_to_drop, axis=1)

y_validation = validation_set[['avg_next_three_months_price', 'sector']]
X_validation = validation_set.drop(cols_to_drop, axis=1)

column_transformer = make_column_transformer(
    (
        OneHotEncoder(), ['sector']
    ),
    (
        StandardScaler(),
        ~X_train.columns.isin(['sector'])
    ),
    remainder='passthrough'
)

lasso_reg = make_pipeline(
    column_transformer,
    Lasso()
)

# Set up the parameter grid for grid search
param_grid = {
    'lasso__alpha': [0.1, 0.5, 1, 5, 10],
    'lasso__tol': [1e-4, 1e-3, 1e-5],
    'lasso__max_iter': [1000, 2000, 3000]
}

# Create GridSearchCV object with single split (validation set)
grid_search = GridSearchCV(
    lasso_reg,
    param_grid,
    scoring='neg_mean_absolute_percentage_error',
    cv=[(X_train.index.values, X_validation.index.values)]
)

# Fit the grid search to the data
grid_search.fit(pd.concat([X_train, X_validation]), pd.concat([y_train['avg_next_three_months_price'], y_validation['avg_next_three_months_price']]))

# Get the best parameters and model
best_params = grid_search.best_params_
best_lasso_reg = grid_search.best_estimator_

print(best_params)
best_lasso_reg

{'lasso__alpha': 1, 'lasso__max_iter': 1000, 'lasso__tol': 0.001}


# XGB Hypertuning

In [2]:
import xgboost as xgb

dataset = utils.get_dataset()

train_validation_set, test_set = utils.split_data_to_train_and_test(
    df=dataset,
    cutoff_date=dt.datetime(2023,6,1)
)

train_set, validation_set =  utils.split_data_to_train_and_test(
    df=train_validation_set,
    cutoff_date=dt.datetime(2023,4,1)
)

cols_to_drop = ['symbol', 'fiscal_date_ending', 'avg_next_three_months_price']


y_train = train_set[['avg_next_three_months_price', 'sector']]
X_train = train_set.drop(cols_to_drop, axis=1)

y_test = test_set[['avg_next_three_months_price', 'sector']]
X_test = test_set.drop(cols_to_drop, axis=1)

y_validation = validation_set[['avg_next_three_months_price', 'sector']]
X_validation = validation_set.drop(cols_to_drop, axis=1)

column_transformer = make_column_transformer(
    (
        OneHotEncoder(), ['sector']
    ),
    (
        StandardScaler(),
        ~X_train.columns.isin(['sector'])
    ),
    remainder='passthrough'
)

xgb_reg = make_pipeline(
    column_transformer,
    xgb.XGBRegressor(objective='reg:squarederror', booster='gbtree')
)

# Set up the parameter grid for grid search
param_grid = {
    'xgbregressor__n_estimators': [100, 500, 1000],
    'xgbregressor__learning_rate': [0.05, 0.1, 0.3],
    'xgbregressor__max_depth': [4, 6, 8],
}

# Create GridSearchCV object with single split (validation set)
grid_search = GridSearchCV(
    xgb_reg,
    param_grid,
    scoring='neg_mean_absolute_percentage_error',
    cv=[(X_train.index.values, X_validation.index.values)]
)

# Fit the grid search to the data
grid_search.fit(pd.concat([X_train, X_validation]), pd.concat([y_train['avg_next_three_months_price'], y_validation['avg_next_three_months_price']]))

# Get the best parameters and model
best_params = grid_search.best_params_
best_xgb_reg = grid_search.best_estimator_

print(best_params)
best_xgb_reg

{'xgbregressor__learning_rate': 0.3, 'xgbregressor__max_depth': 8, 'xgbregressor__n_estimators': 1000}
