In [1]:
import pandas as pd 
from dotenv import load_dotenv
import os

import mlflow
from datetime import date


In [98]:
# get environment variables
load_dotenv(dotenv_path="../.env")
DATAPATH = os.getenv("DATAPATH")
MLFLOW_TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI")

# Set other variables
train_ratio = 0.8
max_n_lags_used = 10
experiment_name = f"stock-prediction-BEL-20-{date.today()}"
model_name = f"best-model-{date.today()}"
preprocessor_name = f"preprocessor-{date.today()}"

In [54]:
data = pd.read_pickle(f"{DATAPATH}/BEL_20.pkl").sort_values("Date") # Load data

# Assuming the dataset is sorted by date, you can split by index
train_size = int(train_ratio * len(data))  
train_data = data.iloc[:train_size]
test_data = data.iloc[train_size:]

# Create features and target
y_train = train_data.sort_values(["ticker", "Date"]).filter(items=["close_growth"]).reset_index(drop=True)
complete_train = y_train.notna().to_numpy().flatten()
y_train = y_train[complete_train]
X_train = train_data.sort_values(["ticker", "Date"]).filter(regex="close_growth_lag|ticker").reset_index(drop=True)[complete_train]

y_test = test_data.sort_values(["ticker", "Date"]).filter(items=["close_growth"]).reset_index(drop=True)
complete_test = y_test.notna().to_numpy().flatten()
y_test = y_test[complete_test] # Remove NaNs from target 
X_test = test_data.sort_values(["ticker", "Date"]).filter(regex="close_growth_lag|ticker").reset_index(drop=True)[complete_test] # Remove corresponding NaNs from features

In [87]:
# Create scikit-learn pipeline and tracking with MLflow

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(experiment_name)

2023/08/02 20:35:42 INFO mlflow.tracking.fluent: Experiment with name 'stock-prediction-BEL-20-2023-08-02' does not exist. Creating a new experiment.


<Experiment: artifact_location='/Users/timcosemans/Library/CloudStorage/OneDrive-Persoonlijk/Projects/PROJ-Algorithmic-Trading/mlruns/461405004937865291', creation_time=1691001342514, experiment_id='461405004937865291', last_update_time=1691001342514, lifecycle_stage='active', name='stock-prediction-BEL-20-2023-08-02', tags={}>

In [88]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from scipy.sparse import issparse


def objective_function(params):
    n_lags_used = params["n_lags_used"]
    print(f"n_lags_used: {n_lags_used}")

    with mlflow.start_run() as run: 

        # Make dummies for categorical features
        cat_features = ["ticker"]
        cat_transformer = Pipeline(steps=[("create_dummies", OneHotEncoder(handle_unknown="ignore"))])

        num_features = [f"close_growth_lag_{i}" for i in range(1, n_lags_used + 1)]

        preprocessor = ColumnTransformer(transformers=[("cat", cat_transformer, cat_features), 
                                                    ('num', 'passthrough', num_features)], remainder="drop")

        
        X_train_reduced = preprocessor.fit_transform(X_train)

        # Convert to DataFrame (handle both dense and sparse data)
        if issparse(X_train_reduced):
            X_train_reduced = pd.DataFrame.sparse.from_spmatrix(X_train_reduced, columns=preprocessor.get_feature_names_out(input_features=X_train.columns))
        else:
            X_train_reduced = pd.DataFrame(X_train_reduced, columns=preprocessor.get_feature_names_out(input_features=X_train.columns))

        # Delete rows with missing values
        X_train_reduced = X_train_reduced.dropna()
        y_train_reduced = y_train.iloc[X_train_reduced.index, :]
    
        # Log parameters
        mlflow.log_param("model", "linear_regression")
        mlflow.log_param("features", f"close growth ({n_lags_used} lags) + ticker dummy")
        mlflow.log_param("target", "close growth")
        mlflow.log_param("n", len(X_train_reduced))
        mlflow.log_param("n_lags_used", n_lags_used)

        # Fit model
        model = LinearRegression()
        model.fit(X_train_reduced, y_train_reduced)

        # Log model and preprocessor
        mlflow.sklearn.log_model(preprocessor, "preprocessor")
        mlflow.sklearn.log_model(model, "model")

        # Make predictions
        X_test_reduced = preprocessor.transform(X_test)
        y_pred = model.predict(X_test_reduced)

        # Evaluate model
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        mape = mean_absolute_percentage_error(y_test, y_pred)

        # Log metrics
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("mape", mape)
    
    return mape


In [89]:
from hyperopt import hp, fmin, tpe, Trials
import numpy as np

# Define the search space
space = {
    'n_lags_used': hp.choice('n_lags_used', np.arange(1, max_n_lags_used + 1, dtype=int)),
}

# Hyperparameter optimization and registration of experiments
num_evals = 5
trials = Trials()
fmin(fn=objective_function, space=space, algo=tpe.suggest, max_evals=num_evals, trials=trials, rstate=np.random.default_rng(42))

n_lags_used: 1                                       
  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?]






n_lags_used: 9                                                                 
 20%|██        | 1/5 [00:07<00:28,  7.17s/trial, best loss: 498265285629.76294]







n_lags_used: 2                                                                 
 40%|████      | 2/5 [00:13<00:19,  6.46s/trial, best loss: 498265285629.76294]







n_lags_used: 7                                                                 
 60%|██████    | 3/5 [00:19<00:12,  6.46s/trial, best loss: 498265285629.76294]







n_lags_used: 4                                                                 
 80%|████████  | 4/5 [00:25<00:06,  6.28s/trial, best loss: 498265285629.76294]







100%|██████████| 5/5 [00:32<00:00,  6.42s/trial, best loss: 498265285629.76294]





{'n_lags_used': 0}

In [93]:
from mlflow.entities import ViewType

# Search for the best model in terms of MAPE
run = mlflow.search_runs(
    experiment_names=[experiment_name],
    filter_string="",
    run_view_type=ViewType.ACTIVE_ONLY,
    max_results=1,
    order_by=["metrics.mape ASC"],
)

In [97]:
# Get run ID and model/preprocessor URIs
run_id = run.run_id[0]
model_uri = f"runs:/{run_id}/model"
preprocessor_uri = f"runs:/{run_id}/preprocessor"

In [99]:
# Register the model and preprocessor
model_details = mlflow.register_model(model_uri=model_uri, name=model_name)
preprocessor_details = mlflow.register_model(model_uri=preprocessor_uri, name=preprocessor_name)

Successfully registered model 'best-model-2023-08-02'.
2023/08/02 20:49:34 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: best-model-2023-08-02, version 1
Created version '1' of model 'best-model-2023-08-02'.
Successfully registered model 'preprocessor-2023-08-02'.
2023/08/02 20:49:34 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: preprocessor-2023-08-02, version 1
Created version '1' of model 'preprocessor-2023-08-02'.


In [103]:
from mlflow.tracking import MlflowClient
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

# Transition to production stage
client.transition_model_version_stage(
    name=model_details.name,
    version=model_details.version,
    stage="Production", 
    archive_existing_versions=True
)

client.transition_model_version_stage(
    name=preprocessor_details.name,
    version=preprocessor_details.version,
    stage="Production", 
    archive_existing_versions=True
)

<ModelVersion: aliases=[], creation_timestamp=1691002174569, current_stage='Production', description=('Preprocessor for predicting the stock price change of a BEL 20 company for '
 'the next day.'), last_updated_timestamp=1691002581187, name='preprocessor-2023-08-02', run_id='f1743578ef5c4ff4bac40f5e4d7d1a14', run_link=None, source='/Users/timcosemans/Library/CloudStorage/OneDrive-Persoonlijk/Projects/PROJ-Algorithmic-Trading/mlruns/461405004937865291/f1743578ef5c4ff4bac40f5e4d7d1a14/artifacts/preprocessor', status='READY', status_message=None, tags={}, user_id=None, version=1>