In [1]:
import pandas as pd 
from dotenv import load_dotenv
import os

import mlflow

In [20]:
# get environment variables
load_dotenv(dotenv_path="../.env")
DATAPATH = os.getenv("DATAPATH")
MLFLOW_TRACKING_URI = os.getenv("MLFLOW_TRACKING_URI")

# Set other variables
train_ratio = 0.8
n_lags_used = 3

In [40]:
data = pd.read_pickle(f"{DATAPATH}/BEL_20.pkl").sort_values("Date") # Load data

# Assuming the dataset is sorted by date, you can split by index
train_size = int(train_ratio * len(data))  
train_data = data.iloc[:train_size]
test_data = data.iloc[train_size:]

# Create features and target
y_train = train_data.sort_values(["ticker", "Date"]).filter(items=["close_growth"]).reset_index(drop=True)
complete_train = y_train.notna().to_numpy().flatten()
y_train = y_train[complete_train]
X_train = train_data.sort_values(["ticker", "Date"]).filter(regex="close_growth_lag|ticker").reset_index(drop=True)[complete_train]

y_test = test_data.sort_values(["ticker", "Date"]).filter(items=["close_growth"]).reset_index(drop=True)
complete_test = y_test.notna().to_numpy().flatten()
y_test = y_test[complete_test] # Remove NaNs from target 
X_test = test_data.sort_values(["ticker", "Date"]).filter(regex="close_growth_lag|ticker").reset_index(drop=True)[complete_test] # Remove corresponding NaNs from features

In [4]:
# Create scikit-learn pipeline and tracking with MLflow

mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment("stock-prediction-BEL-20")

2023/08/02 14:41:34 INFO mlflow.tracking.fluent: Experiment with name 'stock-prediction-BEL-20' does not exist. Creating a new experiment.


<Experiment: artifact_location='/Users/timcosemans/Library/CloudStorage/OneDrive-Persoonlijk/Projects/PROJ-Algorithmic-Trading/mlruns/228838766029681492', creation_time=1690980094130, experiment_id='228838766029681492', last_update_time=1690980094130, lifecycle_stage='active', name='stock-prediction-BEL-20', tags={}>

In [43]:
X_train_reduced = X_train_reduced.dropna()
    



In [44]:
y_train_reduced = y_train.iloc[X_train_reduced.index, :]

In [48]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error

with mlflow.start_run() as run: 

    # Make dummies for categorical features
    cat_features = ["ticker"]
    cat_transformer = Pipeline(steps=[("create_dummies", OneHotEncoder(handle_unknown="ignore"))])

    num_features = [f"close_growth_lag_{i}" for i in range(1, n_lags_used + 1)]

    preprocessor = ColumnTransformer(transformers=[("cat", cat_transformer, cat_features), 
                                                   ('num', 'passthrough', num_features)], remainder="drop")

    # Delete rows with missing values
    X_train_reduced = preprocessor.fit_transform(X_train)
    X_train_reduced = pd.DataFrame(X_train_reduced.toarray(), columns=preprocessor.get_feature_names_out(input_features=X_train.columns))
    X_train_reduced = X_train_reduced.dropna()
    y_train_reduced = y_train.iloc[X_train_reduced.index, :]
   
    # Log parameters
    mlflow.log_param("model", "linear_regression")
    mlflow.log_param("features", f"close growth ({n_lags_used} lags) + ticker dummy")
    mlflow.log_param("target", "close growth")
    mlflow.log_param("n", len(X_train_reduced))

    # Fit model
    model = LinearRegression()
    model.fit(X_train_reduced, y_train_reduced)

    # Log model
    mlflow.sklearn.log_model(model, "model")

    # Make predictions
    X_test_reduced = preprocessor.transform(X_test)
    y_pred = model.predict(X_test_reduced)

    # Evaluate model
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)

    # Log metrics
    mlflow.log_metric("mse", mse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mape", mape)


