# Load necessary libraries

In [None]:
from mlforecast import MLForecast
import lightgbm as lgb
from window_ops.expanding import expanding_mean
from window_ops.rolling import rolling_mean
import pandas as pd

import mlflow
from mlflow.tracking import MlflowClient
from utils import get_smape

# Load data
The objective is to predict 3 months of item-level sales data at different store locations.

In [None]:
df = pd.read_csv("data.csv")
df.head()

# Preprocess

In [None]:
# create a unique_id column
df['unique_id'] = 'store' +\
                df['store'].astype(str) + '_' +\
                'item' + df['item'].astype(str)

# rename columns
df = df.rename(columns={'date': 'ds', 'sales': 'y'})
df['ds'] = pd.to_datetime(df['ds'])
df = df[['ds', 'unique_id', 'y']]

df.head(5)

In [None]:
# check for the missing dates
df_check = pd.DataFrame()
df_check['ds'] = pd.date_range(df.ds.min(), df.ds.max(), freq='D')

# merge df and df_check
df_new = df.merge(df_check, on='ds')

print(f"There are no missing dates in df: {len(df) == len(df_new)}")

## Modelling

In [None]:
# create the model
fcst = MLForecast(
    models=lgb.LGBMRegressor(),
    freq='D',
    lags=[1, 3], # because of the lagged features the time series become shorter
    lag_transforms={
        0: [expanding_mean, (rolling_mean, 2, 0)],
        1: [expanding_mean],
        3: [(rolling_mean, 2, 0)]
    },
    date_features=['dayofweek'],
)

# create a df
preprocess_df = fcst.preprocess(df,
                                id_col='unique_id',
                                time_col='ds',
                                target_col='y')
preprocess_df.head()

## Model Tracking

In [None]:
# your folder with this code should be in your root directory
EXPERIMENT_NAME = "mlflow_example"
EXPERIMENT_ID = mlflow.create_experiment(EXPERIMENT_NAME) 

for idx, param_num_leaves in enumerate([10, 100, 200]):

    # define the model
    model = MLForecast(
    models=lgb.LGBMRegressor(num_leaves=param_num_leaves),
    freq='D',
    lags=[1, 3],
    lag_transforms={
        0: [expanding_mean, (rolling_mean, 2, 0)],
        1: [expanding_mean],
        3: [(rolling_mean, 2, 0)]
    },
    date_features=['dayofweek'],
    )
    
    # perform cross-validation
    crossvalidation_df = model.cross_validation(
    n_windows=2,
    window_size=8,
    data=df,
    id_col='unique_id',
    time_col='ds',
    target_col='y',
    )
    
    # evaluate
    error = get_smape(crossvalidation_df, model='LGBMRegressor')

    RUN_NAME = f"run_{idx}"
    with mlflow.start_run(experiment_id=EXPERIMENT_ID, run_name=RUN_NAME) as run:

        # track parameters
        mlflow.log_param("num_leaves", param_num_leaves)

        # track metrics
        mlflow.log_metric("smape_error", error)

        # track model
        mlflow.sklearn.log_model(model, "LGBMRegressor")

In [None]:
client = MlflowClient()

# Retrieve Experiment information
EXPERIMENT_ID = client.get_experiment_by_name(EXPERIMENT_NAME).experiment_id
df_experiment_info = mlflow.search_runs(EXPERIMENT_ID, order_by=["metrics.smape_error ASC"])

# view
df_experiment_info

In [None]:
# select the run with the best metric
best_run_id = df_experiment_info.loc[0, 'run_id']
best_run_id

In [None]:
# download the model of the bext run
best_model_path = client.download_artifacts(best_run_id, "LGBMRegressor")
best_model = mlflow.sklearn.load_model(best_model_path)

In [None]:
# view the best model
best_model

## Predict

In [None]:
best_model.predict(20)

## Clean-up (if necessary)

In [None]:
# Delete runs (DO NOT USE UNLESS CERTAIN)
for run_id in df_experiment_info['run_id'].values:
    client.delete_run(run_id)

In [None]:
# Delete experiment (DO NOT USE UNLESS CERTAIN)
client.delete_experiment(EXPERIMENT_ID)