In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from utils import *
import mlflow
pd.options.plotting.backend = "plotly"

In [7]:
%env MLFLOW_TRACKING_URI=https://mlflow-gcp-2-r5wfb53nxq-uc.a.run.app/
%env MLFLOW_TRACKING_USERNAME=mlflow
%env MLFLOW_TRACKING_PASSWORD=password123
%env MLFLOW_EXPERIMENT_NAME=my_first_experiment
%env GOOGLE_APPLICATION_CREDENTIALS=/Users/tsfelg/repos/mlflow-for-gcp/secrets/credentials.json

env: MLFLOW_TRACKING_URI=https://mlflow-gcp-2-r5wfb53nxq-uc.a.run.app/
env: MLFLOW_TRACKING_USERNAME=mlflow
env: MLFLOW_TRACKING_PASSWORD=password123
env: MLFLOW_EXPERIMENT_NAME=my_first_experiment
env: GOOGLE_APPLICATION_CREDENTIALS=/Users/tsfelg/repos/mlflow-for-gcp/secrets/credentials.json


# Data

In [9]:
df_1 = pd.read_csv("/Users/tsfelg/repos/mlops_zoomcamp_project/data/2021_PT_Region_Mobility_Report.csv")
df_2 = pd.read_csv("/Users/tsfelg/repos/mlops_zoomcamp_project/data/2022_PT_Region_Mobility_Report.csv")
df = pd.concat([df_1,df_2])

In [10]:
df = clean_data(df)
df = feature_extraction(df)
df = df.dropna()

In [11]:
df_train = df.loc["2021":"2022-01"]
df_test = df.loc["2022-2":]

X_train = df_train.drop(columns=["y"])
X_test = df_test.drop(columns=["y"])

y_train = df_train.y
y_test = df_test.y

# Training

In [12]:
model_ctx = "LinearRegression"
#model_ctx = "LGBMRegressor"

# No Retrain

In [13]:
mlflow.set_experiment('retail_forecasting_dev')

with mlflow.start_run():
    if model_ctx == "LinearRegression":
        model = LinearRegression()
    elif model_ctx == "LGBMRegressor":
        model = LGBMRegressor()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    df.loc[X_test.index, "y_pred"] = y_pred

    mae = np.abs(y_pred - df_test.y).mean()

    mlflow.log_metric("test_mae", mae)

    params = {"Retrain": False}
    mlflow.log_params(params)

    mlflow.sklearn.log_model(model, artifact_path="model")

mlflow.end_run()

# Monthly Retrain

In [None]:
df_test_full = df.loc["2022-2":]

for month in df_test_full.index.month.unique():
    df_train = df.loc["2021":"2022-"+str(month-1)]
    df_test = df_test_full["2022-"+str(month):"2022-"+str(month)]
    X_train = df_train.drop(columns=["y","y_pred","y_pred_retrain"], errors="ignore")
    X_test = df_test.drop(columns=["y","y_pred","y_pred_retrain"], errors="ignore")

    y_train = df_train.y
    y_test = df_test.y

    mlflow.set_experiment('retail_forecasting_dev')

    if model_ctx == "LinearRegression":
        model = LinearRegression()
    elif model_ctx == "LGBMRegressor":
        model = LGBMRegressor()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    df.loc[X_test.index, "y_pred_retrain"] = y_pred

with mlflow.start_run():
    params = {"Retrain": True, "Model":model_ctx}
    mlflow.log_params(params)
    
    mae = np.abs(df_test.y - df.loc[X_test.index, "y_pred_retrain"]).mean()
    mlflow.log_metric("test_mae", mae)
    mlflow.sklearn.log_model(model, artifact_path="model")
mlflow.end_run()

# Evaluation

In [None]:
print(np.abs(df.y - df.y_pred).mean())

In [None]:
print(np.abs(df.y - df.y_pred_retrain).mean())

In [None]:
df["baseline"] = df.y.shift(7)
print(np.abs(df.y - df.baseline).mean())

In [None]:
df.plot()

# Model Registry

In [3]:
from mlflow.tracking import MlflowClient

client = MlflowClient()
runs = client.search_runs(experiment_ids='5')
run = runs[0]
RUN_ID = run.info.run_id

In [4]:
logged_model = f"gs://mlflow-mlops/5/{RUN_ID}/artifacts/model"
model = mlflow.pyfunc.load_model(logged_model)

In [5]:
model

mlflow.pyfunc.loaded_model:
  artifact_path: model
  flavor: mlflow.sklearn
  run_id: e97ea62ced9f4da293ccb7f2156a0980