
## MLflow

Track experiments (e.g., different model versions, parameters, metrics)

Log parameters, metrics, artifacts (like plots or model files)

Later compare models, reproduce runs, or deploy them.

## Add MLflow to XGBoost Notebook

In [1]:
# Install MLflow 
!pip install mlflow

Collecting mlflow
  Using cached mlflow-3.1.0-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==3.1.0 (from mlflow)
  Using cached mlflow_skinny-3.1.0-py3-none-any.whl.metadata (30 kB)
Collecting Flask<4 (from mlflow)
  Downloading flask-3.1.1-py3-none-any.whl.metadata (3.0 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Using cached alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Using cached docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Using cached graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Using cached gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting pyarrow<21,>=4.0.0 (from mlflow)
  Downloading pyarrow-20.0.0-cp311-cp311-macosx_12_0_arm64.whl.metadata (3.3 kB)
Collecting sqlalchemy<3,>=1.4.0 (from mlflow)
  Downloading sqlalchemy-2.0.41-cp311-cp311-macosx_11_0_arm64.whl.metadata (9.6 kB)
Collecting cachetools<7,>=5.0.0 (f

### Import libraries

In [4]:
import mlflow
import mlflow.xgboost
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [5]:
# Load from CSV
df_train = pd.read_csv('../Data/train_Guayas_final.csv')
df_train['date'] = pd.to_datetime(df_train['date'])

# Separate feature and target
X = df_train.drop(columns=['unit_sales', 'date'])
y = df_train['unit_sales']

# Train tesst Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:

# Enable autologging
mlflow.xgboost.autolog()

with mlflow.start_run():

    # Define and train model
    model = xgb.XGBRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Manual logging (optional, since autolog handles it)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)

    # Log model manually (optional)
    mlflow.xgboost.log_model(model, "model")


  self.get_booster().save_model(fname)


In [None]:
#View the MLflow UI locally

#This opens a web UI at http://localhost:5000, where you can:
#View experiment runs
#Compare performance
#Download artifacts

!mlflow ui

[2025-06-19 09:14:04 +0200] [71467] [INFO] Starting gunicorn 23.0.0
[2025-06-19 09:14:04 +0200] [71467] [INFO] Listening at: http://127.0.0.1:5000 (71467)
[2025-06-19 09:14:04 +0200] [71467] [INFO] Using worker: sync
[2025-06-19 09:14:04 +0200] [71468] [INFO] Booting worker with pid: 71468
[2025-06-19 09:14:04 +0200] [71469] [INFO] Booting worker with pid: 71469
[2025-06-19 09:14:04 +0200] [71470] [INFO] Booting worker with pid: 71470
[2025-06-19 09:14:04 +0200] [71471] [INFO] Booting worker with pid: 71471


### Save and Push to GitHub

In [None]:
# Push code from local machine
# Initialize Git if not done already

#git init
#git add .
#git commit -m "Initial commit with XGBoost + MLflow tracking"

# Link to GitHub repo (replace with your URL)
#git remote add origin https://github.com/YOUR_USERNAME/xgboost-mlflow-sales.git

# Push the code
#git branch -M main
#git push -u origin main

In [None]:
#(Optional) Add a .gitignore
#Make sure to ignore MLflow logs locally by creating a .gitignore
#mlruns/
#__pycache__/
#.ipynb_checkpoints/