# MLflow Start Guide

* CLI: run `mlflow ui` to see the MLflow tracking server UI.

In [1]:
from mlflow import MlflowClient
from pprint import pprint
from sklearn.ensemble import RandomForestClassifier

client = MlflowClient(tracking_uri="http://127.0.0.1:5000")

In [2]:
all_experiments = client.search_experiments()

print(all_experiments)

[<Experiment: artifact_location='mlflow-artifacts:/319390409706919501', creation_time=1729085340486, experiment_id='319390409706919501', last_update_time=1729085340486, lifecycle_stage='active', name='Diabetes_Models', tags={}>, <Experiment: artifact_location='mlflow-artifacts:/597897158150876732', creation_time=1729077649624, experiment_id='597897158150876732', last_update_time=1729077649624, lifecycle_stage='active', name='Apple_Models', tags={'mlflow.note.content': 'This is the grocery forecasting project. This '
                        'experiment contains the produce models for apples.',
 'project_name': 'grocery-forecasting',
 'project_quarter': 'Q3-2023',
 'store_dept': 'produce',
 'team': 'stores-ml'}>, <Experiment: artifact_location='mlflow-artifacts:/0', creation_time=1729074293877, experiment_id='0', last_update_time=1729074293877, lifecycle_stage='active', name='Default', tags={}>]


In [6]:
existing_exp = [
    {"name": experiment.name, "lifecycle_stage": experiment.lifecycle_stage}
    for experiment in all_experiments 
    #if experiment.name == "Default"
]

pprint(existing_exp)

[{'lifecycle_stage': 'active', 'name': 'Diabetes_Models'},
 {'lifecycle_stage': 'active', 'name': 'Apple_Models'},
 {'lifecycle_stage': 'active', 'name': 'Default'}]


### Creating Experiments

```python
    # Provide an Experiment description that will appear in the UI
    experiment_description = (
        "This is the grocery forecasting project. "
        "This experiment contains the produce models for apples."
    )

    # Provide searchable tags that define characteristics of the Runs that
    # will be in this Experiment
    experiment_tags = {
        "project_name": "grocery-forecasting",
        "store_dept": "produce",
        "team": "stores-ml",
        "project_quarter": "Q3-2023",
        "mlflow.note.content": experiment_description,
    }

    # Create the Experiment, providing a unique name
    produce_apples_experiment = client.create_experiment(
        name="Apple_Models", tags=experiment_tags
    )
```

### Searching Experiments

In [7]:
# Use search_experiments() to search on the project_name tag key
apples_experiment = client.search_experiments(
    filter_string="tags.`project_name` = 'grocery-forecasting'"
)

vars(apples_experiment[0])

{'_experiment_id': '597897158150876732',
 '_name': 'Apple_Models',
 '_artifact_location': 'mlflow-artifacts:/597897158150876732',
 '_lifecycle_stage': 'active',
 '_tags': {'mlflow.note.content': 'This is the grocery forecasting project. This experiment contains the produce models for apples.',
  'project_name': 'grocery-forecasting',
  'project_quarter': 'Q3-2023',
  'store_dept': 'produce',
  'team': 'stores-ml'},
 '_creation_time': 1729077649624,
 '_last_update_time': 1729077649624}

In [25]:
# Use search_experiments() to search on the project_name tag key
diabetes_experiment = client.search_experiments(
    filter_string="name = 'Diabetes_Models'"
    #view_type='ACTIVE_ONLY'
)

vars(diabetes_experiment[0])

{'_experiment_id': '319390409706919501',
 '_name': 'Diabetes_Models',
 '_artifact_location': 'mlflow-artifacts:/319390409706919501',
 '_lifecycle_stage': 'active',
 '_tags': {},
 '_creation_time': 1729085340486,
 '_last_update_time': 1729085340486}

### Create Dataset about apples

In [28]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta


def generate_apple_sales_data_with_promo_adjustment(
    base_demand: int = 1000, n_rows: int = 5000
):
    """
    Generates a synthetic dataset for predicting apple sales demand with seasonality
    and inflation.

    This function creates a pandas DataFrame with features relevant to apple sales.
    The features include date, average_temperature, rainfall, weekend flag, holiday flag,
    promotional flag, price_per_kg, and the previous day's demand. The target variable,
    'demand', is generated based on a combination of these features with some added noise.

    Args:
        base_demand (int, optional): Base demand for apples. Defaults to 1000.
        n_rows (int, optional): Number of rows (days) of data to generate. Defaults to 5000.

    Returns:
        pd.DataFrame: DataFrame with features and target variable for apple sales prediction.

    Example:
        >>> df = generate_apple_sales_data_with_seasonality(base_demand=1200, n_rows=6000)
        >>> df.head()
    """

    # Set seed for reproducibility
    np.random.seed(9999)

    # Create date range
    dates = [datetime.now() - timedelta(days=i) for i in range(n_rows)]
    dates.reverse()

    # Generate features
    df = pd.DataFrame(
        {
            "date": dates,
            "average_temperature": np.random.uniform(10, 35, n_rows),
            "rainfall": np.random.exponential(5, n_rows),
            "weekend": [(date.weekday() >= 5) * 1 for date in dates],
            "holiday": np.random.choice([0, 1], n_rows, p=[0.97, 0.03]),
            "price_per_kg": np.random.uniform(0.5, 3, n_rows),
            "month": [date.month for date in dates],
        }
    )

    # Introduce inflation over time (years)
    df["inflation_multiplier"] = (
        1 + (df["date"].dt.year - df["date"].dt.year.min()) * 0.03
    )

    # Incorporate seasonality due to apple harvests
    df["harvest_effect"] = np.sin(2 * np.pi * (df["month"] - 3) / 12) + np.sin(
        2 * np.pi * (df["month"] - 9) / 12
    )

    # Modify the price_per_kg based on harvest effect
    df["price_per_kg"] = df["price_per_kg"] - df["harvest_effect"] * 0.5

    # Adjust promo periods to coincide with periods lagging peak harvest by 1 month
    peak_months = [4, 10]  # months following the peak availability
    df["promo"] = np.where(
        df["month"].isin(peak_months),
        1,
        np.random.choice([0, 1], n_rows, p=[0.85, 0.15]),
    )

    # Generate target variable based on features
    base_price_effect = -df["price_per_kg"] * 50
    seasonality_effect = df["harvest_effect"] * 50
    promo_effect = df["promo"] * 200

    df["demand"] = (
        base_demand
        + base_price_effect
        + seasonality_effect
        + promo_effect
        + df["weekend"] * 300
        + np.random.normal(0, 50, n_rows)
    ) * df[
        "inflation_multiplier"
    ]  # adding random noise

    # Add previous day's demand
    df["previous_days_demand"] = df["demand"].shift(1)
    df["previous_days_demand"] = df["previous_days_demand"].bfill()
    #df["previous_days_demand"].bfill(inplace=True)  # fill the first row

    # Drop temporary columns
    df.drop(columns=["inflation_multiplier", "harvest_effect", "month"], inplace=True)

    return df

In [29]:
data = generate_apple_sales_data_with_promo_adjustment(base_demand=1_000, n_rows=1_000)

### Logging runs w/ MLflow

In [30]:
import mlflow
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [31]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [32]:
# Sets the current active experiment to the "Apple_Models" experiment and
# returns the Experiment metadata
apple_experiment = mlflow.set_experiment("Apple_Models")

# Define a run name for this iteration of training.
# If this is not set, a unique name will be auto-generated for your run.
run_name = "apples_rf_test"

# Define an artifact path that the model will be saved to.
artifact_path = "rf_apples"

In [33]:
# Split the data into features and target and drop irrelevant date field and target field
X = data.drop(columns=["date", "demand"])
y = data["demand"]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

params = {
    "n_estimators": 150,
    "max_depth": 10,
    "min_samples_split": 10,
    "min_samples_leaf": 5,
    "bootstrap": True,
    "oob_score": False,
    "random_state": 42,
}

# Train the RandomForestRegressor
rf = RandomForestRegressor(**params)

# Fit the model on the training data
rf.fit(X_train, y_train)

# Predict on the validation set
y_pred = rf.predict(X_val)

# Calculate error metrics
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred)

# Assemble the metrics we're going to write into a collection
metrics = {"mae": mae, "mse": mse, "rmse": rmse, "r2": r2}

In [36]:
"""
with mlflow.start_run(run_name=run_name) as run:
    mlflow.log_param(key='params', value=params)
    mlflow.log_metrics(metrics)

    mlflow.sklearn.log_model(
        sk_model = rf,
        input_example = X_val,
        artifact_path=artifact_path
    )
"""


"\nwith mlflow.start_run(run_name=run_name) as run:\n    mlflow.log_param(key='params', value=params)\n    mlflow.log_metrics(metrics)\n\n    mlflow.sklearn.log_model(\n        sk_model = rf,\n        input_example = X_val,\n        artifact_path=artifact_path\n    )\n"

### Autologging

* See:  [Autolog Example](autolog/diabetes.py)

In [37]:
#!python autolog/diabetes.py

### Comparing runs, choosing a model and deploying it to a REST API

* [Reference](https://mlflow.org/docs/latest/getting-started/quickstart-2/index.html)