# [install mlflow locally]
# start local mlflow server -
must map the port with the host if using docker containers.  Also must set host to 0.0.0.0 to allow external (host) connections.  Any port can be used that is available.

run in terminal prior to using mlflow

$ mlflow server --host 0.0.0.0  --port 8080

In [54]:
from mlflow import MlflowClient
from pprint import pprint
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [55]:
#client = MlflowClient(tracking_uri="http://127.0.0.1:8080")
mlflow.set_tracking_uri(uri="http://0.0.0.0:8080")

In [56]:
all_experiments = client.search_experiments()

print(all_experiments)


[<Experiment: artifact_location='mlflow-artifacts:/615508268430851982', creation_time=1740679369005, experiment_id='615508268430851982', last_update_time=1740679369005, lifecycle_stage='active', name='Apple_Models3', tags={'mlflow.note.content': 'This is the grocery forecasting project. This '
                        'experiment contains the produce models for apples.',
 'project_name': 'grocery-forecasting',
 'project_quarter': 'Q3-2023',
 'store_dept': 'produce',
 'team': 'stores-ml'}>, <Experiment: artifact_location='mlflow-artifacts:/580045222851405243', creation_time=1740678981377, experiment_id='580045222851405243', last_update_time=1740678981377, lifecycle_stage='active', name='MLflow Quickstart2', tags={}>, <Experiment: artifact_location='mlflow-artifacts:/191745933568060576', creation_time=1740675270407, experiment_id='191745933568060576', last_update_time=1740675270407, lifecycle_stage='active', name='MLflow Quickstart', tags={}>, <Experiment: artifact_location='mlflow-artifa

In [60]:
default_experiment = [
    {"name": experiment.name, "lifecycle_stage": experiment.lifecycle_stage}
    for experiment in all_experiments
    if experiment.name == "Default"
][0]

pprint(default_experiment)


{'lifecycle_stage': 'active', 'name': 'Default'}


In [62]:
# Provide an Experiment description that will appear in the UI
experiment_description = (
    "This is a test of mlflow itself. "
    "This experiment contains the produce models for apples."
)

# Provide searchable tags that define characteristics of the Runs that
# will be in this Experiment
experiment_tags = {
    "project_name": "grocery-forecasting",
    "store_dept": "produce",
    "team": "stores-ml",
    "project_quarter": "Q2-2025",
    "mlflow.note.content": experiment_description,
}

# Create the Experiment, providing a unique name
produce_apples_experiment = client.create_experiment(
    name="Apple_Models4", tags=experiment_tags
)


In [63]:
# Use search_experiments() to search on the project_name tag key

apples_experiment = client.search_experiments(
    filter_string="tags.`project_name` = 'grocery-forecasting'"
)

print(vars(apples_experiment[0]))


{'_experiment_id': '596675269500456292', '_name': 'Apple_Models4', '_artifact_location': 'mlflow-artifacts:/596675269500456292', '_lifecycle_stage': 'active', '_tags': {'project_name': 'grocery-forecasting', 'store_dept': 'produce', 'team': 'stores-ml', 'project_quarter': 'Q2-2025', 'mlflow.note.content': 'This is a test of mlflow itself. This experiment contains the produce models for apples.'}, '_creation_time': 1740686248100, '_last_update_time': 1740686248100}


In [64]:

def generate_apple_sales_data_with_promo_adjustment(
    base_demand: int = 1000, n_rows: int = 5000
):
    """
    Generates a synthetic dataset for predicting apple sales demand with seasonality
    and inflation.

    This function creates a pandas DataFrame with features relevant to apple sales.
    The features include date, average_temperature, rainfall, weekend flag, holiday flag,
    promotional flag, price_per_kg, and the previous day's demand. The target variable,
    'demand', is generated based on a combination of these features with some added noise.

    Args:
        base_demand (int, optional): Base demand for apples. Defaults to 1000.
        n_rows (int, optional): Number of rows (days) of data to generate. Defaults to 5000.

    Returns:
        pd.DataFrame: DataFrame with features and target variable for apple sales prediction.

    Example:
        >>> df = generate_apple_sales_data_with_seasonality(base_demand=1200, n_rows=6000)
        >>> df.head()
    """

    # Set seed for reproducibility
    np.random.seed(9999)

    # Create date range
    dates = [datetime.now() - timedelta(days=i) for i in range(n_rows)]
    dates.reverse()

    # Generate features
    df = pd.DataFrame(
        {
            "date": dates,
            "average_temperature": np.random.uniform(10, 35, n_rows),
            "rainfall": np.random.exponential(5, n_rows),
            "weekend": [(date.weekday() >= 5) * 1 for date in dates],
            "holiday": np.random.choice([0, 1], n_rows, p=[0.97, 0.03]),
            "price_per_kg": np.random.uniform(0.5, 3, n_rows),
            "month": [date.month for date in dates],
        }
    )

    # Introduce inflation over time (years)
    df["inflation_multiplier"] = (
        1 + (df["date"].dt.year - df["date"].dt.year.min()) * 0.03
    )

    # Incorporate seasonality due to apple harvests
    df["harvest_effect"] = np.sin(2 * np.pi * (df["month"] - 3) / 12) + np.sin(
        2 * np.pi * (df["month"] - 9) / 12
    )

    # Modify the price_per_kg based on harvest effect
    df["price_per_kg"] = df["price_per_kg"] - df["harvest_effect"] * 0.5

    # Adjust promo periods to coincide with periods lagging peak harvest by 1 month
    peak_months = [4, 10]  # months following the peak availability
    df["promo"] = np.where(
        df["month"].isin(peak_months),
        1,
        np.random.choice([0, 1], n_rows, p=[0.85, 0.15]),
    )

    # Generate target variable based on features
    base_price_effect = -df["price_per_kg"] * 50
    seasonality_effect = df["harvest_effect"] * 50
    promo_effect = df["promo"] * 200

    df["demand"] = (
        base_demand
        + base_price_effect
        + seasonality_effect
        + promo_effect
        + df["weekend"] * 300
        + np.random.normal(0, 50, n_rows)
    ) * df[
        "inflation_multiplier"
    ]  # adding random noise

    # Add previous day's demand
    df["previous_days_demand"] = df["demand"].shift(1)
    df["previous_days_demand"].fillna(
        method="bfill", inplace=True
    )  # fill the first row

    # Drop temporary columns
    df.drop(columns=["inflation_multiplier", "harvest_effect", "month"], inplace=True)

    return df


In [65]:
data = generate_apple_sales_data_with_promo_adjustment(base_demand=1_000, n_rows=1_000)

data[-20:]


Unnamed: 0,date,average_temperature,rainfall,weekend,holiday,price_per_kg,promo,demand,previous_days_demand
980,2025-02-08 19:57:38.524951,34.130183,1.454065,1,0,1.449177,0,1326.30629,1029.418398
981,2025-02-09 19:57:38.524947,32.353643,9.462859,1,0,2.856503,0,1169.129427,1326.30629
982,2025-02-10 19:57:38.524944,18.816833,0.39147,0,0,1.326429,0,990.616709,1169.129427
983,2025-02-11 19:57:38.524940,34.533012,2.120477,0,0,0.970131,0,1068.802075,990.616709
984,2025-02-12 19:57:38.524937,23.057202,2.365705,0,0,1.049931,0,1019.486305,1068.802075
985,2025-02-13 19:57:38.524933,34.810165,3.089005,0,0,2.035149,0,1002.564672,1019.486305
986,2025-02-14 19:57:38.524930,29.208905,3.673292,0,0,2.518098,0,1086.143402,1002.564672
987,2025-02-15 19:57:38.524927,16.428676,4.077782,1,0,1.268979,0,1420.207186,1086.143402
988,2025-02-16 19:57:38.524923,32.067512,2.734454,1,0,0.762317,0,1396.939894,1420.207186
989,2025-02-17 19:57:38.524920,31.938203,13.883486,0,0,1.153301,0,994.40954,1396.939894


In [70]:
import mlflow
# Sets the current active experiment to the "Apple_Models" experiment and
# returns the Experiment metadata
apple_experiment = mlflow.set_experiment("Apple_Models4")

# Define a run name for this iteration of training.
# If this is not set, a unique name will be auto-generated for your run.
run_name = "apples_rf_testJWH3"

# Define an artifact path that the model will be saved to.
artifact_path = "rf_apples"


In [71]:

# Split the data into features and target and drop irrelevant date field and target field
X = data.drop(columns=["date", "demand"])
y = data["demand"]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

params = {
    "n_estimators": 8,
    "max_depth": 4,
    "min_samples_split": 10,
    "min_samples_leaf": 4,
    "bootstrap": True,
    "oob_score": False,
    "random_state": 888,
}

# Train the RandomForestRegressor
rf = RandomForestRegressor(**params)

# Fit the model on the training data
rf.fit(X_train, y_train)

# Predict on the validation set
y_pred = rf.predict(X_val)

# Calculate error metrics
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred)

# Assemble the metrics we're going to write into a collection
metrics = {"mae": mae, "mse": mse, "rmse": rmse, "r2": r2}

# Initiate the MLflow run context
with mlflow.start_run(run_name=run_name) as run:
    
    # Log the parameters used for the model fit
    mlflow.log_params(params)
   

    # Log the error metrics that were calculated during validation
    mlflow.log_metrics(metrics)
    

    # Log an instance of the trained model for later use
    mlflow.sklearn.log_model(
        sk_model=rf, input_example=X_val, artifact_path=artifact_path
    )
   





🏃 View run apples_rf_testJWH3 at: http://0.0.0.0:8080/#/experiments/596675269500456292/runs/739df777708a493d95b8858831910819
🧪 View experiment at: http://0.0.0.0:8080/#/experiments/596675269500456292


In [75]:
from mlflow.models import infer_signature

signature = infer_signature(X_train, rf.predict(X_train))

model_info = mlflow.sklearn.log_model(
        sk_model=rf,
        artifact_path="rf_apples",
        signature=signature,
        input_example=X_train,
        registered_model_name="applesDemo",
    )


Successfully registered model 'applesDemo'.
2025/02/27 20:06:47 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: applesDemo, version 1
Created version '1' of model 'applesDemo'.
