## Desactivar Warnings

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Generación de dataset sintético

In [3]:
from datetime import datetime, timedelta

import numpy as np
import pandas as pd


def generate_apple_sales_data_with_promo_adjustment(base_demand: int = 1000, n_rows: int = 5000):
    """
    Generates a synthetic dataset for predicting apple sales demand with seasonality and inflation.

    This function creates a pandas DataFrame with features relevant to apple sales.
    The features include date, average_temperature, rainfall, weekend flag, holiday flag,
    promotional flag, price_per_kg, and the previous day's demand. The target variable,
    'demand', is generated based on a combination of these features with some added noise.

    Args:
        base_demand (int, optional): Base demand for apples. Defaults to 1000.
        n_rows (int, optional): Number of rows (days) of data to generate. Defaults to 5000.

    Returns:
        pd.DataFrame: DataFrame with features and target variable for apple sales prediction.

    Example:
        >>> df = generate_apple_sales_data_with_seasonality(base_demand=1200, n_rows=6000)
        >>> df.head()
    """

    # Set seed for reproducibility
    np.random.seed(9999)

    # Create date range
    dates = [datetime.now() - timedelta(days=i) for i in range(n_rows)]
    dates.reverse()

    # Generate features
    df = pd.DataFrame(
        {
            "date": dates,
            "average_temperature": np.random.uniform(10, 35, n_rows),
            "rainfall": np.random.exponential(5, n_rows),
            "weekend": [(date.weekday() >= 5) * 1 for date in dates],
            "holiday": np.random.choice([0, 1], n_rows, p=[0.97, 0.03]),
            "price_per_kg": np.random.uniform(0.5, 3, n_rows),
            "month": [date.month for date in dates],
        }
    )

    # Introduce inflation over time (years)
    df["inflation_multiplier"] = 1 + (df["date"].dt.year - df["date"].dt.year.min()) * 0.03

    # Incorporate seasonality due to apple harvests
    df["harvest_effect"] = np.sin(2 * np.pi * (df["month"] - 3) / 12) + np.sin(
        2 * np.pi * (df["month"] - 9) / 12
    )

    # Modify the price_per_kg based on harvest effect
    df["price_per_kg"] = df["price_per_kg"] - df["harvest_effect"] * 0.5

    # Adjust promo periods to coincide with periods lagging peak harvest by 1 month
    peak_months = [4, 10]  # months following the peak availability
    df["promo"] = np.where(
        df["month"].isin(peak_months),
        1,
        np.random.choice([0, 1], n_rows, p=[0.85, 0.15]),
    )

    # Generate target variable based on features
    base_price_effect = -df["price_per_kg"] * 50
    seasonality_effect = df["harvest_effect"] * 50
    promo_effect = df["promo"] * 200

    df["demand"] = (
        base_demand
        + base_price_effect
        + seasonality_effect
        + promo_effect
        + df["weekend"] * 300
        + np.random.normal(0, 50, n_rows)
    ) * df["inflation_multiplier"]  # adding random noise

    # Add previous day's demand
    df["previous_days_demand"] = df["demand"].shift(1)
    df["previous_days_demand"].fillna(method="bfill", inplace=True)  # fill the first row

    # Drop temporary columns
    df.drop(columns=["inflation_multiplier", "harvest_effect", "month"], inplace=True)

    return df

In [4]:
data = generate_apple_sales_data_with_promo_adjustment()

In [5]:
data.head()

Unnamed: 0,date,average_temperature,rainfall,weekend,holiday,price_per_kg,promo,demand,previous_days_demand
0,2011-12-07 20:30:35.690099,30.584727,1.199291,0,0,1.726258,0,851.276659,851.276659
1,2011-12-08 20:30:35.690097,15.465069,1.037626,0,0,0.576471,0,906.836626,851.276659
2,2011-12-09 20:30:35.690096,10.786525,5.656089,0,0,2.513328,0,857.895424,906.836626
3,2011-12-10 20:30:35.690095,23.648154,12.030937,1,0,1.839225,0,1148.961007,857.895424
4,2011-12-11 20:30:35.690093,13.861391,4.303812,1,0,1.531772,0,1283.128282,1148.961007


In [6]:
#Iniciar servidor de seguimiento
import mlflow

mlflow.set_tracking_uri("http://127.0.0.1:8080")

### Términos comúnes

* Run (ejecución): Un entrenamiento de un modelo
* Experimentos: Un conjunto de ejecuciones con las cuales haré comparaciones
* Artefactos: Todo archivo que se genere a partir de una ejecución

In [7]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

# Sets the current active experiment to the "Apple_Models" experiment and returns the Experiment metadata
mlflow.set_experiment("Apple_Models")

# Define an artifact path that the model will be saved to.
artifact_path = "rf_apples"

2025/08/14 20:36:12 INFO mlflow.tracking.fluent: Experiment with name 'Apple_Models' does not exist. Creating a new experiment.


In [9]:
# Split the data into features and target and drop irrelevant date field and target field
X = data.drop(columns=["date", "demand"])
y = data["demand"]

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

params = {
    "n_estimators": 100,
    "max_depth": 6,
    "min_samples_split": 10,
    "min_samples_leaf": 4,
    "bootstrap": True,
    "oob_score": False,
    "random_state": 888,
}

# Train the RandomForestRegressor
rf = RandomForestRegressor(**params)

# Fit the model on the training data
rf.fit(X_train, y_train)

# Predict on the validation set
y_pred = rf.predict(X_val)

# Calculate error metrics
mae = mean_absolute_error(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred)

# Assemble the metrics we're going to write into a collection
metrics = {"mae": mae, "mse": mse, "rmse": rmse, "r2": r2}
    

# Initiate the MLflow run context
with mlflow.start_run() as run:

    mlflow.log_params(params=params)

    mlflow.log_metrics(metrics=metrics)

    mlflow.sklearn.log_model(sk_model=rf,
                             input_example=X_train,
                             artifact_path=artifact_path)

Downloading artifacts: 100%|██████████| 7/7 [00:00<00:00, 655.23it/s]  


🏃 View run resilient-crane-145 at: http://127.0.0.1:8080/#/experiments/867092846214093260/runs/722c6361171d4792844e731625b8ab95
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/867092846214093260


## Ajuste de Hiperparámetros y Autologging

In [None]:
mlflow.sklearn.autolog()

In [10]:
import mlflow
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge
# Enable autologging with hyperparameter tuning support
  # Track top 10 parameter combinations
mlflow.sklearn.autolog(max_tuning_runs=10)
# Define parameter grid
param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100],
    'fit_intercept': [True, False]
}


with mlflow.start_run(run_name="Ridge Regression Tuning"):
    # Create and fit GridSearchCVd
    ridge = Ridge(random_state=42)
    grid_search = GridSearchCV(
        ridge, param_grid, cv=5, scoring="neg_mean_squared_error", n_jobs=-1, verbose=1
    )

    grid_search.fit(X_train, y_train)

    # Best model evaluation
    best_score = grid_search.score(X_val, y_val)
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Best cross-validation score: {grid_search.best_score_:.3f}")
    print(f"Test score: {best_score:.3f}")

Fitting 5 folds for each of 12 candidates, totalling 60 fits


2025/08/14 20:49:13 INFO mlflow.sklearn.utils: Logging the 10 best runs, 2 runs will be omitted.


🏃 View run capricious-robin-341 at: http://127.0.0.1:8080/#/experiments/867092846214093260/runs/d313b06c5a014cb0bc4cee7deb48e6ea
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/867092846214093260
🏃 View run bouncy-pug-22 at: http://127.0.0.1:8080/#/experiments/867092846214093260/runs/77016e4ac1a04590933408df7e38ebd9
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/867092846214093260
🏃 View run mercurial-cow-347 at: http://127.0.0.1:8080/#/experiments/867092846214093260/runs/dc7442ffd6f84c619cfe5a56da355d1e
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/867092846214093260
🏃 View run handsome-lamb-134 at: http://127.0.0.1:8080/#/experiments/867092846214093260/runs/88b985b7af424d478af7a450cd72cba7
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/867092846214093260
🏃 View run gaudy-moth-38 at: http://127.0.0.1:8080/#/experiments/867092846214093260/runs/009a9ec7598b46329755c79178e7d5e0
🧪 View experiment at: http://127.0.0.1:8080/#/experiments/867092846

## Registro de modelos y Despliegue
El **registro de modelos** trata de una base de datos que servirá para el control de \
versiones y seguimiento de mis mejores modelos para ser considerados en producción

In [12]:
import requests
import json

# Prepare test data
apples_data = {
    "dataframe_split": {
        "columns": [
            'average_temperature',
            'rainfall',
            'weekend',
            'holiday',
            'price_per_kg',
            'promo',
            'previous_days_demand'],
        "data": [[30.584727, 1.199291, 1, 0, 1.726258, 0,  851.276659]],
    }
}

# Make prediction request
response = requests.post(
    "http://localhost:5002/invocations",
    headers={"Content-Type": "application/json"},
    data=json.dumps(apples_data),
)

prediction = response.json()
print(f"Predicted apples demand: {prediction['predictions'][0]:.2f}")

Predicted apples demand: 1339.87


### DVC Pipelines

#### Ejemplo de Pipeline
- Stage 1: prepare.py
- Stage 2: featurization.py
- Stage 3: train.py
- Stage 4: evalulate.py

#### Flags:
* -n: Nombre de la etapa
* -p: Parámetros del script
* -d: Dependencias (De que archivos depende mi script)
* -o: La salida del script