In [None]:
import pandas as pd
import numpy as np
import autogluon
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.tree import DecisionTreeClassifier



  from .autonotebook import tqdm as notebook_tqdm


## 1. Create different datasets. Different in sizes and distributions.

## 2. Compute metadata info

In [None]:
def compute_metadata(df):
    return {
        "num_rows": len(df),
        "storetype_entropy": df["StoreType"].value_counts(normalize=True).entropy(),
        "promo_fraction": df["Promo"].mean(),
        "schoolholiday_fraction": df["SchoolHoliday"].mean(),
        "unique_stores": df["Store"].nunique(),
        "month_entropy": df["Month"].value_counts(normalize=True).entropy(),
    }

## 3. Build a dataset of datasets

In [None]:
datasets = {
    "big": big_df,
    "medium": medium_df,
    "small": small_df,
    "promo_heavy": promo_heavy_df,
    "storetype_c": storetype_c_df,
}

meta_rows = []
for name, df in datasets.items():
    m = compute_metadata(df)
    m["label"] = name      # group name
    meta_rows.append(m)

meta_df = pd.DataFrame(meta_rows)

## 4. Train a decision tree to group datasets 

In [None]:
X = meta_df.drop(columns=["label"])
y = meta_df["label"]

tree = DecisionTreeClassifier(max_depth=3)
tree.fit(X, y)

## 5. Train an AutoGluon model for each group

In [None]:
group_models = {}

for name, df in datasets.items():
    print(f"Training group model: {name}")
    predictor = TabularPredictor(
        label="Sales",
        path=f"Models/{name}/"
    ).fit(df, presets="best", time_limit=300)
    
    group_models[name] = predictor

## 6. Final Pipeline

In [2]:
def predict_for_dataset(new_df):
    
    # 1. Compute metadata
    metadata = compute_metadata(new_df)
    meta_vec = pd.DataFrame([metadata])
    
    # 2. Use decision tree to decide group
    group = tree.predict(meta_vec)[0]
    print("Dataset assigned to group:", group)
    
    # 3. Load the right model
    predictor = TabularPredictor.load(f"Models/{group}/")
    
    # 4. Predict
    return predictor.predict(new_df)

## 7. Usage

In [None]:
preds = predict_for_dataset(incoming_df)