In [19]:
import numpy as np
import pandas as pd
from autogluon.tabular import TabularPredictor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
import os


In [None]:
df = pd.read_csv("meta_df.csv")


In [None]:
df_train = df.iloc[0:1110]
df_train

Unnamed: 0,num_rows,sales_mean,sales_std,sales_cv,sales_skew,sales_kurtosis,closed_fraction,weekend_ratio,customers_mean,customers_std,pca_var_first,pca_components_90,store_type,assortment,competition_mean,competition_std,label,leaf_id
0,743,4593.7140,2435.7651,0.530239,-0.506928,-0.041492,0.160162,0.829754,537.504711,265.109817,0.997754,1,0,0,540.0,0.0,store_44,0
1,741,5661.2210,2928.6194,0.517312,-0.786354,0.092722,0.172740,0.939018,514.367072,258.919846,0.000000,18,0,2,8090.0,0.0,store_346,1
2,754,5204.5977,2756.3560,0.529600,-0.568414,-0.033909,0.157825,0.800631,691.066313,344.848645,0.000000,18,0,2,670.0,0.0,store_331,2
3,761,5704.5786,2961.0244,0.519061,-0.701369,0.099789,0.168200,1.059587,474.249671,231.376864,0.997550,1,3,2,9230.0,0.0,store_572,1
4,760,10132.8430,6128.6270,0.604828,-0.221403,-0.675996,0.171053,0.553850,1120.869565,638.839728,0.996584,1,0,2,210.0,0.0,store_1014,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1105,743,6398.4560,3802.4944,0.594283,-0.231925,-0.632953,0.164199,0.543334,655.640646,365.431059,0.998150,1,0,0,19370.0,0.0,store_892,4
1106,749,5404.1533,3000.7166,0.555261,-0.424574,-0.241628,0.172230,1.237494,465.543391,239.973048,0.996873,1,3,2,970.0,0.0,store_785,1
1107,765,7827.6973,3453.6255,0.441206,-0.537519,0.061153,0.048366,0.977242,666.751634,264.603756,0.000000,18,0,2,2290.0,0.0,store_310,4
1108,772,6629.6490,3733.9258,0.563216,-0.706442,-0.315641,0.202073,1.164527,779.966321,419.274426,0.000000,18,0,0,180.0,0.0,store_699,6


In [51]:
TARGET = "sales_mean"
cart_features = [
    "num_rows",
    "customers_mean",
    "customers_std",
    "pca_var_first",
    "store_type",
    "assortment",
]

cart_df = df_train[cart_features + [TARGET]]


In [52]:
def variance(rows):
    return np.var(rows[:, -1])

def variance_reduction(left, right, current_var):
    n = len(left) + len(right)
    return current_var - (
        len(left)/n * variance(left) + len(right)/n * variance(right)
    )

In [53]:
class Question:
    def __init__(self, column, value, feature_names):
        self.column = column
        self.value = value
        self.feature_names = feature_names

    def match(self, row):
        return row[self.column] >= self.value

    def __repr__(self):
        return f"Is {self.feature_names[self.column]} >= {self.value}?"


In [54]:
def partition(rows, question):
    mask = rows[:, question.column] >= question.value
    return rows[mask], rows[~mask]


In [55]:
def find_best_split(rows, feature_names):
    best_gain = 0
    best_question = None
    current_var = variance(rows)
    n_features = rows.shape[1] - 1

    for col in range(n_features):
        values = np.unique(rows[:, col])
        for val in values:
            q = Question(col, val, feature_names)
            left, right = partition(rows, q)
            if len(left) == 0 or len(right) == 0:
                continue
            gain = variance_reduction(left, right, current_var)
            if gain > best_gain:
                best_gain, best_question = gain, q

    return best_gain, best_question


In [56]:
class Leaf:
    def __init__(self, rows):
        self.value = rows[:, -1].mean()
        self.n_samples = len(rows)

class DecisionNode:
    def __init__(self, question, left, right):
        self.question = question
        self.left = left
        self.right = right


In [57]:
class LeafCounter:
    def __init__(self, max_leaves):
        self.max_leaves = max_leaves
        self.count = 0


In [58]:
def build_tree(rows, feature_names, counter, min_samples_leaf=1000):
    gain, question = find_best_split(rows, feature_names)

    if gain == 0 or len(rows) <= min_samples_leaf or counter.count >= counter.max_leaves:
        counter.count += 1
        return Leaf(rows)

    left, right = partition(rows, question)

    if counter.count + 2 > counter.max_leaves:
        counter.count += 1
        return Leaf(rows)

    left_branch = build_tree(left, feature_names, counter, min_samples_leaf)
    right_branch = build_tree(right, feature_names, counter, min_samples_leaf)

    return DecisionNode(question, left_branch, right_branch)


In [59]:
cart_array = cart_df.to_numpy()
counter = LeafCounter(max_leaves=10)

cart_tree = build_tree(
    cart_array,
    cart_features,
    counter,
    min_samples_leaf=100  
)


In [60]:
def print_tree(node, spacing=""):
    if isinstance(node, Leaf):
        print(spacing + f"Leaf: value={node.value:.2f}, samples={node.n_samples}")
        return

    print(spacing + str(node.question))
    print(spacing + "--> True:")
    print_tree(node.left, spacing + "  ")
    print(spacing + "--> False:")
    print_tree(node.right, spacing + "  ")

print_tree(cart_tree)


Is customers_mean >= 823.8724226804123?
--> True:
  Is customers_mean >= 1357.2453333333333?
  --> True:
    Leaf: value=12654.71, samples=30
  --> False:
    Is customers_mean >= 1044.448700410397?
    --> True:
      Leaf: value=9180.52, samples=41
    --> False:
      Leaf: value=7545.39, samples=100
--> False:
  Is customers_mean >= 533.8319738988581?
  --> True:
    Is store_type >= 3.0?
    --> True:
      Is customers_mean >= 654.5797101449275?
      --> True:
        Leaf: value=7608.62, samples=34
      --> False:
        Leaf: value=6386.37, samples=86
    --> False:
      Is customers_mean >= 672.8653594771242?
      --> True:
        Is pca_var_first >= 0.9979759721210184?
        --> True:
          Leaf: value=7005.87, samples=19
        --> False:
          Is assortment >= 2.0?
          --> True:
            Leaf: value=6410.56, samples=51
          --> False:
            Leaf: value=6046.64, samples=69
      --> False:
        Is customers_mean >= 610.6013071895425?
 

In [61]:
leaf_id_map = {}

def get_leaf_id(node, row):
    if isinstance(node, Leaf):
        if node not in leaf_id_map:
            leaf_id_map[node] = len(leaf_id_map)  # 0,1,2,3,4
        return leaf_id_map[node]
    if node.question.match(row):
        return get_leaf_id(node.left, row)
    else:
        return get_leaf_id(node.right, row)

In [62]:
df_train["leaf_id"] = [
    get_leaf_id(cart_tree, row)
    for row in cart_array
]

print(df_train["leaf_id"].unique())

[ 0  1  2  3  4  5  6  7  8  9 10]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["leaf_id"] = [


In [63]:
df_train[df_train["leaf_id"] == 0]

Unnamed: 0,num_rows,sales_mean,sales_std,sales_cv,sales_skew,sales_kurtosis,closed_fraction,weekend_ratio,customers_mean,customers_std,pca_var_first,pca_components_90,store_type,assortment,competition_mean,competition_std,label,leaf_id
0,743,4593.7140,2435.7651,0.530239,-0.506928,-0.041492,0.160162,0.829754,537.504711,265.109817,0.997754,1,0,0,540.0,0.0,store_44,0
8,760,4617.4546,2419.8123,0.524058,-0.525530,0.149822,0.155263,1.028929,534.827632,254.053573,0.000000,18,0,2,1130.0,0.0,store_1096,0
11,747,4553.5690,2397.7578,0.526567,-0.670128,0.312879,0.171352,1.187217,580.425703,289.974471,0.996778,1,0,0,700.0,0.0,store_563,0
30,610,5257.4067,2655.3198,0.505063,-1.114039,0.021207,0.181967,1.314789,544.652459,267.965988,0.997654,1,0,0,1420.0,0.0,store_650,0
33,757,6346.3145,4069.1433,0.641182,0.111423,-0.501225,0.180978,0.683672,566.253633,315.540259,0.998026,1,0,0,4260.0,0.0,store_552,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1085,736,5590.7847,3073.3752,0.549722,-0.553525,-0.217831,0.177989,0.870962,560.369565,287.880293,0.000000,18,0,2,5460.0,0.0,store_934,0
1087,737,5786.1406,3580.9856,0.618890,-0.173092,-0.808705,0.170963,0.547431,594.278155,335.908267,0.997656,1,0,0,2020.0,0.0,store_156,0
1091,756,4812.8270,3019.6497,0.627417,-0.205477,-0.784308,0.179894,0.517286,579.829365,341.142807,0.998239,1,0,0,2720.0,0.0,store_836,0
1097,611,5271.2964,3210.9730,0.609143,-0.200020,-0.441517,0.183306,0.687835,547.859247,302.365331,0.997829,1,0,0,12610.0,0.0,store_181,0


In [67]:
df_train

Unnamed: 0,num_rows,sales_mean,sales_std,sales_cv,sales_skew,sales_kurtosis,closed_fraction,weekend_ratio,customers_mean,customers_std,pca_var_first,pca_components_90,store_type,assortment,competition_mean,competition_std,label,leaf_id
0,743,4593.7140,2435.7651,0.530239,-0.506928,-0.041492,0.160162,0.829754,537.504711,265.109817,0.997754,1,0,0,540.0,0.0,store_44,0
1,741,5661.2210,2928.6194,0.517312,-0.786354,0.092722,0.172740,0.939018,514.367072,258.919846,0.000000,18,0,2,8090.0,0.0,store_346,1
2,754,5204.5977,2756.3560,0.529600,-0.568414,-0.033909,0.157825,0.800631,691.066313,344.848645,0.000000,18,0,2,670.0,0.0,store_331,2
3,761,5704.5786,2961.0244,0.519061,-0.701369,0.099789,0.168200,1.059587,474.249671,231.376864,0.997550,1,3,2,9230.0,0.0,store_572,1
4,760,10132.8430,6128.6270,0.604828,-0.221403,-0.675996,0.171053,0.553850,1120.869565,638.839728,0.996584,1,0,2,210.0,0.0,store_1014,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1105,743,6398.4560,3802.4944,0.594283,-0.231925,-0.632953,0.164199,0.543334,655.640646,365.431059,0.998150,1,0,0,19370.0,0.0,store_892,4
1106,749,5404.1533,3000.7166,0.555261,-0.424574,-0.241628,0.172230,1.237494,465.543391,239.973048,0.996873,1,3,2,970.0,0.0,store_785,1
1107,765,7827.6973,3453.6255,0.441206,-0.537519,0.061153,0.048366,0.977242,666.751634,264.603756,0.000000,18,0,2,2290.0,0.0,store_310,4
1108,772,6629.6490,3733.9258,0.563216,-0.706442,-0.315641,0.202073,1.164527,779.966321,419.274426,0.000000,18,0,0,180.0,0.0,store_699,6


In [71]:
df_train["label"] = df_train["label"].str.replace("store_", "", regex=False).astype(int)
df_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train["label"] = df_train["label"].str.replace("store_", "", regex=False).astype(int)


Unnamed: 0,num_rows,sales_mean,sales_std,sales_cv,sales_skew,sales_kurtosis,closed_fraction,weekend_ratio,customers_mean,customers_std,pca_var_first,pca_components_90,store_type,assortment,competition_mean,competition_std,label,leaf_id
0,743,4593.7140,2435.7651,0.530239,-0.506928,-0.041492,0.160162,0.829754,537.504711,265.109817,0.997754,1,0,0,540.0,0.0,44,0
1,741,5661.2210,2928.6194,0.517312,-0.786354,0.092722,0.172740,0.939018,514.367072,258.919846,0.000000,18,0,2,8090.0,0.0,346,1
2,754,5204.5977,2756.3560,0.529600,-0.568414,-0.033909,0.157825,0.800631,691.066313,344.848645,0.000000,18,0,2,670.0,0.0,331,2
3,761,5704.5786,2961.0244,0.519061,-0.701369,0.099789,0.168200,1.059587,474.249671,231.376864,0.997550,1,3,2,9230.0,0.0,572,1
4,760,10132.8430,6128.6270,0.604828,-0.221403,-0.675996,0.171053,0.553850,1120.869565,638.839728,0.996584,1,0,2,210.0,0.0,1014,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1105,743,6398.4560,3802.4944,0.594283,-0.231925,-0.632953,0.164199,0.543334,655.640646,365.431059,0.998150,1,0,0,19370.0,0.0,892,4
1106,749,5404.1533,3000.7166,0.555261,-0.424574,-0.241628,0.172230,1.237494,465.543391,239.973048,0.996873,1,3,2,970.0,0.0,785,1
1107,765,7827.6973,3453.6255,0.441206,-0.537519,0.061153,0.048366,0.977242,666.751634,264.603756,0.000000,18,0,2,2290.0,0.0,310,4
1108,772,6629.6490,3733.9258,0.563216,-0.706442,-0.315641,0.202073,1.164527,779.966321,419.274426,0.000000,18,0,0,180.0,0.0,699,6


In [70]:
raw_data = pd.read_csv("main_train_df.csv")
raw_data

Unnamed: 0,Store,DayOfWeek,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,Year,Month,Day,WeekOfYear,Sales
0,44,2,763,1,1,0,0,a,a,540.0,6.0,2011.0,0,,,2014,5,6,19,7076
1,346,2,663,1,1,0,1,a,c,8090.0,,,0,,,2014,7,29,31,8129
2,331,7,0,0,0,0,0,a,c,670.0,,,1,14.0,2015.0,2014,9,28,39,0
3,572,7,0,0,0,0,0,d,c,9230.0,4.0,2004.0,1,37.0,2009.0,2013,11,17,46,0
4,1014,3,1234,1,1,0,1,a,c,210.0,,,1,31.0,2013.0,2015,7,15,29,12288
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
813759,932,6,535,1,0,0,0,a,a,15700.0,,,1,13.0,2010.0,2013,10,5,40,4994
813760,25,2,1909,1,0,0,0,c,a,430.0,4.0,2003.0,0,,,2013,5,7,19,13145
813761,135,2,703,1,1,0,1,d,a,5190.0,,,1,1.0,2013.0,2015,3,31,14,9776
813762,923,4,609,1,0,0,0,a,a,280.0,9.0,2008.0,0,,,2015,3,12,11,4790


In [None]:
# make a dictionary of all the raw data related to each store of each leaf
raw_data_leaves = {}

for leaf_id, leaf_df in df_train.groupby("leaf_id"):
    # Stores that belong to this leaf
    store_ids = leaf_df["label"].unique()

    # Select raw data for these stores
    raw_subset = raw_data[raw_data["Store"].isin(store_ids)]

    raw_data_leaves[leaf_id] = raw_subset


In [82]:
raw_data_leaves[1]

Unnamed: 0,Store,DayOfWeek,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,Year,Month,Day,WeekOfYear,Sales
1,346,2,663,1,1,0,1,a,c,8090.0,,,0,,,2014,7,29,31,8129
3,572,7,0,0,0,0,0,d,c,9230.0,4.0,2004.0,1,37.0,2009.0,2013,11,17,46,0
7,201,3,488,1,1,0,1,d,a,20260.0,,,1,18.0,2014.0,2013,7,31,31,6203
9,1111,1,746,1,1,0,0,a,a,1900.0,6.0,2014.0,1,31.0,2013.0,2014,6,30,27,11408
10,918,4,568,1,0,0,0,a,c,18710.0,4.0,2015.0,0,,,2013,7,11,28,5159
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
813756,422,4,464,1,0,0,0,a,c,2880.0,,,0,,,2014,12,11,50,4873
813757,457,2,426,1,0,0,0,d,c,13140.0,,,1,31.0,2013.0,2015,6,9,24,4672
813761,135,2,703,1,1,0,1,d,a,5190.0,,,1,1.0,2013.0,2015,3,31,14,9776
813762,923,4,609,1,0,0,0,a,a,280.0,9.0,2008.0,0,,,2015,3,12,11,4790


In [83]:
import pandas as pd

summary = pd.DataFrame([
    {
        "leaf_id": leaf_id,
        "n_rows": len(df),
        "n_stores": df["Store"].nunique()
    }
    for leaf_id, df in raw_data_leaves.items()
])

summary.sort_values("leaf_id")

Unnamed: 0,leaf_id,n_rows,n_stores
0,0,83044,114
1,1,354539,492
2,2,38320,51
3,3,30593,41
4,4,54704,74
5,5,73850,100
6,6,51058,69
7,7,62469,86
8,8,24793,34
9,9,14255,19


In [84]:
# train a model on each leaf for the raw data
TARGET = "Sales"
base_save_path = "leaf_models_metadata"
os.makedirs(base_save_path, exist_ok=True)

leaf_models = {}

for leaf_id, raw_leaf_df in raw_data_leaves.items():
    print(f"Training AutoGluon for leaf {leaf_id}, samples = {len(raw_leaf_df)}")

    # Skip tiny leaves (VERY important)
    if len(raw_leaf_df) < 500:
        print(f"Skipping leaf {leaf_id} (too few samples)")
        continue

    leaf_folder = os.path.join(base_save_path, f"leaf_{leaf_id}")

    predictor = TabularPredictor(
        label=TARGET,
        eval_metric="rmse",
        path=leaf_folder
    ).fit(
        raw_leaf_df.drop(columns=["leaf_id"], errors="ignore"),
        presets="best_quality",
        time_limit=600
    )

    leaf_models[leaf_id] = predictor


Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.11.11
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.4.0: Fri Mar 15 00:19:22 PDT 2024; root:xnu-10063.101.17~1/RELEASE_ARM64_T8112
CPU Count:          8
Memory Avail:       1.68 GB / 8.00 GB (21.0%)
Disk Space Avail:   27.75 GB / 228.27 GB (12.2%)
Presets specified: ['best_quality']
Using hyperparameters preset: hyperparameters='zeroshot'
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the dat

Training AutoGluon for leaf 0, samples = 83044


Leaderboard on holdout data (DyStack):
                 model  score_holdout   score_val              eval_metric  pred_time_test  pred_time_val    fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0    LightGBMXT_BAG_L1    -303.731466 -318.058227  root_mean_squared_error       12.964498     231.752966   82.588748                12.964498              231.752966          82.588748            1       True          1
1  WeightedEnsemble_L3    -303.731466 -318.058227  root_mean_squared_error       12.966466     231.753982   82.609278                 0.001968                0.001016           0.020530            3       True          4
2  WeightedEnsemble_L2    -303.731466 -318.058227  root_mean_squared_error       12.967669     231.754448   82.596955                 0.003171                0.001482           0.008207            2       True          2
3    LightGBMXT_BAG_L2    -306.997054 -334.851714  root_mean_squared_error   

Training AutoGluon for leaf 1, samples = 354539


2026-01-09 11:53:54,056	ERROR worker.py:409 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
2026-01-09 11:53:54,057	ERROR worker.py:409 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
2026-01-09 11:53:54,058	ERROR worker.py:409 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
2026-01-09 11:53:54,058	ERROR worker.py:409 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
2026-01-09 11:53:54,060	ERROR worker.py:409 -- Unhandled error (suppress with 'RAY_IGNORE_UN

Training AutoGluon for leaf 2, samples = 38320


Leaderboard on holdout data (DyStack):
                     model  score_holdout   score_val              eval_metric  pred_time_test  pred_time_val   fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      WeightedEnsemble_L3    -316.092661 -342.063682  root_mean_squared_error        7.493605      76.030474  88.163705                 0.001761                0.000550           0.034596            3       True         11
1          LightGBM_BAG_L2    -317.288597 -350.323890  root_mean_squared_error        7.169060      75.303412  83.484571                 0.086731                0.611553           2.857025            2       True          8
2      WeightedEnsemble_L2    -317.913559 -342.899024  root_mean_squared_error        5.930279      72.297505  61.077967                 0.001256                0.001582           0.039777            2       True          6
3     ExtraTreesMSE_BAG_L2    -319.849767 -349.125879  root_mean_

Training AutoGluon for leaf 3, samples = 30593


Leaderboard on holdout data (DyStack):
                     model  score_holdout   score_val              eval_metric  pred_time_test  pred_time_val    fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      WeightedEnsemble_L3    -409.256857 -449.422692  root_mean_squared_error        8.654821      44.969810  101.412855                 0.003117                0.001809           0.207180            3       True         13
1      WeightedEnsemble_L2    -410.333074 -451.296659  root_mean_squared_error        7.444723      42.159084   73.170268                 0.001808                0.001437           0.022374            2       True          7
2     ExtraTreesMSE_BAG_L2    -417.177291 -458.061469  root_mean_squared_error        8.222645      43.771513   84.898436                 0.246086                0.508981           3.744860            2       True         11
3          LightGBM_BAG_L1    -420.668207 -465.843258  root_m

Training AutoGluon for leaf 4, samples = 54704


Leaderboard on holdout data (DyStack):
                    model  score_holdout   score_val              eval_metric  pred_time_test  pred_time_val    fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0     WeightedEnsemble_L3    -370.074527 -347.157181  root_mean_squared_error        7.697029      73.185077   92.346683                 0.002517                0.001157           0.041077            3       True          9
1     WeightedEnsemble_L2    -371.117549 -347.414201  root_mean_squared_error        7.449846      72.113898   82.910752                 0.002378                0.001415           0.028433            2       True          5
2         LightGBM_BAG_L1    -371.583997 -353.698696  root_mean_squared_error        2.439841      22.759310   25.624219                 2.439841               22.759310          25.624219            1       True          2
3         LightGBM_BAG_L2    -373.449022 -355.604618  root_mean_s

Training AutoGluon for leaf 5, samples = 73850


Leaderboard on holdout data (DyStack):
                 model  score_holdout   score_val              eval_metric  pred_time_test  pred_time_val    fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0    LightGBMXT_BAG_L1    -385.826664 -403.848167  root_mean_squared_error       15.822081     298.210096   74.433121                15.822081              298.210096          74.433121            1       True          1
1  WeightedEnsemble_L3    -385.826664 -403.848167  root_mean_squared_error       15.823706     298.211010   74.448658                 0.001625                0.000914           0.015537            3       True          4
2  WeightedEnsemble_L2    -385.826664 -403.848167  root_mean_squared_error       15.824875     298.211088   74.437447                 0.002794                0.000992           0.004326            2       True          2
3    LightGBMXT_BAG_L2    -394.094040 -424.721776  root_mean_squared_error   

Training AutoGluon for leaf 6, samples = 51058


Leaderboard on holdout data (DyStack):
                    model  score_holdout   score_val              eval_metric  pred_time_test  pred_time_val    fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0     WeightedEnsemble_L3    -349.817305 -379.429277  root_mean_squared_error        9.911628      78.692785   90.723291                 0.002815                0.000686           0.045201            3       True          8
1     WeightedEnsemble_L2    -350.202613 -380.167087  root_mean_squared_error        9.615922      77.615468   78.932209                 0.004033                0.000732           0.016861            2       True          4
2         LightGBM_BAG_L1    -355.519209 -386.623358  root_mean_squared_error        3.506776      30.253178   26.776278                 3.506776               30.253178          26.776278            1       True          2
3         LightGBM_BAG_L2    -362.739936 -392.145354  root_mean_s

Training AutoGluon for leaf 7, samples = 62469


Leaderboard on holdout data (DyStack):
                    model  score_holdout   score_val              eval_metric  pred_time_test  pred_time_val    fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0     WeightedEnsemble_L3    -328.276189 -327.462021  root_mean_squared_error        9.501860     142.122755  100.983098                 0.002960                0.000933           0.035529            3       True          8
1         LightGBM_BAG_L2    -328.961111 -335.366391  root_mean_squared_error        9.218057     141.247949   85.707407                 0.124470                0.791411           3.519169            2       True          5
2     WeightedEnsemble_L2    -329.444310 -327.622733  root_mean_squared_error        9.095801     140.457330   82.203987                 0.002214                0.000793           0.015749            2       True          3
3         LightGBM_BAG_L1    -330.221152 -333.624306  root_mean_s

Training AutoGluon for leaf 8, samples = 24793


Leaderboard on holdout data (DyStack):
                     model  score_holdout   score_val              eval_metric  pred_time_test  pred_time_val    fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      WeightedEnsemble_L3    -351.256826 -371.573880  root_mean_squared_error        5.036418      23.683197  100.981408                 0.005872                0.000840           0.085071            3       True         11
1   RandomForestMSE_BAG_L2    -353.965890 -381.843976  root_mean_squared_error        4.674668      21.808533   91.082449                 0.740494                0.866410          14.284852            2       True         10
2      WeightedEnsemble_L2    -354.459951 -373.825209  root_mean_squared_error        3.207437      19.585722   68.231682                 0.002229                0.000410           0.030270            2       True          7
3          LightGBM_BAG_L2    -355.105122 -392.500168  root_m

Training AutoGluon for leaf 9, samples = 14255


Leaderboard on holdout data (DyStack):
                     model  score_holdout   score_val              eval_metric  pred_time_test  pred_time_val   fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      WeightedEnsemble_L2    -305.025293 -317.308718  root_mean_squared_error        2.232792      10.088833  43.929411                 0.001847                0.006710           0.093078            2       True          8
1      WeightedEnsemble_L3    -305.226292 -317.188559  root_mean_squared_error        3.288179      12.350785  88.988254                 0.003251                0.001239           0.105138            3       True         14
2          LightGBM_BAG_L2    -306.599685 -335.471161  root_mean_squared_error        2.845433      11.701522  78.238784                 0.047978                0.296203           2.673915            2       True         10
3     ExtraTreesMSE_BAG_L2    -311.173248 -325.751044  root_mean_

Training AutoGluon for leaf 10, samples = 22518


Leaderboard on holdout data (DyStack):
                     model  score_holdout   score_val              eval_metric  pred_time_test  pred_time_val    fit_time  pred_time_test_marginal  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0      WeightedEnsemble_L3    -547.988559 -541.449871  root_mean_squared_error        6.699146      45.597932  100.908371                 0.002166                0.001228           0.045088            3       True         12
1      WeightedEnsemble_L2    -549.901976 -542.448856  root_mean_squared_error        5.298653      42.284604   69.301219                 0.002193                0.000725           0.044682            2       True          7
2          LightGBM_BAG_L1    -554.522739 -557.414493  root_mean_squared_error        1.449346      10.072030   11.581484                 1.449346               10.072030          11.581484            1       True          2
3     ExtraTreesMSE_BAG_L2    -555.356543 -559.569910  root_m

In [87]:
#reload the dictionary of all the leaf models

leaf_models = {}
base_save_path = "leaf_models_metadata"

for leaf_id in range(11):
    leaf_folder = os.path.join(base_save_path, f"leaf_{leaf_id}")
    leaf_models[leaf_id] = TabularPredictor.load(leaf_folder)


In [88]:
leaf_models

{0: <autogluon.tabular.predictor.predictor.TabularPredictor at 0x37fcb0710>,
 1: <autogluon.tabular.predictor.predictor.TabularPredictor at 0x37fc6ff90>,
 2: <autogluon.tabular.predictor.predictor.TabularPredictor at 0x37fc75b50>,
 3: <autogluon.tabular.predictor.predictor.TabularPredictor at 0x37f96b310>,
 4: <autogluon.tabular.predictor.predictor.TabularPredictor at 0x37f757190>,
 5: <autogluon.tabular.predictor.predictor.TabularPredictor at 0x37f6fcf50>,
 6: <autogluon.tabular.predictor.predictor.TabularPredictor at 0x37f9034d0>,
 7: <autogluon.tabular.predictor.predictor.TabularPredictor at 0x37f91e8d0>,
 8: <autogluon.tabular.predictor.predictor.TabularPredictor at 0x37fcd6f10>,
 9: <autogluon.tabular.predictor.predictor.TabularPredictor at 0x37fcb52d0>,
 10: <autogluon.tabular.predictor.predictor.TabularPredictor at 0x37f74ef50>}

In [92]:
df

Unnamed: 0,num_rows,sales_mean,sales_std,sales_cv,sales_skew,sales_kurtosis,closed_fraction,weekend_ratio,customers_mean,customers_std,pca_var_first,pca_components_90,store_type,assortment,competition_mean,competition_std,label,leaf_id
0,743,4593.7140,2435.7651,0.530239,-0.506928,-0.041492,0.160162,0.829754,537.504711,265.109817,0.997754,1,0,0,540.0,0.0,store_44,0
1,741,5661.2210,2928.6194,0.517312,-0.786354,0.092722,0.172740,0.939018,514.367072,258.919846,0.000000,18,0,2,8090.0,0.0,store_346,1
2,754,5204.5977,2756.3560,0.529600,-0.568414,-0.033909,0.157825,0.800631,691.066313,344.848645,0.000000,18,0,2,670.0,0.0,store_331,2
3,761,5704.5786,2961.0244,0.519061,-0.701369,0.099789,0.168200,1.059587,474.249671,231.376864,0.997550,1,3,2,9230.0,0.0,store_572,1
4,760,10132.8430,6128.6270,0.604828,-0.221403,-0.675996,0.171053,0.553850,1120.869565,638.839728,0.996584,1,0,2,210.0,0.0,store_1014,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110,743,5103.3135,2652.5212,0.519764,-0.792899,0.236228,0.170929,0.962292,546.318977,267.520106,0.996651,1,3,2,15720.0,0.0,store_588,7
1111,739,3603.9182,2032.6263,0.564005,-0.292446,0.045020,0.162382,0.755790,437.604871,222.133675,0.000000,18,0,0,2320.0,0.0,store_486,1
1112,770,5511.5947,3165.9922,0.574424,-0.402313,-0.381137,0.181818,0.772786,607.420779,327.400547,0.998231,1,0,0,7240.0,0.0,store_917,0
1113,751,7614.8384,3988.4440,0.523773,-0.681120,-0.194850,0.162450,0.844251,733.103862,365.779886,0.997582,1,0,0,2850.0,0.0,store_683,6


## Performance comparison

In [95]:
def compute_metadata(df):
    meta = {}

    # 1. Dataset size
  
    meta["num_rows"] = len(df)


    # 2. Predicted Sales statistics

    if "PredictedSales" in df:
        sales = df["PredictedSales"].replace(0, np.nan).dropna()

        meta["sales_mean"] = sales.mean()
        meta["sales_std"] = sales.std()
        meta["sales_cv"] = sales.std() / (sales.mean() + 1e-9)

        # Skew and kurtosis tell you whether the store has spikes
        meta["sales_skew"] = sales.skew()
        meta["sales_kurtosis"] = sales.kurtosis()

        # How often the store is closed
        meta["closed_fraction"] = (df["Open"] == 0).mean() if "Open" in df else 0.0
    else:
        meta["sales_mean"] = None
        meta["sales_std"] = None
        meta["sales_cv"] = None
        meta["sales_skew"] = None
        meta["sales_kurtosis"] = None
        meta["closed_fraction"] = None


    # 3. Weekend vs weekday pattern

    if "Sales" in df and "DayOfWeek" in df:
        weekday_sales = df.groupby("DayOfWeek")["Sales"].mean()
        meta["weekend_ratio"] = weekday_sales.get(6, 0) / (weekday_sales.mean() + 1e-9)
    else:
        meta["weekend_ratio"] = None



    #5. Customers Mean and std
    meta["customers_mean"] = df["Customers"].mean()
    meta["customers_std"] = df["Customers"].std()



    # 6. Intrinsic dimensionality (PCA)
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    try:
        scaled = df[numeric_cols].fillna(df[numeric_cols].median())
        pca = PCA()
        pca.fit(scaled)
        meta["pca_var_first"] = pca.explained_variance_ratio_[0]
        meta["pca_components_90"] = int(np.searchsorted(np.cumsum(pca.explained_variance_ratio_), 0.90) + 1)
    except Exception:
        meta["pca_var_first"] = 0.0
        meta["pca_components_90"] = len(numeric_cols)

    
    # 7. Categorical store features (ordinal encoding)
   
    store_type_map = {"a": 0, "b": 1, "c": 2, "d": 3}
    assortment_map = {"a": 0, "b": 1, "c": 2}

    if "StoreType" in df:
        meta["store_type"] = store_type_map.get(df["StoreType"].iloc[0], -1)
    else:
        meta["store_type"] = -1

    if "Assortment" in df:
        meta["assortment"] = assortment_map.get(df["Assortment"].iloc[0], -1)
    else:
        meta["assortment"] = -1



    # 8. Competition distance stats
    
    if "CompetitionDistance" in df:
        comp = df["CompetitionDistance"].fillna(df["CompetitionDistance"].median())
        meta["competition_mean"] = comp.mean()
        meta["competition_std"] = comp.std()
    else:
        meta["competition_mean"] = None
        meta["competition_std"] = None

    return meta


In [163]:
from autogluon.tabular import TabularPredictor
import os
import pandas as pd

def predict_for_dataset(df):
    """
    Predict sales for a full dataset using metadata-based CART routing
    """

    # 1️⃣ Compute metadata (dataset-level)
    metadata = compute_metadata(df)
    meta_df = pd.DataFrame([metadata])

    # 2️⃣ Determine CART leaf
    cart_array = meta_df[cart_features].to_numpy()
    leaf_id = get_leaf_id(cart_tree, cart_array[0])

    print(f"Predicted leaf: {leaf_id}")

    # 3️⃣ Load leaf-specific predictor
    leaf_path = os.path.join("leaf_models_metadata", f"leaf_{leaf_id}")

    if not os.path.exists(leaf_path):
        raise ValueError(f"No trained model found for leaf {leaf_id}")

    predictor = TabularPredictor.load(leaf_path)

    # 4️⃣ Predict all rows
    preds = predictor.predict(df)

    return preds


In [164]:
df_test = pd.read_csv("main_test_df.csv")
df_test

Unnamed: 0,Store,DayOfWeek,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,Year,Month,Day,WeekOfYear,Sales
0,448,5,698,1,0,0,0,a,c,3970.0,9.0,2009.0,0,,,2014,9,26,39,7418
1,1113,2,606,1,0,0,1,a,c,9260.0,,,0,,,2013,7,9,28,5258
2,408,7,0,0,0,0,0,c,a,1560.0,,,1,45.0,2009.0,2013,2,10,6,0
3,410,1,1206,1,1,0,0,c,a,40.0,11.0,2011.0,1,22.0,2012.0,2014,2,3,6,11920
4,193,1,764,1,0,0,0,a,a,520.0,,,0,,,2013,11,11,46,4371
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203440,282,4,753,1,1,0,0,a,a,1220.0,12.0,2010.0,0,,,2013,8,15,33,5525
203441,1111,2,559,1,1,0,0,a,a,1900.0,6.0,2014.0,1,31.0,2013.0,2014,11,25,48,6679
203442,531,2,680,1,1,0,0,a,c,4030.0,,,0,,,2015,1,27,5,6042
203443,904,7,0,0,0,0,0,d,c,570.0,7.0,2013.0,1,14.0,2011.0,2015,2,22,8,0


In [169]:
temp = df_test[df_test["Store"]== 448]
temp

Unnamed: 0,Store,DayOfWeek,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,Year,Month,Day,WeekOfYear,Sales
0,448,5,698,1,0,0,0,a,c,3970.0,9.0,2009.0,0,,,2014,9,26,39,7418
889,448,2,857,1,1,0,0,a,c,3970.0,9.0,2009.0,0,,,2013,3,19,12,9772
2009,448,6,341,1,0,0,0,a,c,3970.0,9.0,2009.0,0,,,2015,1,24,4,4011
2743,448,3,724,1,0,0,0,a,c,3970.0,9.0,2009.0,0,,,2015,6,24,26,7901
3604,448,5,848,1,1,0,0,a,c,3970.0,9.0,2009.0,0,,,2013,1,11,2,9128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200428,448,3,870,1,1,0,0,a,c,3970.0,9.0,2009.0,0,,,2014,11,5,45,11059
201197,448,2,745,1,1,0,0,a,c,3970.0,9.0,2009.0,0,,,2014,9,16,38,9075
201292,448,5,834,1,1,0,0,a,c,3970.0,9.0,2009.0,0,,,2013,9,27,39,9070
201493,448,6,453,1,0,0,0,a,c,3970.0,9.0,2009.0,0,,,2013,12,28,52,5159


In [None]:
# CART algo
df_new = temp.copy()
preds_new = predict_for_dataset(df_new)
df_new["predicted_sales"] = preds_new
df_new

Predicted leaf: 0


Unnamed: 0,Store,DayOfWeek,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,...,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,Year,Month,Day,WeekOfYear,Sales,predicted_sales
0,448,5,698,1,0,0,0,a,c,3970.0,...,2009.0,0,,,2014,9,26,39,7418,5724.328125
889,448,2,857,1,1,0,0,a,c,3970.0,...,2009.0,0,,,2013,3,19,12,9772,8125.454590
2009,448,6,341,1,0,0,0,a,c,3970.0,...,2009.0,0,,,2015,1,24,4,4011,3125.678223
2743,448,3,724,1,0,0,0,a,c,3970.0,...,2009.0,0,,,2015,6,24,26,7901,6106.313477
3604,448,5,848,1,1,0,0,a,c,3970.0,...,2009.0,0,,,2013,1,11,2,9128,7625.145020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200428,448,3,870,1,1,0,0,a,c,3970.0,...,2009.0,0,,,2014,11,5,45,11059,8496.915039
201197,448,2,745,1,1,0,0,a,c,3970.0,...,2009.0,0,,,2014,9,16,38,9075,6783.205078
201292,448,5,834,1,1,0,0,a,c,3970.0,...,2009.0,0,,,2013,9,27,39,9070,7558.923340
201493,448,6,453,1,0,0,0,a,c,3970.0,...,2009.0,0,,,2013,12,28,52,5159,4042.080078


In [175]:
y_true = df_new["Sales"]
y_pred = df_new["predicted_sales"]
rmse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)
print(f"rmse: {rmse}")
print(f"mae: {mae}")

rmse: 2577910.5
mae: 1366.41259765625


In [176]:
# Global model 
global_predictor = TabularPredictor.load(f"new_global/")
y_pred = global_predictor.predict(df_new)
y_true = df_new["Sales"]
rmse = mean_squared_error(y_true, y_pred)
mae = mean_absolute_error(y_true , y_pred)
print(f"rmse: {rmse}")
print(f"mae: {mae}")

rmse: 443204.3125
mae: 496.8134765625
