In [74]:
import pandas as pd
import numpy as np
import autogluon
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.tree import DecisionTreeClassifier



## 1. Create different datasets. Different in sizes and distributions.

In [75]:
main_train_df = pd.read_csv("main_train_df.csv")

In [76]:
# A dict of all the store branches
stores = {}
for store_id in main_train_df["Store"].unique():
    df_store = main_train_df[main_train_df["Store"] == store_id]
    stores[f"store_{store_id}"] = df_store

In [77]:
stores["store_2"].columns

Index(['Store', 'DayOfWeek', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday',
       'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'Year', 'Month', 'Day',
       'WeekOfYear', 'Sales'],
      dtype='object')

## 2. Compute metadata info

In [78]:
def compute_metadata(df):
    return {
        "num_rows": len(df),
        "promo_fraction": df["Promo"].mean(),
        "promo2_fraction": df["Promo2"].mean(),
        "schoolholiday_fraction": df["SchoolHoliday"].mean(),
        "mean_sales": df["Sales"].mean(),
        "std_sales": df["Sales"].std()
    }

## 3. Build a dataset of datasets

In [79]:
meta_rows = []
for name, df in stores.items():
    m = compute_metadata(df)
    m["label"] = name      # group name
    meta_rows.append(m)

meta_df = pd.DataFrame(meta_rows)

In [80]:
meta_df["num_rows"].unique()

array([743, 741, 754, 761, 759, 747, 756, 755, 760, 742, 749, 757, 615,
       715, 729, 765, 733, 779, 744, 750, 598, 611, 610, 622, 762, 748,
       751, 768, 774, 621, 776, 737, 766, 606, 745, 728, 740, 730, 727,
       592, 731, 746, 613, 773, 739, 769, 618, 617, 758, 752, 732, 632,
       607, 767, 597, 763, 589, 738, 601, 778, 605, 736, 753, 780, 775,
       787, 604, 600, 599, 725, 764, 624, 614, 612, 591, 596, 735, 772,
       602, 585, 608, 603, 722, 734, 770, 724, 609, 594, 777, 625, 590,
       588, 619, 620, 616, 771, 726, 636, 781, 631, 595, 623, 789, 577,
       578, 582, 723, 791, 782, 785, 784, 587, 716, 626, 580, 579])

## 4. Train a decision tree to group datasets 

In [81]:
def assign_group(row):
    if row["num_rows"] < 600: 
        return "small"
    elif row["num_rows"] < 700:
        return "medium"
    else:
        return "large"

meta_df["group"] = meta_df.apply(assign_group, axis=1)

In [82]:
X = meta_df.drop(columns=["group" , "label"])
y = meta_df["group"]

tree = DecisionTreeClassifier(max_depth=3)
tree.fit(X, y)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


## 5. Train an AutoGluon model for each group

In [83]:
meta_df

Unnamed: 0,num_rows,promo_fraction,promo2_fraction,schoolholiday_fraction,mean_sales,std_sales,label,group
0,743,0.382234,0.0,0.177658,4606.578735,2482.882693,store_44,large
1,741,0.387314,0.0,0.174089,5672.014845,2996.270525,store_346,large
2,754,0.393899,1.0,0.156499,5193.153846,2745.463917,store_331,large
3,761,0.383706,1.0,0.173456,5701.437582,2998.231622,store_572,large
4,759,0.396574,1.0,0.183136,10139.573123,6176.087416,store_1014,large
...,...,...,...,...,...,...,...,...
1110,743,0.379542,0.0,0.165545,5091.839838,2641.866692,store_588,large
1111,739,0.400541,1.0,0.181326,3597.783491,2024.721289,store_486,large
1112,770,0.385714,0.0,0.189610,5498.275325,3153.566946,store_917,large
1113,751,0.379494,0.0,0.177097,7627.969374,4057.975425,store_683,large


In [85]:
group_models = {}

for group in meta_df["group"].unique():
    print("Training model for group:", group)
    
    # get store names in this group
    store_names = meta_df[meta_df["group"] == group]["label"].tolist()
    
    # merge datasets from all stores in this group
    group_df = pd.concat([stores[name] for name in store_names], ignore_index=True)
    
    # train model for this group
    predictor = TabularPredictor(
        label="Sales",
        path=f"Models_group/{group}/"
    ).fit(
        group_df,
        presets="best",
        time_limit=600,
        dynamic_stacking=False
    )

    group_models[group] = predictor

Preset alias specified: 'best' maps to 'best_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.11.11
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.4.0: Fri Mar 15 00:19:22 PDT 2024; root:xnu-10063.101.17~1/RELEASE_ARM64_T8112
CPU Count:          8
Memory Avail:       2.13 GB / 8.00 GB (26.6%)
Disk Space Avail:   59.01 GB / 228.27 GB (25.9%)
Presets specified: ['best']
Using hyperparameters preset: hyperparameters='zeroshot'
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1


Training model for group: large


Beginning AutoGluon training ... Time limit = 600s
AutoGluon will save models to "/Users/sepideghorbanian/Documents/Semester_5/Research_Project/Models_group/large"
Train Data Rows:    704549
Train Data Columns: 18
Label Column:       Sales
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == int and many unique label-values observed).
	Label info (max, min, mean, stddev): (38484, 0, 5825.31651, 3899.08323)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during Predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression', 'quantile'])
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    2262.98 MB
	Train Data (Original)  Memory Usage: 197.54 MB (8.7% of available memory)
	Inferring data type of each feature based on column values. Se

Training model for group: medium


	Available Memory:                    2503.48 MB
	Train Data (Original)  Memory Usage: 23.31 MB (0.9% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 4 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
		Fitting CategoryFeatureGenerator...
			Fitting CategoryMemoryMinimizeFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', [])  :  5 | ['CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2SinceWeek', 'Promo2SinceYear']
		('int', [])    : 10 | ['Store', 'DayOfWeek', 'Open', '

Training model for group: small


	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 5 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
		Fitting CategoryFeatureGenerator...
			Fitting CategoryMemoryMinimizeFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', [])  :  5 | ['CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2SinceWeek', 'Promo2SinceYear']
		('int', [])    : 10 | ['Store', 'DayOfWeek', 'Open', 'Promo', 'SchoolHoliday', ...]
		('object', []) :  3 | ['StateHoliday', 'StoreType', 'Assortment']
	Types of features in processed data (raw dtype, special dtypes):
		('category', [])  : 2 | ['StateHoliday', 'StoreType']
		('float', [])     : 5 | ['CompetitionD

In [86]:
group_models

{'large': <autogluon.tabular.predictor.predictor.TabularPredictor at 0x3980c5150>,
 'medium': <autogluon.tabular.predictor.predictor.TabularPredictor at 0x31104a1d0>,
 'small': <autogluon.tabular.predictor.predictor.TabularPredictor at 0x179240990>}

In [87]:
tree

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,3
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


## 6. Final Pipeline

In [93]:
def predict_for_dataset(new_df):
    
    # 1. Compute metadata
    metadata = compute_metadata(new_df)
    meta_vec = pd.DataFrame([metadata])
    
    # 2. Use decision tree to decide group
    group = tree.predict(meta_vec)[0]
    print("Dataset assigned to group:", group)
    
    # 3. Load the right model
    predictor = TabularPredictor.load(f"Models_group/{group}/")
    
    # 4. Predict
    return predictor.predict(new_df)

## 7. Usage

In [127]:
stores["store_306"]

Unnamed: 0,Store,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,Year,Month,Day,WeekOfYear,Sales
1133,306,1,1,1,0,0,a,a,5100.0,4.0,2007.0,1,40.0,2014.0,2015,3,2,10,5744
1570,306,3,1,1,0,0,a,a,5100.0,4.0,2007.0,1,40.0,2014.0,2013,1,9,2,5083
2056,306,2,1,0,0,1,a,a,5100.0,4.0,2007.0,1,40.0,2014.0,2013,12,24,52,4018
5080,306,6,1,0,0,0,a,a,5100.0,4.0,2007.0,1,40.0,2014.0,2013,3,30,13,3279
5379,306,4,1,0,0,0,a,a,5100.0,4.0,2007.0,1,40.0,2014.0,2015,1,22,4,3593
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
808626,306,5,1,1,0,0,a,a,5100.0,4.0,2007.0,1,40.0,2014.0,2013,5,3,18,5425
809985,306,7,0,0,0,0,a,a,5100.0,4.0,2007.0,1,40.0,2014.0,2015,4,19,16,0
810129,306,4,1,1,0,0,a,a,5100.0,4.0,2007.0,1,40.0,2014.0,2013,1,10,2,4766
811998,306,4,1,0,0,0,a,a,5100.0,4.0,2007.0,1,40.0,2014.0,2013,2,28,9,4336


In [128]:
incoming_df = stores["store_306"]

In [129]:
preds = predict_for_dataset(incoming_df)

Dataset assigned to group: small


In [130]:
preds

1133      5906.905273
1570      4972.027832
2056      3622.307861
5080      3706.457275
5379      3660.002441
             ...     
808626    5334.231445
809985      17.506905
810129    4495.095215
811998    4100.255859
812127    4237.681152
Name: Sales, Length: 596, dtype: float32