In [1]:
import pandas as pd
import numpy as np
import autogluon
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score



  from .autonotebook import tqdm as notebook_tqdm


## 1. Create different datasets. Different in sizes and distributions.

In [2]:
main_train_df = pd.read_csv("main_train_df.csv")

In [3]:
# A dict of all the store branches
stores = {}
for store_id in main_train_df["Store"].unique():
    df_store = main_train_df[main_train_df["Store"] == store_id]
    stores[f"store_{store_id}"] = df_store

In [4]:
stores["store_2"].columns

Index(['Store', 'DayOfWeek', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday',
       'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'Year', 'Month', 'Day',
       'WeekOfYear', 'Sales'],
      dtype='object')

## 2. Compute metadata info

In [5]:
def compute_metadata(df):
    return {
        "num_rows": len(df),
        "promo_fraction": df["Promo"].mean(),
        "promo2_fraction": df["Promo2"].mean(),
        "schoolholiday_fraction": df["SchoolHoliday"].mean(),
        "mean_sales": df["Sales"].mean(),
        "std_sales": df["Sales"].std()
    }

## 3. Build a dataset of datasets

In [6]:
meta_rows = []
for name, df in stores.items():
    m = compute_metadata(df)
    m["label"] = name      # group name
    meta_rows.append(m)

meta_df = pd.DataFrame(meta_rows)

In [7]:
meta_df["num_rows"].unique()

array([743, 741, 754, 761, 759, 747, 756, 755, 760, 742, 749, 757, 615,
       715, 729, 765, 733, 779, 744, 750, 598, 611, 610, 622, 762, 748,
       751, 768, 774, 621, 776, 737, 766, 606, 745, 728, 740, 730, 727,
       592, 731, 746, 613, 773, 739, 769, 618, 617, 758, 752, 732, 632,
       607, 767, 597, 763, 589, 738, 601, 778, 605, 736, 753, 780, 775,
       787, 604, 600, 599, 725, 764, 624, 614, 612, 591, 596, 735, 772,
       602, 585, 608, 603, 722, 734, 770, 724, 609, 594, 777, 625, 590,
       588, 619, 620, 616, 771, 726, 636, 781, 631, 595, 623, 789, 577,
       578, 582, 723, 791, 782, 785, 784, 587, 716, 626, 580, 579])

## 4. Train a decision tree to group datasets 

In [8]:
# For the purpose of grouping (training the decision tree),  use the one giant model for all the stores and Compute per-store RMSE and group them. 
# Stores with bad RMSE → “hard” group
# Stores with average RMSE → “medium”
# Stores with good RMSE → “easy”

In [9]:
global_predictor = TabularPredictor.load("AutogluonModels_5min/")

In [10]:
store_rmses = {}

for name, df in stores.items():
    perf = global_predictor.evaluate(df)
    store_rmses[name] = abs(perf["root_mean_squared_error"])

In [11]:
store_rmses

{'store_44': np.float64(427.03360952640526),
 'store_346': np.float64(616.0141636042241),
 'store_331': np.float64(585.7557191763498),
 'store_572': np.float64(496.19500154528595),
 'store_1014': np.float64(1172.7844553164641),
 'store_630': np.float64(527.3283023811134),
 'store_545': np.float64(611.9078797703415),
 'store_201': np.float64(461.2834438514096),
 'store_1096': np.float64(435.2329064714001),
 'store_1111': np.float64(501.838246002304),
 'store_918': np.float64(421.03967884933223),
 'store_563': np.float64(491.76494007260885),
 'store_731': np.float64(584.2620050975702),
 'store_406': np.float64(479.52061758359474),
 'store_1040': np.float64(633.5579906504358),
 'store_232': np.float64(539.8962895168451),
 'store_134': np.float64(358.65287362622064),
 'store_155': np.float64(495.7663021096319),
 'store_221': np.float64(442.1392740704735),
 'store_250': np.float64(625.1845422574472),
 'store_94': np.float64(532.3873351593719),
 'store_653': np.float64(435.4116964003884),
 '

In [12]:
def assign_group_by_rmse(rmse):
    if rmse > 800:
        return "hard"
    elif rmse > 650:
        return "medium"
    else:
        return "easy"

meta_df["group"] = meta_df["label"].apply(lambda s: assign_group_by_rmse(store_rmses[s]))


In [13]:
meta_df["group"].value_counts()

group
easy      860
medium    151
hard      104
Name: count, dtype: int64

In [14]:
# Train the decision tree
X = meta_df.drop(columns=["group" , "label"])
y = meta_df["group"]

tree = DecisionTreeClassifier(max_depth=4)
tree.fit(X, y)

0,1,2
,criterion,'gini'
,splitter,'best'
,max_depth,4
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


## 5. Train an AutoGluon model for each group

In [15]:

# Train a model for each group
group_models = {}

for group in meta_df["group"].unique():
    print("Training model for group:", group)
    
    # get store names in this group
    store_names = meta_df[meta_df["group"] == group]["label"].tolist()
    
    # merge datasets from all stores in this group
    group_df = pd.concat([stores[name] for name in store_names], ignore_index=True)
    
    # train model for this group
    predictor = TabularPredictor(
        label="Sales",
        path=f"Models_group/{group}/"
    ).fit(
        group_df,
        presets="best",
        time_limit=600,
        dynamic_stacking=False
    )

    group_models[group] = predictor
    

Preset alias specified: 'best' maps to 'best_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.11.11
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.4.0: Fri Mar 15 00:19:22 PDT 2024; root:xnu-10063.101.17~1/RELEASE_ARM64_T8112
CPU Count:          8
Memory Avail:       2.41 GB / 8.00 GB (30.1%)
Disk Space Avail:   53.24 GB / 228.27 GB (23.3%)
Presets specified: ['best']
Using hyperparameters preset: hyperparameters='zeroshot'
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1


Training model for group: easy


Beginning AutoGluon training ... Time limit = 600s
AutoGluon will save models to "/Users/sepideghorbanian/Documents/Semester_5/Research_Project/Models_group/easy"
Train Data Rows:    626941
Train Data Columns: 18
Label Column:       Sales
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == int and many unique label-values observed).
	Label info (max, min, mean, stddev): (26807, 0, 5153.42366, 3159.48365)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during Predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression', 'quantile'])
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    2547.53 MB
	Train Data (Original)  Memory Usage: 175.78 MB (6.9% of available memory)
	Inferring data type of each feature based on column values. Set

Training model for group: hard


	Available Memory:                    2618.51 MB
	Train Data (Original)  Memory Usage: 21.57 MB (0.8% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 4 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
		Fitting CategoryFeatureGenerator...
			Fitting CategoryMemoryMinimizeFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', [])  :  5 | ['CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2SinceWeek', 'Promo2SinceYear']
		('int', [])    : 10 | ['Store', 'DayOfWeek', 'Open', '

Training model for group: medium


Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    2215.41 MB
	Train Data (Original)  Memory Usage: 30.81 MB (1.4% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 4 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
		Fitting CategoryFeatureGenerator...
			Fitting CategoryMemoryMinimizeFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', [])  :  5 | ['CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 

In [16]:
group_models

{'easy': <autogluon.tabular.predictor.predictor.TabularPredictor at 0x17f1a5310>,
 'hard': <autogluon.tabular.predictor.predictor.TabularPredictor at 0x17b51db10>,
 'medium': <autogluon.tabular.predictor.predictor.TabularPredictor at 0x308a113d0>}

## 6. Final Pipeline

In [17]:
def predict_for_dataset(new_df):
    
    # 1. Compute metadata
    metadata = compute_metadata(new_df)
    meta_vec = pd.DataFrame([metadata])
    
    # 2. Use decision tree to decide group
    group = tree.predict(meta_vec)[0]
    print("Dataset assigned to group:", group)
    
    # 3. Load the right model
    predictor = TabularPredictor.load(f"Models_group/{group}/")
    
    # 4. Predict
    return predictor.predict(new_df) , predictor.evaluate(new_df)

## 7. Usage

In [18]:
stores["store_264"]

Unnamed: 0,Store,DayOfWeek,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,Year,Month,Day,WeekOfYear,Sales
53,264,4,1,1,0,0,a,a,180.0,3.0,2014.0,0,,,2015,7,2,27,7747
880,264,2,1,0,0,0,a,a,180.0,3.0,2014.0,0,,,2014,11,18,47,5122
3495,264,3,1,1,0,0,a,a,180.0,3.0,2014.0,0,,,2014,3,19,12,7664
3905,264,3,1,0,0,0,a,a,180.0,3.0,2014.0,0,,,2013,8,7,32,5469
7490,264,3,0,0,a,1,a,a,180.0,3.0,2014.0,0,,,2014,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
808835,264,7,0,0,0,0,a,a,180.0,3.0,2014.0,0,,,2015,7,26,30,0
810525,264,7,0,0,0,0,a,a,180.0,3.0,2014.0,0,,,2014,10,19,42,0
811813,264,5,1,1,0,0,a,a,180.0,3.0,2014.0,0,,,2013,4,12,15,7871
812389,264,7,0,0,0,0,a,a,180.0,3.0,2014.0,0,,,2015,4,19,16,0


In [19]:
incoming_df = stores["store_264"]

In [20]:
preds = predict_for_dataset(incoming_df)
preds

Dataset assigned to group: easy


(53        5583.647461
 880       3914.870117
 3495      4815.408203
 3905      4075.936523
 7490      -113.602386
              ...     
 808835     -72.632935
 810525     -56.828606
 811813    5162.210449
 812389     -17.395947
 812847    4628.046387
 Name: Sales, Length: 740, dtype: float32,
 {'root_mean_squared_error': np.float64(-2539.691398079609),
  'mean_squared_error': -6450032.0,
  'mean_absolute_error': -1907.52392578125,
  'r2': 0.46888065338134766,
  'pearsonr': 0.9270856465380595,
  'median_absolute_error': -1849.822998046875})

In [21]:
global_predictor.evaluate(incoming_df)


{'root_mean_squared_error': np.float64(-1095.2462418374596),
 'mean_squared_error': -1199564.25,
 'mean_absolute_error': -445.232421875,
 'r2': 0.9012234807014465,
 'pearsonr': 0.9494240345732199,
 'median_absolute_error': -258.593505859375}

In [22]:
# seems like by now the decision tree is not doing well. It predicted a hard class as an easy one and got awful results.

In [23]:
# Check the results again and try to fix the decision tree