In [252]:
import pandas as pd
import numpy as np
import autogluon
from autogluon.tabular import TabularDataset, TabularPredictor
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import skew, kurtosis
from sklearn.feature_selection import mutual_info_regression
from statsmodels.tsa.stattools import acf
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## 1. Create different datasets based on the store. (Different in sizes and distributions)

In [279]:
main_train_df = pd.read_csv("main_train_df.csv")

In [280]:
# A dict of all the store branches
stores = {}
for store_id in main_train_df["Store"].unique():
    df_store = main_train_df[main_train_df["Store"] == store_id]
    stores[f"store_{store_id}"] = df_store

In [281]:
stores["store_2"].columns

Index(['Store', 'DayOfWeek', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday',
       'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'Year', 'Month', 'Day',
       'WeekOfYear', 'Sales'],
      dtype='object')

In [282]:
len(stores)

1115

## 2. Compute metadata info

In [257]:
''' Prevoius simple meta data calculation which did not capture everything fully 
def compute_metadata(df):
    return {
        "num_rows": len(df),
        "promo_fraction": df["Promo"].mean(),
        "promo2_fraction": df["Promo2"].mean(),
        "schoolholiday_fraction": df["SchoolHoliday"].mean(),
        "mean_sales": df["Sales"].mean(),
        "std_sales": df["Sales"].std()
    }
'''

' Prevoius simple meta data calculation which did not capture everything fully \ndef compute_metadata(df):\n    return {\n        "num_rows": len(df),\n        "promo_fraction": df["Promo"].mean(),\n        "promo2_fraction": df["Promo2"].mean(),\n        "schoolholiday_fraction": df["SchoolHoliday"].mean(),\n        "mean_sales": df["Sales"].mean(),\n        "std_sales": df["Sales"].std()\n    }\n'

In [258]:
def compute_metadata(df):
    meta = {}

    # ----------------------------------
    # 1. Basic size features
    # ----------------------------------
    meta["num_rows"] = len(df)

    # ----------------------------------
    # 2. Sales distribution features
    # ----------------------------------
    sales = df["Sales"].values

    meta["mean_sales"] = np.mean(sales)
    meta["std_sales"] = np.std(sales)
    meta["cv_sales"] = meta["std_sales"] / (meta["mean_sales"] + 1e-9)
    meta["skew_sales"] = skew(sales)
    meta["kurtosis_sales"] = kurtosis(sales)
    meta["entropy_sales"] = (
        pd.Series(sales).value_counts(normalize=True)  # probability distrib
        .pipe(lambda p: -(p * np.log2(p + 1e-12))).sum()
    )

    # ----------------------------------
    # 3. Autocorrelation structure
    # ----------------------------------
    # lag-1, lag-7, lag-30 autocorrelation
    try:
        ac = acf(sales, nlags=30, fft=True)
        meta["acf_lag1"] = ac[1]
        meta["acf_lag7"] = ac[7] if len(ac) > 7 else 0
        meta["acf_lag30"] = ac[30] if len(ac) > 30 else 0
    except:
        meta["acf_lag1"] = meta["acf_lag7"] = meta["acf_lag30"] = 0

    # ----------------------------------
    # 4. Weekly seasonality strength
    # ----------------------------------
    # variance explained by grouping by weekday
    weekday_means = df.groupby("DayOfWeek")["Sales"].mean()
    overall_mean = meta["mean_sales"]
    ss_between = np.sum((weekday_means - overall_mean) ** 2)
    ss_total = np.sum((sales - overall_mean) ** 2)
    meta["weekday_seasonality_strength"] = ss_between / (ss_total + 1e-9)

    # ----------------------------------
    # 5. Volatility & noise
    # ----------------------------------
    diffs = np.diff(sales)
    meta["volatility"] = np.std(diffs)
    meta["jump_fraction"] = np.mean(np.abs(diffs) > (2 * np.std(diffs)))

    # ----------------------------------
    # 6. Promotion behaviour features
    # ----------------------------------
    if "Promo" in df:
        meta["promo_fraction"] = df["Promo"].mean()
        meta["promo_sales_corr"] = np.corrcoef(df["Promo"], sales)[0, 1]
    else:
        meta["promo_fraction"] = 0
        meta["promo_sales_corr"] = 0

    if "Promo2" in df:
        meta["promo2_fraction"] = df["Promo2"].mean()
    else:
        meta["promo2_fraction"] = 0

    # ----------------------------------
    # 7. Mutual information from key features
    # ----------------------------------
    important_features = []
    for col in ["Promo", "Promo2", "SchoolHoliday", "DayOfWeek", "Month", "WeekOfYear"]:
        if col in df:
            important_features.append(col)

    if len(important_features) > 0:
        mi = mutual_info_regression(df[important_features].fillna(0), sales)
        meta["mean_mutual_info"] = np.mean(mi)
        meta["max_mutual_info"] = np.max(mi)
    else:
        meta["mean_mutual_info"] = 0
        meta["max_mutual_info"] = 0

    return meta


## 3. Build a dataset of datasets

In [259]:
meta_rows = []
for name, df in stores.items():
    m = compute_metadata(df)
    m["label"] = name      # group name
    meta_rows.append(m)

meta_df = pd.DataFrame(meta_rows)

In [260]:
meta_df

Unnamed: 0,num_rows,mean_sales,std_sales,cv_sales,skew_sales,kurtosis_sales,entropy_sales,acf_lag1,acf_lag7,acf_lag30,weekday_seasonality_strength,volatility,jump_fraction,promo_fraction,promo_sales_corr,promo2_fraction,mean_mutual_info,max_mutual_info,label
0,743,4606.578735,2481.211280,0.538623,-0.441794,-0.092137,8.295296,0.001982,-0.087699,-0.012871,0.006367,3502.579659,0.047170,0.382234,0.681371,0.0,0.205151,0.682002,store_44
1,741,5672.014845,2994.248067,0.527898,-0.705748,0.045183,8.229686,0.022688,0.011633,0.004933,0.006094,4187.874189,0.051351,0.387314,0.534914,0.0,0.154713,0.523504,store_346
2,754,5193.153846,2743.642714,0.528319,-0.557344,-0.093521,8.383364,-0.004893,0.022848,0.022079,0.006443,3885.520866,0.049137,0.393899,0.607224,1.0,0.162247,0.602030,store_331
3,761,5701.437582,2996.261045,0.525527,-0.643381,0.109564,8.311891,0.003614,0.018470,0.015027,0.005693,4227.378102,0.050000,0.383706,0.568730,1.0,0.165873,0.520040,store_572
4,759,10139.573123,6172.017506,0.608706,-0.139795,-0.469621,8.323018,0.044858,-0.025300,-0.035447,0.006025,8533.371109,0.036939,0.396574,0.658192,1.0,0.200843,0.762374,store_1014
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1110,743,5091.839838,2640.088256,0.518494,-0.779477,0.081218,8.221165,0.047158,0.031503,0.005638,0.006209,3647.001267,0.047170,0.379542,0.553360,0.0,0.154943,0.570303,store_588
1111,739,3597.783491,2023.350920,0.562388,-0.230648,0.013594,8.306722,-0.046617,0.008881,-0.009285,0.006318,2929.142004,0.048780,0.400541,0.639159,1.0,0.209718,0.680331,store_486
1112,770,5498.275325,3151.518510,0.573183,-0.405057,-0.352738,8.210001,0.002826,0.054237,-0.016821,0.005557,4451.183365,0.036411,0.385714,0.640810,0.0,0.194054,0.691120,store_917
1113,751,7627.969374,4055.272810,0.531632,-0.596036,-0.223879,8.349188,0.018526,0.031322,-0.029155,0.006250,5678.596849,0.044000,0.379494,0.638681,0.0,0.162047,0.584901,store_683


## 4. Train a decision tree to group datasets 

In [261]:
# For the purpose of grouping (training the decision tree),  use the one giant model for all the stores and Compute per-store RMSE and group them. 
# Stores with bad RMSE → “hard” group
# Stores with average RMSE → “medium”
# Stores with good RMSE → “easy”

In [262]:
global_predictor = TabularPredictor.load("AutogluonModels_5min/")

In [263]:
store_rmses = {}

for name, df in stores.items():
    perf = global_predictor.evaluate(df)
    store_rmses[name] = abs(perf["root_mean_squared_error"])

In [264]:
store_rmses

{'store_44': np.float64(427.03360952640526),
 'store_346': np.float64(616.0141636042241),
 'store_331': np.float64(585.7557191763498),
 'store_572': np.float64(496.19500154528595),
 'store_1014': np.float64(1172.7844553164641),
 'store_630': np.float64(527.3283023811134),
 'store_545': np.float64(611.9078797703415),
 'store_201': np.float64(461.2834438514096),
 'store_1096': np.float64(435.2329064714001),
 'store_1111': np.float64(501.838246002304),
 'store_918': np.float64(421.03967884933223),
 'store_563': np.float64(491.76494007260885),
 'store_731': np.float64(584.2620050975702),
 'store_406': np.float64(479.52061758359474),
 'store_1040': np.float64(633.5579906504358),
 'store_232': np.float64(539.8962895168451),
 'store_134': np.float64(358.65287362622064),
 'store_155': np.float64(495.7663021096319),
 'store_221': np.float64(442.1392740704735),
 'store_250': np.float64(625.1845422574472),
 'store_94': np.float64(532.3873351593719),
 'store_653': np.float64(435.4116964003884),
 '

In [265]:
def assign_group_by_rmse(rmse):
    if rmse > 800:
        return "hard"
    elif rmse > 650:
        return "medium"
    else:
        return "easy"

meta_df["group"] = meta_df["label"].apply(lambda s: assign_group_by_rmse(store_rmses[s]))


In [266]:
meta_df["group"].value_counts()

group
easy      860
medium    151
hard      104
Name: count, dtype: int64

In [267]:
clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_leaf=2,
    random_state=42
)

X = meta_df.drop(columns=["group" , "label"])
y = meta_df["group"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

clf.fit(X_train, y_train)

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,2
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [268]:
pred = clf.predict(X_test)
acc = accuracy_score(y_test, pred)
print("Accuracy:", acc)

Accuracy: 0.8243727598566308


## 5. Train an AutoGluon model for each group

In [269]:

# Train a model for each group
group_models = {}

for group in meta_df["group"].unique():
    print("Training model for group:", group)
    
    # get store names in this group
    store_names = meta_df[meta_df["group"] == group]["label"].tolist()
    
    # merge datasets from all stores in this group
    group_df = pd.concat([stores[name] for name in store_names], ignore_index=True)
    
    # train model for this group
    predictor = TabularPredictor(
        label="Sales",
        path=f"Models_group/{group}/"
    ).fit(
        group_df,
        presets="best",
        time_limit=600,
        dynamic_stacking=False
    )

    group_models[group] = predictor
    

Preset alias specified: 'best' maps to 'best_quality'.
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.4.0
Python Version:     3.11.11
Operating System:   Darwin
Platform Machine:   arm64
Platform Version:   Darwin Kernel Version 23.4.0: Fri Mar 15 00:19:22 PDT 2024; root:xnu-10063.101.17~1/RELEASE_ARM64_T8112
CPU Count:          8
Memory Avail:       1.33 GB / 8.00 GB (16.6%)
Disk Space Avail:   53.29 GB / 228.27 GB (23.3%)
Presets specified: ['best']
Using hyperparameters preset: hyperparameters='zeroshot'
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1


Training model for group: easy


Beginning AutoGluon training ... Time limit = 600s
AutoGluon will save models to "/Users/sepideghorbanian/Documents/Semester_5/Research_Project/Models_group/easy"
Train Data Rows:    626941
Train Data Columns: 18
Label Column:       Sales
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == int and many unique label-values observed).
	Label info (max, min, mean, stddev): (26807, 0, 5153.42366, 3159.48365)
	If 'regression' is not the correct problem_type, please manually specify the problem_type parameter during Predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression', 'quantile'])
Problem Type:       regression
Preprocessing data ...
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    1509.33 MB
	Train Data (Original)  Memory Usage: 175.78 MB (11.6% of available memory)
	Inferring data type of each feature based on column values. Se

Training model for group: hard


Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    1721.41 MB
	Train Data (Original)  Memory Usage: 21.57 MB (1.3% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 4 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
		Fitting CategoryFeatureGenerator...
			Fitting CategoryMemoryMinimizeFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', [])  :  5 | ['CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 

Training model for group: medium


Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    2714.25 MB
	Train Data (Original)  Memory Usage: 30.81 MB (1.1% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
			Note: Converting 4 features to boolean dtype as they only contain 2 unique values.
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
		Fitting CategoryFeatureGenerator...
			Fitting CategoryMemoryMinimizeFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Stage 5 Generators:
		Fitting DropDuplicatesFeatureGenerator...
	Types of features in original data (raw dtype, special dtypes):
		('float', [])  :  5 | ['CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 

In [271]:
group_models

{'easy': <autogluon.tabular.predictor.predictor.TabularPredictor at 0x17d44ba50>,
 'hard': <autogluon.tabular.predictor.predictor.TabularPredictor at 0x17d442ad0>,
 'medium': <autogluon.tabular.predictor.predictor.TabularPredictor at 0x152728e10>}

## 6. Final Pipeline

In [None]:
def predict_for_dataset(new_df):
    
    # 1. Compute metadata
    metadata = compute_metadata(new_df)
    meta_vec = pd.DataFrame([metadata])
    
    # 2. Predict group with probabilities*
    probs = clf.predict_proba(meta_vec)[0]
    confidence = max(probs)
    pred_group = clf.classes_[np.argmax(probs)]
    
    print("Predicted group:", pred_group)
    print("Prediction confidence:", confidence)
    
    # 3. Fallback: if confidence is too low → use global model instead
    if confidence < 0.6:
        print("Low confidence — using global model instead.")
        return global_predictor.predict(new_df), global_predictor.evaluate(new_df)
    
    # 4. Load the correct group model
    predictor = TabularPredictor.load(f"Models_group/{pred_group}/")
    
    # 5. Predict using selected group model
    preds = predictor.predict(new_df)
    evals = predictor.evaluate(new_df)
    
    return preds, evals


## 7. Usage

In [273]:
meta_df.iloc[718]

num_rows                                743
mean_sales                      5844.975774
std_sales                        2904.64343
cv_sales                           0.496947
skew_sales                         -1.02155
kurtosis_sales                     0.171965
entropy_sales                      8.281397
acf_lag1                          -0.006378
acf_lag7                          -0.074628
acf_lag30                           -0.0136
weekday_seasonality_strength       0.006606
volatility                      4123.158961
jump_fraction                      0.052561
promo_fraction                     0.384926
promo_sales_corr                    0.46406
promo2_fraction                         0.0
mean_mutual_info                   0.130918
max_mutual_info                    0.442137
label                             store_632
group                                  easy
Name: 718, dtype: object

In [274]:
incoming_df = stores["store_718"]

In [275]:
preds = predict_for_dataset(incoming_df)
preds

Predicted group: easy
Prediction confidence: 0.9754722222222222


(2250       7332.744629
 2712         -4.130377
 4306       6068.934082
 4474       8644.786133
 5042      12396.359375
               ...     
 809820     5829.947266
 810422     6641.406738
 810436     7836.470215
 812348       -2.487826
 812377     6731.804199
 Name: Sales, Length: 725, dtype: float32,
 {'root_mean_squared_error': np.float64(-536.9945376970946),
  'mean_squared_error': -288363.125,
  'mean_absolute_error': -372.951904296875,
  'r2': 0.9751595854759216,
  'pearsonr': 0.9877379406725663,
  'median_absolute_error': -282.46875})

In [276]:
global_predictor.evaluate(incoming_df)


{'root_mean_squared_error': np.float64(-483.2252021512836),
 'mean_squared_error': -233506.609375,
 'mean_absolute_error': -344.7164306640625,
 'r2': 0.9798851013183594,
 'pearsonr': 0.9898985073192784,
 'median_absolute_error': -254.26953125}

In [277]:
# seems like the small models work better on the hard datasets but worse on the easy datasets?! And for medium is almost the same.

In [278]:
# The provlem is when the decison tree predicts incorrectly and use the wrong model for it. We need to make sure that this does not happen.
# store 264 was a good example of that. With using the simple meta data and decision tree, it was predicted an easy dataset however it was a hard one, therefore it used the easy model on it which worked awfully. (RMSE=2590)
# But with the use of random forest and the new metadata calculation it was predicted as a hard dataset and by using the hard model the evaluation results were also much better than the global model.
# That is why we use a threshod for the prediction score now. 

In [None]:
# What meta features would you suggest to capture the actual difficulty of the dataset?
# Randomforest or decision tree?
