In [1]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.5


In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split,cross_val_score, KFold,RandomizedSearchCV
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.ensemble import VotingRegressor, HistGradientBoostingRegressor, GradientBoostingRegressor, RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, SGDRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor, BaggingRegressor
from sklearn.neural_network import MLPRegressor
from catboost import CatBoostRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from math import pi
from sklearn.cluster import KMeans

# Ignore all warnings
import warnings
warnings.simplefilter("ignore")

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



### Get the raw data

This step make sure that the train and test data is in the same page.

In [2]:
# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Base features
base_features = test_df.drop(columns=['id']).columns
test_id = test_df['id']

# Concatenate train and test datasets
train_df = pd.concat([train_df[base_features], train_df['orders']], axis=1)

test_df=test_df[base_features]

train_test_df = pd.concat([train_df, test_df], sort=False).reset_index(drop=True)

date_col = 'date'

### feature extraction and processing

In [3]:
def base_features_processing(df):

    df[date_col] = pd.to_datetime(df[date_col], errors='coerce')

    df["year"] = df[date_col].dt.year.fillna(-1)
    df["month"] = df[date_col].dt.month.fillna(-1)
    df["day"] = df[date_col].dt.day.fillna(-1)
    df["day_of_week"] = df[date_col].dt.dayofweek.fillna(-1)

    df["week_of_year"] = df[date_col].dt.isocalendar().week.fillna(-1)


    df["quarter"] = df[date_col].dt.quarter.fillna(-1)
    df["is_month_start"] = df[date_col].dt.is_month_start.astype(int).fillna(-1)
    df["is_month_end"] = df[date_col].dt.is_month_end.astype(int).fillna(-1)
    df["is_quarter_start"] = df[date_col].dt.is_quarter_start.astype(int).fillna(-1)
    df["is_quarter_end"] = df[date_col].dt.is_quarter_end.astype(int).fillna(-1)


    # check if the holiday is close.
    df['holiday_before'] = df['holiday'].shift(1).fillna(0).astype(int)
    df['holiday_after'] = df['holiday'].shift(-1).fillna(0).astype(int)

    # total number of holidays in the corresponding month of that row
    df['total_holidays_month'] = df.groupby(['year', 'month'])['holiday'].transform('sum')
    # the total number of days that shops were closed in the corresponding week of that row
    df['total_shops_closed_week'] = df.groupby(['year', 'week_of_year'])['shops_closed'].transform('sum')

    df.drop(date_col, axis=1, inplace=True)

    # Replace null values in holiday_name with 'None'
    df['holiday_name'].fillna('None', inplace=True)

    # OneHotEncoding for holiday_name

    enc = OneHotEncoder(sparse=False)
    holiday_encoded = enc.fit_transform(df[['holiday_name']])

    encoded_df = pd.DataFrame(holiday_encoded, columns=enc.get_feature_names_out(['holiday_name']))
    df = pd.concat([df, encoded_df], axis=1)
    df.drop('holiday_name', axis=1, inplace=True)

    # LabelEncoding for warehouse column;

    le = LabelEncoder()
    df['warehouse'] = le.fit_transform(df['warehouse'])

    return df

In [4]:
# extract some basic features
train_test_df=base_features_processing(train_test_df)

### Work with seasonality

In [5]:
# Apply sine and cosine transformations
# The reason we do this is that we want all cyclical patterns captured
# capture seasonality
def add_fourier_terms(df, year_k, week_k, day_k):
    for k in range(1, year_k+1):
        df['year_sin_'+str(k)] = df['year'] * np.sin(2 * pi * df['year'])
        df['year_cos_'+str(k)] = df['year'] * np.cos(2 * pi * df['year'])
    for k in range(1, week_k+1):
        df['month_sin_'+str(k)] = df['month'] * np.sin(2 * pi * df['month'])
        df['month_cos_'+str(k)] = df['month'] * np.cos(2 * pi * df['month'])
    for k in range(1, day_k+1):
        df['day_sin_'+str(k)] = df['day'] * np.sin(2 * pi * df['day'])
        df['day_cos_'+str(k)] = df['day'] * np.cos(2 * pi * df['day'])
    for k in range(1, day_k+1):
        df['quarter'+str(k)] = df['quarter'] * np.sin(2 * pi * df['quarter'])
        df['quarter'+str(k)] = df['quarter'] * np.cos(2 * pi * df['quarter'])

add_fourier_terms(train_test_df, year_k= 5, week_k=5, day_k=5)

### Go back to where we start
Convert the train_test_df to seperate dfs since we have done some basic feature transformation

In [6]:
groupby_columns=['warehouse', 'holiday', 'shops_closed']
print('groupby_columns: ', groupby_columns)

train_test_df_2=train_test_df.copy()

# Convert the data back to train_df and test_df
train_df_processed = train_test_df_2[~train_test_df_2['orders'].isnull()]

#train_df_processed.dropna(inplace=True)

test_df_processed = train_test_df_2[train_test_df_2['orders'].isnull()]


test_df_processed = test_df_processed.drop(columns=['orders'])

test_data_len=len(test_df_processed)

groupby_columns:  ['warehouse', 'holiday', 'shops_closed']


In [7]:
# Fill Na to make sure
train_df_processed=train_df_processed.fillna(train_df_processed.mean())
test_df_processed=test_df_processed.fillna(test_df_processed.mean())

In [8]:
# Move target to the last column
column_to_move = train_df_processed['orders']
train_df_processed = train_df_processed.drop('orders', axis=1)
train_df_processed = pd.concat([train_df_processed, column_to_move], axis=1)

### Build up more features
	•    Interaction Features: Created to capture complex relationships between orders and other binary indicators (like holidays or shop closures).
	•	Cumulative Features: Added to understand the accumulation of orders over time within specific groups, which can help in capturing trends.
	•	Handling Missing Data: Ensures that the dataset is clean and ready for modeling by filling or dropping missing values.
	•	Sorting: Organizes the data chronologically, which is especially important for time-sensitive models.
	•	Warehouse Analysis: Counts the occurrences of each warehouse in the test set to potentially guide further processing or analysis.

In [9]:
train_df_processed['orders_holiday'] = train_df_processed['orders'] * train_df_processed['holiday']
train_df_processed['orders_wsh'] = train_df_processed['orders'] * train_df_processed['winter_school_holidays']

train_df_processed['orders_sh'] = train_df_processed['orders'] * train_df_processed['school_holidays']

train_df_processed['orders_shops_closed'] = train_df_processed['orders'] * train_df_processed['shops_closed']

#train_df_processed['daily_avg']  = train_df_processed.groupby(['warehouse','day_of_week'])['orders'].transform('mean')
#train_df_processed['monthly_avg'] = train_df_processed.groupby(['warehouse','month'])['orders'].transform('mean')

train_df_processed['cumulative_orders'] = train_df_processed.groupby(groupby_columns)['orders'].cumsum()
holiday_names=['holiday_name_1848 Revolution Memorial Day (Extra holiday)', 'holiday_name_2nd Christmas Day', "holiday_name_All Saints' Day Holiday", 'holiday_name_Christmas Eve', 'holiday_name_Cyrila a Metodej', 'holiday_name_Day of National Unity', 'holiday_name_Den boje za svobodu a demokracii', 'holiday_name_Den ceske statnosti', 'holiday_name_Den osvobozeni', 'holiday_name_Den vzniku samostatneho ceskoslovenskeho statu', 'holiday_name_Easter Monday', 'holiday_name_Good Friday', 'holiday_name_Independent Hungary Day', 'holiday_name_International womens day', 'holiday_name_Jan Hus', 'holiday_name_Labour Day', 'holiday_name_Memorial Day for the Martyrs of Arad', 'holiday_name_Memorial Day for the Victims of the Communist Dictatorships', 'holiday_name_Memorial Day for the Victims of the Holocaust', 'holiday_name_Memorial Day of the Republic', 'holiday_name_National Defense Day', 'holiday_name_New Years Day', 'holiday_name_None', 'holiday_name_Peace Festival in Augsburg', 'holiday_name_Reformation Day']
train_df_processed=train_df_processed.fillna(train_df_processed.mean())
train_df_processed.dropna(inplace=True)
train_df_processed.sort_values(by=['year','month','day'])

Unnamed: 0,warehouse,holiday,shops_closed,winter_school_holidays,school_holidays,year,month,day,day_of_week,week_of_year,...,quarter2,quarter3,quarter4,quarter5,orders,orders_holiday,orders_wsh,orders_sh,orders_shops_closed,cumulative_orders
0,4,0,0,0,0,2020,12,5,5,49,...,4.0,4.0,4.0,4.0,6895.0,0.0,0.0,0.0,0.0,6895.0
1193,0,0,0,0,0,2020,12,5,5,49,...,4.0,4.0,4.0,4.0,6447.0,0.0,0.0,0.0,0.0,6447.0
2386,5,0,0,0,0,2020,12,5,5,49,...,4.0,4.0,4.0,4.0,4154.0,0.0,0.0,0.0,0.0,4154.0
3579,6,0,0,0,0,2020,12,5,5,49,...,4.0,4.0,4.0,4.0,4091.0,0.0,0.0,0.0,0.0,4091.0
6186,1,0,0,0,0,2020,12,5,5,49,...,4.0,4.0,4.0,4.0,4623.0,0.0,0.0,0.0,0.0,4623.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2385,0,0,0,0,0,2024,3,15,4,11,...,1.0,1.0,1.0,1.0,10777.0,0.0,0.0,0.0,0.0,8364673.0
3578,5,0,0,0,0,2024,3,15,4,11,...,1.0,1.0,1.0,1.0,7140.0,0.0,0.0,0.0,0.0,5916900.0
4771,6,0,0,0,0,2024,3,15,4,11,...,1.0,1.0,1.0,1.0,6408.0,0.0,0.0,0.0,0.0,5416577.0
5556,3,0,0,0,0,2024,3,15,4,11,...,1.0,1.0,1.0,1.0,6512.0,0.0,0.0,0.0,0.0,2646950.0


In [10]:
# frequncy at which each warehouse appears in our test dataset
warehouse_counts = test_df_processed['warehouse'].value_counts().reset_index()
warehouse_counts.columns = ['warehouse', 'count']
#val=warehouse_counts['warehouse'][0]

wr_count = warehouse_counts['count'][warehouse_counts['warehouse'] == 0].item()
print(wr_count)

61


In [11]:
warehouse_counts

Unnamed: 0,warehouse,count
0,4,61
1,0,61
2,5,61
3,6,61
4,1,57
5,3,48
6,2,48


### The most important part: map the known to the unknown:
	•	The function is intended to fill in missing or unknown values in the test data (test_df) for a specific feature by using the most recent (latest) values of that feature from the training data (train_df) for each warehouse.
	•	This is particularly useful when you want to leverage the most recent patterns or trends from the training data to make better predictions on the test data.

In [12]:
# Extract the records for extra features for each warehouse and insert into test_df_processed
def get_latest_matching_record(train_df, test_df, feature):
    # Create a copy of the test dataframe
    result_df = test_df.copy()
    # Process each warehouse separately
    for warehouse in test_df['warehouse'].unique():
        # Extract the records for the current warehouse
        wr_count = warehouse_counts['count'][warehouse_counts['warehouse'] == warehouse].item()
        #print(f'wharehouse {warehouse} occurances in test df: ', wr_count)
        last_values = train_df[train_df['warehouse'] == warehouse].tail(wr_count)[feature].values
        # Get the rows corresponding to the current warehouse in the result dataframe
        warehouse_rows = result_df[result_df['warehouse'] == warehouse].index
        # Assign the last wr_count values to the corresponding rows in the result dataframe
        for i in range(wr_count):          #(min(wr_count, len(warehouse_rows))):
            result_df.loc[warehouse_rows[i], feature] = last_values[i]

    return result_df

In [13]:
test_df_processed= get_latest_matching_record(train_df_processed, test_df_processed,  'orders_holiday')
test_df_processed= get_latest_matching_record(train_df_processed, test_df_processed,  'orders_wsh')

test_df_processed= get_latest_matching_record(train_df_processed, test_df_processed,  'orders_sh')
test_df_processed= get_latest_matching_record(train_df_processed, test_df_processed,  'orders_shops_closed')
#test_df_processed= get_latest_matching_record(train_df_processed, test_df_processed, 'daily_avg')
#test_df_processed= get_latest_matching_record(train_df_processed, test_df_processed, 'monthly_avg')
test_df_processed= get_latest_matching_record(train_df_processed, test_df_processed, 'cumulative_orders')
test_df_processed=test_df_processed.fillna(test_df_processed.mean())
X = train_df_processed.drop(columns=['orders'])
y = train_df_processed['orders']
# Show the first few rows of the updated dataset
print('train_df_processed.head()', train_df_processed.head())
print('test_df_processed.head()', test_df_processed.head())
train_df_processed.to_csv('train_data.csv')
test_df_processed.to_csv('test_data.csv')

train_df_processed.head()    warehouse  holiday  shops_closed  winter_school_holidays  school_holidays  \
0          4        0             0                       0                0   
1          4        0             0                       0                0   
2          4        0             0                       0                0   
3          4        0             0                       0                0   
4          4        0             0                       0                0   

   year  month  day  day_of_week  week_of_year  ...  quarter2  quarter3  \
0  2020     12    5            5            49  ...       4.0       4.0   
1  2020     12    6            6            49  ...       4.0       4.0   
2  2020     12    7            0            50  ...       4.0       4.0   
3  2020     12    8            1            50  ...       4.0       4.0   
4  2020     12    9            2            50  ...       4.0       4.0   

   quarter4  quarter5  orders  orders_holi

### The machine learning part: get the strong baseline.
In this section, we will get a very powerful baseline. Then, we will do hyperparameters tuning to further optimize the result

In [90]:
# Set random seed
random_seed = 777
# Split train data into features and target
X = train_df_processed.drop(columns=['orders'])
y = train_df_processed['orders']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

In [91]:
# Initialize individual models
models = {
    'XGBRegressor': XGBRegressor(),
    'LGBMRegressor': LGBMRegressor(),
    'CatBoostRegressor': CatBoostRegressor(silent=True),
    'HistGradientBoostingRegressor': HistGradientBoostingRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'SVR': SVR(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'ExtraTreesRegressor': ExtraTreesRegressor(),
    'BaggingRegressor': BaggingRegressor(),
    'SGDRegressor': SGDRegressor(),
    'MLPRegressor': MLPRegressor(max_iter=500)
}

# Dictionary to store evaluation results
results = {}

# Loop over the models
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)

    # Predict on the test set
    y_pred = model.predict(X_test)

    mape = mean_absolute_percentage_error(y_test, y_pred)

    # Store the results
    results[name] = mape

# Identifying top 5 models
# Sort the dictionary by values (scores) in ascending order and select the top 5
top_five_models = dict(sorted(results.items(), key=lambda item: item[1])[:5])

# Print the top five models and their scores
print("Best Performers:")
for model, score in top_five_models.items():
    print(f"{model}: {score}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001710 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1135
[LightGBM] [Info] Number of data points in the train set: 6973, number of used features: 62
[LightGBM] [Info] Start training from score 5540.710455
Best Performers:
RandomForestRegressor: 0.03407636143179316
XGBRegressor: 0.036126134356093756
ExtraTreesRegressor: 0.03727163636335777
BaggingRegressor: 0.037687897723861954
CatBoostRegressor: 0.03901749516021265


In [92]:
# Initialize individual models
xgb_model = XGBRegressor(random_state=random_seed)
cat_model = CatBoostRegressor(random_state=random_seed, silent=True)
hgb_model = HistGradientBoostingRegressor(random_state=random_seed)
lgb_model = LGBMRegressor(random_state=random_seed)
rf_model = RandomForestRegressor(random_state=random_seed)
et_model=ExtraTreesRegressor(random_state=random_seed)
br_model=BaggingRegressor(random_state=random_seed)
dt_model=DecisionTreeRegressor(random_state=random_seed)


# Create a VotingRegressor with the models
voting_reg = VotingRegressor(
    estimators=[
        ('xgb', xgb_model),
        ('cat', cat_model),
        ('hgb', hgb_model),
        ('lgb', lgb_model),
        ('rf', rf_model),
        ('et', et_model),
        ('br', br_model),
        ('dt', dt_model)
    ]
)

# Train the VotingRegressor
voting_reg.fit(X_train, y_train)

# Predict and evaluate on test data
y_pred = voting_reg.predict(X_test)
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f'Voting Regressor MAPE: {mape:.4f}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002603 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1135
[LightGBM] [Info] Number of data points in the train set: 6973, number of used features: 62
[LightGBM] [Info] Start training from score 5540.710455
Voting Regressor MAPE: 0.0332


In [93]:
# Predict on the actual test set

submit_pred = voting_reg.predict(test_df_processed)

# Create submission file
submission = pd.DataFrame({
    'id': test_id,
    'Target': submit_pred
})

# Save submission file
submission.to_csv('submission_new1.csv', index=False)
print(submission.head(397))

                        id        Target
0      Prague_1_2024-03-16  10664.245934
1      Prague_1_2024-03-17  10389.879176
2      Prague_1_2024-03-18   9702.237360
3      Prague_1_2024-03-19   9518.588516
4      Prague_1_2024-03-20   9553.971586
..                     ...           ...
392  Budapest_1_2024-05-11   7014.795140
393  Budapest_1_2024-05-12   6622.288632
394  Budapest_1_2024-05-13   6629.828123
395  Budapest_1_2024-05-14   6830.661320
396  Budapest_1_2024-05-15   6940.213890

[397 rows x 2 columns]


### A completely different approach: AutoML

In [17]:
pip install autogluon

Collecting autogluon
  Downloading autogluon-1.1.1-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.core==1.1.1 (from autogluon.core[all]==1.1.1->autogluon)
  Downloading autogluon.core-1.1.1-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.features==1.1.1 (from autogluon)
  Downloading autogluon.features-1.1.1-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.tabular==1.1.1 (from autogluon.tabular[all]==1.1.1->autogluon)
  Downloading autogluon.tabular-1.1.1-py3-none-any.whl.metadata (13 kB)
Collecting autogluon.multimodal==1.1.1 (from autogluon)
  Downloading autogluon.multimodal-1.1.1-py3-none-any.whl.metadata (12 kB)
Collecting autogluon.timeseries==1.1.1 (from autogluon.timeseries[all]==1.1.1->autogluon)
  Downloading autogluon.timeseries-1.1.1-py3-none-any.whl.metadata (12 kB)
Collecting scipy<1.13,>=1.5.4 (from autogluon.core==1.1.1->autogluon.core[all]==1.1.1->autogluon)
  Downloading scipy-1.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metad

In [16]:
from autogluon.tabular import TabularPredictor, TabularDataset

In [17]:
label='orders'
train_data =train_df_processed

In [94]:
predictor = TabularPredictor(label=label, eval_metric='mean_absolute_percentage_error').fit(
    train_data=train_data,
    time_limit=6000,  # Train for up to 10 minutes
    presets='best_quality',  # Use a good balance of quality and speed
    ag_args_fit={'num_gpus': 1},  # Use 1 GPU for training
)

No path specified. Models will be saved in: "AutogluonModels/ag-20240821_104205"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.1.1
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
CPU Count:          2
Memory Avail:       9.88 GB / 12.67 GB (77.9%)
Disk Space Avail:   41.42 GB / 78.19 GB (53.0%)
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
DyStack is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
	This is used to identify the optimal `num_stack_levels` value. Copies of AutoGluon will be fit on subsets of the data. Then holdout validation d

In [95]:
results = predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                          model  score_val                     eval_metric  pred_time_val     fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0           WeightedEnsemble_L3  -0.028206  mean_absolute_percentage_error      30.802607  3423.919656                0.000781           0.316911            3       True         38
1          ExtraTreesMSE_BAG_L2  -0.028633  mean_absolute_percentage_error      29.450267  2952.920506                0.454630          15.407227            2       True         24
2         ExtraTrees_r42_BAG_L2  -0.028708  mean_absolute_percentage_error      29.446817  2949.871347                0.451180          12.358068            2       True         37
3           WeightedEnsemble_L2  -0.028875  mean_absolute_percentage_error      19.764074  1352.443174                0.001132           0.369147            2       True         19
4        RandomForestMSE_BAG_L2  

In [99]:
predictions = predictor.predict(test_df_processed,model="ExtraTrees_r42_BAG_L2")
# Create submission file
submission = pd.DataFrame({
    'id': test_id,
    'Target': predictions.reset_index(drop="index")
})
submission.to_csv('submission_f3.csv', index=False)

### We didn't see much of the performance grow. Maybe we shrink the dataset dimensions to remove the noise.

In [66]:
train_data.shape

(7340, 85)

In [83]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
import pandas as pd

# Assuming `train_df` is your DataFrame containing the training data
X = train_data.drop(columns=['orders'])  # Replace 'orders' with your target column name
y = train_data['orders']  # Target variable

# Initialize and train the RandomForest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X, y)

# Use SelectFromModel to select the most important features
selector = SelectFromModel(rf, threshold='2*median', prefit=True)
selected_features = selector.transform(X)

# Get the names of the selected features
selected_feature_names = X.columns[selector.get_support()]

# Create a DataFrame with the selected features
important_features_df = pd.DataFrame(selected_features, columns=selected_feature_names)

# Optionally, combine the selected features with the target variable
final_df = pd.concat([important_features_df, y.reset_index(drop=True)], axis=1)

# Output the selected features and their corresponding DataFrame
print("Selected Features:")
print(selected_feature_names)

print("\nFinal DataFrame with Selected Features:")
print(final_df.head())

Selected Features:
Index(['warehouse', 'holiday', 'year', 'month', 'day', 'day_of_week',
       'week_of_year', 'quarter', 'holiday_before', 'holiday_after',
       'total_holidays_month', 'total_shops_closed_week',
       'holiday_name_Christmas Eve', 'holiday_name_New Years Day',
       'year_cos_1', 'year_cos_2', 'year_cos_3', 'year_cos_4', 'year_cos_5',
       'month_cos_1', 'month_cos_2', 'month_cos_3', 'month_cos_4',
       'month_cos_5', 'day_cos_1', 'day_cos_2', 'day_cos_3', 'day_cos_4',
       'day_cos_5', 'quarter5', 'orders_holiday', 'orders_wsh',
       'cumulative_orders'],
      dtype='object')

Final DataFrame with Selected Features:
   warehouse  holiday    year  month  day  day_of_week  week_of_year  quarter  \
0        4.0      0.0  2020.0   12.0  5.0          5.0          49.0      4.0   
1        4.0      0.0  2020.0   12.0  6.0          6.0          49.0      4.0   
2        4.0      0.0  2020.0   12.0  7.0          0.0          50.0      4.0   
3        4.0      0

In [84]:
# Transform the test data using the already fitted selector
selected_test_features = selector.transform(test_df_processed)

# Create a DataFrame with the selected features from the test set
important_test_features_df = pd.DataFrame(selected_test_features, columns=selected_feature_names)

# Output the selected features and their corresponding DataFrame
print("Selected Features in Test Set:")
print(selected_feature_names)

print("\nFinal Test DataFrame with Selected Features:")
print(important_test_features_df.head())

Selected Features in Test Set:
Index(['warehouse', 'holiday', 'year', 'month', 'day', 'day_of_week',
       'week_of_year', 'quarter', 'holiday_before', 'holiday_after',
       'total_holidays_month', 'total_shops_closed_week',
       'holiday_name_Christmas Eve', 'holiday_name_New Years Day',
       'year_cos_1', 'year_cos_2', 'year_cos_3', 'year_cos_4', 'year_cos_5',
       'month_cos_1', 'month_cos_2', 'month_cos_3', 'month_cos_4',
       'month_cos_5', 'day_cos_1', 'day_cos_2', 'day_cos_3', 'day_cos_4',
       'day_cos_5', 'quarter5', 'orders_holiday', 'orders_wsh',
       'cumulative_orders'],
      dtype='object')

Final Test DataFrame with Selected Features:
   warehouse  holiday    year  month   day  day_of_week  week_of_year  \
0        4.0      0.0  2024.0    3.0  16.0          5.0          11.0   
1        4.0      0.0  2024.0    3.0  17.0          6.0          11.0   
2        4.0      0.0  2024.0    3.0  18.0          0.0          12.0   
3        4.0      0.0  2024.0    3

In [85]:
# Set random seed
random_seed = 777
# Split train data into features and target
X = final_df.drop(columns=['orders'])
y = final_df['orders']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_seed)

In [86]:
# Initialize individual models
xgb_model = XGBRegressor(random_state=random_seed)
cat_model = CatBoostRegressor(random_state=random_seed, silent=True)
hgb_model = HistGradientBoostingRegressor(random_state=random_seed)
lgb_model = LGBMRegressor(random_state=random_seed)
rf_model = RandomForestRegressor(random_state=random_seed)
et_model=ExtraTreesRegressor(random_state=random_seed)
br_model=BaggingRegressor(random_state=random_seed)
dt_model=DecisionTreeRegressor(random_state=random_seed)


# Create a VotingRegressor with the models
voting_reg = VotingRegressor(
    estimators=[
        ('xgb', xgb_model),
        ('cat', cat_model),
        ('hgb', hgb_model),
        ('lgb', lgb_model),
        ('rf', rf_model),
        ('et', et_model),
        ('br', br_model),
        ('dt', dt_model)
    ]
)

# Train the VotingRegressor
voting_reg.fit(X_train, y_train)

# Predict and evaluate on test data
y_pred = voting_reg.predict(X_test)
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f'Voting Regressor MAPE: {mape:.4f}')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000761 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 772
[LightGBM] [Info] Number of data points in the train set: 5872, number of used features: 31
[LightGBM] [Info] Start training from score 5542.194142
Voting Regressor MAPE: 0.0339


In [88]:
# Predict on the actual test set

submit_pred = voting_reg.predict(important_test_features_df)

# Create submission file
submission = pd.DataFrame({
    'id': test_id,
    'Target': submit_pred
})

# Save submission file
submission.to_csv('submission5.csv', index=False)
print(submission.head(397))

                        id        Target
0      Prague_1_2024-03-16  10734.446381
1      Prague_1_2024-03-17  10419.622697
2      Prague_1_2024-03-18   9871.772180
3      Prague_1_2024-03-19   9663.365009
4      Prague_1_2024-03-20   9648.252073
..                     ...           ...
392  Budapest_1_2024-05-11   6904.956695
393  Budapest_1_2024-05-12   6334.302450
394  Budapest_1_2024-05-13   6512.297098
395  Budapest_1_2024-05-14   6550.312603
396  Budapest_1_2024-05-15   6549.551096

[397 rows x 2 columns]


# Let's try finetuning one single model. See how good it can get

In [109]:
pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-24.7.0-py3-none-any.whl.metadata (11 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-24.7.0-py3-none-any.whl (24 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-24.7.0 scikit-optimize-0.10.2


In [111]:
# Set random seed
random_seed = 777
# Split train data into features and target
X = train_df_processed.drop(columns=['orders'])
y = train_df_processed['orders']

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=random_seed)

In [112]:
import xgboost as xgb
from skopt import BayesSearchCV
from sklearn.metrics import make_scorer, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split

# Initialize the XGBRegressor
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42,tree_method='gpu_hist')

# Define the parameter space for Bayesian optimization
param_space = {
    'n_estimators': (100, 1000),
    'learning_rate': (0.01, 0.3, 'log-uniform'),
    'max_depth': (3, 10),
    'min_child_weight': (1, 10),
    'gamma': (0.0, 1.0, 'uniform'),
    'subsample': (0.6, 1.0, 'uniform'),
    'colsample_bytree': (0.6, 1.0, 'uniform'),
    'reg_alpha': (1e-9, 1.0, 'log-uniform'),
    'reg_lambda': (1e-9, 1.0, 'log-uniform')
}

# Create a custom scorer for MAPE
mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

# Initialize BayesSearchCV
bayes_search = BayesSearchCV(
    estimator=xgb_model,
    search_spaces=param_space,
    n_iter=50,  # Number of parameter settings to sample
    scoring=mape_scorer,  # Use MAPE as the scoring metric
    cv=5,  # Number of cross-validation folds
    verbose=1,
    random_state=42,
    n_jobs=-1  # Use all available cores
)

# Fit BayesSearchCV to find the best parameters
bayes_search.fit(X_train, y_train)

# Print the best parameters and the best score
print(f"Best parameters found: {bayes_search.best_params_}")
print(f"Best score (MAPE): {abs(bayes_search.best_score_):.4f}")

# Evaluate the model with the best parameters on the validation set
best_model = bayes_search.best_estimator_
y_pred = best_model.predict(X_val)
mape = mean_absolute_percentage_error(y_val, y_pred)
print(f"Validation MAPE: {mape:.4f}")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

In [113]:
# Predict on the actual test set
submit_pred = best_model.predict(test_df_processed)

# Create submission file
submission = pd.DataFrame({
    'id': test_id,
    'Target': submit_pred
})

# Save submission file
submission.to_csv('submission_single_best_model.csv', index=False)
print(submission.head(397))

                        id        Target
0      Prague_1_2024-03-16  10539.750000
1      Prague_1_2024-03-17  10352.125000
2      Prague_1_2024-03-18   9857.996094
3      Prague_1_2024-03-19   9647.023438
4      Prague_1_2024-03-20   9597.051758
..                     ...           ...
392  Budapest_1_2024-05-11   7199.659180
393  Budapest_1_2024-05-12   6539.994629
394  Budapest_1_2024-05-13   6582.928223
395  Budapest_1_2024-05-14   6618.795898
396  Budapest_1_2024-05-15   6464.374023

[397 rows x 2 columns]


# let's try tune the parameters of our stacking model. (mannully)

In [None]:
from skopt import BayesSearchCV
from sklearn.model_selection import KFold
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import HistGradientBoostingRegressor, VotingRegressor, RandomForestRegressor, ExtraTreesRegressor, BaggingRegressor
from lightgbm import LGBMRegressor
from sklearn.tree import DecisionTreeRegressor  # Correct import
import numpy as np

# Initialize individual models with GPU support where applicable
xgb_model = xgb.XGBRegressor(random_state=random_seed, tree_method='gpu_hist')
cat_model = CatBoostRegressor(random_state=random_seed, silent=True, task_type="GPU")
hgb_model = HistGradientBoostingRegressor(random_state=random_seed)
lgb_model = LGBMRegressor(random_state=random_seed, n_jobs=-1)
rf_model = RandomForestRegressor(random_state=random_seed)
et_model = ExtraTreesRegressor(random_state=random_seed)
br_model = BaggingRegressor(random_state=random_seed)
dt_model = DecisionTreeRegressor(random_state=random_seed)

# Voting Regressor without tuning
voting_reg = VotingRegressor(
    estimators=[
        ('xgb', xgb_model),
        ('cat', cat_model),
        ('hgb', hgb_model),
        ('lgb', lgb_model),
        ('rf', rf_model),
        ('et', et_model),
        ('br', br_model),
        ('dt', dt_model),
    ]
)

# Define parameter grid for Bayesian Optimization
param_grid = {
    'xgb__n_estimators': (100, 1000),
    'xgb__max_depth': (3, 10),
    'xgb__learning_rate': (0.01, 0.3, 'log-uniform'),
    'xgb__subsample': (0.6, 1.0),
    'cat__depth': (3, 10),
    'cat__learning_rate': (0.01, 0.3, 'log-uniform'),
    'cat__l2_leaf_reg': (1, 10),
    'hgb__max_iter': (100, 1000),
    'hgb__learning_rate': (0.01, 0.3, 'log-uniform'),
    'hgb__max_depth': (3, 10),
    'lgb__n_estimators': (100, 1000),
    'lgb__max_depth': (-1, 10),
    'lgb__learning_rate': (0.01, 0.3, 'log-uniform'),
    'lgb__subsample': (0.6, 1.0),
    'rf__n_estimators': (100, 1000),
    'rf__max_depth': (3, 20),
    'et__n_estimators': (100, 1000),
    'et__max_depth': (3, 20),
    'br__n_estimators': (10, 100),
    'dt__max_depth': (3, 20),
}

# Bayesian search
opt = BayesSearchCV(
    estimator=voting_reg,
    search_spaces=param_grid,
    n_iter=50,  # Increased number of iterations for more thorough search
    cv=KFold(n_splits=5),
    scoring='neg_mean_absolute_percentage_error',
    n_jobs=-1,
    verbose=1,
    random_state=random_seed
)

# Fit the model
opt.fit(X_train, y_train)

# Best model evaluation
best_model = opt.best_estimator_
y_pred = best_model.predict(X_test)
mape = mean_absolute_percentage_error(y_test, y_pred)
print(f'Optimized Voting Regressor MAPE: {mape:.4f}')
print(f'Best parameters found: {opt.best_params_}')

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [None]:
# Predict on the actual test set
submit_pred = best_model.predict(test_df_processed)

# Create submission file
submission = pd.DataFrame({
    'id': test_id,
    'Target': submit_pred
})

# Save submission file
submission.to_csv('submission_stacking_best_model.csv', index=False)
print(submission.head(397))