In [192]:
# Import necessary libraries for data manipulation and analysis
import pandas as pd
import numpy as np

# Import mathematical constant for potential circular calculations
from math import pi

# Import plotting library for data visualization
import matplotlib.pyplot as plt

# Import preprocessing tools from scikit-learn
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder

# Import tools for splitting data and model evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error

# Import LightGBM for gradient boosting framework
import lightgbm as lgb
from lightgbm import LGBMRegressor

# Import XGBoost for another gradient boosting framework
import xgboost as xgb
from xgboost import XGBRegressor

# Import CatBoost for a different gradient boosting algorithm
from catboost import CatBoostRegressor

# Import ensemble methods from scikit-learn
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

# Import linear and logistic regression models
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

# Import boosting and decision tree algorithms
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor

# Import tools for hyperparameter tuning
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import KFold

from skopt import BayesSearchCV
from skopt.space import Real, Integer
from scipy.optimize import minimize

from sklearn.metrics import mean_squared_error
# Suppress all warnings to keep output clean
import warnings
warnings.simplefilter("ignore")
from sklearn.preprocessing import LabelEncoder

In [193]:
from sklearn.metrics import make_scorer, mean_absolute_percentage_error
from skopt.space import Real, Integer, Categorical
mape_scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)

In [194]:
# Load the training dataset from the specified Kaggle input path
base_train_df = pd.read_csv('train.csv')
# Display the first 5 rows of the training dataset for a quick overview
base_train_df.head(5)

Unnamed: 0,warehouse,date,orders,holiday_name,holiday,shutdown,mini_shutdown,shops_closed,winter_school_holidays,school_holidays,blackout,mov_change,frankfurt_shutdown,precipitation,snow,user_activity_1,user_activity_2,id
0,Prague_1,2020-12-05,6895.0,,0,0,0,0,0,0,0,0.0,0,0.0,0.0,1722.0,32575.0,Prague_1_2020-12-05
1,Prague_1,2020-12-06,6584.0,,0,0,0,0,0,0,0,0.0,0,0.0,0.0,1688.0,32507.0,Prague_1_2020-12-06
2,Prague_1,2020-12-07,7030.0,,0,0,0,0,0,0,0,0.0,0,0.0,0.0,1696.0,32552.0,Prague_1_2020-12-07
3,Prague_1,2020-12-08,6550.0,,0,0,0,0,0,0,0,0.0,0,0.8,0.0,1681.0,32423.0,Prague_1_2020-12-08
4,Prague_1,2020-12-09,6910.0,,0,0,0,0,0,0,0,0.0,0,0.5,0.0,1704.0,32410.0,Prague_1_2020-12-09


In [195]:
# Load the test dataset from the specified Kaggle input path
base_test_df = pd.read_csv('test.csv')
# Display the first 5 rows of the test dataset for a quick overview
base_test_df.head(5)

Unnamed: 0,warehouse,date,holiday_name,holiday,shops_closed,winter_school_holidays,school_holidays,id
0,Prague_1,2024-03-16,,0,0,0,0,Prague_1_2024-03-16
1,Prague_1,2024-03-17,,0,0,0,0,Prague_1_2024-03-17
2,Prague_1,2024-03-18,,0,0,0,0,Prague_1_2024-03-18
3,Prague_1,2024-03-19,,0,0,0,0,Prague_1_2024-03-19
4,Prague_1,2024-03-20,,0,0,0,0,Prague_1_2024-03-20


In [196]:
# Define base features by excluding the 'id' column from the test dataset
base_features = base_test_df.drop(columns=['id']).columns
# Extract the 'id' column from the test dataset for later use in predictions
test_id = base_test_df['id']

In [197]:
# Concatenate base features with the 'orders' column from the training dataset
# This creates a new dataframe with only the features and target variable for training
train_df = pd.concat([base_train_df[base_features], base_train_df['orders']], axis=1)

# Prepare the test dataset by selecting only the base features
test_df = base_test_df[base_features]

> ## `Preprocessing`

In [198]:
# Display information about the structure of the training and test datasets
print(train_df.info())
print('='*60)
print(test_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7340 entries, 0 to 7339
Data columns (total 8 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   warehouse               7340 non-null   object 
 1   date                    7340 non-null   object 
 2   holiday_name            218 non-null    object 
 3   holiday                 7340 non-null   int64  
 4   shops_closed            7340 non-null   int64  
 5   winter_school_holidays  7340 non-null   int64  
 6   school_holidays         7340 non-null   int64  
 7   orders                  7340 non-null   float64
dtypes: float64(1), int64(4), object(3)
memory usage: 458.9+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 397 entries, 0 to 396
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   warehouse               397 non-null    object
 1   date                    397 no

In [199]:
# Combine training and test datasets for feature engineering
all_df = pd.concat([train_df, test_df], sort=False).reset_index(drop=True)

In [200]:
# Convert 'date' column to datetime for easier manipulation
date_start = pd.to_datetime(all_df['date'], errors='coerce').min()

# Extract various time-based features from the 'date' column
date_col = ['date']
for _col in date_col:
    date_col = pd.to_datetime(all_df[_col], errors='coerce')
    # Extract year, month, day, etc., from the date
    all_df[_col + "_year"] = date_col.dt.year.fillna(-1)
    all_df[_col + "_month"] = date_col.dt.month.fillna(-1)
    all_df[_col + "_day"] = date_col.dt.day.fillna(-1)
    all_df[_col + "_day_of_week"] = date_col.dt.dayofweek.fillna(-1)
    all_df[_col + "_week_of_year"] = date_col.dt.isocalendar().week.fillna(-1)

    # Calculate number of days since the start date
    all_df[_col + "_num"] = (date_col-date_start).dt.days.fillna(-1)
    
    # Adjust day of year for leap years
    all_df[_col + "_day_of_year"] = date_col.dt.dayofyear.fillna(-1)
    all_df[_col + "_day_of_year"] = np.where( (all_df[_col + "_year"]%4==0)&(all_df[_col + "_month"]>2), 
                                              all_df[_col + "_day_of_year"]-1, 
                                              all_df[_col + "_day_of_year"])

    # Extract quarter information
    all_df[_col + "_quarter"] = date_col.dt.quarter.fillna(-1)
    
    # Create boolean features for start and end of various time periods
    all_df[_col + "_is_month_start"] = date_col.dt.is_month_start.astype(int).fillna(-1)
    all_df[_col + "_is_month_end"] = date_col.dt.is_month_end.astype(int).fillna(-1)
    all_df[_col + "_is_quarter_start"] = date_col.dt.is_quarter_start.astype(int).fillna(-1)
    all_df[_col + "_is_quarter_end"] = date_col.dt.is_quarter_end.astype(int).fillna(-1)

# Reconvert 'date' column to datetime type for consistency
all_df['date'] = pd.to_datetime(all_df['date'])

# Display the resulting dataframe with all engineered features
all_df

Unnamed: 0,warehouse,date,holiday_name,holiday,shops_closed,winter_school_holidays,school_holidays,orders,date_year,date_month,date_day,date_day_of_week,date_week_of_year,date_num,date_day_of_year,date_quarter,date_is_month_start,date_is_month_end,date_is_quarter_start,date_is_quarter_end
0,Prague_1,2020-12-05,,0,0,0,0,6895.0,2020,12,5,5,49,0,339,4,0,0,0,0
1,Prague_1,2020-12-06,,0,0,0,0,6584.0,2020,12,6,6,49,1,340,4,0,0,0,0
2,Prague_1,2020-12-07,,0,0,0,0,7030.0,2020,12,7,0,50,2,341,4,0,0,0,0
3,Prague_1,2020-12-08,,0,0,0,0,6550.0,2020,12,8,1,50,3,342,4,0,0,0,0
4,Prague_1,2020-12-09,,0,0,0,0,6910.0,2020,12,9,2,50,4,343,4,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7732,Budapest_1,2024-05-11,,0,0,0,0,,2024,5,11,5,19,1253,131,2,0,0,0,0
7733,Budapest_1,2024-05-12,,0,0,0,0,,2024,5,12,6,19,1254,132,2,0,0,0,0
7734,Budapest_1,2024-05-13,,0,0,0,0,,2024,5,13,0,20,1255,133,2,0,0,0,0
7735,Budapest_1,2024-05-14,,0,0,0,0,,2024,5,14,1,20,1256,134,2,0,0,0,0


In [201]:
# Apply sine and cosine transformations to capture cyclical patterns in time

# Commented out: Yearly sine and cosine transformations based on the year itself
# Import the math library for pi
from math import pi

all_df['month_sin'] = all_df['date_month'] * np.sin(2 * pi * all_df['date_month'])
all_df['month_cos'] = all_df['date_month'] * np.cos(2 * pi * all_df['date_month'])
all_df['day_sin'] = all_df['date_day'] * np.sin(2 * pi * all_df['date_day'])
all_df['day_cos'] = all_df['date_day'] * np.cos(2 * pi * all_df['date_day'])

all_df['year_sin'] = np.sin(2 * pi * all_df["date_day_of_year"])
all_df['year_cos'] = np.cos(2 * pi * all_df['date_day_of_year'])


In [202]:
# Handle missing values in the 'holiday_name' column by replacing them with 'None'
all_df['holiday_name'].fillna('None', inplace=True)

In [203]:
# Perform One-Hot Encoding on the 'holiday_name' column
enc = OneHotEncoder(sparse_output=False)

# Fit and transform the 'holiday_name' column into a new encoded dataframe
holiday_encoded = enc.fit_transform(all_df[['holiday_name']])
# Create a dataframe from the encoded data with appropriate column names
encoded_df = pd.DataFrame(holiday_encoded, columns=enc.get_feature_names_out(['holiday_name']))
# Concatenate the original dataframe with the encoded holiday features
all_df = pd.concat([all_df, encoded_df], axis=1)

In [204]:
# Remove the original 'holiday_name' column after encoding
all_df = all_df.drop('holiday_name', axis=1)

# Apply Label Encoding to the 'warehouse' column
le = preprocessing.LabelEncoder()
# Transform categorical 'warehouse' data into numerical labels
all_df['warehouse'] = le.fit_transform(all_df['warehouse'])

# Commented out: Label Encoding for 'holiday_name' (not used due to One-Hot Encoding)
# all_df['holiday_name'] = le.fit_transform(all_df['holiday_name'])

In [205]:
# Feature engineering for holidays: create features for the day before and after holidays
all_df['holiday_before'] = all_df['holiday'].shift(1).fillna(0).astype(int)
all_df['holiday_after'] = all_df['holiday'].shift(-1).fillna(0).astype(int)

In [206]:
# Split the data back into training and test sets based on the presence of 'orders'
train_df_le = all_df[~all_df['orders'].isnull()]
test_df_le = all_df[all_df['orders'].isnull()]

# Remove the 'date' column from both datasets as it's no longer needed for modeling
train_df_le = train_df_le.drop(columns=['date'], axis=1)
test_df_le = test_df_le.drop(columns=['date'], axis=1)

> ## `Modeling (Ensemble + Stacking)`

**Ensemble**
* LightGBM 
* XGBoost 
* RandomForest 
* CatBoost
* Logistic Regression
* Ada Boost
* Decision Tree
* Gradient Boost

In [207]:
# Data Splitting
# Set a random seed for reproducibility
random_seed = 777 

# Prepare features and target variable
X = train_df_le.drop(columns=['orders'])
y = train_df_le['orders']

# buggy part
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1, random_state=random_seed)

In [164]:
# Define search spaces for Bayesian Optimization
search_spaces = {
    'lgb': {
        'n_estimators': Integer(100, 1000),
        'num_leaves': Integer(20, 100),
        'learning_rate': Real(0.001, 0.1, prior='log-uniform'),
        'colsample_bytree': Real(0.4, 1.0),
        'subsample': Real(0.5, 1.0),
        'min_child_weight': Integer(1, 10),
        'lambda_l1': Real(0, 1),
        'lambda_l2': Real(0, 1)
    },
    'xgb': {
        'n_estimators': Integer(100, 1000),
        'max_depth': Integer(3, 15),
        'learning_rate': Real(0.001, 0.1, prior='log-uniform'),
        'colsample_bytree': Real(0.4, 1.0),
        'subsample': Real(0.5, 1.0),
        'min_child_weight': Integer(1, 10),
        'gamma': Real(0, 5),
        'lambda': Real(0, 1),
        'alpha': Real(0, 1)
    },
    'cat': {
        'iterations': Integer(100, 1000),
        'depth': Integer(4, 12),
        'learning_rate': Real(0.001, 0.1, prior='log-uniform'),
        'l2_leaf_reg': Real(1, 10),
        'border_count': Integer(32, 255),
        'bagging_temperature': Real(0.0, 1.0),
        'random_strength': Real(0.0, 1.0)
    },
    'rf': {
        'n_estimators': Integer(100, 1000),
        'max_depth': Integer(5, 50),
        'min_samples_split': Integer(2, 20),
        'min_samples_leaf': Integer(1, 10),
        'max_features': Real(0.1, 1.0)
    }
}

# Initialize the models with default parameters and GPU/multicore settings
models = {
    'lgb': LGBMRegressor(device='gpu',random_state=random_seed),
    'xgb': XGBRegressor(tree_method='gpu_hist', random_state=random_seed),
    'cat': CatBoostRegressor(task_type='GPU', random_state=random_seed, verbose=0),
    'rf': RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=random_seed)
}


# models which won't be tuned
lr_model = LogisticRegression(random_state=random_seed)
ad_model = AdaBoostRegressor(random_state=random_seed)
dt_model = DecisionTreeRegressor(random_state=random_seed)
gb_model = GradientBoostingRegressor(random_state=random_seed)


lr_model.fit(X_train, y_train)
ad_model.fit(X_train, y_train)
dt_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)

# Bayesian Optimization for each model
opt_models = {}
for model_name, model in models.items():
    opt = BayesSearchCV(estimator=model, search_spaces=search_spaces.get(model_name, {}), n_iter=30, cv=5, random_state=random_seed,verbose=1,scoring=mape_scorer)
    opt.fit(X_train, y_train)
    opt_models[model_name] = opt.best_estimator_
    print(model_name, opt.best_score_, opt.best_params_)

# untuned model; doing this for time saving
opt_models['lr'] =lr_model
opt_models['ad'] =ad_model
opt_models['dt'] =dt_model
opt_models['gb'] =gb_model

# Train optimized models on full training data
stacking_train = np.zeros((X_train.shape[0], len(opt_models)))

for i, (model_name, model) in enumerate(opt_models.items()):
    stacking_train[:, i] = model.predict(X_train)


# Define search spaces for the meta-models
meta_search_spaces = {
    'meta_model_1': {  # LGBMRegressor
        'n_estimators': Integer(100, 1000),
        'num_leaves': Integer(20, 100),
        'learning_rate': Real(0.001, 0.1, prior='log-uniform'),
        'colsample_bytree': Real(0.4, 1.0),
    },
    'meta_model_2': {  # CatBoostRegressor
        'iterations': Integer(100, 1000),
        'depth': Integer(4, 12),
        'learning_rate': Real(0.001, 0.1, prior='log-uniform')
    },
    'meta_model_3': {  # XGBRegressor
        'n_estimators': Integer(100, 1000),
        'max_depth': Integer(3, 15),
        'learning_rate': Real(0.001, 0.1, prior='log-uniform'),
        'colsample_bytree': Real(0.4, 1.0),
        'subsample': Real(0.5, 1.0)
    },
    'meta_model_4': {  # RandomForestRegressor
        'n_estimators': Integer(100, 1000),
        'max_depth': Integer(5, 50),
        'min_samples_split': Integer(2, 20),
        'min_samples_leaf': Integer(1, 10),
        'max_features': Real(0.1, 1.0)
    },
    'meta_model_5': {  # GradientBoostingRegressor
        'n_estimators': Integer(100, 1000),
        'max_depth': Integer(3, 15),
        'learning_rate': Real(0.001, 0.1, prior='log-uniform'),
        'subsample': Real(0.5, 1.0),
        'min_samples_split': Integer(2, 20),
        'min_samples_leaf': Integer(1, 10),
        'max_features': Real(0.1, 1.0)
    },
    'meta_model_6': {  # DecisionTreeRegressor
        'max_depth': Integer(3, 15),
        'min_samples_split': Integer(2, 20),
        'min_samples_leaf': Integer(1, 10),
        'max_features': Categorical(['sqrt', 'log2']),
        'ccp_alpha': Real(0.0, 0.1)  # Cost-complexity pruning parameter
    },
    'meta_model_7': {  # AdaBoostRegressor
        'n_estimators': Integer(50, 500),
        'learning_rate': Real(0.001, 1.0, prior='log-uniform'),
        'loss': Categorical(['linear', 'square', 'exponential'])
    },
    'meta_model_8': {  # LinearRegression
        'fit_intercept': Categorical([True, False])
    }
}


# Initialize and optimize meta-models using Bayesian Optimization
meta_models = {
    'meta_model_1': LGBMRegressor(device='gpu', random_state=random_seed),
    'meta_model_2': CatBoostRegressor(task_type='GPU', random_state=random_seed, verbose=0),
    'meta_model_3': XGBRegressor(tree_method='gpu_hist', random_state=random_seed),
    'meta_model_4': RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=random_seed),
    'meta_model_5': GradientBoostingRegressor(random_state=random_seed),
    'meta_model_6': DecisionTreeRegressor(random_state=random_seed),
    'meta_model_7': AdaBoostRegressor(random_state=random_seed),
    'meta_model_8': LinearRegression(n_jobs=-1)
}

opt_meta_models = {}
for model_name, model in meta_models.items():
    opt_meta = BayesSearchCV(estimator=model, search_spaces=meta_search_spaces.get(model_name, {}), n_iter=10, cv=5, random_state=random_seed, verbose=1, scoring=mape_scorer)
    opt_meta.fit(stacking_train, y_train)
    opt_meta_models[model_name] = opt_meta.best_estimator_
    print(model_name, opt_meta.best_score_, opt_meta.best_params_)

# drop the order column 
test_df_le=test_df_le.drop(columns=['orders'])

level_2_train = np.zeros((X_train.shape[0], len(opt_meta_models)))
for i, (model_name, model) in enumerate(opt_meta_models.items()):
    print(model_name)
    level_2_train[:, i] = model.predict(stacking_train)

# the model first pass through original layer
stacking_t=np.zeros((test_df_le.shape[0], len(opt_models)))
for i, (model_name, model) in enumerate(opt_models.items()):
    stacking_t[:, i] = model.predict(test_df_le)
# pass the second layer. Which is meta-layer
meta_stack=np.zeros((test_df_le.shape[0], len(opt_meta_models)))
for i, (model_name, model) in enumerate(opt_meta_models.items()):
    meta_stack[:, i] = model.predict(stacking_t)


initial_weights = np.ones(len(meta_models)) / len(meta_models)

# Final prediction
submit_pred = np.dot(meta_stack, initial_weights)


Fitting 5 folds for each of 1 candidates, totalling 5 fits
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 975
[LightGBM] [Info] Number of data points in the train set: 5871, number of used features: 25
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4090, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 14 dense feature groups (0.09 MB) transferred to GPU in 0.001371 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 5542.428717
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 976
[LightGBM] [Info] Number of data points in the train set: 5871, number of used features: 26
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4090, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been bui

In [None]:
'colsample_bytree': 0.9090704690562509, 'lambda_l1': 0.5123226527180421, 'lambda_l2': 0.17762444404308067, 'learning_rate': 0.04939196011309037, 'min_child_weight': 8, 'n_estimators': 501, 'num_leaves': 53, 'subsample': 0.5052004980567765}
alpha': 0.0, 'colsample_bytree': 1.0, 'gamma': 5.0, 'lambda': 0.8926060892693777, 'learning_rate': 0.025678181688152554, 'max_depth': 15, 'min_child_weight': 10, 'n_estimators': 686, 'subsample': 0.5
bagging_temperature': 1.0, 'border_count': 133, 'depth': 12, 'iterations': 1000, 'l2_leaf_reg': 10.0, 'learning_rate': 0.1, 'random_strength': 0.0
max_depth': 31, 'max_features': 1.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 1000

> ## `Submit`

In [165]:
# Create a DataFrame for submission with 'id' and 'Target'
submission = pd.DataFrame({
    'id': test_id,         # Test IDs for each prediction
    'Target': submit_pred  # Final predictions for submission
})

# Save the submission DataFrame to a CSV file
submission.to_csv('submission_ultimate1.csv', index=False)  # Save without including row indices

# Print the submission DataFrame to verify the results
print(submission)

                        id        Target
0      Prague_1_2024-03-16  10354.786501
1      Prague_1_2024-03-17  10283.978299
2      Prague_1_2024-03-18  10004.316190
3      Prague_1_2024-03-19   9983.068800
4      Prague_1_2024-03-20   9983.252076
..                     ...           ...
392  Budapest_1_2024-05-11   7123.248690
393  Budapest_1_2024-05-12   6724.833991
394  Budapest_1_2024-05-13   7992.263321
395  Budapest_1_2024-05-14   7997.529854
396  Budapest_1_2024-05-15   7996.436605

[397 rows x 2 columns]


###  Different data preprocessing
Let's use different data scheme but same model

In [180]:
# Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Base features
base_features = test_df.drop(columns=['id']).columns
test_id = test_df['id']

# Concatenate train and test datasets
train_df = pd.concat([train_df[base_features], train_df['orders']], axis=1)

test_df=test_df[base_features]

train_test_df = pd.concat([train_df, test_df], sort=False).reset_index(drop=True)

date_col = 'date'


def base_features_processing(df):

    df[date_col] = pd.to_datetime(df[date_col], errors='coerce')

    df["year"] = df[date_col].dt.year.fillna(-1)
    df["month"] = df[date_col].dt.month.fillna(-1)
    df["day"] = df[date_col].dt.day.fillna(-1)
    df["day_of_week"] = df[date_col].dt.dayofweek.fillna(-1)

    df["week_of_year"] = df[date_col].dt.isocalendar().week.fillna(-1)


    df["quarter"] = df[date_col].dt.quarter.fillna(-1)
    df["is_month_start"] = df[date_col].dt.is_month_start.astype(int).fillna(-1)
    df["is_month_end"] = df[date_col].dt.is_month_end.astype(int).fillna(-1)
    df["is_quarter_start"] = df[date_col].dt.is_quarter_start.astype(int).fillna(-1)
    df["is_quarter_end"] = df[date_col].dt.is_quarter_end.astype(int).fillna(-1)


    # check if the holiday is close.
    df['holiday_before'] = df['holiday'].shift(1).fillna(0).astype(int)
    df['holiday_after'] = df['holiday'].shift(-1).fillna(0).astype(int)

    # total number of holidays in the corresponding month of that row
    df['total_holidays_month'] = df.groupby(['year', 'month'])['holiday'].transform('sum')
    # the total number of days that shops were closed in the corresponding week of that row
    df['total_shops_closed_week'] = df.groupby(['year', 'week_of_year'])['shops_closed'].transform('sum')

    df.drop(date_col, axis=1, inplace=True)

    # Replace null values in holiday_name with 'None'
    df['holiday_name'].fillna('None', inplace=True)

    # OneHotEncoding for holiday_name

    enc = OneHotEncoder(sparse_output=False)
    holiday_encoded = enc.fit_transform(df[['holiday_name']])

    encoded_df = pd.DataFrame(holiday_encoded, columns=enc.get_feature_names_out(['holiday_name']))
    df = pd.concat([df, encoded_df], axis=1)
    df.drop('holiday_name', axis=1, inplace=True)

    # LabelEncoding for warehouse column;

    le = LabelEncoder()
    df['warehouse'] = le.fit_transform(df['warehouse'])

    return df
train_test_df=base_features_processing(train_test_df)
# Apply sine and cosine transformations
# The reason we do this is that we want all cyclical patterns captured
# capture seasonality
def add_fourier_terms(df, year_k, week_k, day_k):
    for k in range(1, year_k+1):
        df['year_sin_'+str(k)] = df['year'] * np.sin(2 * pi * df['year'])
        df['year_cos_'+str(k)] = df['year'] * np.cos(2 * pi * df['year'])
    for k in range(1, week_k+1):
        df['month_sin_'+str(k)] = df['month'] * np.sin(2 * pi * df['month'])
        df['month_cos_'+str(k)] = df['month'] * np.cos(2 * pi * df['month'])
    for k in range(1, day_k+1):
        df['day_sin_'+str(k)] = df['day'] * np.sin(2 * pi * df['day'])
        df['day_cos_'+str(k)] = df['day'] * np.cos(2 * pi * df['day'])
    for k in range(1, day_k+1):
        df['quarter'+str(k)] = df['quarter'] * np.sin(2 * pi * df['quarter'])
        df['quarter'+str(k)] = df['quarter'] * np.cos(2 * pi * df['quarter'])

add_fourier_terms(train_test_df, year_k= 5, week_k=5, day_k=5)
groupby_columns=['warehouse', 'holiday', 'shops_closed']
print('groupby_columns: ', groupby_columns)

train_test_df_2=train_test_df.copy()

# Convert the data back to train_df and test_df
train_df_processed = train_test_df_2[~train_test_df_2['orders'].isnull()]

#train_df_processed.dropna(inplace=True)

test_df_processed = train_test_df_2[train_test_df_2['orders'].isnull()]


test_df_processed = test_df_processed.drop(columns=['orders'])

test_data_len=len(test_df_processed)
# Fill Na to make sure
train_df_processed=train_df_processed.fillna(train_df_processed.mean())
test_df_processed=test_df_processed.fillna(test_df_processed.mean())
# Move target to the last column
column_to_move = train_df_processed['orders']
train_df_processed = train_df_processed.drop('orders', axis=1)
train_df_processed = pd.concat([train_df_processed, column_to_move], axis=1)
train_df_processed['orders_holiday'] = train_df_processed['orders'] * train_df_processed['holiday']
train_df_processed['orders_wsh'] = train_df_processed['orders'] * train_df_processed['winter_school_holidays']

train_df_processed['orders_sh'] = train_df_processed['orders'] * train_df_processed['school_holidays']

train_df_processed['orders_shops_closed'] = train_df_processed['orders'] * train_df_processed['shops_closed']

#train_df_processed['daily_avg']  = train_df_processed.groupby(['warehouse','day_of_week'])['orders'].transform('mean')
#train_df_processed['monthly_avg'] = train_df_processed.groupby(['warehouse','month'])['orders'].transform('mean')

train_df_processed['cumulative_orders'] = train_df_processed.groupby(groupby_columns)['orders'].cumsum()
holiday_names=['holiday_name_1848 Revolution Memorial Day (Extra holiday)', 'holiday_name_2nd Christmas Day', "holiday_name_All Saints' Day Holiday", 'holiday_name_Christmas Eve', 'holiday_name_Cyrila a Metodej', 'holiday_name_Day of National Unity', 'holiday_name_Den boje za svobodu a demokracii', 'holiday_name_Den ceske statnosti', 'holiday_name_Den osvobozeni', 'holiday_name_Den vzniku samostatneho ceskoslovenskeho statu', 'holiday_name_Easter Monday', 'holiday_name_Good Friday', 'holiday_name_Independent Hungary Day', 'holiday_name_International womens day', 'holiday_name_Jan Hus', 'holiday_name_Labour Day', 'holiday_name_Memorial Day for the Martyrs of Arad', 'holiday_name_Memorial Day for the Victims of the Communist Dictatorships', 'holiday_name_Memorial Day for the Victims of the Holocaust', 'holiday_name_Memorial Day of the Republic', 'holiday_name_National Defense Day', 'holiday_name_New Years Day', 'holiday_name_None', 'holiday_name_Peace Festival in Augsburg', 'holiday_name_Reformation Day']
train_df_processed=train_df_processed.fillna(train_df_processed.mean())
train_df_processed.dropna(inplace=True)
train_df_processed.sort_values(by=['year','month','day'])
# frequncy at which each warehouse appears in our test dataset
warehouse_counts = test_df_processed['warehouse'].value_counts().reset_index()
warehouse_counts.columns = ['warehouse', 'count']
#val=warehouse_counts['warehouse'][0]

wr_count = warehouse_counts['count'][warehouse_counts['warehouse'] == 0].item()
print(wr_count)
# Extract the records for extra features for each warehouse and insert into test_df_processed
def get_latest_matching_record(train_df, test_df, feature):
    # Create a copy of the test dataframe
    result_df = test_df.copy()
    # Process each warehouse separately
    for warehouse in test_df['warehouse'].unique():
        # Extract the records for the current warehouse
        wr_count = warehouse_counts['count'][warehouse_counts['warehouse'] == warehouse].item()
        #print(f'wharehouse {warehouse} occurances in test df: ', wr_count)
        last_values = train_df[train_df['warehouse'] == warehouse].tail(wr_count)[feature].values
        # Get the rows corresponding to the current warehouse in the result dataframe
        warehouse_rows = result_df[result_df['warehouse'] == warehouse].index
        # Assign the last wr_count values to the corresponding rows in the result dataframe
        for i in range(wr_count):          #(min(wr_count, len(warehouse_rows))):
            result_df.loc[warehouse_rows[i], feature] = last_values[i]

    return result_df
test_df_processed= get_latest_matching_record(train_df_processed, test_df_processed,  'orders_holiday')
test_df_processed= get_latest_matching_record(train_df_processed, test_df_processed,  'orders_wsh')

test_df_processed= get_latest_matching_record(train_df_processed, test_df_processed,  'orders_sh')
test_df_processed= get_latest_matching_record(train_df_processed, test_df_processed,  'orders_shops_closed')
#test_df_processed= get_latest_matching_record(train_df_processed, test_df_processed, 'daily_avg')
#test_df_processed= get_latest_matching_record(train_df_processed, test_df_processed, 'monthly_avg')
test_df_processed= get_latest_matching_record(train_df_processed, test_df_processed, 'cumulative_orders')
test_df_processed=test_df_processed.fillna(test_df_processed.mean())
X = train_df_processed.drop(columns=['orders'])
y = train_df_processed['orders']
# Show the first few rows of the updated dataset
print('train_df_processed.head()', train_df_processed.head())
print('test_df_processed.head()', test_df_processed.head())

groupby_columns:  ['warehouse', 'holiday', 'shops_closed']
61
train_df_processed.head()    warehouse  holiday  shops_closed  winter_school_holidays  school_holidays  \
0          4        0             0                       0                0   
1          4        0             0                       0                0   
2          4        0             0                       0                0   
3          4        0             0                       0                0   
4          4        0             0                       0                0   

   year  month  day  day_of_week  week_of_year  ...  quarter2  quarter3  \
0  2020     12    5            5            49  ...       4.0       4.0   
1  2020     12    6            6            49  ...       4.0       4.0   
2  2020     12    7            0            50  ...       4.0       4.0   
3  2020     12    8            1            50  ...       4.0       4.0   
4  2020     12    9            2            50  ...     

In [181]:
# Set random seed
random_seed = 777
# Split train data into features and target
X = train_df_processed.drop(columns=['orders'])
y = train_df_processed['orders']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1, random_state=random_seed)

In [170]:
# Define search spaces for Bayesian Optimization
search_spaces = {
    'lgb': {
        'n_estimators': Integer(100, 1000),
        'num_leaves': Integer(20, 100),
        'learning_rate': Real(0.001, 0.1, prior='log-uniform'),
        'colsample_bytree': Real(0.4, 1.0),
        'subsample': Real(0.5, 1.0),
        'min_child_weight': Integer(1, 10),
        'lambda_l1': Real(0, 1),
        'lambda_l2': Real(0, 1)
    },
    'xgb': {
        'n_estimators': Integer(100, 1000),
        'max_depth': Integer(3, 15),
        'learning_rate': Real(0.001, 0.1, prior='log-uniform'),
        'colsample_bytree': Real(0.4, 1.0),
        'subsample': Real(0.5, 1.0),
        'min_child_weight': Integer(1, 10),
        'gamma': Real(0, 5),
        'lambda': Real(0, 1),
        'alpha': Real(0, 1)
    },
    'cat': {
        'iterations': Integer(100, 1000),
        'depth': Integer(4, 12),
        'learning_rate': Real(0.001, 0.1, prior='log-uniform'),
        'l2_leaf_reg': Real(1, 10),
        'border_count': Integer(32, 255),
        'bagging_temperature': Real(0.0, 1.0),
        'random_strength': Real(0.0, 1.0)
    },
    'rf': {
        'n_estimators': Integer(100, 1000),
        'max_depth': Integer(5, 50),
        'min_samples_split': Integer(2, 20),
        'min_samples_leaf': Integer(1, 10),
        'max_features': Real(0.1, 1.0)
    }
}

# Initialize the models with default parameters and GPU/multicore settings
models = {
    'lgb': LGBMRegressor(device='gpu',random_state=random_seed),
    'xgb': XGBRegressor(tree_method='gpu_hist', random_state=random_seed),
    'cat': CatBoostRegressor(task_type='GPU', random_state=random_seed, verbose=0),
    'rf': RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=random_seed)
}


# models which won't be tuned
lr_model = LogisticRegression(random_state=random_seed)
ad_model = AdaBoostRegressor(random_state=random_seed)
dt_model = DecisionTreeRegressor(random_state=random_seed)
gb_model = GradientBoostingRegressor(random_state=random_seed)


lr_model.fit(X_train, y_train)
ad_model.fit(X_train, y_train)
dt_model.fit(X_train, y_train)
gb_model.fit(X_train, y_train)

# Bayesian Optimization for each model
opt_models = {}
for model_name, model in models.items():
    opt = BayesSearchCV(estimator=model, search_spaces=search_spaces.get(model_name, {}), n_iter=30, cv=5, random_state=random_seed,verbose=1,scoring=mape_scorer)
    opt.fit(X_train, y_train)
    opt_models[model_name] = opt.best_estimator_
    print(model_name, opt.best_score_, opt.best_params_)

# untuned model; doing this for time saving
opt_models['lr'] =lr_model
opt_models['ad'] =ad_model
opt_models['dt'] =dt_model
opt_models['gb'] =gb_model

# Train optimized models on full training data
stacking_train = np.zeros((X_train.shape[0], len(opt_models)))

for i, (model_name, model) in enumerate(opt_models.items()):
    stacking_train[:, i] = model.predict(X_train)


# Define search spaces for the meta-models
meta_search_spaces = {
    'meta_model_1': {  # LGBMRegressor
        'n_estimators': Integer(100, 1000),
        'num_leaves': Integer(20, 100),
        'learning_rate': Real(0.001, 0.1, prior='log-uniform'),
        'colsample_bytree': Real(0.4, 1.0),
    },
    'meta_model_2': {  # CatBoostRegressor
        'iterations': Integer(100, 1000),
        'depth': Integer(4, 12),
        'learning_rate': Real(0.001, 0.1, prior='log-uniform')
    },
    'meta_model_3': {  # XGBRegressor
        'n_estimators': Integer(100, 1000),
        'max_depth': Integer(3, 15),
        'learning_rate': Real(0.001, 0.1, prior='log-uniform'),
        'colsample_bytree': Real(0.4, 1.0),
        'subsample': Real(0.5, 1.0)
    },
    'meta_model_4': {  # RandomForestRegressor
        'n_estimators': Integer(100, 1000),
        'max_depth': Integer(5, 50),
        'min_samples_split': Integer(2, 20),
        'min_samples_leaf': Integer(1, 10),
        'max_features': Real(0.1, 1.0)
    },
    'meta_model_5': {  # GradientBoostingRegressor
        'n_estimators': Integer(100, 1000),
        'max_depth': Integer(3, 15),
        'learning_rate': Real(0.001, 0.1, prior='log-uniform'),
        'subsample': Real(0.5, 1.0),
        'min_samples_split': Integer(2, 20),
        'min_samples_leaf': Integer(1, 10),
        'max_features': Real(0.1, 1.0)
    },
    'meta_model_6': {  # DecisionTreeRegressor
        'max_depth': Integer(3, 15),
        'min_samples_split': Integer(2, 20),
        'min_samples_leaf': Integer(1, 10),
        'max_features': Categorical([ 'sqrt', 'log2']),
        'ccp_alpha': Real(0.0, 0.1)  # Cost-complexity pruning parameter
    },
    'meta_model_7': {  # AdaBoostRegressor
        'n_estimators': Integer(50, 500),
        'learning_rate': Real(0.001, 1.0, prior='log-uniform'),
        'loss': Categorical(['linear', 'square', 'exponential'])
    },
    'meta_model_8': {  # LinearRegression
        'fit_intercept': Categorical([True, False])
    }
}


# Initialize and optimize meta-models using Bayesian Optimization
meta_models = {
    'meta_model_1': LGBMRegressor(device='gpu', random_state=random_seed),
    'meta_model_2': CatBoostRegressor(task_type='GPU', random_state=random_seed, verbose=0),
    'meta_model_3': XGBRegressor(tree_method='gpu_hist', random_state=random_seed),
    'meta_model_4': RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=random_seed),
    'meta_model_5': GradientBoostingRegressor(random_state=random_seed),
    'meta_model_6': DecisionTreeRegressor(random_state=random_seed),
    'meta_model_7': AdaBoostRegressor(random_state=random_seed),
    'meta_model_8': LinearRegression(n_jobs=-1)
}

opt_meta_models = {}
for model_name, model in meta_models.items():
    opt_meta = BayesSearchCV(estimator=model, search_spaces=meta_search_spaces.get(model_name, {}), n_iter=10, cv=5, random_state=random_seed, verbose=1, scoring=mape_scorer)
    opt_meta.fit(stacking_train, y_train)
    opt_meta_models[model_name] = opt_meta.best_estimator_
    print(model_name, opt_meta.best_score_, opt_meta.best_params_)

# drop the order column 
test_df_le=test_df.copy()

level_2_train = np.zeros((X_train.shape[0], len(opt_meta_models)))
for i, (model_name, model) in enumerate(opt_meta_models.items()):
    print(model_name)
    level_2_train[:, i] = model.predict(stacking_train)

# the model first pass through original layer
stacking_t=np.zeros((test_df_le.shape[0], len(opt_models)))
for i, (model_name, model) in enumerate(opt_models.items()):
    stacking_t[:, i] = model.predict(test_df_le)
# pass the second layer. Which is meta-layer
meta_stack=np.zeros((test_df_le.shape[0], len(opt_meta_models)))
for i, (model_name, model) in enumerate(opt_meta_models.items()):
    meta_stack[:, i] = model.predict(stacking_t)


initial_weights = np.ones(len(meta_models)) / len(meta_models)

# Final prediction
submit_pred = np.dot(meta_stack, initial_weights)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1099
[LightGBM] [Info] Number of data points in the train set: 5871, number of used features: 60
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4090, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 44 dense feature groups (0.25 MB) transferred to GPU in 0.005766 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 5542.428717
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 1112
[LightGBM] [Info] Number of data points in the train set: 5871, number of used features: 61
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4090, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been b

KeyError: "['orders'] not found in axis"

In [172]:
# drop the order column 
test_df_le=test_df_processed

level_2_train = np.zeros((X_train.shape[0], len(opt_meta_models)))
for i, (model_name, model) in enumerate(opt_meta_models.items()):
    print(model_name)
    level_2_train[:, i] = model.predict(stacking_train)

# the model first pass through original layer
stacking_t=np.zeros((test_df_le.shape[0], len(opt_models)))
for i, (model_name, model) in enumerate(opt_models.items()):
    stacking_t[:, i] = model.predict(test_df_le)
# pass the second layer. Which is meta-layer
meta_stack=np.zeros((test_df_le.shape[0], len(opt_meta_models)))
for i, (model_name, model) in enumerate(opt_meta_models.items()):
    meta_stack[:, i] = model.predict(stacking_t)


initial_weights = np.ones(len(meta_models)) / len(meta_models)

# Final prediction
submit_pred = np.dot(meta_stack, initial_weights)

meta_model_1
meta_model_2
meta_model_3
meta_model_4
meta_model_5
meta_model_6
meta_model_7
meta_model_8


In [173]:
# Create a DataFrame for submission with 'id' and 'Target'
submission = pd.DataFrame({
    'id': test_id,         # Test IDs for each prediction
    'Target': submit_pred  # Final predictions for submission
})

# Save the submission DataFrame to a CSV file
submission.to_csv('submission_ultimate2.csv', index=False)  # Save without including row indices

# Print the submission DataFrame to verify the results
print(submission)

                        id        Target
0      Prague_1_2024-03-16  10906.720675
1      Prague_1_2024-03-17  10543.092249
2      Prague_1_2024-03-18   9599.860299
3      Prague_1_2024-03-19   9291.414551
4      Prague_1_2024-03-20   9315.107073
..                     ...           ...
392  Budapest_1_2024-05-11   7073.880302
393  Budapest_1_2024-05-12   6720.785400
394  Budapest_1_2024-05-13   6704.118498
395  Budapest_1_2024-05-14   6716.275756
396  Budapest_1_2024-05-15   7926.960434

[397 rows x 2 columns]


>### New attempt

In [208]:
# Cross-validation Setup
# Define the number of splits for k-fold cross-validation
n_splits = 10
kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_seed)

# Initialize arrays to store predictions from base models for stacking
stacking_train = np.zeros((X_train.shape[0], 8))
stacking_test = np.zeros((X_test.shape[0], 8))

# Initialize base models for stacking
lgb_model = LGBMRegressor(
colsample_bytree=0.9090704690562509,
lambda_l1=0.5123226527180421,
lambda_l2=0.17762444404308067,
learning_rate=0.04939196011309037,
min_child_weight=8,
n_estimators=501,
num_leaves=53,
subsample=0.5052004980567765,
random_state=random_seed,
device='gpu'  # Assuming GPU usage as in previous examples
)

xgb_model = XGBRegressor(
alpha=0.0,
colsample_bytree=1.0,
gamma=5.0,learning_rate=0.025678181688152554,max_depth=15,min_child_weight=10,n_estimators=686,subsample=0.5,random_state=random_seed,tree_method='gpu_hist') # Assuming GPU usage as in previous examples)


cat_model = CatBoostRegressor(
    bagging_temperature=1.0,
    border_count=133,
    depth=12,
    iterations=1000,
    l2_leaf_reg=10.0,
    learning_rate=0.1,
    random_strength=0.0,
    random_state=random_seed,
    task_type='GPU',  # Assuming GPU usage as in previous examples
    verbose=0  # Suppress training output
)

rf_model = RandomForestRegressor(
    max_depth=31,
    max_features=1.0,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=1000,
    random_state=random_seed,
    n_jobs=-1  # Utilize all available CPU cores
)




lr_model = LogisticRegression(random_state=random_seed)
ad_model = AdaBoostRegressor(random_state=random_seed)
dt_model = DecisionTreeRegressor(random_state=random_seed)
gb_model = GradientBoostingRegressor(random_state=random_seed)

# Train base models with cross-validation and generate stacking features
for train_idx, val_idx in kf.split(X_train):
    X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[val_idx]

    # Fit each base model on training data
    lgb_model.fit(X_tr, y_tr)
    xgb_model.fit(X_tr, y_tr)
    cat_model.fit(X_tr, y_tr)
    rf_model.fit(X_tr, y_tr)
    lr_model.fit(X_tr, y_tr)
    ad_model.fit(X_tr, y_tr)
    dt_model.fit(X_tr, y_tr)
    gb_model.fit(X_tr, y_tr)

    # Predict on validation set for stacking features
    stacking_train[val_idx, 0] = lgb_model.predict(X_val)
    stacking_train[val_idx, 1] = xgb_model.predict(X_val)
    stacking_train[val_idx, 2] = cat_model.predict(X_val)
    stacking_train[val_idx, 3] = rf_model.predict(X_val)
    stacking_train[val_idx, 4] = lr_model.predict(X_val)
    stacking_train[val_idx, 5] = ad_model.predict(X_val)
    stacking_train[val_idx, 6] = dt_model.predict(X_val)
    stacking_train[val_idx, 7] = gb_model.predict(X_val)

    # Predict on test set and average predictions over all folds
    stacking_test[:, 0] += lgb_model.predict(X_test) / n_splits
    stacking_test[:, 1] += xgb_model.predict(X_test) / n_splits
    stacking_test[:, 2] += cat_model.predict(X_test) / n_splits
    stacking_test[:, 3] += rf_model.predict(X_test) / n_splits
    stacking_test[:, 4] += lr_model.predict(X_test) / n_splits
    stacking_test[:, 5] += ad_model.predict(X_test) / n_splits
    stacking_test[:, 6] += dt_model.predict(X_test) / n_splits
    stacking_test[:, 7] += gb_model.predict(X_test) / n_splits

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 980
[LightGBM] [Info] Number of data points in the train set: 6605, number of used features: 26
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4090, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 14 dense feature groups (0.10 MB) transferred to GPU in 0.001349 secs. 1 sparse feature groups
[LightGBM] [Info] Start training from score 5540.911128
[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Info] Total Bins 983
[LightGBM] [Info] Number of data points in the train set: 6605, number of used features: 27
[LightGBM] [Info] Using GPU Device: NVIDIA GeForce RTX 4090, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightG

In [209]:
# Train meta-models
meta_model_1 = LGBMRegressor(n_estimators=150, num_leaves=15, learning_rate=0.05, colsample_bytree=0.6, lambda_l1=0.2, lambda_l2=0.2, random_state=random_seed)
meta_model_2 = CatBoostRegressor(verbose=0, random_state = random_seed)
meta_model_3 = XGBRegressor(random_state = random_seed)
meta_model_4 = RandomForestRegressor(n_estimators=100, random_state=random_seed)
meta_model_5 = GradientBoostingRegressor(random_state=random_seed)
meta_model_6 = DecisionTreeRegressor(random_state=random_seed)
meta_model_7 = AdaBoostRegressor(random_state=random_seed)
meta_model_8 = LinearRegression()

meta_model_1.fit(stacking_train, y_train)
meta_model_2.fit(stacking_train, y_train)
meta_model_3.fit(stacking_train, y_train)
meta_model_4.fit(stacking_train, y_train)
meta_model_5.fit(stacking_train, y_train)
meta_model_6.fit(stacking_train, y_train)
meta_model_7.fit(stacking_train, y_train)
meta_model_8.fit(stacking_train, y_train)

best_iteration_1 = meta_model_1.best_iteration_
best_iteration_2 = meta_model_2.best_iteration_

# Predict on test set using meta-models
meta_pred_1 = meta_model_1.predict(stacking_test)
meta_pred_2 = meta_model_2.predict(stacking_test)
meta_pred_3 = meta_model_3.predict(stacking_test)
meta_pred_4 = meta_model_4.predict(stacking_test)
meta_pred_5 = meta_model_5.predict(stacking_test)
meta_pred_6 = meta_model_6.predict(stacking_test)
meta_pred_7 = meta_model_7.predict(stacking_test)
meta_pred_8 = meta_model_8.predict(stacking_test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000502 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1909
[LightGBM] [Info] Number of data points in the train set: 7339, number of used features: 8
[LightGBM] [Info] Start training from score 5535.446110


In [210]:
# Prediction
# Drop the 'orders' column from the test DataFrame as it is not needed for predictions
# test_df_le = test_df_le.drop(columns=['date', 'orders'])  # Original line commented out
test_df_le=test_df_le.drop(columns=["orders"])# Drop 'orders' column

# Generate predictions using each of the base models
lgb_pred_test = lgb_model.predict(test_df_le)        # Predictions from LightGBM model
xgb_pred_test = xgb_model.predict(test_df_le)        # Predictions from XGBoost model
cat_pred_test = cat_model.predict(test_df_le)        # Predictions from CatBoost model
rf_pred_test = rf_model.predict(test_df_le)          # Predictions from Random Forest model
lr_pred_test = lr_model.predict(test_df_le)          # Predictions from Logistic Regression model
ad_pred_test = ad_model.predict(test_df_le)          # Predictions from AdaBoost model
dt_pred_test = dt_model.predict(test_df_le)          # Predictions from Decision Tree model
gb_pred_test = gb_model.predict(test_df_le)          # Predictions from Gradient Boosting model

# Stack predictions from the various models into a single array for stacking
# stacking_test_df_le = np.vstack([lgb_pred_test, xgb_pred_test, cat_pred_test, rf_pred_test]).T  # Original line commented out
stacking_test_df_le = np.vstack([lgb_pred_test, xgb_pred_test, cat_pred_test, rf_pred_test, lr_pred_test, ad_pred_test, dt_pred_test, gb_pred_test]).T


# submit_pred = (
#     meta_pred_1 * weights['meta_model_1'] +
#     meta_pred_2 * weights['meta_model_2'] +
#     meta_pred_3 * weights['meta_model_3'] +
#     meta_pred_4 * weights['meta_model_4'] +
#     meta_pred_5 * weights['meta_model_5'] +
#     meta_pred_6 * weights['meta_model_6'] +
#     meta_pred_7 * weights['meta_model_7'] +
#     meta_pred_8 * weights['meta_model_8']
# )

# Generate predictions using the meta-models on the stacked test predictions
submit_pred_1 = meta_model_1.predict(stacking_test_df_le)  # Predictions from the first meta-model
submit_pred_2 = meta_model_2.predict(stacking_test_df_le)  # Predictions from the second meta-model
submit_pred_3 = meta_model_3.predict(stacking_test_df_le)  # Predictions from the third meta-model
submit_pred_4 = meta_model_4.predict(stacking_test_df_le)  # Predictions from the fourth meta-model
submit_pred_5 = meta_model_5.predict(stacking_test_df_le)  # Predictions from the fifth meta-model
submit_pred_6 = meta_model_6.predict(stacking_test_df_le)  # Predictions from the sixth meta-model
submit_pred_7 = meta_model_7.predict(stacking_test_df_le)  # Predictions from the seventh meta-model
submit_pred_8 = meta_model_8.predict(stacking_test_df_le)  # Predictions from the eight meta-model


# # Define weights for each model's predictions in the final ensemble
# weights = {
#     'cat_test_preds': 0.10,  # Weight for CatBoost predictions
#     'lgb_test_preds': 0.10,   # Weight for LightGBM predictions
#     'xgb_test_preds': 0.40,   # Weight for XGBoost predictions
#     'rf_test_preds': 0.10,     # Weight for Random Forest predictions
#     'xtr_test_preds': 0.30,    # Weight for the final model's predictions
# }

weights = {
    'meta_model_1': 0.125,
    'meta_model_2': 0.125,
    'meta_model_3': 0.125,
    'meta_model_4': 0.125,
    'meta_model_5': 0.125,
    'meta_model_6': 0.125,
    'meta_model_7': 0.125,
    'meta_model_8': 0.125,
}

# Calculate the weighted predictions for each model
meta_model_1_weighted = submit_pred_2 * weights['meta_model_1']  # Weighted predictions for CatBoost
meta_model_2_weighted = submit_pred_1 * weights['meta_model_2']  # Weighted predictions for LightGBM
meta_model_3_weighted = submit_pred_3 * weights['meta_model_3']  # Weighted predictions for XGBoost
meta_model_4_weighted = submit_pred_4 * weights['meta_model_4']  # Weighted predictions for Random Forest
meta_model_5_weighted = submit_pred_5 * weights['meta_model_5']
meta_model_6_weighted = submit_pred_6 * weights['meta_model_6']
meta_model_7_weighted = submit_pred_7 * weights['meta_model_7']
meta_model_8_weighted = submit_pred_8 * weights['meta_model_8']

# Combine all weighted predictions to obtain the final submission prediction
submit_pred = meta_model_1_weighted + meta_model_2_weighted + meta_model_3_weighted + meta_model_4_weighted + meta_model_5_weighted + meta_model_6_weighted + meta_model_7_weighted + meta_model_8_weighted



In [211]:
# Create a DataFrame for submission with 'id' and 'Target'
submission = pd.DataFrame({
    'id': test_id,         # Test IDs for each prediction
    'Target': submit_pred  # Final predictions for submission
})

# Save the submission DataFrame to a CSV file
submission.to_csv('submission_ultimate5.csv', index=False)  # Save without including row indices

# Print the submission DataFrame to verify the results
print(submission)

                        id        Target
0      Prague_1_2024-03-16  10468.963923
1      Prague_1_2024-03-17  10295.205614
2      Prague_1_2024-03-18   9832.579028
3      Prague_1_2024-03-19   9568.869854
4      Prague_1_2024-03-20   9578.687128
..                     ...           ...
392  Budapest_1_2024-05-11   6816.031763
393  Budapest_1_2024-05-12   6367.650992
394  Budapest_1_2024-05-13   6451.108525
395  Budapest_1_2024-05-14   6406.536346
396  Budapest_1_2024-05-15   6380.408212

[397 rows x 2 columns]
