In [37]:
import pandas as pd
from typing import List

In [38]:
df_train = pd.read_csv("../data/train.csv")
df_test = pd.read_csv("../data/test.csv")

In [57]:
df_test.columns.to_list()

['warehouse',
 'date',
 'holiday_name',
 'holiday',
 'shops_closed',
 'winter_school_holidays',
 'school_holidays',
 'id']

In [39]:
df = df_train[df_test.columns.to_list() + ["orders"]]
df.head()

Unnamed: 0,warehouse,date,holiday_name,holiday,shops_closed,winter_school_holidays,school_holidays,id,orders
0,Prague_1,2020-12-05,,0,0,0,0,Prague_1_2020-12-05,6895.0
1,Prague_1,2020-12-06,,0,0,0,0,Prague_1_2020-12-06,6584.0
2,Prague_1,2020-12-07,,0,0,0,0,Prague_1_2020-12-07,7030.0
3,Prague_1,2020-12-08,,0,0,0,0,Prague_1_2020-12-08,6550.0
4,Prague_1,2020-12-09,,0,0,0,0,Prague_1_2020-12-09,6910.0


In [40]:
def preprocess_pipeline(df: pd.DataFrame, steps: list) -> pd.DataFrame:
    """
    Applies a series of preprocessing steps to a DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame to be processed.
        steps (list): A list of tuples where each tuple contains a function and a dictionary of 
        keyword arguments for that function. Each function should take a DataFrame as its first 
        argument.

    Returns:
        pd.DataFrame: The processed DataFrame after all steps have been applied.
    """
    for step, kwargs in steps:
        # Apply each preprocessing step to the DataFrame with the provided arguments
        df = step(df, **kwargs)
    return df

In [44]:
def drop_columns(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
    return df.drop(columns=columns)

def encode_holiday_name(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    df[column_name] = df[column_name].apply(lambda x: 0 if pd.isna(x) else 1)
    return df

def create_dummies(df: pd.DataFrame, column_name: str) -> pd.DataFrame:
    dummies: pd.DataFrame = pd.get_dummies(df[column_name], prefix=column_name)
    df: pd.DataFrame = df.drop(columns=[column_name])
    return pd.concat([df, dummies], axis=1)

def replace_bool(df: pd.DataFrame, values: dict) -> pd.DataFrame:
    return df.replace(to_replace=values)

In [45]:
steps = [
    (drop_columns, {"columns": ["id", "date"]}), 
    (encode_holiday_name, {"column_name": "holiday_name"}), 
    (create_dummies, {"column_name": "warehouse"}), 
    (replace_bool, {"values": {True: 1, False: 0}})
]

In [46]:
df_processed = preprocess_pipeline(df=df, steps=steps)
df_processed

  return df.replace(to_replace=values)


Unnamed: 0,holiday_name,holiday,shops_closed,winter_school_holidays,school_holidays,orders,warehouse_Brno_1,warehouse_Budapest_1,warehouse_Frankfurt_1,warehouse_Munich_1,warehouse_Prague_1,warehouse_Prague_2,warehouse_Prague_3
0,0,0,0,0,0,6895.0,0,0,0,0,1,0,0
1,0,0,0,0,0,6584.0,0,0,0,0,1,0,0
2,0,0,0,0,0,7030.0,0,0,0,0,1,0,0
3,0,0,0,0,0,6550.0,0,0,0,0,1,0,0
4,0,0,0,0,0,6910.0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7335,0,0,0,0,0,6733.0,0,1,0,0,0,0,0
7336,0,0,0,0,0,6492.0,0,1,0,0,0,0,0
7337,0,0,0,0,0,6661.0,0,1,0,0,0,0,0
7338,0,0,0,0,0,6843.0,0,1,0,0,0,0,0


In [47]:
X = df_processed.drop(columns=["orders"])
y = df_processed["orders"]

In [48]:
from sklearn.model_selection import train_test_split

In [49]:
x_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [50]:
from xgboost import XGBRegressor

In [51]:
xgb = XGBRegressor()

In [52]:
xgb.fit(x_train, y_train)

In [53]:
y_pred = xgb.predict(x_test)

In [54]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [55]:
mean_squared_error(y_true=y_test, y_pred=y_pred)

np.float64(926646.6732077951)

In [56]:
r2_score(y_true=y_test, y_pred=y_pred)

0.8062007784483891

In [31]:
df_processed = preprocess_pipeline(df=df_test, steps=steps)
df_processed

  return df.replace(to_replace={True: 1, False: -1})


Unnamed: 0,holiday_name,holiday,shops_closed,winter_school_holidays,school_holidays,warehouse_Brno_1,warehouse_Budapest_1,warehouse_Frankfurt_1,warehouse_Munich_1,warehouse_Prague_1,warehouse_Prague_2,warehouse_Prague_3
0,-1,0,0,0,0,-1,-1,-1,-1,1,-1,-1
1,-1,0,0,0,0,-1,-1,-1,-1,1,-1,-1
2,-1,0,0,0,0,-1,-1,-1,-1,1,-1,-1
3,-1,0,0,0,0,-1,-1,-1,-1,1,-1,-1
4,-1,0,0,0,0,-1,-1,-1,-1,1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
392,-1,0,0,0,0,-1,1,-1,-1,-1,-1,-1
393,-1,0,0,0,0,-1,1,-1,-1,-1,-1,-1
394,-1,0,0,0,0,-1,1,-1,-1,-1,-1,-1
395,-1,0,0,0,0,-1,1,-1,-1,-1,-1,-1


In [32]:
res = xgb.predict(df_processed)

In [33]:
res

array([8534.58  , 8534.58  , 8534.58  , 8534.58  , 8534.58  , 8534.58  ,
       8534.58  , 8534.58  , 8534.58  , 8534.58  , 8534.58  , 8534.58  ,
       8534.58  , 7564.142 , 7590.501 , 7590.501 , 7837.942 , 8534.58  ,
       8534.58  , 8534.58  , 8534.58  , 8534.58  , 8534.58  , 8534.58  ,
       8534.58  , 8534.58  , 8534.58  , 8534.58  , 8534.58  , 8534.58  ,
       8534.58  , 8534.58  , 8534.58  , 8534.58  , 8534.58  , 8534.58  ,
       8534.58  , 8534.58  , 8534.58  , 8534.58  , 8534.58  , 8534.58  ,
       8534.58  , 8534.58  , 8534.58  , 8534.58  , 7564.142 , 8534.58  ,
       8534.58  , 8534.58  , 8534.58  , 8534.58  , 8534.58  , 7837.942 ,
       8534.58  , 8534.58  , 8534.58  , 8534.58  , 8534.58  , 8534.58  ,
       8534.58  , 7275.8013, 7275.8013, 7275.8013, 7275.8013, 7275.8013,
       7275.8013, 7275.8013, 7275.8013, 7275.8013, 7275.8013, 7275.8013,
       7275.8013, 7275.8013, 6629.    , 6970.0015, 6970.0015, 7422.945 ,
       7275.8013, 7275.8013, 7275.8013, 7275.8013, 

In [34]:
# Create submission
submission = pd.DataFrame()
submission["id"] = df_test["id"].to_list()
submission["orders"] = res.tolist()
submission.to_csv("submission.csv", index=False)