In [1]:
# data #
import numpy as np
import pandas as pd

# visualisation #
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go 

from plotly.subplots import make_subplots

# date #
from datetime import date
import holidays 

# Load the training data

In [2]:
train_data = pd.read_csv("/kaggle/input/playground-series-s3e19/train.csv")

# Preprocessing

In [3]:
# show the first 5 rows
train_data.head()

Unnamed: 0,id,date,country,store,product,num_sold
0,0,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Improve Your Coding,63
1,1,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Train More LLMs,66
2,2,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Win Friends and Influence People,9
3,3,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Win More Kaggle Competitions,59
4,4,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Write Better,49


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136950 entries, 0 to 136949
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        136950 non-null  int64 
 1   date      136950 non-null  object
 2   country   136950 non-null  object
 3   store     136950 non-null  object
 4   product   136950 non-null  object
 5   num_sold  136950 non-null  int64 
dtypes: int64(2), object(4)
memory usage: 6.3+ MB


In [5]:
train_data.describe()

Unnamed: 0,id,num_sold
count,136950.0,136950.0
mean,68474.5,165.522636
std,39534.20402,183.691575
min,0.0,2.0
25%,34237.25,46.0
50%,68474.5,98.0
75%,102711.75,184.0
max,136949.0,1380.0


In [6]:
AR_seasons = np.array([2, 2, 3, 3, 3, 4, 4, 4, 1, 1, 1, 2]).astype(int)
AR_seasons_dict = dict(zip(range(1, 13), AR_seasons))

other_seasons = np.array([4, 4, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4]).astype(int)
other_seasons_dict = dict(zip(range(1, 13), other_seasons))

def preprocess(df):
    # create day, month, year column
    df["date"] = pd.to_datetime(df["date"])
    df["day"] = df["date"].dt.day
    df["month"] = df["date"].dt.month
    df["year"] = df["date"].dt.year
    
    # check if date is a weekday or weekend
    df["day_of_week"] = df["date"].dt.dayofweek
    df.loc[df['day_of_week'] > 4, 'weekend'] = True
    df.loc[df['day_of_week'] <= 4, 'weekend'] = False
    
    # check if date is a holiday
    country_holidays = {
        "Argentina": holidays.country_holidays("AR"),
        "Japan": holidays.country_holidays("JP"),
        "Estonia": holidays.country_holidays("EE"),
        "Spain": holidays.country_holidays("ES"),
        "Canada": holidays.country_holidays("CA")
    }
    df["holiday"] = df.apply(lambda row: row["date"] in country_holidays[row["country"]], axis=1)
    
    df['month_sin'] = np.sin(2*np.pi*df.month/12)
    df['month_cos'] = np.cos(2*np.pi*df.month/12)
    df['day_sin'] = np.sin(2*np.pi*df.day/31)
    df['day_cos'] = np.cos(2*np.pi*df.day/31)
    
    # check the season
    df['AR_season'] = df.loc[df['country'] == 'Argentina', 'month'].map(AR_seasons_dict)
    df['season'] = df.loc[df['country'] != 'Argentina', 'month'].map(other_seasons_dict)
    df['season'] = df['season'].fillna(df['AR_season'])
    df.drop('AR_season', axis=1, inplace=True)
    
    return df

In [7]:
prepared_train_data = preprocess(train_data)
prepared_train_data.head()

Unnamed: 0,id,date,country,store,product,num_sold,day,month,year,day_of_week,weekend,holiday,month_sin,month_cos,day_sin,day_cos,season
0,0,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Improve Your Coding,63,1,1,2017,6,True,True,0.5,0.866025,0.201299,0.97953,2.0
1,1,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Train More LLMs,66,1,1,2017,6,True,True,0.5,0.866025,0.201299,0.97953,2.0
2,2,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Win Friends and Influence People,9,1,1,2017,6,True,True,0.5,0.866025,0.201299,0.97953,2.0
3,3,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Win More Kaggle Competitions,59,1,1,2017,6,True,True,0.5,0.866025,0.201299,0.97953,2.0
4,4,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Write Better,49,1,1,2017,6,True,True,0.5,0.866025,0.201299,0.97953,2.0


# Analyse the data (EDA)
## Distribiution of county, store, product

In [8]:
fig = make_subplots(rows=1, cols=3)

col = 1
for column in ["country", "store", "product"]:
    name = column + " count"
    count = prepared_train_data[column].value_counts()
    fig.add_trace(go.Bar(x=count.index,
                         y=count.values,
                         text=count.values,
                         name=name), row=1, col=col)
    col += 1

fig.update_layout(title_text="Distribiution of county, store, product",
                  title_font_size=22,
                  yaxis_title="Count",
                  template="ggplot2")
fig.show()

## Proportion of products sold

In [9]:
stores = ["Kaggle Learn", "Kaggle Store", "Kagglazon"]

fig = make_subplots(rows=1, cols=3, subplot_titles=stores, specs=[[{'type':'pie'}, {'type':'pie'}, {'type':'pie'}]])

col = 1
for store in stores:
    rows_of_store = prepared_train_data.loc[prepared_train_data["store"] == store]
    sum_of_sold_per_product = rows_of_store.groupby("product")["num_sold"].sum()
    fig.add_trace(go.Pie(name=store,
                         labels=sum_of_sold_per_product.index,
                         values=sum_of_sold_per_product.values), row=1, col=col)
    col += 1

fig.update_traces(hole=.4, hoverinfo="label+percent+value+name")
fig.update_layout(title_text="Proportion of products sold",
                  title_font_size=22,
                  template="ggplot2",
                  legend=dict(orientation="h", yanchor="top", xanchor="right", x=1))
fig.show()

## Average number of sales per country on weekday and weekend

In [10]:
fig = go.Figure()

for country in ["Argentina", "Canada", "Estonia", "Japan", "Spain"]:
    rows_of_country = prepared_train_data.loc[prepared_train_data["country"] == country]
    num_of_sold_per_country = rows_of_country.groupby("weekend")["num_sold"].mean()
    fig.add_trace(go.Bar(x=["Weekday", "Weekend"],
                         y=num_of_sold_per_country.values,
                         text=num_of_sold_per_country.values,
                         name=country))
    
fig.update_layout(title_text="Average number of sales per country on weekday and weekend",
                  title_font_size=22,
                  yaxis_title="Average number",
                  template="ggplot2")
fig.show()

## Average number of sales per country on workday and holiday

In [11]:
fig = go.Figure()

for country in ["Argentina", "Canada", "Estonia", "Japan", "Spain"]:
    rows_of_country = prepared_train_data.loc[prepared_train_data["country"] == country]
    num_of_sold_per_country = rows_of_country.groupby("holiday")["num_sold"].mean()
    fig.add_trace(go.Bar(x=["Workday", "Holiday"],
                         y=num_of_sold_per_country.values,
                         text=num_of_sold_per_country.values,     
                         name=country))
    
fig.update_layout(title_text="Average number of sales per country on workday and holiday",
                  title_font_size=22,
                  yaxis_title="Average number",
                  template="ggplot2")
fig.show()

## Average sales in time

In [12]:
def number_to_weekday(number):
    days = ["Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday"]
    return days[number]

fig = make_subplots(rows=2, cols=2)

row = 0
col = 0
for time in ["day", "month", "day_of_week", "year"]:
    mean_sales = prepared_train_data.groupby(time)["num_sold"].mean()
    fig.add_trace(go.Scatter(x=([number_to_weekday(num) for num in mean_sales.index] if time == "day_of_week" else mean_sales.index),
                             y=mean_sales.values,
                             mode='lines',
                             name=time), row=(row%2)+1, col=(col%2)+1)
    row += 1
    col += (row%2)

fig.update_xaxes(title_text="Time", row=2, col=1)
fig.update_layout(title_text="Average sales in time", 
                  title_font_size=22,
                  yaxis_title="Average sales",
                  template="ggplot2")
fig.show()

## Weekly sales by store

In [13]:
def show_fig_weekly_sales_by(column):
    weekly_sold_mean = prepared_train_data.groupby([prepared_train_data["date"].dt.to_period("W"), column]).mean()["num_sold"].reset_index()
    weekly_sold_mean["date"] = weekly_sold_mean["date"].dt.to_timestamp()

    fig = px.line(x=weekly_sold_mean["date"], y=weekly_sold_mean["num_sold"], color=weekly_sold_mean[column])
    fig.update_layout(title_text="Weekly sale by " + column, 
                      title_font_size=22,
                      xaxis_title="Time",
                      yaxis_title="Average sales",
                      template="ggplot2")
    fig.show()
    
show_fig_weekly_sales_by("store")


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



## Weekly sales by country

In [14]:
show_fig_weekly_sales_by("country")


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



## Weekly sales by product

In [15]:
show_fig_weekly_sales_by("product")


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



# Preparation of the dataset for the machine learning algorithms
## Label encoding

In [16]:
# show the columns
prepared_train_data.columns

Index(['id', 'date', 'country', 'store', 'product', 'num_sold', 'day', 'month',
       'year', 'day_of_week', 'weekend', 'holiday', 'month_sin', 'month_cos',
       'day_sin', 'day_cos', 'season'],
      dtype='object')

In [17]:
from sklearn.preprocessing import LabelEncoder

def label_encoding(df):
    le = LabelEncoder()
    df = df.drop(['date', 'id'], axis=1, inplace=False)
    df['country'] = le.fit_transform(df['country'])
    df['store'] = le.fit_transform(df['store'])
    df['product'] = le.fit_transform(df['product'])
    df['holiday'] = le.fit_transform(df['holiday'])
    df['weekend'] = le.fit_transform(df['weekend'])
    return df

X = label_encoding(prepared_train_data)
X = X.drop('num_sold', axis=1, inplace=False)
y = prepared_train_data["num_sold"]
X.head()


A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.23.5



Unnamed: 0,country,store,product,day,month,year,day_of_week,weekend,holiday,month_sin,month_cos,day_sin,day_cos,season
0,0,1,0,1,1,2017,6,1,1,0.5,0.866025,0.201299,0.97953,2.0
1,0,1,1,1,1,2017,6,1,1,0.5,0.866025,0.201299,0.97953,2.0
2,0,1,2,1,1,2017,6,1,1,0.5,0.866025,0.201299,0.97953,2.0
3,0,1,3,1,1,2017,6,1,1,0.5,0.866025,0.201299,0.97953,2.0
4,0,1,4,1,1,2017,6,1,1,0.5,0.866025,0.201299,0.97953,2.0


In [18]:
y

0          63
1          66
2           9
3          59
4          49
         ... 
136945    700
136946    752
136947    111
136948    641
136949    539
Name: num_sold, Length: 136950, dtype: int64

# Train the CatBoostRegressor

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2, shuffle=True)

In [20]:
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostRegressor
from sklearn.metrics import make_scorer

def calc_smape_score(y_true, y_pred):
    return 100 / len(y_true) * np.sum(2 * np.abs(y_pred - y_true) / (np.abs(y_pred) + np.abs(y_true)))

smape_score = make_scorer(calc_smape_score, greater_is_better=False)

# configure the cross-validation procedure
cv = KFold(n_splits=2, shuffle=True, random_state=42)

hyperparameters_grid = {
    'iterations': [100,1000],
    'l2_leaf_reg': [1,10],
    'learning_rate': [0.01, 0.04],
    'depth': [3, 11],
    'bootstrap_type': ['Bernoulli', 'Bayesian']
}
    
# define gridsearch and find best hyperparameters
search = GridSearchCV(CatBoostRegressor(objective='MAE', random_state=42, verbose=False), param_grid=hyperparameters_grid, scoring=smape_score, cv=cv, verbose=10, refit=True, n_jobs=-1)
result = search.fit(X_train, y_train)

y_test_predicted = result.predict(X_test)

test_SMAPE = calc_smape_score(y_test, y_test_predicted)

Fitting 2 folds for each of 32 candidates, totalling 64 fits




In [21]:
print("Grid search cross validation SMAPE: " + str(result.best_score_))
print("Grid search cross validation best hyperparameters: " + str(result.best_params_))
print("SMAPE on test data: " + str(test_SMAPE))

Grid search cross validation SMAPE: -5.112674261742752
Grid search cross validation best hyperparameters: {'bootstrap_type': 'Bayesian', 'depth': 11, 'iterations': 1000, 'l2_leaf_reg': 10, 'learning_rate': 0.01}
SMAPE on test data: 4.9957097115290265


In [22]:
best_model = CatBoostRegressor(iterations=result.best_params_["iterations"],
                               l2_leaf_reg=result.best_params_["l2_leaf_reg"],
                               learning_rate=result.best_params_["learning_rate"],
                               depth=result.best_params_["depth"],
                               bootstrap_type=result.best_params_["bootstrap_type"], objective='MAE', random_state=42, verbose=False)
                
best_model.fit(X, y)

<catboost.core.CatBoostRegressor at 0x7a43def03dc0>

# Submission

In [23]:
# prepare test data
test_data = pd.read_csv("/kaggle/input/playground-series-s3e19/test.csv")
prepared_test_data = preprocess(test_data)
X_encoded_test_data = label_encoding(prepared_test_data)
X_encoded_test_data.head()

Unnamed: 0,country,store,product,day,month,year,day_of_week,weekend,holiday,month_sin,month_cos,day_sin,day_cos,season
0,0,1,0,1,1,2022,5,1,1,0.5,0.866025,0.201299,0.97953,2.0
1,0,1,1,1,1,2022,5,1,1,0.5,0.866025,0.201299,0.97953,2.0
2,0,1,2,1,1,2022,5,1,1,0.5,0.866025,0.201299,0.97953,2.0
3,0,1,3,1,1,2022,5,1,1,0.5,0.866025,0.201299,0.97953,2.0
4,0,1,4,1,1,2022,5,1,1,0.5,0.866025,0.201299,0.97953,2.0


In [24]:
test_data.head()

Unnamed: 0,id,date,country,store,product,day,month,year,day_of_week,weekend,holiday,month_sin,month_cos,day_sin,day_cos,season
0,136950,2022-01-01,Argentina,Kaggle Learn,Using LLMs to Improve Your Coding,1,1,2022,5,True,True,0.5,0.866025,0.201299,0.97953,2.0
1,136951,2022-01-01,Argentina,Kaggle Learn,Using LLMs to Train More LLMs,1,1,2022,5,True,True,0.5,0.866025,0.201299,0.97953,2.0
2,136952,2022-01-01,Argentina,Kaggle Learn,Using LLMs to Win Friends and Influence People,1,1,2022,5,True,True,0.5,0.866025,0.201299,0.97953,2.0
3,136953,2022-01-01,Argentina,Kaggle Learn,Using LLMs to Win More Kaggle Competitions,1,1,2022,5,True,True,0.5,0.866025,0.201299,0.97953,2.0
4,136954,2022-01-01,Argentina,Kaggle Learn,Using LLMs to Write Better,1,1,2022,5,True,True,0.5,0.866025,0.201299,0.97953,2.0


In [25]:
# predict test data with the beast model which was drained on the whole training dataset
probed_map = {
    "Argentina": 3.5,
    "Spain": 1.5,
    "Japan": 1.4,
    "Estonia": 1.7,
    "Canada": 0.8,
}

train_data_predictions = best_model.predict(X_encoded_test_data)
train_data_predictions = pd.Series(train_data_predictions, index=test_data.index, copy=True)
train_data_predictions = np.round(train_data_predictions * test_data["country"].replace(probed_map), 0).astype(int)
train_data_predictions

0        127
1        122
2         21
3        116
4         96
        ... 
27370    944
27371    902
27372    172
27373    837
27374    688
Length: 27375, dtype: int64

In [26]:
# create submission csv
submission = pd.read_csv('/kaggle/input/playground-series-s3e19/sample_submission.csv')
submission['num_sold'] = train_data_predictions
submission.head()

Unnamed: 0,id,num_sold
0,136950,127
1,136951,122
2,136952,21
3,136953,116
4,136954,96


In [27]:
submission.to_csv('cbr4_sales_submission.csv', index=False)