In [1]:
# import standard libraries
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

# import models and metrics
from sklearn.model_selection import train_test_split, GroupKFold
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import TransformedTargetRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
import optuna

In [2]:
# read data from '.csv' files
train = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2022/train.csv', parse_dates=['date'])
test = pd.read_csv('/kaggle/input/tabular-playground-series-sep-2022/test.csv', parse_dates=['date'])

for df in [train, test]:
    df['product'] = df['product'].str.replace(' ', '_')
    df['product'] = df['product'].str.replace(':', '_')

original_train = train
original_test = test

In [3]:
train_sum = train.groupby(['date'])['num_sold'].sum().reset_index()
train = train_sum.copy()
# remove marth-april-2020 days from data
outlier = train.loc[((train['date'] >= '2020-03-01') & (train['date'] < '2020-06-01'))]
train = train.loc[~((train['date'] >= '2020-03-01') & (train['date'] < '2020-06-01'))]
#get the dates to forecast for
test_sum = test.groupby(['date'])['row_id'].first().reset_index().drop(columns='row_id')
test = test_sum.copy()
#keep dates for later
test_dates = test[['date']]

In [4]:
def feature_engineer(df, outlier_fe = True):
    new_df = df.copy()
    new_df['month'] = df['date'].dt.month
    new_df['month_sin'] = np.sin(new_df['month'] * (2 * np.pi / 12))
    new_df['month_cos'] = np.cos(new_df['month'] * (2 * np.pi / 12))
    
    new_df['is_month_start'] = df['date'].apply(lambda x: x.is_month_start).astype(np.int8)
    new_df['is_month_end'] = df['date'].apply(lambda x: x.is_month_end).astype(np.int8)
    
    new_df['day_of_week'] = df['date'].dt.dayofweek
    
    new_df['day_of_year'] = df['date'].dt.dayofyear
    new_df['day_of_year'] = new_df.apply(lambda x: x['day_of_year']-1 if (x['date'] > pd.Timestamp('2020-02-29') and x['date'] < pd.Timestamp('2021-01-01'))  else x['day_of_year'], axis=1)
    
    if outlier_fe:
        new_df['important_dates'] = new_df['day_of_year'].apply(lambda x: x if x in [124, 125, 126, 127, 140, 141,] else 0)
    else:
        new_df['important_dates'] = new_df['day_of_year'].apply(lambda x: x if x in [
                                                                                    1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 
                                                                                   124, 125, 126, 127, 140, 141, 
                                                                                   167, 168, 169, 
                                                                                   170, 171, 173, 174, 175, 176, 177, 178, 179, 
                                                                                   180, 181, 
                                                                                   203, 230, 231, 232, 233, 234, 282, 289, 
                                                                                   290, 307, 308, 309, 310, 311, 312, 313, 317, 318, 319, 
                                                                                   320, 360, 361, 362, 363, 364, 365
                                                                                ] else 0)
    
    new_df['year'] = df['date'].dt.year
    new_df['day'] = df['date'].dt.day
    
    import dateutil.easter as easter
    easter_date = new_df.date.apply(lambda date: pd.Timestamp(easter.easter(date.year)))
    for day in list(range(-5, 5)) + list(range(30, 50)):
        new_df[f'easter_{day}'] = (new_df.date - easter_date).dt.days.eq(day)
    
    for col in new_df.columns :
        if 'easter' in col :
            new_df = pd.get_dummies(new_df, columns = [col], drop_first=True)
    
    new_df = new_df.drop(columns=['date','month'])
    
    new_df = pd.get_dummies(new_df, columns = ['important_dates','day_of_week', 'day'])
    
    return new_df

outlier_fe = True
train = feature_engineer(train, outlier_fe)
test = feature_engineer(test, outlier_fe)
outlier = feature_engineer(outlier, outlier_fe)

In [5]:
y = train['num_sold']
X = train.drop(columns='num_sold')
X_Test = outlier.drop(columns='num_sold')

In [6]:
sc = StandardScaler()
sc.fit(X)
X = sc.transform(X)
X_Test = sc.transform(X_Test)

In [7]:
from sklearn.compose import TransformedTargetRegressor
model = TransformedTargetRegressor(
                                   regressor = Ridge(alpha=0.2, tol=0.00001, max_iter=100000), 
                                   func=np.log, inverse_func=np.exp
                                   )
model.fit(X, y)
pred = model.predict(X_Test)

In [8]:
train_sum.loc[((train_sum['date'] >= '2020-03-01') & (train_sum['date'] < '2020-06-01')), 'num_sold'] = pred

In [9]:
# f,ax = plt.subplots(figsize=(20,10))
# sns.lineplot(data = train_sum, x='date', y='num_sold');

In [10]:
outlier_fe = False
train = feature_engineer(train_sum, outlier_fe)
test = feature_engineer(test_sum, outlier_fe)

In [11]:
y = train['num_sold']
X = train.drop(columns='num_sold')
X_Test = test

In [12]:
preds_lst = []
kf = GroupKFold(n_splits=4)
scores = []
for fold, (train_idx, val_idx) in enumerate(kf.split(X, groups=X.year)):
    model = TransformedTargetRegressor(
                                       regressor=Ridge(alpha=0.2, tol=0.00001, max_iter=100000), 
                                       func=np.log, inverse_func=np.exp
                                       )
    model = make_pipeline(MinMaxScaler(), model)
    model.fit(X.iloc[train_idx], y.iloc[train_idx])
    preds_lst.append(model.predict(X_Test))
    sc =  model.score(X.iloc[val_idx], y.iloc[val_idx])
    scores.append(sc)
    print(fold, X.iloc[train_idx]['year'].unique(),  X.iloc[val_idx]['year'].unique(), sc)

print('Mean score', np.mean(scores))
preds_df = pd.DataFrame(np.column_stack(preds_lst))
preds_df['num_sold'] = preds_df.sum(axis = 1)/len(preds_lst)
test_dates['num_sold'] = preds_df['num_sold']

0 [2017 2018 2019] [2020] -0.5460784812957906
1 [2017 2018 2020] [2019] -0.24517552130909048
2 [2017 2019 2020] [2018] 0.7487230661921057
3 [2018 2019 2020] [2017] 0.8936907819852213
Mean score 0.2127899613931115


In [13]:
product_df = original_train.groupby(['date','product'])['num_sold'].sum().reset_index()
product_ratio_df = product_df.pivot(index='date', columns='product', values='num_sold')
product_ratio_df = product_ratio_df.apply(lambda x: x/x.sum(),axis=1)
product_ratio_df = product_ratio_df.stack().rename('ratios').reset_index()

# make prediction to ratio in 2021
X = product_ratio_df[product_ratio_df['date'] < '2020-01-01']
y = X['ratios']
X_Test = original_test.groupby(['date','product']).sum().reset_index().drop(columns='row_id')
X_Test_original = X_Test
X = X.drop(columns='ratios')
X = pd.get_dummies(X, columns = ['product'])
X_Test = pd.get_dummies(X_Test, columns = ['product'])
X = feature_engineer(X)
X_Test = feature_engineer(X_Test)

model = CatBoostRegressor(eval_metric= 'MAPE',
                          n_estimators= 799, 
                          learning_rate= 0.02, 
                          depth= 9, 
                          min_child_samples= 34, 
                          l2_leaf_reg= 1.5, 
                          logging_level='Silent')
model = make_pipeline(RobustScaler(), model)
model.fit(X,y)
pred = model.predict(X_Test)

product_ratio = X_Test_original
product_ratio['ratios'] = pred
product_ratio['mm-dd'] = product_ratio['date'].dt.strftime('%m-%d')
product_ratio = product_ratio.drop(columns='date')

test_product_ratio_df = original_test.copy()
test_product_ratio_df['mm-dd'] = test_product_ratio_df['date'].dt.strftime('%m-%d')
test_product_ratio_df = pd.merge(test_product_ratio_df,product_ratio, how='left', on = ['mm-dd','product'])

test_sub = pd.merge(original_test, test_dates, how='left')
test_sub['ratios'] = test_product_ratio_df['ratios']

In [14]:
def disaggregate_forecast(df):
    new_df = df.copy()
    
    store_weights = original_train.groupby('store')['num_sold'].sum()/original_train['num_sold'].sum()
    country_weights = pd.Series(index = test_sub['country'].unique(),data = 1/6)

    for country in country_weights.index:
        new_df.loc[(new_df['country'] == country), 'num_sold'] = new_df.loc[(new_df['country'] == country), 'num_sold'] *  country_weights[country]
        
    for store in store_weights.index:
        new_df.loc[new_df['store'] == store, 'num_sold'] = new_df.loc[new_df['store'] == store, 'num_sold'] * store_weights[store]
    
    new_df['num_sold'] = new_df['num_sold'] * new_df['ratios']
    new_df['num_sold'] = new_df['num_sold'].round()
    new_df = new_df.drop(columns=['ratios'])
    
    return new_df

final_df = disaggregate_forecast(test_sub)

In [15]:
submission = pd.read_csv('../input/tabular-playground-series-sep-2022/sample_submission.csv')
submission['num_sold'] = final_df['num_sold']
submission.to_csv('submission.csv', index = False)
submission

Unnamed: 0,row_id,num_sold
0,70128,513.0
1,70129,411.0
2,70130,343.0
3,70131,542.0
4,70132,178.0
...,...,...
17515,87643,657.0
17516,87644,208.0
17517,87645,170.0
17518,87646,142.0


In [16]:
# def smape_loss(y_true, y_pred):
#     '''SMAPE Loss'''
#     return np.abs(y_true - y_pred) / (y_true + np.abs(y_pred)) * 200

# np.mean(smape_loss(best.num_sold, final_df.num_sold))

In [17]:
# best = submission