In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('./data/sales_train.csv').dropna()[['unique_id', 'date', 'sales', 'warehouse', 'total_orders', 'sell_price_main'] \
                                                    + [f'type_{i}_discount' for i in range(7)]]
df['date'] = pd.to_datetime(df['date'])
df = df.loc[df['date'] >= '06-01-2022']

df['city'] = df['warehouse'].apply(lambda x: x.split('_')[0])
df['dayofweek'] = df['date'].dt.dayofweek
df['year'] = df['date'].dt.year

df['sales'] = np.sqrt(df['sell_price_main'] * df['sales'])

df['discount_amount'] = df['sell_price_main'] * df[[f'type_{i}_discount' for i in range(7)]].max(axis=1)

In [2]:
t_cols = ['sales', 'date', 'unique_id']
data = df[t_cols].copy()
data['date'] = pd.to_datetime(data['date'])
data['month'] = data['date'].dt.to_period('M')

# prev_month average
monthly_avg = (
        data.groupby(['unique_id', 'month'])['sales']
        .mean()
        .groupby('unique_id')
        .shift(1)
        .reset_index()
        .rename(columns={'sales': 'prev_month_avg'})
    )
data = data.merge(monthly_avg, on=['unique_id', 'month'], how='left').fillna(0)
data.drop(columns=['month', 'sales'], inplace=True)

df = df.merge(data, how='left', on=['unique_id', 'date'])

# robust week encoding
week_encoding = df[['dayofweek', 'sales', 'city', 'year']].groupby(
        ['year', 'dayofweek', 'city']).mean().reset_index()
week_mean = df[['sales', 'city', 'year']].groupby(
        ['year', 'city']).mean().reset_index()
week_mean.rename(columns={'sales': 'mean_sales'}, inplace=True)
week_encoding.rename(columns={'sales': 'weekday_avg_sales'}, inplace=True)
week_encoding = week_encoding.merge(
        week_mean, how='left', on=['city', 'year'])
week_encoding['weekday_frac_sales'] = week_encoding['weekday_avg_sales'] / \
        week_encoding['mean_sales']

week_encoding.drop('mean_sales', axis=1, inplace=True)

df = df.merge(week_encoding, how='left', on=[
                        'year', 'dayofweek', 'city'])

df['week_trend'] = df['weekday_frac_sales'] * df['prev_month_avg']

In [3]:
# lags
PERIODS = [14, 21, 28, 35]

df = df.sort_values(['unique_id', 'date'])
for shift in PERIODS:
    df[f'product_sales_{shift}'] = (df.groupby('unique_id', observed=True)['sales']
                                    .transform(lambda x: x.shift(shift).fillna(0))
                                    )

df['moving'] = (df.groupby('unique_id')['sales']
                    .transform(lambda x: x.shift(14).rolling(window=14, min_periods=1).mean().fillna(0))
                    )
df['week_moving_trend'] = df['weekday_frac_sales'] * df['moving']

# lags
DAY_PERIODS = [14, 21, 28, 35]
OFF_PERIODS = [15, 16, 17, 18, 19, 20]
orders = df['total_orders']
for shift in DAY_PERIODS:
    grouped = df[['unique_id', 'sales', 'total_orders']].groupby('unique_id')
    sales = grouped['sales'].transform(lambda x: x.shift(shift))
    s_orders = grouped['total_orders'].transform(lambda x: x.shift(shift))
    df[f'lag_{shift}'] = (orders * sales / s_orders).fillna(0)

numerator = df['weekday_frac_sales'].shift(periods=14)
for shift in OFF_PERIODS:
    grouped = df[['unique_id', 'sales', 'weekday_frac_sales']].groupby('unique_id')
    sales = grouped['sales'].transform(lambda x: x.shift(shift))
    frac = grouped['weekday_frac_sales'].transform(
        lambda x: x.shift(shift))
    df[f'lag_{shift}'] = (numerator * sales / frac).fillna(0)

df['normed_week_mean'] = df[['lag_14'] + [f'lag_{i}' for i in DAY_PERIODS]].mean(axis=1)
df['normed_week_median'] = df[['lag_14'] + [f'lag_{i}' for i in DAY_PERIODS]].median(axis=1)

df = df.drop([f'lag_{i}' for i in OFF_PERIODS], axis=1)

In [6]:
import cvxpy as cp
from tqdm import tqdm

nec_cols = ['product_sales_14', 'product_sales_21', 'product_sales_28', 'product_sales_35',
            'weekday_avg_sales', 'week_trend', 'week_moving_trend', 'moving', 'normed_week_mean',
            'normed_week_median', 'discount_amount'] + [f'lag_{i}' for i in [14, 21, 28, 35]]

def mae_reg(X: pd.DataFrame, y: pd.Series):
    X_np = X.to_numpy()
    y_np = y.to_numpy()

    _, d = X_np.shape
    beta = cp.Variable(d)
    intercept = cp.Variable()

    residuals = y_np - (X_np @ beta + intercept)
    objective = cp.Minimize(cp.sum(cp.abs(residuals)))
    problem = cp.Problem(objective)
    problem.solve()

    return beta.value, intercept.value


size = df[['unique_id', 'date']].groupby('unique_id', observed=True).count(
).reset_index().rename(columns={'date': 'n_many'})
small = size.loc[size['n_many'] < 60]
big = size.loc[size['n_many'] >= 60]
lil_df = df.merge(small, how='inner', on='unique_id')[
    ['unique_id', 'sales'] + nec_cols]
big_df = df.merge(big, how='inner', on='unique_id')[
    ['unique_id', 'sales'] + nec_cols]

disc_values = big_df[['unique_id', 'discount_amount']].groupby('unique_id', observed=True).count(
).reset_index().rename(columns={'discount_amount' : 'n_many'})

no_disc_ids = disc_values.loc[disc_values['n_many'] <= 1, 'unique_id'].to_list()
disc_ids = disc_values.loc[disc_values['n_many'] > 1, 'unique_id'].to_list()

lil_ids = lil_df['unique_id'].unique().tolist()

dict = {}
for id in tqdm(disc_ids):
    beta, intercept = mae_reg(
        big_df.loc[big_df['unique_id'] == id, nec_cols], big_df.loc[big_df['unique_id'] == id, 'sales'])
    dict[id] = {
        'unique_id': id,
        **{'coef_' + col: coef for col, coef in zip(nec_cols, beta)},
        'intercept': intercept
    }

no_disc = nec_cols.copy()
no_disc.remove('discount_amount')

for id in tqdm(no_disc_ids):
    beta, intercept = mae_reg(
        big_df.loc[big_df['unique_id'] == id, no_disc], big_df.loc[big_df['unique_id'] == id, 'sales'])
    dict[id] = {
        'unique_id': id,
        **{'coef_' + col: coef for col, coef in zip(no_disc, beta)},
        'coef_discount_amount': 0,
        'intercept': intercept
    }

smol_cols = ['weekday_avg_sales']
non_cols = nec_cols.copy()
for col in smol_cols:
    non_cols.remove(col)
for id in tqdm(lil_ids):
    beta, intercept = mae_reg(
        lil_df.loc[lil_df['unique_id'] == id, smol_cols], lil_df.loc[lil_df['unique_id'] == id, 'sales'])
    dict[id] = {
        'unique_id': id,
        **{'coef_' + col: coef for col, coef in zip(smol_cols, beta)},
        **{'coef_' + col: 0 for col in non_cols},
        'intercept': intercept
    }

dataframe = pd.DataFrame(dict).T
dataframe['unique_id'] = dataframe['unique_id'].astype(int)
display(dataframe)
dataframe.to_csv('./trend_coefs.csv', index=False)

100%|██████████| 4709/4709 [09:28<00:00,  8.29it/s]
0it [00:00, ?it/s]
100%|██████████| 535/535 [00:03<00:00, 174.14it/s]


Unnamed: 0,unique_id,coef_product_sales_14,coef_product_sales_21,coef_product_sales_28,coef_product_sales_35,coef_weekday_avg_sales,coef_week_trend,coef_week_moving_trend,coef_moving,coef_normed_week_mean,coef_normed_week_median,coef_discount_amount,coef_lag_14,coef_lag_21,coef_lag_28,coef_lag_35,intercept
1,1,0.242304,-0.31373,0.014351,0.154618,4.29074,0.185922,-2.567638,2.558439,-0.03848,0.014081,2.228295,-0.222865,0.349924,-0.008591,-0.035623,-28.19660296063067
3,3,-0.026581,0.01252,0.053885,0.028307,-0.996998,-0.051817,0.848852,-0.524355,-0.026239,0.597826,0.440145,-0.037067,0.0083,-0.015248,-0.032509,278.4051336748718
5,5,0.043001,-0.112441,0.18371,-0.206968,0.410454,-0.114343,0.161709,-0.156046,-0.029311,0.289712,0.0,-0.037808,0.013164,-0.000317,0.150543,5.059322127748809
6,6,0.082615,-0.069819,0.007505,0.089304,0.280042,-0.377588,0.599775,-0.4372,0.063221,0.130405,4.990035,0.067553,0.017068,0.080541,-0.027574,4.485701924651151
7,7,0.009806,0.073666,-0.032609,-0.052556,0.108722,-0.15933,0.471327,-0.176894,0.033387,0.263108,0.37124,-0.02806,-0.062234,0.030575,0.13702,0.945756021738211
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5372,5372,0,0,0,0,0.636647,0,0,0,0,0,0,0,0,0,0,59.92805731313763
5390,5390,0,0,0,0,2.421462,0,0,0,0,0,0,0,0,0,0,-277.07144988110304
5416,5416,0,0,0,0,14.680294,0,0,0,0,0,0,0,0,0,0,-529.0288731901177
5417,5417,0,0,0,0,2.890143,0,0,0,0,0,0,0,0,0,0,-29.320323376233052
