In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="5"


import numpy as np
import pandas as pd
from datetime import datetime, timedelta
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder

In [None]:
train = pd.read_parquet('../data/processed/df_train.parquet')
test = pd.read_parquet('../data/processed/df_test.parquet')
test.loc[test['date'] == test['date'].min(), 'price_change_perc'] = 0
test.loc[test['date'] == test['date'].min(), 'price_change_logdiff'] = 0

sub = pd.read_csv('../data/raw/sample_final.csv')

In [None]:
def transform_df(df, max_days=100):
    df = df[df['date'] > df['date'].max() - timedelta(days=max_days)]
    df = df[['geo_cluster_id', 'sku_id', 'date', 'price_change_perc', 'sales']].copy()
    df['price_change'] = df['price_change_perc'] !=0
    return df.drop(columns= 'price_change_perc')

train = transform_df(train)
test = transform_df(test)

In [None]:
geo_cluster_id_encoder = LabelEncoder().fit(pd.concat([train['geo_cluster_id'], test['geo_cluster_id']], axis=0))
train['geo_cluster_id'] = geo_cluster_id_encoder.transform(train['geo_cluster_id'])
test['geo_cluster_id'] = geo_cluster_id_encoder.transform(test['geo_cluster_id'])

sku_id_encoder = LabelEncoder().fit(pd.concat([train['sku_id'], test['sku_id']], axis=0))
train['sku_id'] = sku_id_encoder.transform(train['sku_id'])
test['sku_id'] = sku_id_encoder.transform(test['sku_id'])

# Features

In [None]:
def get_target(df, target_date, n_target=14):
    target = df.loc[(target_date + timedelta(days=n_target) >= df['date']) & (df['date'] >= target_date)
        ,['geo_cluster_id', 'sku_id', 'date', 'sales']].copy()
    for i in range(1, n_target+1):
        target[f'sales_{i}'] = target['sales'].shift(-i)
    geo_cluster_mask = target['geo_cluster_id'] == target['geo_cluster_id'].shift(-14)
    sku_id_mask = target['sku_id'] == target['sku_id'].shift(-14)
    target = target[geo_cluster_mask & sku_id_mask].dropna()
    return target.drop(columns='sales')

def get_price_change(df, target_date, n_target=14):
    target = df.loc[(target_date + timedelta(days=n_target) >= df['date']) & (df['date'] >= target_date)
        ,['geo_cluster_id', 'sku_id', 'date', 'price_change']].copy()
    for i in range(1, n_target+1):
        target[f'price_change_{i}'] = target['price_change'].shift(-i)
    geo_cluster_mask = target['geo_cluster_id'] ==target['geo_cluster_id'].shift(-14)
    sku_id_mask = target['sku_id'] == target['sku_id'].shift(-14)
    target = target[geo_cluster_mask & sku_id_mask].dropna()
    
    target.loc[:, target.filter(regex=r'^price_change_').columns] = target.filter(regex=r'^price_change_').astype(int)
    return target.drop(columns='price_change')

def get_feature(df, target_date, timdelta_train_days=30):
    df = df[(target_date >= df['date']) & (df['date'] > target_date - timedelta(days=timdelta_train_days))]
    
    median_sales = df.groupby(['geo_cluster_id', 'sku_id'])['sales'].median().rename('median_sales')
    median_non_zero_sales = df[df['sales']!=0].groupby(['geo_cluster_id', 'sku_id'])['sales'].median().rename('median_non_zero_sales')
    median_price_change_sales = df[df['price_change']!=0].groupby(['geo_cluster_id', 'sku_id'])['sales'].median().rename('median_price_change_sales')
    median_price_no_change_sales = df[df['price_change']==0].groupby(['geo_cluster_id', 'sku_id'])['sales'].median().rename('median_price_no_change_sales')
    
    result = pd.concat([median_sales, median_non_zero_sales, median_price_change_sales, median_price_no_change_sales], axis=1).fillna(0)
    result['date'] = target_date
    result = result.merge(df[['geo_cluster_id', 'sku_id', 'date', 'price_change', 'sales']], on=['geo_cluster_id', 'sku_id', 'date'], how='left')
    return result 

def get_df(df, target_date):
    target = get_target(df , target_date)
    feature = get_feature(df , target_date)
    price_change = get_price_change(df , target_date)
    result = feature.merge(target, on=['geo_cluster_id', 'sku_id', 'date'])
    result = result.merge(price_change, on=['geo_cluster_id', 'sku_id', 'date'])
    return result

# CV

In [None]:
leak_cols = [f'price_change_{i}' for i in range(1,15)]
target_cols = [f'sales_{i}' for i in range(1,15)]
train_cols = ['geo_cluster_id', 'sku_id', 'sales', 'price_change', 'median_sales', 
              'median_non_zero_sales', 'median_price_change_sales', 'median_price_no_change_sales']
float_cols = ['sales', 'price_change', 'median_sales', 
              'median_non_zero_sales', 'median_price_change_sales', 'median_price_no_change_sales']
int_cols = ['geo_cluster_id', 'sku_id']

In [None]:
def edit_valid_price_change(df):
    df = df.copy()
    price_change_df = np.array(df[leak_cols])
    target_df = np.array(df[target_cols] > 0)
    price_change_df[np.arange(target_df.shape[0]), target_df.argmax(axis=1)] = True
    df[leak_cols] = price_change_df
    df['price_change_1'] = False
    
    df.loc[~df[target_cols].any(axis=1), leak_cols] = False
    
    return df

In [None]:
timdelta_train_days = 30
timdelta_valid_days = 14

valid_date_list = [
    train['date'].max() - timedelta(days=14), 
    train['date'].max() - timedelta(days=21), 
    train['date'].max() - timedelta(days=28),
    train['date'].max() - timedelta(days=35),
    train['date'].max() - timedelta(days=42)
]

train_list = []
valid_list = []
for fold_valid_date in tqdm(valid_date_list):
    fold_train = get_df(train, fold_valid_date - timedelta(days=timdelta_valid_days))
    fold_val = get_df(train, fold_valid_date)
    fold_val = edit_valid_price_change(fold_val)
    train_list.append(fold_train)
    valid_list.append(fold_val)

# NN Model and Catalys stuff

In [None]:
import torch
import torch.nn as nn
import catalyst

from catalyst import dl, metrics
from os.path import join as pjoin
from catalyst import callbacks

from nn_utils import MetricWrapper, TableDataset, SAE, CustomRunner 

In [None]:
class ForecastingFeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.sku_id_emb = nn.Embedding(len(sku_id_encoder.classes_), 128)
        self.geo_cluster_id_emb = nn.Embedding(len(geo_cluster_id_encoder.classes_), 64)
        self.cont_features_bn = nn.BatchNorm1d(len(float_cols) + len(leak_cols))
        
        n_input_features = 128 + 64 + len(float_cols) + len(leak_cols)
        self.feed_forward = nn.Sequential(
            nn.Linear(n_input_features, 256),
            nn.BatchNorm1d(256),
            nn.PReLU(256),
            
            nn.Linear(256, 512),
            nn.BatchNorm1d(512),
            nn.PReLU(512),
            
            nn.Linear(512, 128),
            nn.BatchNorm1d(128),
            nn.PReLU(128),
            
            nn.Linear(128, len(target_cols)),
        )
        
    def forward(self, int_f, float_f):
        geo_cluster_id_emb = self.geo_cluster_id_emb(int_f[:,0])
        sku_id_emb = self.sku_id_emb(int_f[:,1])
        float_emb = self.cont_features_bn(float_f)
        all_features = torch.cat((geo_cluster_id_emb, sku_id_emb, float_emb), dim=-1)
        out = self.feed_forward(all_features)
        
        return out

# Test on fold

In [None]:
BS = 256
EXP_NAME = "baseline_tmax50_milr1e6_emb10xLR_lr3e4_bs256_nodrop_ver2"
LR = 0.0003

In [None]:
def all_metric(y_true, y_pred):
    # 0 axis - items
    # 1 axis - days
    return np.mean(np.abs(y_true - y_pred).sum(0) / y_true.sum(0))

def public_metric(y_true, y_pred):
    return all_metric(y_true[:,:7], y_pred[:,:7])

def private_metric(y_true, y_pred):
    return all_metric(y_true[:,7:], y_pred[:,7:])

def metric(target, predict):
    return (target - predict).abs().sum()/target.sum()

In [None]:
def train_one_fold(input_train_fold, input_val_fold, fold_id, is_test=False, epoch=None, seed=42):
    
    np.random.seed(seed)
    
    if is_test:
        loaders = {
            "train": torch.utils.data.DataLoader(
                TableDataset(input_train_fold, float_cols=float_cols + leak_cols, int_cols=int_cols, target_cols=target_cols),
                batch_size=BS,
                shuffle=True,
                drop_last=True
            )
        }
    else:
        loaders = {
            "train": torch.utils.data.DataLoader(
                TableDataset(input_train_fold, float_cols=float_cols + leak_cols, int_cols=int_cols, target_cols=target_cols),
                batch_size=BS,
                shuffle=True,
                drop_last=True
            ),
            "valid": torch.utils.data.DataLoader(
                TableDataset(input_val_fold, float_cols=float_cols + leak_cols, int_cols=int_cols, target_cols=target_cols),
                batch_size=BS,
                shuffle=False,
                drop_last=False
            )
        }
    model = ForecastingFeedForward()
    criterion = SAE()
    optimizer = torch.optim.Adam([
        {"params": model.sku_id_emb.parameters(), "lr": LR*10},
        {"params": model.geo_cluster_id_emb.parameters(), "lr": LR*10},
        {"params": model.cont_features_bn.parameters(), "lr": LR},
        {"params": model.feed_forward.parameters(), "lr": LR},
    ])
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, T_max=50, eta_min=1e-6
    )
    
    runner = CustomRunner()
    cat_callbacks = [
            callbacks.OptimizerCallback(
                metric_key="loss", accumulation_steps=1
            ),
            callbacks.SchedulerCallback(
                loader_key="train" if is_test else "valid", metric_key="tgt_metric", mode="epoch"
            ),
            MetricWrapper(
                metric_name="tgt_metric",
                metric_func=all_metric
            ),
            MetricWrapper(
                metric_name="public_metric",
                metric_func=public_metric
            ),
            MetricWrapper(
                metric_name="private_metric",
                metric_func=private_metric
            )
    ]
    if not is_test:
        cat_callbacks.append(callbacks.EarlyStoppingCallback(patience=7, loader_key="valid", metric_key="tgt_metric", minimize=True,))
    runner.train(
        model=model,
        criterion=criterion,
        optimizer=optimizer,
        scheduler=scheduler,
        loaders=loaders,
        logdir=pjoin("logdir", EXP_NAME, f"test_{seed}_{epoch}") if is_test else pjoin("logdir", EXP_NAME, f"fold_{fold_id}"),
        valid_loader=None if is_test else "valid" ,
        valid_metric=None if is_test else "tgt_metric",
        minimize_valid_metric=True,
        num_epochs=50 if epoch is None else epoch,
        verbose=True,
        load_best_on_end=True,
        timeit=True,
        callbacks=cat_callbacks,
        seed=seed
    )
    val_preds = []
    if is_test:
         loaders = {
            "valid": torch.utils.data.DataLoader(
                TableDataset(input_val_fold, float_cols=float_cols + leak_cols, int_cols=int_cols, target_cols=target_cols),
                batch_size=BS,
                shuffle=False,
                drop_last=False
            )
        }
    for prediction in runner.predict_loader(loader=loaders["valid"]):
        val_preds.append(prediction)
    return np.concatenate(val_preds, axis=0)

# Train

In [None]:
results = pd.DataFrame(index=range(len(train_list)), columns=target_cols)
for i, (fold_valid, fold_train) in enumerate(zip(valid_list, train_list)):
    #fold_train = fold_train.iloc[:10_000]
    #fold_valid = fold_valid.iloc[:10_000]
    val_forecast = train_one_fold(fold_train, fold_valid, fold_id=i)
    for target_col_id, target_col in enumerate(target_cols):
        results.loc[i, target_col] = metric(fold_valid[target_col], val_forecast[:,target_col_id])
        
print('public:', results[['sales_1', 'sales_2', 'sales_3', 'sales_4', 'sales_5', 'sales_6', 'sales_7']].mean().mean())
print('private:', results[['sales_8', 'sales_9', 'sales_10', 'sales_11', 'sales_12', 'sales_13', 'sales_14']].mean().mean())
print(pd.DataFrame(results.mean(axis=1)).T, '\n')
print(pd.DataFrame(results.mean(axis=0)).T, '\n')
results

In [None]:
print('public:', results[['sales_1', 'sales_2', 'sales_3', 'sales_4', 'sales_5', 'sales_6', 'sales_7']].mean().mean())
print('private:', results[['sales_8', 'sales_9', 'sales_10', 'sales_11', 'sales_12', 'sales_13', 'sales_14']].mean().mean())
print(pd.DataFrame(results.mean(axis=1)).T, '\n')
print(pd.DataFrame(results.mean(axis=0)).T, '\n')

# predict

In [None]:
test_train = get_df(train, train['date'].max()-timedelta(days=14))

test_test_feature = get_feature(train, train['date'].max())
test_test_price_change = get_price_change(test.append(train).sort_values(['geo_cluster_id', 'sku_id', 'date']), train['date'].max())
test_test = test_test_feature.merge(test_test_price_change, on=['geo_cluster_id', 'sku_id', 'date'])
test_test[target_cols] = 0.0

In [None]:
test_forecasts = []
for s_v in [42, 2021, 12345]:
    for e_v in [16, 18, 19]:
        test_forecast = train_one_fold(test_train, test_test, fold_id=0, is_test=True, epoch=e_v, seed=s_v)
        test_forecasts.append(test_forecast)

In [None]:
test_forecasts = np.stack(test_forecasts, axis=0).mean(0)

In [None]:
test_predictions = []
for target_col_id, (target_col, date) in enumerate(zip(tqdm(target_cols), np.sort(test['date'].unique()))):
    pred = test_test[['geo_cluster_id', 'sku_id']].copy()
    pred['date'] = date
    pred['pred'] = test_forecasts[:,target_col_id]
    test_predictions.append(pred)
    
test_predictions = pd.concat(test_predictions).reset_index(drop=True)

In [None]:
sub = pd.read_csv('../data/raw/sample_final.csv')
test_predicted = test.reset_index().merge(test_predictions, on=['geo_cluster_id', 'sku_id', 'date'], how='left').fillna(0)
sub['sales'] = test_predicted['pred'].tolist()
sub.loc[sub['sales'] < 0, 'sales'] = 0
sub['sales'].mean()

In [None]:
sub.to_csv(f'../submissions/{EXP_NAME}_9models_leshapreproc.csv', index=False)