In [1]:
from pathlib import Path
import warnings
from loader import DatasetLoader
from loader.transforms import (
    LagTransform, 
    Transform, 
    MovingAvgTransform,
    HoltWintersPredictTransform,
    IsWeekendTransform,
    WeekdayTransform
)
from loader.ts_dataset import TsDataset

warnings.filterwarnings("ignore")

In [41]:
root = Path('../data/processed/')

product_name='balance'
city_id=1

loader = DatasetLoader(root, product_name=product_name, city_id=city_id)
df = loader.get_dataset()

# cut extra history. HoltWinters transform is really long
last_year_date = '2021-10-06'
df = df[df.date > last_year_date]
print(len(df))

df.head(4)

df length 1946
df length 240
240


Unnamed: 0,date,target,new_orders_count,custom_orders_rate
0,2022-03-02,68.0,8.0,0.4718
1,2022-03-03,71.0,5.0,0.5841
2,2022-03-04,42.0,5.0,0.45
3,2022-03-05,72.0,5.0,0.5195


In [42]:
lagged_target = df.target.iloc[3:].tolist()
df = df.iloc[:-3]
df.loc[:, 'target'] = lagged_target
df.head(4)

Unnamed: 0,date,target,new_orders_count,custom_orders_rate
0,2022-03-02,72.0,8.0,0.4718
1,2022-03-03,65.0,5.0,0.5841
2,2022-03-04,78.0,5.0,0.45
3,2022-03-05,96.0,5.0,0.5195


In [43]:
# hw_col = df.hws_1d__target

trs = [
    HoltWintersPredictTransform(name='hws_1d', column='target'),
    
    WeekdayTransform(name='weekday', column='date'),
    IsWeekendTransform(name='weekend', column='date'),
    
#     LagTransform(name='lag_2', column='new_orders_count', value=2),
#     LagTransform(name='lag_7', column='new_orders_count', value=7),
#     LagTransform(name='lag_9', column='new_orders_count', value=9),
    
#     LagTransform(name='lag_5', column='orders_count', value=5),
#     LagTransform(name='lag_6', column='orders_count', value=6),
#     LagTransform(name='lag_7', column='orders_count', value=7),
#     LagTransform(name='lag_12', column='orders_count', value=12),
    
#     MovingAvgTransform(name='ma_7', column='orders_count', value=7),
#     MovingAvgTransform(name='ma_7', column='new_orders_count', value=7),
    MovingAvgTransform(name='ma_7', column='target', value=7),
    
#     MovingAvgTransform(name='ma_14', column='orders_count', value=14),
#     MovingAvgTransform(name='ma_14', column='new_orders_count', value=14),
    MovingAvgTransform(name='ma_14', column='target', value=14),
]

# df = df.assign(hws_1d__target=hw_col)

for tr in trs:
    df = tr.transform(df)
    
df.head(4)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 187/187 [00:15<00:00, 12.08it/s]

cols Index(['date', 'target', 'new_orders_count', 'custom_orders_rate',
       'hws_1d__target'],
      dtype='object')





Unnamed: 0,date,target,new_orders_count,custom_orders_rate,hws_1d__target,weekday__date_0,weekday__date_1,weekday__date_2,weekday__date_3,weekday__date_4,weekday__date_5,weekday__date_6,weekend__date_False,weekend__date_True,ma_7_7_target,ma_14_14_target
0,2022-03-02,72.0,8.0,0.4718,72.0,0,0,1,0,0,0,0,0,1,72.0,72.0
1,2022-03-03,65.0,5.0,0.5841,72.0,0,0,0,1,0,0,0,0,1,72.0,72.0
2,2022-03-04,78.0,5.0,0.45,72.0,0,0,0,0,1,0,0,0,1,72.0,72.0
3,2022-03-05,96.0,5.0,0.5195,72.0,0,0,0,0,0,1,0,1,0,72.0,72.0


In [45]:


import torch
from torch import nn
from torch.utils.data import DataLoader

from statistics import mean
from sklearn.model_selection import train_test_split
from tensorboardX import SummaryWriter

from models import LstmMlp

In [48]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

window_size = 7*3
train = DataLoader(TsDataset(train, window_size), batch_size=64, collate_fn=TsDataset.collate_fn, drop_last=True)
test =  DataLoader(TsDataset(test, window_size), batch_size=4, collate_fn=TsDataset.collate_fn, drop_last=True)


loss = nn.L1Loss()
model = LstmMlp(df.shape[1]-2)
opt = torch.optim.Adadelta(model.parameters())
writer = SummaryWriter()

train_step = 0
test_step = 0

In [49]:
best_test_smape = 1
best_test_epoch = None

for epoch in range(300):
    if epoch % 10 == 0:
        print(f'epoch {epoch}')
        
    metrics = dict(loss=[], smape=[])
    
    for x, y in train:
        model.train()
        opt.zero_grad()
        
        pred = model(x)
        err = loss(y, pred).mean()
        err.backward()
        opt.step()
        
        err = err.item()
        err_smape = ((pred - y).abs() / (pred.abs() + y.abs()) / 2).mean().item()
        
        metrics['loss'].append(err)
        metrics['smape'].append(err_smape)
        
        writer.add_scalar('loss/step/train', err, train_step)
        writer.add_scalar('smape/step/train', err_smape, train_step)
    
    train_step += 1
    writer.add_scalar('loss/epoch/train', mean(metrics['loss']), epoch)
    writer.add_scalar('smape/epoch/train', mean(metrics['smape']), epoch)
    
    metrics = dict(loss=[], smape=[])
    for x, y in test:
        model.eval()
        with torch.no_grad():
            pred = model(x)
            err = loss(y, pred)

            err = err.item()
            err_smape = ((pred - y).abs() / (pred.abs() + y.abs()) / 2).mean().item()
        
        metrics['loss'].append(err)
        metrics['smape'].append(err_smape)
        
        writer.add_scalar('loss/step/test', err, test_step)
        writer.add_scalar('smape/step/test', err_smape, test_step)
    
    test_step += 1
    writer.add_scalar('loss/epoch/test', mean(metrics['loss']), epoch)
    writer.add_scalar('smape/epoch/test', mean(metrics['smape']), epoch)
    
    min_smape = min(metrics['smape'])
    if min_smape < best_test_smape:
        best_test_smape = min_smape
        best_test_epoch = epoch
        
# print(f'achieve smape {best_test_smape:.3f} on epoch {best_test_epoch}')
# with open(f'res/{product_name}_{city_id}.txt', 'w') as f:
#     f.write(best_test_smape)
#     f.write('\n')
#     f.write(best_test_epoch)


epoch 0
epoch 10
epoch 20
epoch 30
epoch 40
epoch 50
epoch 60
epoch 70
epoch 80
epoch 90
epoch 100
epoch 110
epoch 120
epoch 130
epoch 140
epoch 150
epoch 160
epoch 170
epoch 180
epoch 190
epoch 200
epoch 210
epoch 220
epoch 230
epoch 240
epoch 250
epoch 260
epoch 270
epoch 280
epoch 290


In [77]:
s='''balance
basic
breakfasts_2x
daily
detox
fit
fit_express
m_fit
power
priem
priem_plus
super_fit'''.splitlines()

[[line, 2] for line in s]

[['balance', 2],
 ['basic', 2],
 ['breakfasts_2x', 2],
 ['daily', 2],
 ['detox', 2],
 ['fit', 2],
 ['fit_express', 2],
 ['m_fit', 2],
 ['power', 2],
 ['priem', 2],
 ['priem_plus', 2],
 ['super_fit', 2]]