## Тройное экспоненциальное сглаживание

In [1]:
import numpy as np
import pandas as pd
from statsmodels.tsa.api import ExponentialSmoothing
from datetime import timedelta

from sklearn.metrics import mean_absolute_error as mae

from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings('ignore')

In [2]:
traindf = pd.read_csv('data/train.csv', sep=',', parse_dates=['date'],index_col='date')
traindf = traindf.drop(columns='id')

In [3]:
testdf = pd.read_csv('data/test.csv', sep=',', parse_dates=['date'],index_col='date')
testdf = testdf.drop(columns='id')

In [4]:
traindf['label'] = 'train'
testdf['label'] = 'test'

df = pd.concat((traindf,testdf), axis = 0)
df['quantity'] = np.where(df.label == 'train', df.quantity, np.nan)

In [5]:
train_dates = pd.date_range(min(traindf.index), min(testdf.index) - timedelta(1))
test_dates  = pd.date_range(min(testdf.index), max(testdf.index))

In [6]:
week = 7

In [7]:
from sklearn.model_selection import TimeSeriesSplit
tss = TimeSeriesSplit(n_splits=3)

In [8]:
def do_validation(train):
    errors = dict()
    for nperiods in range(7, 31, 7):
        for train_idx, test_idx in tss.split(train):
            model = ExponentialSmoothing(np.asarray(train), seasonal_periods=nperiods, seasonal='add', trend='add').fit()
            forecast = pd.Series(model.forecast(len(test_idx)))
            errors.setdefault(nperiods, []).append(mae(train.iloc[test_idx].values, forecast.values))
    return min(errors, key=lambda k: np.mean(errors.get(k)))

In [9]:
def make_prediction(frame):
    mask = (df.product_id == frame.product_id[0]) & (df.warehouse_id == frame.warehouse_id[0])
    quantity = df[mask]['quantity'][:-week]
    if (len(quantity) == 0):
        df.loc[mask & (df.label == 'test'), 'quantity'] = pd.Series(np.zeros(week), index = test_dates)
        return

    train = pd.Series(index=train_dates, dtype='float64').combine_first(quantity).fillna(0)

    nperiods = do_validation(train)

    fit = ExponentialSmoothing(np.asarray(train), seasonal_periods=nperiods,trend='add', seasonal='add').fit()
    df.loc[mask & (df.label == 'test'), 'quantity'] = pd.Series(fit.forecast(week), index = test_dates)


testdf.groupby(['warehouse_id','product_id', 'label']).agg(make_prediction)

In [10]:
res = df[(df.label == 'test')].copy()
res

Unnamed: 0_level_0,warehouse_id,product_id,quantity,label
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-04-09,0,71165,2.466150,test
2021-04-09,1,71165,6.857162,test
2021-04-09,0,71170,-0.224253,test
2021-04-09,1,71170,0.054411,test
2021-04-09,0,71185,0.663822,test
...,...,...,...,...
2021-04-15,1,98615,0.020406,test
2021-04-15,0,98620,0.000000,test
2021-04-15,1,98620,0.020406,test
2021-04-15,0,98635,0.000000,test


In [11]:
res.reset_index(inplace=True)
res

Unnamed: 0,date,warehouse_id,product_id,quantity,label
0,2021-04-09,0,71165,2.466150,test
1,2021-04-09,1,71165,6.857162,test
2,2021-04-09,0,71170,-0.224253,test
3,2021-04-09,1,71170,0.054411,test
4,2021-04-09,0,71185,0.663822,test
...,...,...,...,...,...
46027,2021-04-15,1,98615,0.020406,test
46028,2021-04-15,0,98620,0.000000,test
46029,2021-04-15,1,98620,0.020406,test
46030,2021-04-15,0,98635,0.000000,test


In [20]:
testdf['product_id'].to_numpy()

array([71165, 71165, 71170, ..., 98620, 98635, 98635])

In [27]:
finish = res.drop(columns=['date', 'warehouse_id', 'product_id', 'label'])

In [29]:
finish.index.names = ['id']

In [32]:
finish

Unnamed: 0_level_0,quantity
id,Unnamed: 1_level_1
0,2.466150
1,6.857162
2,-0.224253
3,0.054411
4,0.663822
...,...
46027,0.020406
46028,0.000000
46029,0.020406
46030,0.000000


In [31]:
finish.to_csv('sub3.csv')

In [33]:
finish['quantity'] = finish['quantity'].astype(int)

In [37]:
finish.to_csv('sub4.csv')

## Линейная регрессия

In [None]:
import numpy as np
import pandas as pd

from datetime import datetime, timedelta
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt 

In [None]:
traindf = pd.read_csv('data/train.csv', sep=',', parse_dates=['date'],index_col='date')
traindf = traindf.drop(columns='id')

In [None]:
testdf = pd.read_csv('data/test.csv', sep=',', parse_dates=['date'],index_col='date')
testdf = testdf.drop(columns='id')

In [None]:
traindf['label'] = 'train'
testdf['label'] = 'test'

df = pd.concat((traindf,testdf), axis = 0)
df['quantity'] = np.where(df.label == 'train', df.quantity, np.nan)

In [None]:
df.sort_index(inplace=True)
df.sort_values(['product_id','warehouse_id'], inplace=True)

In [None]:
all_dates = pd.date_range(min(df.index), max(df.index))

In [None]:
lag_period = 1
nperiods = 28

features = []
for period in range(1, nperiods + 1, 1):
    df[f"lperiod_{period}"] = 0
    features.append(f"lperiod_{period}")

In [None]:
model = LinearRegression()

In [None]:
def lag_periods(quantities, mask):
    quantities = pd.Series(index=all_dates, dtype='float64').combine_first(quantities).fillna(0)
    for period in range(1, nperiods + 1, 1):
        df.loc[mask, f"lperiod_{period}"] = quantities.shift(period*lag_period)
        
def add_features(frame):
    print(frame)
    mask = (df.product_id == frame.product_id[0]) & (df.warehouse_id == frame.warehouse_id[0])
    lag_periods(df.loc[mask, 'quantity'], mask)

testdf.groupby(['warehouse_id','product_id', 'label']).agg(add_features)

In [None]:
df['lmean'] = df[features].mean(axis = 1)
features.extend(['lmean'])

In [None]:
def make_prediction(frame):
    mask = (df.product_id == frame.product_id[0]) & (df.warehouse_id == frame.warehouse_id[0])
    train_df = df[mask & (df.label == 'train')][features + ['quantity']].dropna()
    test_df = df[mask & (df.label == 'test')][features]
    
    if (train_df.shape[0] == 0):
        df.loc[mask & (df.label == 'test'), 'quantity'] = 0
        return

    model.fit(train_df.drop('quantity', axis = 1), train_df['quantity'])
    df.loc[mask & (df.label == 'test'), 'quantity'] = model.predict(test_df)

testdf.groupby(['warehouse_id','product_id', 'label']).agg(make_prediction)

In [None]:
test_df = df[(df.label == 'test')].copy()
test_df.drop(columns=features + ['label'], inplace=True)
test_df.reset_index(inplace=True)

In [None]:
test_df = test_df.sort_values(['date', 'product_id', 'warehouse_id'])
test_df.reset_index(inplace=True, drop=True)
test_df.head(5)

In [None]:
testdf.reset_index(inplace=True)
testdf.head(5)

In [None]:
test_df

In [None]:
# test_df = test_df.drop(columns=['date', 'product_id', 'warehouse_id'])
test_df.index.names = ['id']
test_df.to_csv('sub2.csv')

In [None]:
test_df

In [None]:
lin = read_csv('sub2.csv')

## Среднее

In [None]:
def pred(row):
    mask = (traindf.product_id == row.product_id) & (traindf.warehouse_id == row.warehouse_id)
    return traindf[mask].quantity.mean()

testdf['quantity'] = testdf.apply(pred, axis=1)

In [None]:
testdf.fillna(0, inplace=True)

In [None]:
testdf.drop(columns=['date', 'product_id', 'warehouse_id']).to_csv('sub1.csv')