In [1]:
from datetime import date, timedelta

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import sys

import math
import sklearn.metrics as skl_metrics

from datetime import timedelta
from logging import StreamHandler, DEBUG, Formatter, FileHandler, getLogger

logger = getLogger(__name__)
pd.options.mode.chained_assignment = None  # default='warn'

DIR = '../logs/'

log_fmt = Formatter('%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s ')
handler = StreamHandler()
handler.setLevel('INFO')
handler.setFormatter(log_fmt)
logger.addHandler(handler)

handler = FileHandler(DIR + 'train.py.log', 'a')
handler.setLevel(DEBUG)
handler.setFormatter(log_fmt)
logger.setLevel(DEBUG)
logger.addHandler(handler)

logger.info('start')

#------------------------------------------------------------------------------------#

df_train = pd.read_csv(
    '../input/train_2s.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
    float(u)) if float(u) > 0 else 0},
    parse_dates=["date"]
)

df_test = pd.read_csv(
    "../input/test_2s.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)




2018-01-04 11:52:32,747 __main__ 32 [INFO][<module>] start 
  interactivity=interactivity, compiler=compiler, result=result)


In [33]:
df_train['unit_sales'].sum()

6889023.974129723

In [38]:
df_train.shape

(4619173, 5)

In [28]:
### 
df_train_item = df_train[['item_nbr','date', 'store_nbr', 'unit_sales']].groupby(['item_nbr','date'])\
    .agg({'unit_sales': 'sum', 'store_nbr':'count'}).reset_index()


In [37]:
df_train_item.tail(2)

Unnamed: 0,item_nbr,date,unit_sales,store_nbr,item_avg_sales
3117097,2124052,2017-08-15,0.693147,1,0.693147
3117098,2127114,2017-08-08,0.693147,1,0.693147


In [35]:
df_train_item.shape

(3117099, 5)

In [29]:
    
df_train_item["item_avg_sales"] = df_train_item["unit_sales"] / df_train_item["store_nbr"]


In [30]:
df_train_item['unit_sales'].sum()

6889023.974014039

In [41]:
df_train_item[["item_nbr", "date", "item_avg_sales"]].head(2)

Unnamed: 0,item_nbr,date,item_avg_sales
0,96995,2013-01-10,0.693147
1,96995,2013-01-11,0.693147


In [39]:

###############################################################################
# Functions


def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range
              (dt - timedelta(days=minus), periods=periods, freq=freq)]


def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "item_nbr": df_2017_nbr.item_nbr,
        "date": (t2017), 
        "day_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_21_2017": get_timespan(df_2017, t2017, 21, 21).mean(axis=1).values,
        "mean_42_2017": get_timespan(df_2017, t2017, 42, 42).mean(axis=1).values,
        "mean_91_2017": get_timespan(df_2017, t2017, 91, 91).mean(axis=1).values,
        "mean_182_2017": get_timespan(df_2017, t2017, 182, 182).mean(axis=1).values,
        "mean_364_2017": get_timespan(df_2017, t2017, 364, 364).mean(axis=1).values,
    })
  
    for i in range(7):
        X['dow_4_{}_mean'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['dow_13_{}_mean'.format(i)] = get_timespan(df_2017, t2017, 91-i, 13, freq='7D').mean(axis=1).values
        X['dow_26_{}_mean'.format(i)] = get_timespan(df_2017, t2017, 182-i, 26, freq='7D').mean(axis=1).values
        X['dow_52_{}_mean'.format(i)] = get_timespan(df_2017, t2017, 364-i, 52, freq='7D').mean(axis=1).values        


    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X



In [None]:
df_train_item[["item_nbr", "date", "item_avg_sales"]].head(2)

In [57]:
df_2017 = df_train_item.set_index(
    ["item_nbr", "date"])[["item_avg_sales"]].unstack(
        level=-1).fillna(0)



In [51]:
df_2017.columns = df_2017.columns.get_level_values(1)

In [52]:
df_2017.columns

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06', '2013-01-07', '2013-01-08',
               '2013-01-09', '2013-01-10',
               ...
               '2017-08-06', '2017-08-07', '2017-08-08', '2017-08-09',
               '2017-08-10', '2017-08-11', '2017-08-12', '2017-08-13',
               '2017-08-14', '2017-08-15'],
              dtype='datetime64[ns]', name='date', length=1684, freq=None)

In [53]:
df_2017_nbr = pd.DataFrame(df_2017.copy())
df_2017_nbr.reset_index(inplace = True)

In [56]:

df_2017[pd.datetime(2017, 1, 1)] = 0
df_2017[pd.datetime(2016, 1, 1)] = 0
df_2017[pd.datetime(2015, 1, 1)] = 0    
df_2017[pd.datetime(2015, 7, 7)] = 0
