In [118]:
from datetime import date, timedelta

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import sys

import math
import sklearn.metrics as skl_metrics

from datetime import timedelta
from logging import StreamHandler, DEBUG, Formatter, FileHandler, getLogger

logger = getLogger(__name__)
pd.options.mode.chained_assignment = None  # default='warn'

DIR = '../logs/'

log_fmt = Formatter('%(asctime)s %(name)s %(lineno)d [%(levelname)s][%(funcName)s] %(message)s ')
handler = StreamHandler()
handler.setLevel('INFO')
handler.setFormatter(log_fmt)
logger.addHandler(handler)

handler = FileHandler(DIR + 'train.py.log', 'a')
handler.setLevel(DEBUG)
handler.setFormatter(log_fmt)
logger.setLevel(DEBUG)
logger.addHandler(handler)

logger.info('start')

#------------------------------------------------------------------------------------#

df_train = pd.read_csv(
    '../input/train_1s.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
#    converters={'unit_sales': lambda u: np.log1p(
#    float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
)

df_test = pd.read_csv(
    "../input/test_1s.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)



items = pd.read_csv(
    "../input/items.csv",
).set_index("item_nbr")



df_2017 = df_train.loc[df_train.date>=pd.datetime(2016,5,1)]
#del df_train

promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

items = items.reindex(df_2017.index.get_level_values(1))

df_2017[pd.datetime(2017,1,1)] = 0
df_2017[pd.datetime(2016,12,25)] = 0

2017-12-26 19:36:56,280 __main__ 32 [INFO][<module>] start 
2017-12-26 19:36:56,280 __main__ 32 [INFO][<module>] start 
2017-12-26 19:36:56,280 __main__ 32 [INFO][<module>] start 
2017-12-26 19:36:56,280 __main__ 32 [INFO][<module>] start 
2017-12-26 19:36:56,280 __main__ 32 [INFO][<module>] start 
2017-12-26 19:36:56,280 __main__ 32 [INFO][<module>] start 
2017-12-26 19:36:56,280 __main__ 32 [INFO][<module>] start 
2017-12-26 19:36:56,280 __main__ 32 [INFO][<module>] start 
2017-12-26 19:36:56,280 __main__ 32 [INFO][<module>] start 
2017-12-26 19:36:56,280 __main__ 32 [INFO][<module>] start 
2017-12-26 19:36:56,280 __main__ 32 [INFO][<module>] start 
2017-12-26 19:36:56,280 __main__ 32 [INFO][<module>] start 
  interactivity=interactivity, compiler=compiler, result=result)


In [119]:
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({
        "day_1_2017": get_timespan(df_2017, t2017, 1, 1).values.ravel(),
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "mean_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
        "mean_60_2017": get_timespan(df_2017, t2017, 60, 60).mean(axis=1).values,
        "mean_140_2017": get_timespan(df_2017, t2017, 140, 140).mean(axis=1).values,
        
        "mean_21_2017": get_timespan(df_2017, t2017, 21, 21).mean(axis=1).values,
        "mean_42_2017": get_timespan(df_2017, t2017, 42, 42).mean(axis=1).values,
        "mean_91_2017": get_timespan(df_2017, t2017, 91, 91).mean(axis=1).values,
        "mean_182_2017": get_timespan(df_2017, t2017, 182, 182).mean(axis=1).values,
        "mean_364_2017": get_timespan(df_2017, t2017, 364, 364).mean(axis=1).values,
        
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_2017, t2017, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_2017, t2017, 140, 140).sum(axis=1).values
    })
    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df_2017, t2017, 140-i, 20, freq='7D').mean(axis=1).values
    for i in range(16):
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = df_2017[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X


In [120]:
logger.info('Preparing datasetn...')

t2017 = date(2017, 5, 31)
X_l, y_l = [], []
for i in range(6):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
# del X_l, y_l
X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

2017-12-26 19:37:01,508 __main__ 1 [INFO][<module>] Preparing datasetn... 
2017-12-26 19:37:01,508 __main__ 1 [INFO][<module>] Preparing datasetn... 
2017-12-26 19:37:01,508 __main__ 1 [INFO][<module>] Preparing datasetn... 
2017-12-26 19:37:01,508 __main__ 1 [INFO][<module>] Preparing datasetn... 
2017-12-26 19:37:01,508 __main__ 1 [INFO][<module>] Preparing datasetn... 
2017-12-26 19:37:01,508 __main__ 1 [INFO][<module>] Preparing datasetn... 
2017-12-26 19:37:01,508 __main__ 1 [INFO][<module>] Preparing datasetn... 
2017-12-26 19:37:01,508 __main__ 1 [INFO][<module>] Preparing datasetn... 
2017-12-26 19:37:01,508 __main__ 1 [INFO][<module>] Preparing datasetn... 
2017-12-26 19:37:01,508 __main__ 1 [INFO][<module>] Preparing datasetn... 
2017-12-26 19:37:01,508 __main__ 1 [INFO][<module>] Preparing datasetn... 
2017-12-26 19:37:01,508 __main__ 1 [INFO][<module>] Preparing datasetn... 


In [121]:
X_tmp.shape

(3532, 45)

In [122]:
X_train.head(1)

Unnamed: 0,day_1_2017,mean_140_2017,mean_14_2017,mean_182_2017,mean_21_2017,mean_30_2017,mean_364_2017,mean_3_2017,mean_42_2017,mean_60_2017,...,promo_6,promo_7,promo_8,promo_9,promo_10,promo_11,promo_12,promo_13,promo_14,promo_15
0,0.0,0.114286,0.214286,0.087912,0.190476,0.2,0.043956,0.333333,0.309524,0.266667,...,0,0,0,0,0,0,0,0,0,0


In [125]:
df_2017.head(1)

Unnamed: 0_level_0,date,2016-05-01 00:00:00,2016-05-02 00:00:00,2016-05-03 00:00:00,2016-05-04 00:00:00,2016-05-05 00:00:00,2016-05-06 00:00:00,2016-05-07 00:00:00,2016-05-08 00:00:00,2016-05-09 00:00:00,2016-05-10 00:00:00,...,2017-08-08 00:00:00,2017-08-09 00:00:00,2017-08-10 00:00:00,2017-08-11 00:00:00,2017-08-12 00:00:00,2017-08-13 00:00:00,2017-08-14 00:00:00,2017-08-15 00:00:00,2017-01-01 00:00:00,2016-12-25 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0


In [126]:
df_2017.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3532 entries, (1, 96995) to (1, 2127114)
Columns: 472 entries, 2016-05-01 to 2016-12-25
dtypes: float64(470), int64(2)
memory usage: 12.8 MB


In [127]:

t2017 = date(2017, 5, 31)
X_l, y_l = [], []
#for i in range(6):
i = 1     
delta = timedelta(days=7 * i)




In [128]:
X = pd.DataFrame({
    "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
    "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values
})

In [129]:
df_2017.head(3)

Unnamed: 0_level_0,date,2016-05-01 00:00:00,2016-05-02 00:00:00,2016-05-03 00:00:00,2016-05-04 00:00:00,2016-05-05 00:00:00,2016-05-06 00:00:00,2016-05-07 00:00:00,2016-05-08 00:00:00,2016-05-09 00:00:00,2016-05-10 00:00:00,...,2017-08-08 00:00:00,2017-08-09 00:00:00,2017-08-10 00:00:00,2017-08-11 00:00:00,2017-08-12 00:00:00,2017-08-13 00:00:00,2017-08-14 00:00:00,2017-08-15 00:00:00,2017-01-01 00:00:00,2016-12-25 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0
1,99197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
1,103520,0.0,2.0,1.0,2.0,3.0,3.0,2.0,0.0,3.0,0.0,...,3.0,0.0,3.0,1.0,1.0,1.0,0.0,0.0,0,0


In [149]:
df_train.loc[(df_train['item_nbr'] == 96995) & (df_train['date'] > '20160501'),]

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
2307014,2017-04-07,1,96995,2.0,False
2325574,2017-04-17,1,96995,1.0,False
2333835,2017-04-21,1,96995,2.0,False
2336005,2017-04-22,1,96995,3.0,False
2343675,2017-04-26,1,96995,1.0,False
2347775,2017-04-28,1,96995,1.0,False
2366782,2017-05-08,1,96995,1.0,False
2368889,2017-05-09,1,96995,1.0,False
2374923,2017-05-12,1,96995,1.0,False
2400570,2017-05-25,1,96995,1.0,False


In [171]:
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

In [175]:
pd.date_range(t2017 - timedelta(30), periods=15, freq='D')

DatetimeIndex(['2017-05-01', '2017-05-02', '2017-05-03', '2017-05-04',
               '2017-05-05', '2017-05-06', '2017-05-07', '2017-05-08',
               '2017-05-09', '2017-05-10', '2017-05-11', '2017-05-12',
               '2017-05-13', '2017-05-14', '2017-05-15'],
              dtype='datetime64[ns]', freq='D')

In [178]:
get_timespan(df_2017, t2017, 30, 15,freq='D').head(1)

Unnamed: 0_level_0,date,2017-05-01 00:00:00,2017-05-02 00:00:00,2017-05-03 00:00:00,2017-05-04 00:00:00,2017-05-05 00:00:00,2017-05-06 00:00:00,2017-05-07 00:00:00,2017-05-08 00:00:00,2017-05-09 00:00:00,2017-05-10 00:00:00,2017-05-11 00:00:00,2017-05-12 00:00:00,2017-05-13 00:00:00,2017-05-14 00:00:00,2017-05-15 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,96995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0


In [176]:
X = pd.DataFrame({
    "mean_30_15_2017": get_timespan(df_2017, t2017, 30, 15,freq='D').mean(axis=1).values,
    "mean_30_30_2017": get_timespan(df_2017, t2017, 30, 30).mean(axis=1).values,
    "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
    "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
})
X.head(1)

Unnamed: 0,mean_14_2017,mean_30_15_2017,mean_30_30_2017,mean_7_2017
0,0.214286,0.2,0.2,0.428571


In [170]:
X.head(1)

Unnamed: 0,mean_14_2017,mean_30_15_2017,mean_30_30_2017,mean_7_2017
0,0.214286,0.0,0.2,0.428571


In [160]:
t2017

datetime.date(2017, 5, 31)

In [131]:
y = df_2017[
    pd.date_range(t2017, periods=16)
].values

In [132]:
y.shape

(3532, 16)