In [14]:
"""
This is an upgraded version of Ceshine's LGBM starter script, simply adding
more average features and weekly average features on it.
"""

from datetime import date, timedelta

import pandas as pd
import numpy as np
import lightgbm as lgb
import sys
import math
import gc
import sklearn.metrics as skl_metrics

from logging import StreamHandler, DEBUG, Formatter, FileHandler, getLogger

logger = getLogger(__name__)

pd.options.mode.chained_assignment = None  # default='warn'

DIR = '../logs/'

log_fmt = Formatter('%(asctime)s %(name)s %(lineno)d [%(levelname)s]\
    [%(funcName)s] %(message)s ')
handler = StreamHandler()
handler.setLevel('INFO')
handler.setFormatter(log_fmt)
logger.addHandler(handler)

handler = FileHandler(DIR + 'train.py.log', 'a')
handler.setLevel(DEBUG)
handler.setFormatter(log_fmt)
logger.setLevel(DEBUG)
logger.addHandler(handler)

logger.info('start')

dtype_weather={"TEMP":np.float32,
               "VISIB":np.float32,
               "PRCP": np.float32
}
    
weather = pd.read_csv('../input/Weather_20180107.csv',dtype=dtype_weather,parse_dates=["YEARMODA"],)
weather["date"] = pd.to_datetime(weather['YEARMODA'],format='%Y%m%d').dt.date
weather['ID'] = 1

t2014 = date(2014, 8, 6)
t2015 = date(2015, 8, 5)
t2016 = date(2016, 8, 3)
t2017 = date(2017, 5, 31)
train_week_2017 = 9

logger.info('Load data successful')

###############################################################################
# Functions


def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range
              (dt - timedelta(days=minus), periods=periods, freq=freq)]



def prepare_dataset(t2017):
    X = pd.DataFrame({
         "ID": weather_temp.ID,
        "date": (t2017),
    })

    for i in range(16):
        for j in range(7):
            X["TEMP_{}_d{}".format(i,j)] = get_timespan(weather_temp, t2017, i+j-3, 1).values.ravel()


    for i in range(16):
        for j in range(7):
            X["VISIB_{}_d{}".format(i,j)] = get_timespan(weather_visib, t2017, i+j-3, 1).values.ravel()

            
    for i in range(16):
        for j in range(7):
            X["PRCP_{}_d{}".format(i,j)] = get_timespan(weather_prcp, t2017, i+j-3, 1).values.ravel()

    return X

###############################################################################

weather_temp = weather[['ID','date','TEMP']].set_index(
    ['ID','date'])[["TEMP"]].unstack(
        level=-1).fillna(0)
weather_temp.columns = weather_temp.columns.get_level_values(1)
weather_temp.reset_index(inplace = True)

weather_visib = weather[['ID','date','VISIB']].set_index(
    ['ID','date'])[["VISIB"]].unstack(
        level=-1).fillna(0)
weather_visib.columns = weather_visib.columns.get_level_values(1)
weather_visib.reset_index(inplace = True)


weather_prcp = weather[['ID','date','PRCP']].set_index(
    ['ID','date'])[["PRCP"]].unstack(
        level=-1).fillna(0)
weather_prcp.columns = weather_prcp.columns.get_level_values(1)
weather_prcp.reset_index(inplace = True)


weather_temp[pd.datetime(2017, 6, 28)] = 0
weather_visib[pd.datetime(2017, 6, 28)] = 0
weather_prcp[pd.datetime(2017, 6, 28)] = 0


weather_temp[pd.datetime(2016, 7, 15)] = 0
weather_visib[pd.datetime(2016, 7 , 15)] = 0
weather_prcp[pd.datetime(2016, 7, 15)] = 0

weather_temp[pd.datetime(2016, 7, 16)] = 0
weather_visib[pd.datetime(2016, 7 , 16)] = 0
weather_prcp[pd.datetime(2016, 7, 16)] = 0

weather_temp[pd.datetime(2016, 7, 17)] = 0
weather_visib[pd.datetime(2016, 7 , 17)] = 0
weather_prcp[pd.datetime(2016, 7, 17)] = 0

weather_temp[pd.datetime(2016, 7, 18)] = 0
weather_visib[pd.datetime(2016, 7 , 18)] = 0
weather_prcp[pd.datetime(2016, 7, 18)] = 0
##########################################################################

logger.info('Preparing traing dataset...')

X_l = []

# Add train data on Aug 2014 and Aug 2015


logger.info('Preparing 2015 training dataset...')
for i in range(4):
    delta = timedelta(days=7 * i)
    X_tmp = prepare_dataset(
        t2015 + delta
    )
    X_l.append(X_tmp)
 
logger.info('Preparing 2017 training dataset...')
for i in range(train_week_2017):
    delta = timedelta(days=7 * i)
    X_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)


logger.info('Preparing 2016 training dataset...')
for i in range(4):
    delta = timedelta(days=7 * i)
    X_tmp = prepare_dataset(
        t2016 + delta
    )
    X_l.append(X_tmp)

X_train = pd.concat(X_l, axis=0)

del X_l

delta = timedelta(0)

X_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16))
    
##########################################################################
# output   

X_train.to_pickle('../data/weather_train.p')
X_val.to_pickle('../data/weather_val.p')
X_test.to_pickle('../data/weather_test.p')
 


2018-01-09 19:30:59,861 __main__ 36 [INFO]    [<module>] start 
2018-01-09 19:30:59,861 __main__ 36 [INFO]    [<module>] start 
2018-01-09 19:30:59,870 __main__ 53 [INFO]    [<module>] Load data successful 
2018-01-09 19:30:59,870 __main__ 53 [INFO]    [<module>] Load data successful 
2018-01-09 19:30:59,913 __main__ 131 [INFO]    [<module>] Preparing traing dataset... 
2018-01-09 19:30:59,913 __main__ 131 [INFO]    [<module>] Preparing traing dataset... 
2018-01-09 19:30:59,916 __main__ 138 [INFO]    [<module>] Preparing 2015 training dataset... 
2018-01-09 19:30:59,916 __main__ 138 [INFO]    [<module>] Preparing 2015 training dataset... 
2018-01-09 19:31:02,245 __main__ 146 [INFO]    [<module>] Preparing 2017 training dataset... 
2018-01-09 19:31:02,245 __main__ 146 [INFO]    [<module>] Preparing 2017 training dataset... 
2018-01-09 19:31:07,533 __main__ 155 [INFO]    [<module>] Preparing 2016 training dataset... 
2018-01-09 19:31:07,533 __main__ 155 [INFO]    [<module>] Preparing 20

In [15]:
X_train.head(1)

Unnamed: 0,ID,date,TEMP_0_d0,TEMP_0_d1,TEMP_0_d2,TEMP_0_d3,TEMP_0_d4,TEMP_0_d5,TEMP_0_d6,TEMP_1_d0,...,PRCP_14_d4,PRCP_14_d5,PRCP_14_d6,PRCP_15_d0,PRCP_15_d1,PRCP_15_d2,PRCP_15_d3,PRCP_15_d4,PRCP_15_d5,PRCP_15_d6
0,1,2015-08-05,60.700001,59.599998,62.400002,59.099998,59.400002,61.900002,58.900002,59.599998,...,0.03,0.12,0.16,0.12,0.04,0.0,0.03,0.12,0.16,0.39


In [16]:
X_train.shape

(17, 338)

In [40]:

features_weather = X_train.columns.tolist()

j=0
i=0

for j in range(16):
    if j != i:
        for k in range(7):
            features_weather.remove("TEMP_{}_d{}".format(j,k))
            features_weather.remove("VISIB_{}_d{}".format(j,k))
            features_weather.remove("PRCP_{}_d{}".format(j,k))

In [41]:
X_train[features_weather]

Unnamed: 0,ID,date,TEMP_0_d0,TEMP_0_d1,TEMP_0_d2,TEMP_0_d3,TEMP_0_d4,TEMP_0_d5,TEMP_0_d6,VISIB_0_d0,...,VISIB_0_d4,VISIB_0_d5,VISIB_0_d6,PRCP_0_d0,PRCP_0_d1,PRCP_0_d2,PRCP_0_d3,PRCP_0_d4,PRCP_0_d5,PRCP_0_d6
0,1,2015-08-05,60.700001,59.599998,62.400002,59.099998,59.400002,61.900002,58.900002,6.9,...,7.1,6.7,7.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,1,2015-08-12,57.900002,60.099998,61.400002,59.099998,59.0,63.0,60.799999,8.1,...,7.8,7.3,5.2,0.0,0.04,0.0,0.0,0.0,0.0,0.0
0,1,2015-08-19,62.099998,59.5,60.5,58.700001,60.900002,59.5,58.200001,6.2,...,7.6,7.4,6.8,0.0,0.0,0.02,0.0,0.0,0.0,0.0
0,1,2015-08-26,58.599998,59.0,59.900002,61.200001,58.700001,60.0,60.900002,6.5,...,6.7,6.5,6.6,0.0,0.01,0.0,0.0,0.04,0.02,0.0
0,1,2017-05-31,60.900002,63.0,63.200001,60.400002,61.200001,59.200001,60.0,6.0,...,6.1,8.7,7.6,0.01,0.0,0.01,0.01,0.0,0.0,0.0
0,1,2017-06-07,60.299999,61.5,66.099998,64.199997,60.700001,60.599998,61.200001,6.5,...,6.3,6.0,6.0,0.0,0.08,0.0,0.0,0.02,0.04,0.0
0,1,2017-06-14,61.400002,58.799999,57.200001,55.599998,60.0,65.0,63.599998,7.6,...,6.6,7.3,6.5,0.01,0.04,0.08,0.55,0.31,0.0,0.16
0,1,2017-06-21,60.799999,62.5,61.599998,58.599998,57.799999,62.099998,61.700001,7.9,...,5.0,6.2,6.5,0.0,0.0,0.0,0.0,0.75,0.0,0.0
0,1,2017-06-28,59.099998,61.0,58.799999,0.0,60.799999,60.200001,59.799999,6.5,...,6.8,6.3,7.8,0.0,0.04,0.08,0.0,0.0,0.02,0.0
0,1,2017-07-05,60.400002,57.299999,61.099998,59.5,57.5,57.700001,58.799999,7.9,...,5.4,5.9,5.9,0.04,0.04,0.0,0.0,0.0,0.0,0.0


In [26]:
train_out = pd.read_pickle('../data/storeitem_train_1s.p')

In [27]:
train_out.groupby(['date']).size()

date
2014-08-06    3564
2014-08-13    3564
2014-08-20    3564
2014-08-27    3564
2015-08-05    3564
2015-08-12    3564
2015-08-19    3564
2015-08-26    3564
2016-08-03    3564
2016-08-10    3564
2016-08-17    3564
2016-08-24    3564
2017-05-31    3564
2017-06-07    3564
2017-06-14    3564
2017-06-21    3564
2017-06-28    3564
2017-07-05    3564
2017-07-12    3564
2017-07-19    3564
2017-07-26    3564
dtype: int64

In [42]:
train = pd.merge(train_out, X_train[features_weather], on=['date'], how='left').fillna(0)

In [39]:
train.shape

(74844, 150)

In [43]:
train.loc[ train['date'] == date(2017, 5, 31),]


Unnamed: 0,index,date,day_1,item_nbr,mean_14,mean_140,mean_182,mean_21,mean_3,mean_30,...,VISIB_0_d4,VISIB_0_d5,VISIB_0_d6,PRCP_0_d0,PRCP_0_d1,PRCP_0_d2,PRCP_0_d3,PRCP_0_d4,PRCP_0_d5,PRCP_0_d6
42768,0,2017-05-31,0.000000,96995,0.148532,0.070156,0.053966,0.132028,0.231049,0.138629,...,6.1,8.7,7.6,0.01,0.0,0.01,0.01,0.0,0.0,0.0
42769,1,2017-05-31,0.000000,99197,0.511931,0.134989,0.181302,0.544938,0.597253,0.381457,...,6.1,8.7,7.6,0.01,0.0,0.01,0.01,0.0,0.0,0.0
42770,2,2017-05-31,0.000000,103520,0.667989,0.712362,0.786856,0.785272,0.000000,0.868856,...,6.1,8.7,7.6,0.01,0.0,0.01,0.01,0.0,0.0,0.0
42771,3,2017-05-31,0.000000,103665,0.866918,1.015355,1.003911,0.957257,0.366204,0.956552,...,6.1,8.7,7.6,0.01,0.0,0.01,0.01,0.0,0.0,0.0
42772,4,2017-05-31,1.098612,105574,1.556041,1.805308,1.741335,1.703183,1.059351,1.774012,...,6.1,8.7,7.6,0.01,0.0,0.01,0.01,0.0,0.0,0.0
42773,5,2017-05-31,2.302585,105575,2.066668,2.272804,2.282490,2.119502,1.878263,2.151082,...,6.1,8.7,7.6,0.01,0.0,0.01,0.01,0.0,0.0,0.0
42774,6,2017-05-31,0.000000,105577,0.375535,0.547424,0.538586,0.572337,0.462098,0.608580,...,6.1,8.7,7.6,0.01,0.0,0.01,0.01,0.0,0.0,0.0
42775,7,2017-05-31,0.000000,105693,0.000000,0.117239,0.179591,0.000000,0.000000,0.000000,...,6.1,8.7,7.6,0.01,0.0,0.01,0.01,0.0,0.0,0.0
42776,8,2017-05-31,0.693147,105737,1.034304,0.835684,0.874266,1.046255,1.133732,0.905477,...,6.1,8.7,7.6,0.01,0.0,0.01,0.01,0.0,0.0,0.0
42777,9,2017-05-31,1.791759,105857,1.904952,1.305548,1.209307,1.930835,1.707988,1.634912,...,6.1,8.7,7.6,0.01,0.0,0.01,0.01,0.0,0.0,0.0


In [44]:
train.loc[ train['date'] == date(2017, 7, 19),]

Unnamed: 0,index,date,day_1,item_nbr,mean_14,mean_140,mean_182,mean_21,mean_3,mean_30,...,VISIB_0_d4,VISIB_0_d5,VISIB_0_d6,PRCP_0_d0,PRCP_0_d1,PRCP_0_d2,PRCP_0_d3,PRCP_0_d4,PRCP_0_d5,PRCP_0_d6
67716,0,2017-07-19,0.000000,96995,0.000000,0.094911,0.073009,0.033007,0.000000,0.046210,...,6.2,5.2,6.2,0.0,0.0,0.01,0.04,0.2,0.43,0.0
67717,1,2017-07-19,1.386294,99197,0.652050,0.320388,0.277567,0.586036,0.693147,0.604129,...,6.2,5.2,6.2,0.0,0.0,0.01,0.04,0.2,0.43,0.0
67718,2,2017-07-19,0.000000,103520,0.831016,0.823375,0.759052,0.790668,0.366204,0.811548,...,6.2,5.2,6.2,0.0,0.0,0.01,0.04,0.2,0.43,0.0
67719,3,2017-07-19,1.791759,103665,1.026778,1.018449,1.043892,0.994531,0.597253,0.984795,...,6.2,5.2,6.2,0.0,0.0,0.01,0.04,0.2,0.43,0.0
67720,4,2017-07-19,2.079442,105574,1.776992,1.808192,1.786186,1.692449,1.656604,1.734055,...,6.2,5.2,6.2,0.0,0.0,0.01,0.04,0.2,0.43,0.0
67721,5,2017-07-19,3.332205,105575,2.363525,2.269667,2.274376,2.304497,2.265980,2.334852,...,6.2,5.2,6.2,0.0,0.0,0.01,0.04,0.2,0.43,0.0
67722,6,2017-07-19,0.000000,105577,0.506434,0.596708,0.584314,0.565599,0.000000,0.565091,...,6.2,5.2,6.2,0.0,0.0,0.01,0.04,0.2,0.43,0.0
67723,7,2017-07-19,0.000000,105693,0.177493,0.143208,0.153348,0.217350,0.000000,0.198355,...,6.2,5.2,6.2,0.0,0.0,0.01,0.04,0.2,0.43,0.0
67724,8,2017-07-19,2.397895,105737,0.683210,0.854353,0.846975,0.626117,1.030347,0.682846,...,6.2,5.2,6.2,0.0,0.0,0.01,0.04,0.2,0.43,0.0
67725,9,2017-07-19,1.791759,105857,1.608451,1.544144,1.420982,1.640494,1.521449,1.660476,...,6.2,5.2,6.2,0.0,0.0,0.01,0.04,0.2,0.43,0.0
