In [64]:

"""
This is an upgraded version of Ceshine's LGBM starter script, simply adding
more average features and weekly average features on it.
"""

from datetime import date, timedelta

import pandas as pd
import numpy as np
import lightgbm as lgb
import sys
import math
import gc
import sklearn.metrics as skl_metrics

from logging import StreamHandler, DEBUG, Formatter, FileHandler, getLogger

logger = getLogger(__name__)

pd.options.mode.chained_assignment = None  # default='warn'

DIR = '../logs/'

log_fmt = Formatter('%(asctime)s %(name)s %(lineno)d [%(levelname)s]\
    [%(funcName)s] %(message)s ')
handler = StreamHandler()
handler.setLevel('INFO')
handler.setFormatter(log_fmt)
logger.addHandler(handler)

handler = FileHandler(DIR + 'train.py.log', 'a')
handler.setLevel(DEBUG)
handler.setFormatter(log_fmt)
logger.setLevel(DEBUG)
logger.addHandler(handler)

logger.info('start')

dtype_weather={"TEMP":np.float32,
               "VISIB":np.float32,
               "PRCP": np.float32
}
    
weather = pd.read_csv('../input/Weather_20180107.csv',dtype=dtype_weather,parse_dates=["YEARMODA"],)
weather["date"] = pd.to_datetime(weather['YEARMODA'],format='%Y%m%d').dt.date
weather['ID'] = 1

t2014 = date(2014, 8, 6)
t2015 = date(2015, 8, 5)
t2016 = date(2016, 8, 3)
t2017 = date(2017, 5, 31)
train_week_2017 = 9

logger.info('Load data successful')


2018-01-09 16:30:24,603 __main__ 37 [INFO]    [<module>] start 
2018-01-09 16:30:24,603 __main__ 37 [INFO]    [<module>] start 
2018-01-09 16:30:24,603 __main__ 37 [INFO]    [<module>] start 
2018-01-09 16:30:24,603 __main__ 37 [INFO]    [<module>] start 
2018-01-09 16:30:24,603 __main__ 37 [INFO]    [<module>] start 
2018-01-09 16:30:24,603 __main__ 37 [INFO]    [<module>] start 
2018-01-09 16:30:24,603 __main__ 37 [INFO]    [<module>] start 
2018-01-09 16:30:24,603 __main__ 37 [INFO]    [<module>] start 
2018-01-09 16:30:24,616 __main__ 54 [INFO]    [<module>] Load data successful 
2018-01-09 16:30:24,616 __main__ 54 [INFO]    [<module>] Load data successful 
2018-01-09 16:30:24,616 __main__ 54 [INFO]    [<module>] Load data successful 
2018-01-09 16:30:24,616 __main__ 54 [INFO]    [<module>] Load data successful 
2018-01-09 16:30:24,616 __main__ 54 [INFO]    [<module>] Load data successful 
2018-01-09 16:30:24,616 __main__ 54 [INFO]    [<module>] Load data successful 
2018-01-09 16:

In [66]:
weather.head(1)

Unnamed: 0,YEARMODA,TEMP,VISIB,PRCP,date,ID
0,2014-01-02,61.099998,7.8,0.0,2014-01-02,1


In [85]:

###############################################################################
# Functions


def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range
              (dt - timedelta(days=minus), periods=periods, freq=freq)]



def prepare_dataset(t2017):
    X = pd.DataFrame({
         "ID": weather_temp.ID,
        "date": (t2017),
    })

    for i in range(7):
        X['TEMP_{}'.format(i)] = get_timespan(weather_temp, t2017, 3-i, 1).values.ravel()

    for i in range(7):
        X['VISIB_{}'.format(i)] = get_timespan(weather_visib, t2017, 3-i, 1).values.ravel()
        
    for i in range(7):
        X['PRCP_{}'.format(i)] = get_timespan(weather_prcp, t2017, 3-i, 1).values.ravel()
        
    return X


In [106]:
weather_temp = weather[['ID','date','TEMP']].set_index(
    ['ID','date'])[["TEMP"]].unstack(
        level=-1).fillna(0)
weather_temp.columns = weather_temp.columns.get_level_values(1)
weather_temp.reset_index(inplace = True)

weather_visib = weather[['ID','date','VISIB']].set_index(
    ['ID','date'])[["VISIB"]].unstack(
        level=-1).fillna(0)
weather_visib.columns = weather_visib.columns.get_level_values(1)
weather_visib.reset_index(inplace = True)


weather_prcp = weather[['ID','date','PRCP']].set_index(
    ['ID','date'])[["PRCP"]].unstack(
        level=-1).fillna(0)
weather_prcp.columns = weather_prcp.columns.get_level_values(1)
weather_prcp.reset_index(inplace = True)



In [68]:
weather_temp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Columns: 1090 entries, ID to 2017-12-30 00:00:00
dtypes: float32(1089), int64(1)
memory usage: 4.3 KB


In [69]:
weather_temp.head(1)

date,ID,2014-01-02 00:00:00,2014-01-05 00:00:00,2014-01-09 00:00:00,2014-01-10 00:00:00,2014-01-11 00:00:00,2014-01-17 00:00:00,2014-01-19 00:00:00,2014-01-20 00:00:00,2014-01-22 00:00:00,...,2017-12-21 00:00:00,2017-12-22 00:00:00,2017-12-23 00:00:00,2017-12-24 00:00:00,2017-12-25 00:00:00,2017-12-26 00:00:00,2017-12-27 00:00:00,2017-12-28 00:00:00,2017-12-29 00:00:00,2017-12-30 00:00:00
0,1,61.099998,58.099998,61.5,60.400002,62.700001,60.599998,60.099998,60.0,65.0,...,63.0,63.400002,66.400002,67.199997,65.900002,61.700001,59.700001,59.5,60.400002,64.5


In [123]:
weather_temp[pd.datetime(2017, 6, 28)] = 0
weather_visib[pd.datetime(2017, 6, 28)] = 0
weather_prcp[pd.datetime(2017, 6, 28)] = 0


KeyError: datetime.datetime(2014, 8, 9, 0, 0)

In [124]:
logger.info('Preparing traing dataset...')

X_l = []

# Add train data on Aug 2014 and Aug 2015


logger.info('Preparing 2016 training dataset...')
for i in range(4):
    delta = timedelta(days=7 * i)
    X_tmp = prepare_dataset(
        t2016 + delta
    )
    X_l.append(X_tmp)

logger.info('Preparing 2015 training dataset...')
for i in range(4):
    delta = timedelta(days=7 * i)
    X_tmp = prepare_dataset(
        t2015 + delta
    )
    X_l.append(X_tmp)
 
logger.info('Preparing 2017 training dataset...')
for i in range(train_week_2017):
    delta = timedelta(days=7 * i)
    X_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    
#logger.info('Preparing 2014 training dataset...')
#for i in range(4):
#    delta = timedelta(days=7 * i)
#    X_tmp = prepare_dataset(
#        t2014 + delta
#    )
#    X_l.append(X_tmp)


X_train = pd.concat(X_l, axis=0)

del X_l

delta = timedelta(0)

X_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16))


2018-01-09 16:52:31,701 __main__ 1 [INFO]    [<module>] Preparing traing dataset... 
2018-01-09 16:52:31,701 __main__ 1 [INFO]    [<module>] Preparing traing dataset... 
2018-01-09 16:52:31,701 __main__ 1 [INFO]    [<module>] Preparing traing dataset... 
2018-01-09 16:52:31,701 __main__ 1 [INFO]    [<module>] Preparing traing dataset... 
2018-01-09 16:52:31,701 __main__ 1 [INFO]    [<module>] Preparing traing dataset... 
2018-01-09 16:52:31,701 __main__ 1 [INFO]    [<module>] Preparing traing dataset... 
2018-01-09 16:52:31,701 __main__ 1 [INFO]    [<module>] Preparing traing dataset... 
2018-01-09 16:52:31,701 __main__ 1 [INFO]    [<module>] Preparing traing dataset... 
2018-01-09 16:52:31,714 __main__ 9 [INFO]    [<module>] Preparing 2016 training dataset... 
2018-01-09 16:52:31,714 __main__ 9 [INFO]    [<module>] Preparing 2016 training dataset... 
2018-01-09 16:52:31,714 __main__ 9 [INFO]    [<module>] Preparing 2016 training dataset... 
2018-01-09 16:52:31,714 __main__ 9 [INFO]   

In [125]:
X_train

Unnamed: 0,ID,date,TEMP_0,TEMP_1,TEMP_2,TEMP_3,TEMP_4,TEMP_5,TEMP_6,VISIB_0,...,VISIB_4,VISIB_5,VISIB_6,PRCP_0,PRCP_1,PRCP_2,PRCP_3,PRCP_4,PRCP_5,PRCP_6
0,1,2016-08-03,60.200001,57.799999,57.299999,58.599998,60.200001,62.5,65.300003,6.6,...,6.4,6.1,7.5,0.0,0.0,0.0,0.02,0.04,0.0,0.0
0,1,2016-08-10,59.900002,60.0,60.0,62.0,57.900002,61.900002,62.400002,5.4,...,5.7,8.0,7.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,1,2016-08-17,60.400002,60.400002,59.700001,59.799999,59.900002,60.0,58.0,7.6,...,7.2,7.2,7.2,0.0,0.12,0.0,0.0,0.0,0.0,0.0
0,1,2016-08-24,63.200001,59.200001,61.0,59.0,63.5,57.900002,58.799999,7.1,...,9.9,6.4,6.9,0.0,0.2,0.0,0.0,0.0,0.03,0.0
0,1,2015-08-05,58.900002,61.900002,59.400002,59.099998,62.400002,59.599998,60.700001,7.8,...,7.9,8.6,6.9,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,1,2015-08-12,60.799999,63.0,59.0,59.099998,61.400002,60.099998,57.900002,5.2,...,7.2,7.3,8.1,0.0,0.0,0.0,0.0,0.0,0.04,0.0
0,1,2015-08-19,58.200001,59.5,60.900002,58.700001,60.5,59.5,62.099998,6.8,...,7.2,6.5,6.2,0.0,0.0,0.0,0.0,0.02,0.0,0.0
0,1,2015-08-26,60.900002,60.0,58.700001,61.200001,59.900002,59.0,58.599998,6.6,...,7.5,6.0,6.5,0.0,0.02,0.04,0.0,0.0,0.01,0.0
0,1,2017-05-31,60.0,59.200001,61.200001,60.400002,63.200001,63.0,60.900002,7.6,...,5.8,5.7,6.0,0.0,0.0,0.0,0.01,0.01,0.0,0.01
0,1,2017-06-07,61.200001,60.599998,60.700001,64.199997,66.099998,61.5,60.299999,6.0,...,10.4,5.7,6.5,0.0,0.04,0.02,0.0,0.0,0.08,0.0
