In [1]:
import numpy as np 
import pandas as pd 

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from tqdm import tqdm_notebook
import os
import gc

import lightgbm as lgb
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import warnings
warnings.simplefilter("ignore")
pd.set_option('display.max_rows', 200)

In [2]:
directory = '/kaggle/input/ashrae-energy-prediction'
os.listdir(directory)

['sample_submission.csv',
 'building_metadata.csv',
 'weather_test.csv',
 'train.csv',
 'test.csv',
 'weather_train.csv']

In [3]:
weather_train = pd.read_csv(directory+'/weather_train.csv', parse_dates=['timestamp'])
weather_test = pd.read_csv(directory+'/weather_test.csv', parse_dates=['timestamp'])
weather = pd.concat([weather_train, weather_test])
del weather_train, weather_test
gc.collect()
site_ids_offsets = pd.DataFrame({'site_id': 
                                 {0: 5,
                                  1: 0,
                                  2: 9,
                                  3: 6,
                                  4: 8,
                                  5: 0,
                                  6: 6,
                                  7: 6,
                                  8: 5,
                                  9: 7,
                                  10: 8,
                                  11: 6,
                                  12: 0,
                                  13: 7,
                                  14: 6,
                                  15: 6}})

weather['offset'] = weather['site_id'].map(site_ids_offsets['site_id'])
weather['timestamp'] = (weather['timestamp'] - pd.to_timedelta(weather['offset'], unit='H'))
del weather['offset']

In [4]:
building_metadata = pd.read_csv(directory+'/building_metadata.csv')

sample_submission = pd.read_csv(directory+'/sample_submission.csv')
df_train = pd.read_csv(directory+'/train.csv', parse_dates=['timestamp'])

# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
# Modified to support timestamp type, categorical type
# Modified to add option to use float16 or not. feather format does not support float16.
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [5]:
weather = reduce_mem_usage(weather)
building_metadata = reduce_mem_usage(building_metadata)

df_train = df_train.merge(building_metadata, on='building_id', how='left')
df_train = df_train.merge(weather, on=['site_id', 'timestamp'], how='left')
df_train = df_train.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')

df_train = df_train[df_train['air_temperature'].notnull()|df_train['cloud_coverage'].notnull()|df_train['dew_temperature'].notnull()|df_train['precip_depth_1_hr'].notnull()].reset_index(drop=True)

Memory usage of dataframe is 31.82 MB
Memory usage after optimization is: 17.90 MB
Decreased by 43.8%
Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 65.4%


# feature engneering

In [6]:
df_train["hour"] = df_train["timestamp"].dt.hour
df_train["weekend"] = df_train["timestamp"].dt.weekday
df_train['year_built'] = df_train['year_built']-1900
df_train['square_feet'] = np.log1p(df_train['square_feet'])

dates_range = pd.date_range(start='2015-12-31', end='2019-01-01')
us_holidays = calendar().holidays(start=dates_range.min(), end=dates_range.max())
df_train['is_holiday'] = (df_train['timestamp'].dt.date.astype('datetime64').isin(us_holidays)).astype(np.int8)
del df_train["timestamp"]

df_train['meter_reading'] = np.log1p(df_train['meter_reading'])

In [7]:
df_train = reduce_mem_usage(df_train)
category = ['hour','meter','weekend','primary_use','site_id','building_id']
for col in category :
    df_train[col] = df_train[col].astype('category')

gc.collect()

Memory usage of dataframe is 1641.12 MB
Memory usage after optimization is: 980.90 MB
Decreased by 40.2%


15

# Model

In [8]:
# force the model to use the weather data instead of dates, to avoid overfitting to the past history
features = [col for col in df_train.columns if col not in ['meter_reading', 'year', 'month', 'day','row_id','group', 
                                                           'sea_level_pressure', 'wind_direction', 'wind_speed']]
target = 'meter_reading'

df_train = df_train[~df_train['meter_reading'].isnull()].reset_index(drop=True)
df_train['meter_reading'].isnull().sum()

0

In [9]:
NFOLDS = 2
kf = KFold(n_splits=2, shuffle=False, random_state=1024)
models = []
oof = np.zeros(len(df_train))
print('Light GBM Model')
for fold_, (trn_idx, val_idx)  in enumerate(kf.split(df_train)):
        tr_x, tr_y = df_train[features].iloc[trn_idx], df_train[target][trn_idx]
        vl_x, vl_y = df_train[features].iloc[val_idx], df_train[target][val_idx]
        print({'train size':len(tr_x), 'eval size':len(vl_x)})
        print('fold : ',fold_)
        tr_data = lgb.Dataset(tr_x, label=tr_y)
        vl_data = lgb.Dataset(vl_x, label=vl_y) 
        reg= lgb.LGBMRegressor(n_estimators=10000,
                               reg_alpha=0.0, 
                               reg_lambda=2.0,
                                learning_rate=0.05,
                                feature_fraction=0.8,
                                subsample=0.3,
                                num_leaves=40,
                                metric='rmse',
                              )
        reg.fit(tr_x, tr_y, eval_set=[(vl_x, vl_y)], early_stopping_rounds=200, verbose=200)
        oof[val_idx] = reg.predict(df_train.iloc[val_idx][features])
        models.append(reg)
        del tr_x, tr_y, vl_x, vl_y, tr_data, vl_data
        gc.collect()
    
print('oof_RMSE : ' ,np.sqrt(mean_squared_error(oof, df_train['meter_reading'])))


Light GBM Model
{'train size': 9889878, 'eval size': 9889879}
fold :  0
Training until validation scores don't improve for 200 rounds
[200]	valid_0's rmse: 1.16313
[400]	valid_0's rmse: 1.15078
[600]	valid_0's rmse: 1.14872
[800]	valid_0's rmse: 1.14542
[1000]	valid_0's rmse: 1.14474
Early stopping, best iteration is:
[893]	valid_0's rmse: 1.14448
{'train size': 9889879, 'eval size': 9889878}
fold :  1
Training until validation scores don't improve for 200 rounds
[200]	valid_0's rmse: 1.15163
[400]	valid_0's rmse: 1.13768
[600]	valid_0's rmse: 1.13489
[800]	valid_0's rmse: 1.13443
[1000]	valid_0's rmse: 1.13364
Early stopping, best iteration is:
[980]	valid_0's rmse: 1.13347
oof_RMSE :  1.1389904197400698


In [10]:
df_train['oof'] = oof
df_train[['oof']].to_csv('oof_l1.csv', index=False)
del df_train
gc.collect()

22

# predict

In [11]:
df_test = pd.read_csv('/kaggle/input/ashrae-energy-prediction/test.csv', parse_dates=['timestamp'])
df_test = reduce_mem_usage(df_test)

Memory usage of dataframe is 1272.51 MB
Memory usage after optimization is: 596.49 MB
Decreased by 53.1%


In [12]:
df_test = df_test.merge(building_metadata, on='building_id', how='left')
df_test = df_test.merge(weather, on=['site_id', 'timestamp'], how='left')
del building_metadata, weather
gc.collect()
df_test = reduce_mem_usage(df_test)

Memory usage of dataframe is 2584.79 MB
Memory usage after optimization is: 2584.79 MB
Decreased by 0.0%


In [13]:
#time variable
df_test["timestamp"] = pd.to_datetime(df_test["timestamp"])
df_test["hour"] = df_test["timestamp"].dt.hour
df_test["weekend"] = df_test["timestamp"].dt.weekday
df_test['year_built'] = df_test['year_built']-1900
df_test['square_feet'] = np.log1p(df_test['square_feet'])

df_test['is_holiday'] = (df_test['timestamp'].dt.date.astype('datetime64').isin(us_holidays)).astype(np.int8)
del df_test["timestamp"]

category = ['hour','meter','weekend','primary_use','site_id','building_id']
for col in category :
    df_test[col] = df_test[col].astype('category')

In [14]:
# split test data into batches
set_size = len(df_test)
iterations = 50
batch_size = set_size // iterations

print(set_size, iterations, batch_size)
assert set_size == iterations * batch_size

meter_reading = []
for i in tqdm_notebook(range(iterations)):
    pos = i*batch_size
    fold_preds = [np.expm1(model.predict(df_test[features].iloc[pos : pos+batch_size])) for model in models]
    meter_reading.extend(np.mean(fold_preds, axis=0))

print(len(meter_reading))
assert len(meter_reading) == set_size

41697600 50 833952


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


41697600


In [15]:
sample_submission['meter_reading'] = np.clip(meter_reading, a_min=0, a_max=None) # clip min at zero
sample_submission.to_csv('submission.csv', index=False)
sample_submission.head(20)

Unnamed: 0,row_id,meter_reading
0,0,155.322415
1,1,78.743679
2,2,12.913539
3,3,224.273439
4,4,955.097866
5,5,23.635308
6,6,105.654618
7,7,309.475125
8,8,487.691107
9,9,317.209244
