In [1]:
import pandas as pd 
import numpy as np 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from tqdm import tqdm_notebook
import os
import gc

import lightgbm as lgb
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import warnings
warnings.simplefilter("ignore")
pd.set_option('display.max_rows', 200)
from catboost import CatBoostRegressor, Pool

In [2]:
directory = '/kaggle/input/ashrae-energy-prediction'
os.listdir(directory)


['test.csv',
 'building_metadata.csv',
 'train.csv',
 'weather_test.csv',
 'sample_submission.csv',
 'weather_train.csv']

In [3]:
usecols = ['site_id', 'timestamp', 'air_temperature', 'cloud_coverage',
       'dew_temperature', 'precip_depth_1_hr']
weather_train = pd.read_csv(directory+'/weather_train.csv', parse_dates=['timestamp'], usecols=usecols)
weather_test = pd.read_csv(directory+'/weather_test.csv', parse_dates=['timestamp'], usecols=usecols)
weather = pd.concat([weather_train, weather_test])
del weather_train, weather_test
gc.collect()


site_ids_offsets = pd.DataFrame({'site_id': 
                                 {0: 5,
                                  1: 0,
                                  2: 9,
                                  3: 6,
                                  4: 8,
                                  5: 0,
                                  6: 6,
                                  7: 6,
                                  8: 5,
                                  9: 7,
                                  10: 8,
                                  11: 6,
                                  12: 0,
                                  13: 7,
                                  14: 6,
                                  15: 6}})

weather['offset'] = weather['site_id'].map(site_ids_offsets['site_id'])
weather['timestamp'] = (weather['timestamp'] - pd.to_timedelta(weather['offset'], unit='H'))
del weather['offset']

In [4]:
building_metadata = pd.read_csv(directory+'/building_metadata.csv')
building_metadata['primary_use'] = building_metadata['primary_use'].map(dict(zip(building_metadata['primary_use'].unique(), 
                                                                                 range(16))))

In [5]:
df_train = pd.read_csv(directory+'/train.csv', parse_dates=['timestamp'])


In [6]:
# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
# Modified to support timestamp type, categorical type
# Modified to add option to use float16 or not. feather format does not support float16.
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [7]:
weather = reduce_mem_usage(weather)
building_metadata = reduce_mem_usage(building_metadata)
df_train = df_train.merge(building_metadata, on='building_id', how='left')
df_train = df_train.merge(weather, on=['site_id', 'timestamp'], how='left')
df_train = df_train.query('not (building_id <= 104 & meter == 0 & timestamp <= "2016-05-20")')

Memory usage of dataframe is 22.27 MB
Memory usage after optimization is: 13.12 MB
Decreased by 41.1%
Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 66.5%


In [8]:
weather.to_pickle('weather.pkl')
building_metadata.to_pickle('building.pkl')
del weather, building_metadata
gc.collect()

22

In [9]:
df_train = df_train[df_train['air_temperature'].notnull()|df_train['cloud_coverage'].notnull()|df_train['dew_temperature'].notnull()|df_train['precip_depth_1_hr'].notnull()].reset_index(drop=True)

In [10]:
df_train["hour"] = df_train["timestamp"].dt.hour
df_train["weekend"] = df_train["timestamp"].dt.weekday
df_train['year_built'] = df_train['year_built']-1900
df_train['square_feet'] = np.log1p(df_train['square_feet'])

dates_range = pd.date_range(start='2015-12-31', end='2019-01-01')
us_holidays = calendar().holidays(start=dates_range.min(), end=dates_range.max())
df_train['is_holiday'] = (df_train['timestamp'].dt.date.astype('datetime64').isin(us_holidays)).astype(np.int8)
del df_train["timestamp"]

df_train['meter_reading'] = np.log1p(df_train['meter_reading'])

In [11]:
df_train = reduce_mem_usage(df_train)
category = ['hour','meter','weekend','primary_use','site_id','building_id']
gc.collect()

Memory usage of dataframe is 1414.76 MB
Memory usage after optimization is: 754.54 MB
Decreased by 46.7%


46

In [12]:
# force the model to use the weather data instead of dates, to avoid overfitting to the past history
features = [col for col in df_train.columns if col not in ['meter_reading']]
target = 'meter_reading'

df_train = df_train[~df_train['meter_reading'].isnull()].reset_index(drop=True)
df_train['meter_reading'].isnull().sum()

0

In [13]:
NFOLDS = 2
kf = KFold(n_splits=2, shuffle=False, random_state=1024)
models = []
oof = np.zeros(len(df_train))
samples = len(df_train)//2
for i  in range(2):
    print('fold :', i)
    if i == 0:
        tr_x, tr_y = df_train[features].iloc[:samples], df_train[target][:samples]
        vl_x, vl_y = df_train[features].iloc[samples:], df_train[target][samples:]
    else:
        vl_x, vl_y = df_train[features].iloc[:samples], df_train[target][:samples]
        tr_x, tr_y = df_train[features].iloc[samples:], df_train[target][samples:]
    print({'train size':len(tr_x), 'eval size':len(vl_x)})
    
    dtrain = Pool(tr_x, label=tr_y, cat_features=category)
    dvalid = Pool(vl_x, label=vl_y, cat_features=category) 
    del tr_x, tr_y, vl_y
    gc.collect()
    cat_params = {
    'n_estimators': 1500,
    'learning_rate': 0.09,
    'eval_metric': 'RMSE',
    'loss_function': 'RMSE',
    'random_seed': 100,
    'metric_period': 10,
    'task_type': 'GPU',
    'depth': 10,
    }
    model = CatBoostRegressor(**cat_params)
    model = model.fit(
        dtrain, eval_set=dvalid,
        use_best_model=True,
        verbose=20,
        early_stopping_rounds=20)
    model.save_model('catboost_fold_%s.bin'%i)
    if i == 0:
        oof[samples:] = model.predict(vl_x)
    else:
        oof[:samples] = model.predict(vl_x)
    models.append(model)
    del vl_x, dtrain, dvalid
    gc.collect()
    
print('oof_RMSE : ' ,np.sqrt(mean_squared_error(oof, df_train['meter_reading'])))

fold : 0
{'train size': 9889878, 'eval size': 9889879}
0:	learn: 1.9835591	test: 1.9793084	best: 1.9793084 (0)	total: 1.07s	remaining: 26m 45s
20:	learn: 1.1776762	test: 1.3186642	best: 1.3186642 (20)	total: 21.8s	remaining: 25m 34s
40:	learn: 1.0809903	test: 1.2535898	best: 1.2535898 (40)	total: 41.7s	remaining: 24m 43s
60:	learn: 1.0374483	test: 1.2260811	best: 1.2260811 (60)	total: 1m 3s	remaining: 24m 46s
80:	learn: 1.0032288	test: 1.2072036	best: 1.2072036 (80)	total: 1m 24s	remaining: 24m 37s
100:	learn: 0.9813128	test: 1.1964487	best: 1.1957657 (99)	total: 1m 45s	remaining: 24m 25s
120:	learn: 0.9619407	test: 1.1874266	best: 1.1874266 (120)	total: 2m 8s	remaining: 24m 20s
140:	learn: 0.9462669	test: 1.1799867	best: 1.1799867 (140)	total: 2m 32s	remaining: 24m 25s
160:	learn: 0.9323376	test: 1.1748415	best: 1.1748415 (160)	total: 2m 54s	remaining: 24m 10s
180:	learn: 0.9191027	test: 1.1703283	best: 1.1701577 (179)	total: 3m 17s	remaining: 23m 59s
200:	learn: 0.9101628	test: 1.167

In [14]:
df_train['oof'] = oof
df_train[['oof']].to_csv('oof_c1.csv', index=False)
del df_train
gc.collect()

0

models = []
for i in range(2):
    model = CatBoostRegressor()
    model.load_model('catboost_fold_%s.bin'%i)
    models.append(model)

df_test = pd.read_csv('/kaggle/input/ashrae-energy-prediction/test.csv', parse_dates=['timestamp'])
df_test = reduce_mem_usage(df_test)

df_test = df_test.merge(building_metadata, on='building_id', how='left')
df_test = df_test.merge(weather, on=['site_id', 'timestamp'], how='left')
del building_metadata, weather
gc.collect()
df_test = reduce_mem_usage(df_test)

#time variable
df_test["timestamp"] = pd.to_datetime(df_test["timestamp"])
df_test["hour"] = df_test["timestamp"].dt.hour
df_test["weekend"] = df_test["timestamp"].dt.weekday
df_test['year_built'] = df_test['year_built']-1900
df_test['square_feet'] = np.log1p(df_test['square_feet'])

df_test['is_holiday'] = (df_test['timestamp'].dt.date.astype('datetime64').isin(us_holidays)).astype(np.int8)
del df_test["timestamp"]

# split test data into batches
set_size = len(df_test)
iterations = 50
batch_size = set_size // iterations

print(set_size, iterations, batch_size)
assert set_size == iterations * batch_size

meter_reading = []
for i in tqdm_notebook(range(iterations)):
    pos = i*batch_size
    fold_preds = [np.expm1(model.predict(Pool(df_test[features].iloc[pos : pos+batch_size], cat_features=category))) for model in models]
    meter_reading.extend(np.mean(fold_preds, axis=0))

print(len(meter_reading))
assert len(meter_reading) == set_size

sample_submission['meter_reading'] = np.clip(meter_reading, a_min=0, a_max=None) # clip min at zero
sample_submission.to_csv('submission.csv', index=False)
sample_submission.head(20)