In [1]:
import pandas as pd 
import numpy as np 
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold
from tqdm import tqdm_notebook
import os
import gc

import lightgbm as lgb
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
import warnings
warnings.simplefilter("ignore")
pd.set_option('display.max_rows', 200)
from catboost import CatBoostRegressor, Pool

In [2]:
directory = '/kaggle/input/ashrae-energy-prediction'
os.listdir(directory)


['train.csv',
 'building_metadata.csv',
 'sample_submission.csv',
 'weather_test.csv',
 'weather_train.csv',
 'test.csv']

In [3]:
building_metadata = pd.read_pickle('/kaggle/input/ashrae-catboost/building.pkl')
weather = pd.read_pickle('/kaggle/input/ashrae-catboost/weather.pkl')

In [4]:
# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
# Modified to support timestamp type, categorical type
# Modified to add option to use float16 or not. feather format does not support float16.
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [5]:
category = ['hour','meter','weekend','primary_use','site_id','building_id']


In [6]:
models = []
for i in range(2):
    model = CatBoostRegressor()
    model.load_model('/kaggle/input/ashrae-catboost/catboost_fold_%s.bin'%i)
    models.append(model)

In [7]:
dates_range = pd.date_range(start='2015-12-31', end='2019-01-01')
us_holidays = calendar().holidays(start=dates_range.min(), end=dates_range.max())

In [8]:
df_test = pd.read_csv('/kaggle/input/ashrae-energy-prediction/test.csv', parse_dates=['timestamp'])
df_test = reduce_mem_usage(df_test)

df_test = df_test.merge(building_metadata, on='building_id', how='left')
df_test = df_test.merge(weather, on=['site_id', 'timestamp'], how='left')
del building_metadata, weather
gc.collect()
df_test = reduce_mem_usage(df_test)

#time variable
df_test["timestamp"] = pd.to_datetime(df_test["timestamp"])
df_test["hour"] = df_test["timestamp"].dt.hour
df_test["weekend"] = df_test["timestamp"].dt.weekday
df_test['year_built'] = df_test['year_built']-1900
df_test['square_feet'] = np.log1p(df_test['square_feet'])

Memory usage of dataframe is 1272.51 MB
Memory usage after optimization is: 596.49 MB
Decreased by 53.1%
Memory usage of dataframe is 2107.59 MB
Memory usage after optimization is: 2107.59 MB
Decreased by 0.0%


In [9]:
df_test['is_holiday'] = (df_test['timestamp'].dt.date.astype('datetime64').isin(us_holidays)).astype(np.int8)
del df_test["timestamp"]

# split test data into batches
set_size = len(df_test)
iterations = 50
batch_size = set_size // iterations

print(set_size, iterations, batch_size)
assert set_size == iterations * batch_size

41697600 50 833952


In [10]:
features = [col for col in df_test if col not in ['row_id']]

In [11]:
meter_reading = []
for i in tqdm_notebook(range(iterations)):
    pos = i*batch_size
    fold_preds = [np.expm1(model.predict(Pool(df_test[features].iloc[pos : pos+batch_size], cat_features=category))) for model in models]
    meter_reading.extend(np.mean(fold_preds, axis=0))

print(len(meter_reading))
assert len(meter_reading) == set_size

HBox(children=(IntProgress(value=0, max=50), HTML(value='')))


41697600


In [12]:
sample_submission = pd.read_csv(directory+'/sample_submission.csv')
sample_submission['meter_reading'] = np.clip(meter_reading, a_min=0, a_max=None) # clip min at zero
sample_submission.to_csv('submission.csv', index=False)
sample_submission.head(20)

Unnamed: 0,row_id,meter_reading
0,0,268.049476
1,1,86.483316
2,2,11.24553
3,3,323.685274
4,4,900.124096
5,5,12.070984
6,6,103.566558
7,7,605.493424
8,8,608.26871
9,9,375.387351
