In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ashrae-energy-prediction/sample_submission.csv
/kaggle/input/ashrae-energy-prediction/building_metadata.csv
/kaggle/input/ashrae-energy-prediction/weather_train.csv
/kaggle/input/ashrae-energy-prediction/weather_test.csv
/kaggle/input/ashrae-energy-prediction/train.csv
/kaggle/input/ashrae-energy-prediction/test.csv


**Energy Consumption Prediction in Smart Buildings using Ensemble Learning**

Data Reading

In [5]:
import os
import gc
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O
import datetime

In [6]:
path = '../input/ashrae-energy-prediction'

for dirname, _, filenames in os.walk(path):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../input/ashrae-energy-prediction/sample_submission.csv
../input/ashrae-energy-prediction/building_metadata.csv
../input/ashrae-energy-prediction/weather_train.csv
../input/ashrae-energy-prediction/weather_test.csv
../input/ashrae-energy-prediction/train.csv
../input/ashrae-energy-prediction/test.csv


Data Preparation

In [7]:
def reduce_mem(df):
    result = df.copy()
    for col in result.columns:
        col_data = result[col]
        dn = col_data.dtype.name
        if not dn.startswith("datetime"):
            if dn == "object":  # only object feature has low cardinality
                result[col] = pd.to_numeric(col_data.astype("category").cat.codes, downcast="unsigned")
            elif dn.startswith("int") | dn.startswith("uint"):
                if col_data.min() >= 0:
                    result[col] = pd.to_numeric(col_data, downcast="unsigned")
                else:
                    result[col] = pd.to_numeric(col_data, downcast='integer')
            else:
                result[col] = pd.to_numeric(col_data, downcast='float')
    return result

def _delete_bad_sitezero(X, y):
    cond = (X.timestamp > '2016-05-20') | (X.site_id != 0) | (X.meter != 0)
    X = X[cond]
    y = y.reindex_like(X)
    return X.reset_index(drop=True), y.reset_index(drop=True)

def _extract_temporal(X, train=True):
    X['hour'] = X.timestamp.dt.hour
    X['weekday'] = X.timestamp.dt.weekday
    if train:
        # include month to create validation set, to be deleted before training
        X['month'] = X.timestamp.dt.month 
    # month and year cause overfit, could try other (holiday, business, etc.)
    return reduce_mem(X)
def load_data(source='train'):
    assert source in ['train','test']
    df = pd.read_csv(f'{path}/{source}.csv', parse_dates=['timestamp'])
    return reduce_mem(df)

def load_building():
    df = pd.read_csv(f'{path}/building_metadata.csv').fillna(-1)
    return reduce_mem(df)

def load_weather(source='train', fix_timezone=True, impute=True, add_lag=True):
    assert source in ['train','test']
    df = pd.read_csv(f'{path}/weather_{source}.csv', parse_dates=['timestamp'])
    if fix_timezone:
        offsets = [5,0,9,6,8,0,6,6,5,7,8,6,0,7,6,6]
        offset_map = {site: offset for site, offset in enumerate(offsets)}
        df.timestamp = df.timestamp - pd.to_timedelta(df.site_id.map(offset_map), unit='h')
    if impute:
        site_dfs = []
        for site in df.site_id.unique():
            if source == 'train':
                new_idx = pd.date_range(start='2016-1-1', end='2016-12-31-23', freq='H')
            else:
                new_idx = pd.date_range(start='2017-1-1', end='2018-12-31-23', freq='H')
            site_df = df[df.site_id == site].set_index('timestamp').reindex(new_idx)
            site_df.site_id = site
            for col in [c for c in site_df.columns if c != 'site_id']:
                site_df[col] = site_df[col].interpolate(limit_direction='both', method='linear')
                site_df[col] = site_df[col].fillna(df[col].median())
            site_dfs.append(site_df)
        df = pd.concat(site_dfs)
        df['timestamp'] = df.index
        df = df.reset_index(drop=True)
        
    if add_lag:
        df = add_lag_features(df, window=3)
    
    return reduce_mem(df)

def merged_dfs(source='train', fix_timezone=True, impute=True, add_lag=False):
    df = load_data(source=source).merge(load_building(), on='building_id', how='left')
    df = df.merge(load_weather(source=source, fix_timezone=fix_timezone, impute=impute, add_lag=add_lag),
                 on=['site_id','timestamp'], how='left')
    if source == 'train':
        X = df.drop('meter_reading', axis=1)  
        y = np.log1p(df.meter_reading)  # log-transform of target
        return X, y
    elif source == 'test':
        return df

In [8]:
X_train, y_train = merged_dfs(add_lag=False)
X_train.head()

Unnamed: 0,building_id,meter,timestamp,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed
0,0,0,2016-01-01,0,0,7432,2008.0,-1.0,19.4,6.0,19.4,0.0,1019.400024,0.0,0.0
1,1,0,2016-01-01,0,0,2720,2004.0,-1.0,19.4,6.0,19.4,0.0,1019.400024,0.0,0.0
2,2,0,2016-01-01,0,0,5376,1991.0,-1.0,19.4,6.0,19.4,0.0,1019.400024,0.0,0.0
3,3,0,2016-01-01,0,0,23685,2002.0,-1.0,19.4,6.0,19.4,0.0,1019.400024,0.0,0.0
4,4,0,2016-01-01,0,0,116607,1975.0,-1.0,19.4,6.0,19.4,0.0,1019.400024,0.0,0.0


In [9]:
# preprocessing
X_train, y_train = _delete_bad_sitezero(X_train, y_train)
X_train = _extract_temporal(X_train)

# remove timestamp and other unimportant features
to_drop = ['timestamp','sea_level_pressure','wind_direction','wind_speed', 'precip_depth_1_hr', 'year_built', 'square_feet']
X_train.drop(to_drop, axis=1, inplace=True)

gc.collect()

70

In [10]:
X_train

Unnamed: 0,building_id,meter,site_id,primary_use,floor_count,air_temperature,cloud_coverage,dew_temperature,hour,weekday,month
0,105,0,1,0,5.0,3.8,0.0,2.4,0,4,1
1,106,0,1,0,4.0,3.8,0.0,2.4,0,4,1
2,106,3,1,0,4.0,3.8,0.0,2.4,0,4,1
3,107,0,1,0,10.0,3.8,0.0,2.4,0,4,1
4,108,0,1,0,5.0,3.8,0.0,2.4,0,4,1
...,...,...,...,...,...,...,...,...,...,...,...
19869881,1444,0,15,1,-1.0,1.7,2.0,-5.6,23,5,12
19869882,1445,0,15,0,-1.0,1.7,2.0,-5.6,23,5,12
19869883,1446,0,15,1,-1.0,1.7,2.0,-5.6,23,5,12
19869884,1447,0,15,4,-1.0,1.7,2.0,-5.6,23,5,12


In [11]:
list(X_train.columns)

['building_id',
 'meter',
 'site_id',
 'primary_use',
 'floor_count',
 'air_temperature',
 'cloud_coverage',
 'dew_temperature',
 'hour',
 'weekday',
 'month']

In [12]:
df_train = pd.concat([X_train, y_train], axis=1)

del X_train, y_train
gc.collect()

df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19869886 entries, 0 to 19869885
Data columns (total 12 columns):
 #   Column           Dtype  
---  ------           -----  
 0   building_id      uint16 
 1   meter            uint8  
 2   site_id          uint8  
 3   primary_use      uint8  
 4   floor_count      float32
 5   air_temperature  float32
 6   cloud_coverage   float32
 7   dew_temperature  float32
 8   hour             uint8  
 9   weekday          uint8  
 10  month            uint8  
 11  meter_reading    float32
dtypes: float32(5), uint16(1), uint8(6)
memory usage: 530.6 MB


In [13]:
df_train

Unnamed: 0,building_id,meter,site_id,primary_use,floor_count,air_temperature,cloud_coverage,dew_temperature,hour,weekday,month,meter_reading
0,105,0,1,0,5.0,3.8,0.0,2.4,0,4,1,3.190624
1,106,0,1,0,4.0,3.8,0.0,2.4,0,4,1,0.318163
2,106,3,1,0,4.0,3.8,0.0,2.4,0,4,1,0.000000
3,107,0,1,0,10.0,3.8,0.0,2.4,0,4,1,5.171529
4,108,0,1,0,5.0,3.8,0.0,2.4,0,4,1,4.524668
...,...,...,...,...,...,...,...,...,...,...,...,...
19869881,1444,0,15,1,-1.0,1.7,2.0,-5.6,23,5,12,2.277267
19869882,1445,0,15,0,-1.0,1.7,2.0,-5.6,23,5,12,1.762159
19869883,1446,0,15,1,-1.0,1.7,2.0,-5.6,23,5,12,0.000000
19869884,1447,0,15,4,-1.0,1.7,2.0,-5.6,23,5,12,5.078761


In [15]:
df_targets = df_train['meter_reading']

In [16]:
df_targets.head()

0    3.190624
1    0.318163
2    0.000000
3    5.171529
4    4.524668
Name: meter_reading, dtype: float32

**Spliting Data**

In [18]:
from sklearn.model_selection import train_test_split

train_data,test_data,train_target,test_target = train_test_split(df_train,df_targets, test_size=0.2)

In [19]:
train_data.head()

Unnamed: 0,building_id,meter,site_id,primary_use,floor_count,air_temperature,cloud_coverage,dew_temperature,hour,weekday,month,meter_reading
19057950,117,0,1,0,4.0,7.3,8.161017,7.3,16,5,12,3.78419
8691858,610,0,4,1,4.0,17.799999,4.0,7.8,17,2,6,3.617652
11170925,250,3,2,0,-1.0,26.0,7.2,20.0,20,4,7,0.0
7569799,291,0,3,9,-1.0,26.1,4.0,17.799999,19,3,5,5.382245
3921099,973,1,9,6,-1.0,12.8,1.0,5.0,1,5,3,3.726033


In [20]:
train_data.shape

(15895908, 12)

In [21]:
test_data.shape

(3973978, 12)

In [22]:
train_target.shape

(15895908,)

In [23]:
test_target.shape

(3973978,)

**Machine Models**

In [27]:
from sklearn.linear_model import LinearRegression,SGDRegressor,ElasticNet,Ridge
from sklearn.svm import SVC
from sklearn import linear_model
from sklearn.ensemble import StackingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor

In [29]:
def checkModelPerformane(model):
    X=train_data
    y=train_target
    model.fit(X,y)
    pred=model.predict(test_data)
    print("mean_squared_error: ",np.sqrt(mean_squared_error(test_target, pred))) 
    print("mean_absolute_error: ", np.sqrt(mean_absolute_error(test_target, pred)))

Linear Regression

In [30]:
print("LinearRegression")
checkModelPerformane(LinearRegression())

LinearRegression
mean_squared_error:  2.0073492e-06
mean_absolute_error:  0.0012559247


In [31]:
print("LinearRegression with lasso regualrizion")
checkModelPerformane(linear_model.Lasso(alpha=0.1))

LinearRegression with lasso regualrizion
mean_squared_error:  0.048779067
mean_absolute_error:  0.19565679


Lasso, Ridge and Elastic Regression

In [33]:
print("lasso regression ")
checkModelPerformane(linear_model.Lasso(alpha=0.1))

lasso regression 
mean_squared_error:  0.048779067
mean_absolute_error:  0.19565679


In [34]:
print("ElasticNet regression ")
checkModelPerformane(ElasticNet())

ElasticNet regression 
mean_squared_error:  0.4296076
mean_absolute_error:  0.58064497


In [35]:
print("Ridge regression ")
checkModelPerformane(Ridge(alpha=1.0))

Ridge regression 
mean_squared_error:  1.1636168e-05
mean_absolute_error:  0.0030065794


Decision Tree

In [36]:
print("Decision tree regression")
checkModelPerformane(DecisionTreeRegressor(random_state=42))

Decision tree regression
mean_squared_error:  5.5630510852342654e-05
mean_absolute_error:  0.000981215102035194


3 Layers

In [39]:
from keras import models
from keras import layers

def build_model():
    # Because we will need to instantiate
    # the same model multiple times,
    # we use a function to construct it.
    model=models.Sequential()
    model.add(layers.Flatten())
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(128, activation='relu'))# Add the proper output and activation
    #model.add(layers.Dense(1, activation='relu'))# Add the proper output and activation
    return model

In [None]:
print("SVM")
checkModelPerformane(SVR())

SVM


SGDRegressor

In [None]:
print("SGDRegressor")
checkModelPerformane(SGDRegressor())