In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn import utils
from sklearn import preprocessing
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor,Ridge,ElasticNet
from sklearn.model_selection import train_test_split,KFold,GroupKFold
import lightgbm as lgb
import gc
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn. linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.layers as layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.optimizers import *

from prettytable import PrettyTable

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/ashrae-energy-prediction/sample_submission.csv
/kaggle/input/ashrae-energy-prediction/building_metadata.csv
/kaggle/input/ashrae-energy-prediction/weather_train.csv
/kaggle/input/ashrae-energy-prediction/weather_test.csv
/kaggle/input/ashrae-energy-prediction/train.csv
/kaggle/input/ashrae-energy-prediction/test.csv


## - Loading and Reducing Memory Usage of Data

In [2]:
data_path = "/kaggle/input/ashrae-energy-prediction/"

train_path = data_path + "train.csv"

building_path = data_path + "building_metadata.csv"

weather_train_path = data_path + "weather_train.csv"

In [3]:
train_data = pd.read_csv(train_path)

building_data = pd.read_csv(building_path)

weather_train_data = pd.read_csv(weather_train_path)

In [4]:
train_data = train_data.loc[train_data['meter'] == 0]
train_data = train_data.reset_index()

In [5]:
train_data.to_feather('train_data.feather')

building_data.to_feather('building_data.feather')

weather_train_data.to_feather('weather_train_data.feather')

In [6]:
train_data = pd.read_feather('train_data.feather')

building_data = pd.read_feather('building_data.feather')

weather_train_data = pd.read_feather('weather_train_data.feather')

In [7]:
def reduce_mem_usage(df, df_name):

    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:

        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2

    print('Memory usage of {} is reduced by {:.2f} %. Usage dropped from {:.2f} MB to {:.2f} MB.'.format(df_name, (100 * (start_mem - end_mem) / start_mem), start_mem, end_mem))
    
    return df


In [8]:
train_data = reduce_mem_usage(train_data, 'Train Data')

building_data = reduce_mem_usage(building_data, 'Building Data')

weather_train_data = reduce_mem_usage(weather_train_data, 'Weather Train Data')

Memory usage of Train Data is reduced by 67.43 %. Usage dropped from 460.09 MB to 149.85 MB.
Memory usage of Building Data is reduced by 73.88 %. Usage dropped from 0.07 MB to 0.02 MB.
Memory usage of Weather Train Data is reduced by 73.06 %. Usage dropped from 9.60 MB to 2.59 MB.


In [9]:
# merge data

train = train_data.merge(building_data, on='building_id', how='left')
train = train.merge(weather_train_data, on=['site_id', 'timestamp'], how='left')

In [10]:
def breakdown_timestamp(dataframe):
    
    dataframe['timestamp']= pd.to_datetime(dataframe['timestamp'])

    dataframe['hour']= np.uint8(dataframe['timestamp'].dt.hour)
    
    dataframe['day']= np.uint16(dataframe['timestamp'].dt.day)
    dataframe['dayofweek']= np.uint8(dataframe['timestamp'].dt.dayofweek)
    dataframe['dayofyear']= np.uint16(dataframe['timestamp'].dt.dayofyear)

    dataframe['month']= np.uint8(dataframe['timestamp'].dt.month)

    dataframe['year']= np.uint16(dataframe['timestamp'].dt.year)
    
    return dataframe

In [11]:
train = breakdown_timestamp(train)

## - Applying Log Transformation to 'Meter Reading and 'Square Feet'

In [12]:
train['meter_reading'] = np.log1p(train['meter_reading'])

In [13]:
train['square_feet'] = np.log1p(train['square_feet'])

## - Data Preperation and Feature Engineering

In [14]:
zero_meter_readings = list(train[train['meter_reading'] == 0].index)
train.drop(zero_meter_readings, axis = 0, inplace = True)

In [15]:
# Drop Columns with More than 50% Missing Values
threshold = len(train) * 0.5
train.dropna(axis=1, thresh = threshold, inplace = True)

In [16]:
# Fill Missing Values
train['cloud_coverage'].fillna(train['cloud_coverage'].median(), inplace=True)
train['sea_level_pressure'].fillna(train['sea_level_pressure'].median(), inplace=True)
train['precip_depth_1_hr'].fillna(train['precip_depth_1_hr'].median(), inplace=True)
train['wind_direction'].fillna(train['wind_direction'].median(), inplace=True)
train['wind_speed'].fillna(train['wind_speed'].median(), inplace=True)
train['dew_temperature'].fillna(train['dew_temperature'].median(), inplace=True)
train['air_temperature'].fillna(train['air_temperature'].median(), inplace=True)

In [17]:
# Add new feature from existing ones to get better results

train['season'] = train['timestamp'].apply(lambda x: 'Spring' if x.month==3 or x.month==4 or x.month==5 else 
                                                  'Summer' if x.month==6 or x.month==7 or x.month==8 else 
                                                  'Autumn' if x.month==9 or x.month==10 or x.month==11 else 
                                                  'Winter')

train['isDayTime'] = train['timestamp'].apply(lambda x: 1 if x.hour >=6 and x.hour <=18 else 0)

In [18]:
# Encoding categorical data

categorical_features = ['primary_use', 'season']

encoder = preprocessing.LabelEncoder()

for i in categorical_features:
    
    train[i] = encoder.fit_transform(train[i])
    

## - Drop Timestamp and Reduce Memory Usage Again

In [19]:
train = train.drop(['timestamp'],axis=1)

In [20]:
reduced_train_data = reduce_mem_usage(train, 'Train Data')

Memory usage of Train Data is reduced by 34.67 %. Usage dropped from 824.74 MB to 538.83 MB.


## - Removing Least Important Features Generated by our Feature Selection Method

In [21]:
new_data = reduced_train_data[['square_feet','building_id','primary_use','site_id','hour','air_temperature','index','dayofyear','dayofweek','isDayTime','dew_temperature','meter_reading']]

In [22]:
new_data.drop(['site_id','dew_temperature'],axis=1, inplace = True)

# Models and Fine Tuning

In [23]:
X_train = new_data.drop(['meter_reading'],axis = 1)

Y_train = new_data['meter_reading'].values

## 1 Basline Model

In [24]:
def baselineModel(y_actual,y_pred):

    rmsle_score = np.sqrt(np.mean((y_actual - y_pred) * (y_actual-y_pred)))
    
    print("The RMSLE Score of the Baseline Model is :",rmsle_score)

baselineModel(Y_train, np.median(Y_train))

The RMSLE Score of the Baseline Model is : 1.513


The baseline score can be computed is median value of labels. The baseline score for 50% of the data is 1.772. So it can be expected that the baseline score could be more.

**Splitting the data for training and testing**

In [25]:
train_x, test_x, train_y, test_y = train_test_split(X_train,Y_train, test_size=0.2, random_state=42)

**Calculating RMSLE**

RMSLE is the suggested evaluation metric for the models performance. Since we already applied log1p transformation to the target value 'Meter Reading', therefore, Root Mean Squared Error (RMSE) can be used directly. And can be computed as follows:

In [26]:
def RMSLE(y_actual, y_pred):
    
    return np.sqrt(mean_squared_error(y_actual, y_pred))

## 2 Linear Regression

In [27]:
linear_Regression = LinearRegression()
linear_Regression.fit(train_x, train_y)

LinearRegression()

**Calculating Prediction Score**

In [28]:
print('Linear Regression Traininig RMSLE = ', RMSLE((train_y) , (linear_Regression.predict(train_x))))
print('Linear Regression Testing RMSLE = ',RMSLE((test_y) ,(linear_Regression.predict(test_x))))

Linear Regression Traininig RMSLE =  1.0090694012886698
Linear Regression Testing RMSLE =  1.0085413165280115


**Linear Regresssion Cross Validation**

In [29]:
lin_scores = cross_val_score(linear_Regression, train_x, train_y,scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)

print("Linear Regression CV Scores:", lin_rmse_scores)
print("==========================================================================================")
print("Mean CV Score:", lin_rmse_scores.mean())

Linear Regression CV Scores: [1.00872785 1.00851953 1.00626184 1.00933602 1.00932143 1.01150728
 1.00856741 1.01168035 1.00824321 1.00853313]
Mean CV Score: 1.0090698041714536


## 3 ElasticNet

**Hyperparameter Tuning**

In [30]:
parameters = {'alpha':[0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'fit_intercept' : [False],
              'l1_ratio':[0.5]}

elasticnet = GridSearchCV(estimator = ElasticNet(),
                        param_grid = parameters,
                        cv = 3, 
                        scoring = 'neg_mean_squared_error',
                        verbose = 1,
                        return_train_score = True,
                        n_jobs = -1)

elasticnet.fit(train_x, train_y)

Fitting 3 folds for each of 7 candidates, totalling 21 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  21 out of  21 | elapsed: 23.1min finished
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


GridSearchCV(cv=3, estimator=ElasticNet(), n_jobs=-1,
             param_grid={'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'fit_intercept': [False], 'l1_ratio': [0.5]},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=1)

In [31]:
print("Best ElasticNet Estimator is : ", elasticnet.best_estimator_)
print("Best ElasticNet Paramteres are : ", elasticnet.best_params_)

Best ElasticNet Estimator is :  ElasticNet(alpha=0.001, fit_intercept=False)
Best ElasticNet Paramteres are :  {'alpha': 0.001, 'fit_intercept': False, 'l1_ratio': 0.5}


**Building Best ElasticNet Estimator**

In [32]:
elastic_model = ElasticNet(alpha = 0.001, fit_intercept = False, l1_ratio = 0.5)
elastic_model.fit(train_x, train_y)

  positive)
  positive)
  positive)


ElasticNet(alpha=0.001, fit_intercept=False)

**Calculating Prediction Score**

In [33]:
print('ElasticNet Training RMSLE = ',RMSLE((train_y) , (elastic_model.predict(train_x))))
print('ElasticNet Testing RMSLE = ',RMSLE((test_y) , (elastic_model.predict(test_x))))

ElasticNet Training RMSLE =  1.1601119761684195
ElasticNet Testing RMSLE =  1.1594681989166413


## 4 Ridge

**Hyperparameters Tuning**

In [34]:
parameters = {'alpha':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'fit_intercept' : [True],
              'solver' : ['lsqr']}

ridge = GridSearchCV(estimator = Ridge(),
                        param_grid = parameters,
                        cv = 3, 
                        scoring = 'neg_mean_squared_error',
                        verbose = 1,
                        return_train_score = True,
                        n_jobs = -1)

ridge.fit(train_x, train_y)

Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   24.6s finished


GridSearchCV(cv=3, estimator=Ridge(), n_jobs=-1,
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
                         'fit_intercept': [True], 'solver': ['lsqr']},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=1)

In [35]:
print("Best Ridge Estimator is :",ridge.best_estimator_)
print("Best Ridge Paramteres are : ", ridge.best_params_)

Best Ridge Estimator is : Ridge(alpha=0.0001, solver='lsqr')
Best Ridge Paramteres are :  {'alpha': 0.0001, 'fit_intercept': True, 'solver': 'lsqr'}


**Best Ridge Estimator**

In [36]:
Ridge = Ridge(alpha = 0.0001, fit_intercept = True, solver = "lsqr")
Ridge.fit(train_x, train_y)

Ridge(alpha=0.0001, solver='lsqr')

**Calculating Prediction Score**

In [37]:
print('Ridge Training RMSLE = ',RMSLE((train_y) , (Ridge.predict(train_x))))
print('Ridge Testing RMSLE = ',RMSLE(test_y , Ridge.predict(test_x)))

Ridge Training RMSLE =  1.5101599351525472
Ridge Testing RMSLE =  1.5091990174524632


## 5 Lasso

**Hyperparameter Tuninig**

In [38]:
parameters = {'alpha':[0.001, 0.01, 0.1, 1, 10]}

lasso = GridSearchCV(estimator = Lasso(),
                        param_grid = parameters,
                        cv = 3, 
                        scoring = 'neg_mean_squared_error',
                        verbose = 1,
                        return_train_score = True,
                        n_jobs = -1)

lasso.fit(train_x, train_y)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed: 14.5min finished
  positive)
  positive)


GridSearchCV(cv=3, estimator=Lasso(), n_jobs=-1,
             param_grid={'alpha': [0.001, 0.01, 0.1, 1, 10]},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=1)

In [39]:
print("Best Lasso Estimator ",lasso.best_estimator_)
print("Best Lasso Paramteres are : ", lasso.best_params_)

Best Lasso Estimator  Lasso(alpha=0.001)
Best Lasso Paramteres are :  {'alpha': 0.001}


**Best Lasso Estimator**

In [40]:
Lasso = Lasso(alpha = 0.001)
Lasso.fit(train_x, train_y)

  positive)
  positive)
  positive)
  positive)


Lasso(alpha=0.001)

**Calculating Prediction Score**

In [41]:
print('Lasso Traininig RMSLE = ',RMSLE((train_y) , (Lasso.predict(train_x))))
print('Lasso Testing RMSLE = ',RMSLE(test_y , Lasso.predict(test_x)))

Lasso Traininig RMSLE =  1.0097581856294302
Lasso Testing RMSLE =  1.0092308664196366


## 6 Decision Tree

**Hyperparameters Tuning**

In [42]:
parameters = {'max_depth': [3,5,7,9,11,15]}


decission_tree = GridSearchCV(estimator = DecisionTreeRegressor(),
                        param_grid = parameters,
                        cv = 3, 
                        scoring = 'neg_mean_squared_error',
                        verbose = 1,
                        return_train_score = True,
                        n_jobs = -1)

decission_tree.fit(train_x, train_y)

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:  5.1min finished


GridSearchCV(cv=3, estimator=DecisionTreeRegressor(), n_jobs=-1,
             param_grid={'max_depth': [3, 5, 7, 9, 11, 15]},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=1)

In [43]:
print("Best Decision Tree Estimator ",decission_tree.best_estimator_)
print("Best Decision Tree Paramteres are : ", decission_tree.best_params_)

Best Decision Tree Estimator  DecisionTreeRegressor(max_depth=15)
Best Decision Tree Paramteres are :  {'max_depth': 15}


**Best Decision Tree Estimator**

In [44]:
DecissionTree = DecisionTreeRegressor(max_depth=15)
DecissionTree.fit(train_x, train_y)

DecisionTreeRegressor(max_depth=15)

**Calculating Prediction Score**

In [45]:
print('Decision Tree Training RMSLE = ',RMSLE((train_y) , (DecissionTree.predict(train_x))))
print('Decision Tree Testing RMSLE = ',RMSLE(test_y , DecissionTree.predict(test_x)))

Decision Tree Training RMSLE =  0.42242750334370155
Decision Tree Testing RMSLE =  0.4237460031710356


## 7 RandomForestRegressor

**Hyperparameters Tuning**

In [46]:
# parameters = { 'n_estimators': [60,80,100],
#               'max_depth':[5,7,9]}

# forest_reg = GridSearchCV(estimator = RandomForestRegressor(),
#                         param_grid = parameters,
#                         cv = 3, 
#                         scoring = 'neg_mean_squared_error',
#                         verbose = 1,
#                         return_train_score = True,
#                         n_jobs = -1)
# forest_reg.fit(train_x, train_y)

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


KeyboardInterrupt: 

In [None]:
# print("Best Random Forest Estimator ",forest_reg.best_estimator_)
# print("Best Random Forest Paramteres are : ", forest_reg.best_params_)

**Best RandomForest Estimator**

In [None]:
# RandomForest = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth)
# RandomForest.fit(train_x, train_y)

**Calculating Prediction Score**

In [None]:
# print('Train RMSLE = ',RMSLE((train_y) , (RandomForest.predict(train_x))))
# print('Test RMSLE = ',RMSLE(test_y , RandomForest.predict(test_x)))

## 8 SGD Regressor

In [47]:
# Scaling the Data Prior to Passing it to the Model

x_train_scaled = preprocessing.scale(train_x)
x_test_scaled = preprocessing.scale(test_x)

**Hyperparameters Tuning**

In [48]:
parameters = {'alpha':[0.0001, 0.001],
             'eta0': [0.001],
             'penalty': ['l2'],
             'learning_rate': ['adaptive', 'invscaling'],
             'early_stopping': [True]}

sgd = GridSearchCV(estimator = SGDRegressor(),
                        param_grid = parameters,
                        cv = 3, 
                        scoring = 'neg_mean_squared_error',
                        verbose = 10,
                        return_train_score = True,
                        n_jobs = -1)

sgd.fit(x_train_scaled, train_y)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:  2.6min remaining:  1.9min
[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed:  2.9min remaining:   58.3s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  3.2min finished


GridSearchCV(cv=3, estimator=SGDRegressor(), n_jobs=-1,
             param_grid={'alpha': [0.0001, 0.001], 'early_stopping': [True],
                         'eta0': [0.001],
                         'learning_rate': ['adaptive', 'invscaling'],
                         'penalty': ['l2']},
             return_train_score=True, scoring='neg_mean_squared_error',
             verbose=10)

In [49]:
print("Best SGDRegressor Estimator ",sgd.best_estimator_)
print("Best SGDRegressor Paramteres are : ", sgd.best_params_)

Best SGDRegressor Estimator  SGDRegressor(early_stopping=True, eta0=0.001, learning_rate='adaptive')
Best SGDRegressor Paramteres are :  {'alpha': 0.0001, 'early_stopping': True, 'eta0': 0.001, 'learning_rate': 'adaptive', 'penalty': 'l2'}


**Best SGDRegressor**

In [50]:
sgd_regressor = SGDRegressor(early_stopping=True, eta0=0.001, learning_rate='adaptive')
sgd_regressor.fit(x_train_scaled, train_y)

SGDRegressor(early_stopping=True, eta0=0.001, learning_rate='adaptive')

**Calculating Prediction Score**

In [51]:
print('SGDRegressor Training RMSLE = ',RMSLE((train_y) , (sgd_regressor.predict(x_train_scaled))))
print('SGDRegressor Testing RMSLE = ',RMSLE(test_y , sgd_regressor.predict(x_test_scaled)))

SGDRegressor Training RMSLE =  1.00940761908817
SGDRegressor Testing RMSLE =  1.008881559274521
