In [1]:
import pandas as pd
pd.options.display.max_columns=200
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
import warnings 
warnings.filterwarnings('ignore')
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.compose import TransformedTargetRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor

In [2]:
data = pd.read_csv('data.csv')

In [3]:
data.set_index('OSEBuildingID', inplace = True)

In [4]:
data.loc[data['TotalGHGEmissions'].isnull(), 'TotalGHGEmissions'] = np.mean(data['TotalGHGEmissions'])

In [5]:
data.loc[data['GHGEmissionsIntensity'].isnull(), 'GHGEmissionsIntensity'] = np.mean(data['GHGEmissionsIntensity'])

In [6]:
data.loc[data['SiteEUI(kBtu/sf)'].isnull()]

Unnamed: 0_level_0,NumberofBuildings,NumberofFloors,PropertyGFATotal,LargestPropertyUseTypeGFA,ENERGYSTARScore,EnergyStarNaN,SiteEUI(kBtu/sf),SiteEUIWN(kBtu/sf),SourceEUI(kBtu/sf),SourceEUIWN(kBtu/sf),SiteEnergyUse(kBtu),SiteEnergyUseWN(kBtu),TotalGHGEmissions,GHGEmissionsIntensity,Age,PropertyGFAParking(%),PropertyGFABuilding(s)(%),SteamUse(%),Electricity(%),NaturalGas(%),PrimaryPropertyType_Distribution Center,PrimaryPropertyType_Hotel,PrimaryPropertyType_K-12 School,PrimaryPropertyType_Large Office,PrimaryPropertyType_Medical Office,PrimaryPropertyType_Mixed Use Property,PrimaryPropertyType_Other,PrimaryPropertyType_Retail Store,PrimaryPropertyType_Self-Storage Facility,PrimaryPropertyType_Small- and Mid-Sized Office,PrimaryPropertyType_Supermarket / Grocery Store,PrimaryPropertyType_University,PrimaryPropertyType_Warehouse,PrimaryPropertyType_Worship Facility,CouncilDistrictCode_1,CouncilDistrictCode_2,CouncilDistrictCode_3,CouncilDistrictCode_4,CouncilDistrictCode_5,CouncilDistrictCode_6,CouncilDistrictCode_7
OSEBuildingID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1


In [7]:
data.loc[data['SiteEUI(kBtu/sf)'].isnull(), 'SiteEUI(kBtu/sf)'] = np.mean(data['SiteEUI(kBtu/sf)'])

In [8]:
X = data[['Age','NumberofBuildings','NumberofFloors','PropertyGFATotal','PropertyGFAParking(%)',
        'PropertyGFABuilding(s)(%)','LargestPropertyUseTypeGFA','ENERGYSTARScore','EnergyStarNaN',
        'TotalGHGEmissions','GHGEmissionsIntensity',
        'PrimaryPropertyType_Distribution Center', 'PrimaryPropertyType_Hotel',
       'PrimaryPropertyType_K-12 School', 
       'PrimaryPropertyType_Large Office',
       'PrimaryPropertyType_Medical Office',
       'PrimaryPropertyType_Mixed Use Property',
       'PrimaryPropertyType_Other',
       'PrimaryPropertyType_Retail Store',
       'PrimaryPropertyType_Self-Storage Facility',
       'PrimaryPropertyType_Small- and Mid-Sized Office',
       'PrimaryPropertyType_Supermarket / Grocery Store',
       'PrimaryPropertyType_University', 'PrimaryPropertyType_Warehouse',
       'PrimaryPropertyType_Worship Facility','CouncilDistrictCode_1',
       'CouncilDistrictCode_2', 'CouncilDistrictCode_3',
       'CouncilDistrictCode_4', 'CouncilDistrictCode_5',
       'CouncilDistrictCode_6', 'CouncilDistrictCode_7']]

y = data['SiteEUI(kBtu/sf)']

In [9]:
X.head()

Unnamed: 0_level_0,Age,NumberofBuildings,NumberofFloors,PropertyGFATotal,PropertyGFAParking(%),PropertyGFABuilding(s)(%),LargestPropertyUseTypeGFA,ENERGYSTARScore,EnergyStarNaN,TotalGHGEmissions,GHGEmissionsIntensity,PrimaryPropertyType_Distribution Center,PrimaryPropertyType_Hotel,PrimaryPropertyType_K-12 School,PrimaryPropertyType_Large Office,PrimaryPropertyType_Medical Office,PrimaryPropertyType_Mixed Use Property,PrimaryPropertyType_Other,PrimaryPropertyType_Retail Store,PrimaryPropertyType_Self-Storage Facility,PrimaryPropertyType_Small- and Mid-Sized Office,PrimaryPropertyType_Supermarket / Grocery Store,PrimaryPropertyType_University,PrimaryPropertyType_Warehouse,PrimaryPropertyType_Worship Facility,CouncilDistrictCode_1,CouncilDistrictCode_2,CouncilDistrictCode_3,CouncilDistrictCode_4,CouncilDistrictCode_5,CouncilDistrictCode_6,CouncilDistrictCode_7
OSEBuildingID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
1,89.0,1.0,12.0,88434.0,0.0,100.0,88434.0,60.0,0.0,249.98,2.83,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,20.0,1.0,11.0,103566.0,14.545314,85.454686,83880.0,61.0,0.0,295.86,2.86,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,47.0,1.0,41.0,956110.0,20.574829,79.425171,756493.0,43.0,0.0,2089.28,2.19,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5,90.0,1.0,10.0,61320.0,0.0,100.0,61320.0,56.0,0.0,286.43,4.67,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,36.0,1.0,18.0,175580.0,35.311539,64.688461,123445.0,75.0,0.0,505.01,2.88,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [10]:
y.head()

OSEBuildingID
1     81.699997
2     94.800003
3     96.000000
5    110.800003
8    114.800003
Name: SiteEUI(kBtu/sf), dtype: float64

In [11]:
std_scaler = StandardScaler().fit(X)
X_scale = std_scaler.transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scale, y, test_size=0.2, random_state=7)

In [12]:
X_test.shape

(330, 32)

In [13]:
dr = DummyRegressor(strategy='mean')
dr.fit(X_train, y_train)
y_pred = dr.predict(X_test)
baseline_error = mean_squared_error(y_test, y_pred, squared=False)
baseline_error

71.58877563770794

In [14]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
lr_error = mean_squared_error(y_test, y_pred, squared=False)
lr_error

53.22305073230883

In [15]:
def resultize_RMSE(grid): 
      results = pd.DataFrame(grid.cv_results_)
      results = results.sort_values(by = 'rank_test_score')
      results = results[['params', 'mean_test_score', 'rank_test_score']]
      results['mean_test_score'] = mean_squared_error(y_test, grid.predict(X_test), squared=False)
      return results

In [16]:
params = {'alpha': np.logspace(-3, 10, 20)}
ridge = Ridge()
grid_ridge = GridSearchCV(ridge, params, cv=10)
grid_ridge.fit(X_train, y_train)
grid_ridge_best_params = grid_ridge.best_params_
resultize_RMSE(grid_ridge).head()

Unnamed: 0,params,mean_test_score,rank_test_score
7,{'alpha': 61.58482110660267},53.054887,1
8,{'alpha': 297.63514416313194},53.054887,2
6,{'alpha': 12.742749857031347},53.054887,3
5,{'alpha': 2.636650898730358},53.054887,4
4,{'alpha': 0.545559478116852},53.054887,5


In [17]:
ridge = Ridge(**grid_ridge_best_params)
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)
ridge_error = mean_squared_error(y_test, y_pred, squared=False)
ridge_error

53.054887217220106

In [18]:
params = {'alpha': np.logspace(-3, 10, 20)}
grid_lasso = GridSearchCV(Lasso(), params, cv=10, scoring='neg_root_mean_squared_error')
grid_lasso.fit(X_train, y_train)
grid_lasso_best_params = grid_lasso.best_params_
resultize_RMSE(grid_lasso).head()

Unnamed: 0,params,mean_test_score,rank_test_score
4,{'alpha': 0.545559478116852},53.451754,1
5,{'alpha': 2.636650898730358},53.451754,2
3,{'alpha': 0.11288378916846895},53.451754,3
2,{'alpha': 0.023357214690901226},53.451754,4
1,{'alpha': 0.004832930238571752},53.451754,5


In [19]:
lasso = Lasso(**grid_lasso_best_params)
lasso.fit(X_train, y_train)
y_pred = lasso.predict(X_test)
lasso_error = mean_squared_error(y_test, y_pred, squared=False)
lasso_error

53.45175430691234

In [20]:
tt_ridge = TransformedTargetRegressor(regressor=Ridge(), 
                                    func=np.log1p, 
                                    inverse_func=np.expm1)
tt_ridge.fit(X_train, y_train)
y_pred = tt_ridge.predict(X_test)
tt_ridge_error = mean_squared_error(y_test, y_pred, squared=False)
tt_ridge_error

4774.3508638096955

In [21]:
params = {'regressor__alpha': np.logspace(-3, 10, 20)}
grid_tt_ridge = GridSearchCV(tt_ridge, params, cv=10)
grid_tt_ridge.fit(X_train, y_train)
grid_tt_ridge_best_params = grid_tt_ridge.best_params_
resultize_RMSE(grid_tt_ridge).head()

Unnamed: 0,params,mean_test_score,rank_test_score
9,{'regressor__alpha': 1438.449888287663},192.855663,1
10,{'regressor__alpha': 6951.927961775606},192.855663,2
8,{'regressor__alpha': 297.63514416313194},192.855663,3
11,{'regressor__alpha': 33598.18286283781},192.855663,4
12,{'regressor__alpha': 162377.67391887243},192.855663,5


In [22]:
grid_tt_ridge_best_params = {'alpha': 1438.449888287663}

In [23]:
tt_ridge = TransformedTargetRegressor(
      regressor=Ridge(**grid_tt_ridge_best_params),
      func=np.log1p,
        inverse_func=np.expm1)
tt_ridge.fit(X_train, y_train)
y_pred = tt_ridge.predict(X_test)
tt_ridge_error = mean_squared_error(y_test, y_pred, squared=False)
tt_ridge_error

192.85566265066424

In [24]:
tt_lasso = TransformedTargetRegressor(regressor=Lasso(),
                                    func=np.log1p,
                                    inverse_func=np.expm1)
tt_lasso.fit(X_train, y_train)
y_pred = tt_lasso.predict(X_test)
tt_lasso_error = mean_squared_error(y_test, y_pred, squared=False)
tt_lasso_error

75.16043487691316

In [25]:
params = {'regressor__alpha': np.logspace(-3, 10, 20)}
grid_tt_lasso = GridSearchCV(tt_lasso, params, cv=10)
grid_tt_lasso.fit(X_train, y_train)
grid_tt_lasso_best_params = grid_tt_lasso.best_params_
resultize_RMSE(grid_tt_lasso).head()

Unnamed: 0,params,mean_test_score,rank_test_score
9,{'regressor__alpha': 1438.449888287663},75.160435,1
17,{'regressor__alpha': 428133239.8719396},75.160435,1
16,{'regressor__alpha': 88586679.04100832},75.160435,1
15,{'regressor__alpha': 18329807.108324375},75.160435,1
14,{'regressor__alpha': 3792690.1907322537},75.160435,1


In [26]:
svr = SVR()
svr.fit(X_train, y_train)
y_pred = svr.predict(X_test)
svr_error = mean_squared_error(y_test, y_pred, squared=False)
svr_error

67.72442721659307

In [27]:
'''params = {'kernel': ['sigmoid','rbf','poly','sigmoid'],
         'C': [0.1,1,100,1000],
         'degree': [1,2,3,4,5,6]}
grid_svr = GridSearchCV(SVR(), params, cv=10)
grid_svr.fit(X_train, y_train)
grid_svr_best_params = grid_svr.best_params_
resultize_RMSE(grid_svr)'''

"params = {'kernel': ['sigmoid','rbf','poly','sigmoid'],\n         'C': [0.1,1,100,1000],\n         'degree': [1,2,3,4,5,6]}\ngrid_svr = GridSearchCV(SVR(), params, cv=10)\ngrid_svr.fit(X_train, y_train)\ngrid_svr_best_params = grid_svr.best_params_\nresultize_RMSE(grid_svr)"

In [28]:
grid_svr_best_params = {'C': 1000, 'degree': 1, 'kernel': 'poly'}

In [29]:
tt_svr = SVR(**grid_svr_best_params)
tt_svr.fit(X_train, y_train)
y_pred = tt_svr.predict(X_test)
tt_svr_error = mean_squared_error(y_test, y_pred, squared=False)
tt_svr_error

52.75023761124778

In [30]:
tt_svr = TransformedTargetRegressor(regressor=SVR(),
                                    func=np.log1p,
                                    inverse_func=np.expm1)
tt_svr.fit(X_train, y_train)
y_pred = tt_svr.predict(X_test)
tt_svr_error = mean_squared_error(y_test, y_pred, squared=False) 
tt_svr_error

50.573491867545094

In [31]:
'''params = {'regressor__kernel': ['sigmoid','rbf','poly','sigmoid'],
            'regressor__C': [0.1,1,100,1000],
            'regressor__degree': [1,2,3,4,5,6]}
grid_tt_svr = GridSearchCV(tt_svr, params, cv=10)
grid_tt_svr.fit(X_train, y_train)
grid_tt_svr_best_params = grid_tt_svr.best_params_
resultize_RMSE(grid_tt_svr)'''

"params = {'regressor__kernel': ['sigmoid','rbf','poly','sigmoid'],\n            'regressor__C': [0.1,1,100,1000],\n            'regressor__degree': [1,2,3,4,5,6]}\ngrid_tt_svr = GridSearchCV(tt_svr, params, cv=10)\ngrid_tt_svr.fit(X_train, y_train)\ngrid_tt_svr_best_params = grid_tt_svr.best_params_\nresultize_RMSE(grid_tt_svr)"

In [32]:
grid_tt_svr_best_params = {'C': 1, 'degree': 1, 'kernel': 'rbf'}

In [33]:
tt_svr = TransformedTargetRegressor(regressor=SVR(**grid_tt_svr_best_params),
                                    func=np.log1p,
                                    inverse_func=np.expm1)
tt_svr.fit(X_train, y_train)
y_pred = tt_svr.predict(X_test)
tt_svr_error = mean_squared_error(y_test, y_pred, squared=False)
tt_svr_error

50.573491867545094

In [34]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
rf_error = mean_squared_error(y_test, y_pred, squared=False)
rf_error

37.65827196557304

In [35]:
"""params = {'n_estimators': [10, 100, 1000],
            'max_depth': [1, 10, 100, 1000],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]}
grid_rf = GridSearchCV(RandomForestRegressor(), params, cv=10)
grid_rf.fit(X_train, y_train)
grid_rf_best_params = grid_rf.best_params_
resultize_RMSE(grid_rf)"""

"params = {'n_estimators': [10, 100, 1000],\n            'max_depth': [1, 10, 100, 1000],\n            'min_samples_split': [2, 5, 10],\n            'min_samples_leaf': [1, 2, 4]}\ngrid_rf = GridSearchCV(RandomForestRegressor(), params, cv=10)\ngrid_rf.fit(X_train, y_train)\ngrid_rf_best_params = grid_rf.best_params_\nresultize_RMSE(grid_rf)"

In [36]:
grid_rf_best_params = {'max_depth': 1000,
      'min_samples_leaf': 1,
      'min_samples_split': 10,
      'n_estimators': 100}

In [37]:
rf = RandomForestRegressor(**grid_rf_best_params)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
rf_error = mean_squared_error(y_test, y_pred, squared=False)
rf_error

38.62092843551049

In [38]:
tt_rf = TransformedTargetRegressor(regressor=RandomForestRegressor(),
                                    func=np.log1p,
                                    inverse_func=np.expm1)
tt_rf.fit(X_train, y_train)
y_pred = tt_rf.predict(X_test)
tt_rf_error = mean_squared_error(y_test, y_pred, squared=False)
tt_rf_error

38.94287548969048

In [39]:
'''params = {'regressor__n_estimators': [10, 100, 1000],
            'regressor__max_depth': [1, 10, 100, 1000],
            'regressor__min_samples_split': [2, 5, 10],    
            'regressor__min_samples_leaf': [1, 2, 4]}
grid_tt_rf = GridSearchCV(tt_rf, params, cv=10)
grid_tt_rf.fit(X_train, y_train)
grid_tt_rf_best_params = grid_tt_rf.best_params_
resultize_RMSE(grid_tt_rf)'''

"params = {'regressor__n_estimators': [10, 100, 1000],\n            'regressor__max_depth': [1, 10, 100, 1000],\n            'regressor__min_samples_split': [2, 5, 10],    \n            'regressor__min_samples_leaf': [1, 2, 4]}\ngrid_tt_rf = GridSearchCV(tt_rf, params, cv=10)\ngrid_tt_rf.fit(X_train, y_train)\ngrid_tt_rf_best_params = grid_tt_rf.best_params_\nresultize_RMSE(grid_tt_rf)"

In [40]:
grid_tt_rf_best_params = {'max_depth': 1000,
      'min_samples_leaf': 1,
      'min_samples_split': 10,
      'n_estimators': 100}

In [41]:
tt_rf = TransformedTargetRegressor(regressor=RandomForestRegressor(**grid_tt_rf_best_params),
                                    func=np.log1p,
                                    inverse_func=np.expm1)
tt_rf.fit(X_train, y_train)
y_pred = tt_rf.predict(X_test)
tt_rf_error = mean_squared_error(y_test, y_pred, squared=False)
tt_rf_error

40.71217025005807