In [1]:
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
import random
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from imblearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler, StandardScaler, QuantileTransformer, RobustScaler
from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split, cross_validate
from imblearn.under_sampling import TomekLinks
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from yellowbrick.classifier.rocauc import roc_auc
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector
from category_encoders import CatBoostEncoder
from yellowbrick.classifier import ROCAUC
from imblearn.over_sampling import SMOTENC
from sklearn.metrics import roc_curve, auc
from imblearn.under_sampling import RandomUnderSampler
import category_encoders as ce
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings("ignore")
random.seed(2814)

In [2]:
# Read in Imputed dataset
df = pd.read_csv('../data/processed/Define/seai_missingforest_data_imputed.csv')

In [3]:
# Feature Engineering
conditions = [
    (df['MainSpaceHeatingFuel'] == 'Heating Oil'),
    (df['MainSpaceHeatingFuel'] == 'Mains Gas'),
    (df['MainSpaceHeatingFuel'] == 'Electricity'),
    (df['MainSpaceHeatingFuel'] == 'Solid Multi-Fuel'),
    (df['MainSpaceHeatingFuel'] == 'Sod Peat'),
    (df['MainSpaceHeatingFuel'] == 'Bulk LPG (propane or butane)'),
    (df['MainSpaceHeatingFuel'] == 'House Coal'),
    (df['MainSpaceHeatingFuel'] == 'Wood Logs'),
    (df['MainSpaceHeatingFuel'] == 'Bottled LPG'),
    (df['MainSpaceHeatingFuel'] == 'Peat Briquettes'),
    (df['MainSpaceHeatingFuel'] == 'Wood Pellets (bulk supply for'),
    (df['MainSpaceHeatingFuel'] == 'Electricity - Standard Domesti'),
    (df['MainSpaceHeatingFuel'] == 'Wood Pellets (in bags for seco'),
    (df['MainSpaceHeatingFuel'] == 'Manufactured Smokeless Fuel'),
    (df['MainSpaceHeatingFuel'] == 'Anthracite'),
    (df['MainSpaceHeatingFuel'] == 'Wood Chips'),
    (df['MainSpaceHeatingFuel'] == 'Electricity - Off-peak Night-R'),
    (df['MainSpaceHeatingFuel'] == 'Electricity - On-peak Night-Ra'),
    (df['MainSpaceHeatingFuel'] == 'Bioethanol from renewable sour'),
    (df['MainSpaceHeatingFuel'] == 'Biodiesel from renewable sourc')
    ]

# create a list of the values we want to assign for each condition
values = [15.57, 10.00, 7.93, 9.51, 9.51, 15.00, 8.75, 13.50, 24.21, 9.51, 9.07, 7.93, 9.52, 8.80, 8.40, 5.92, 11.54, 11.54, 12.50, 12.50]

# create a new column and use np.select to assign values to it using our lists as arguments
df['EnergyCost'] = np.select(conditions, values, 12.50)
df['EnergyCost'] = round((df['TotalDeliveredEnergy'] * df['EnergyCost']) / 100, 2)
# display updated DataFrame
df.head()

Unnamed: 0,CountyName,DwellingTypeDescr,YearofConstruction,GroundFloorArea(sq m),MainSpaceHeatingFuel,MainWaterHeatingFuel,VentilationMethod,StructureType,NoOfSidesSheltered,InsulationType,InsulationThickness,TotalDeliveredEnergy,EnergyRating,EnergyCost
0,Donegal,Detached house,1997,171.19,Heating Oil,Heating Oil,Natural vent.,Masonry,one,Factory Insulated,20.0,25474.522,C2,3966.38
1,Kildare,Detached house,2010,242.93,Heating Oil,Heating Oil,Natural vent.,Masonry,two,Factory Insulated,50.0,27654.474,B3,4305.8
2,Dublin,Semi-detached house,1999,99.38,Mains Gas,Mains Gas,Natural vent.,Masonry,three,Loose Jacket,20.0,17000.038,C3,1700.0
3,Dublin,Semi-detached house,1965,138.41,Mains Gas,Mains Gas,Natural vent.,Masonry,two,Factory Insulated,39.35,22708.484,C2,2270.85
4,Dublin,Semi-detached house,1985,127.16,Mains Gas,Mains Gas,Natural vent.,Masonry,two,Loose Jacket,100.0,28182.863,D2,2818.29


In [4]:
# Move BER to the very end 
new_cols = [col for col in df.columns if col != 'EnergyRating'] + ['EnergyRating']
df = df[new_cols]
del(new_cols)

In [5]:
# Make sure year is not counted as a numerical feature
df['YearofConstruction'] = df['YearofConstruction'].astype('object')

In [8]:
df.EnergyRating.value_counts().sort_index()

A1       981
A2     25973
A3     34284
B1     13866
B2     31155
B3     75200
C1    110384
C2    121622
C3    116075
D1    112599
D2     96922
E1     55989
E2     44367
F      45956
G      66448
Name: EnergyRating, dtype: int64

##### Outline of steps for data to go through in a pipe:

- Data Ingestion
- Data Cleaning
- Data Imputation
- Data Sampling
- Data Splitting of Categorical and Numerical pipes within main pipe
- Run MinMaxScaler on Numerical data
- Run OneHotEncoding on the 

Now that impute is complete, we need to sample the dataset

In [8]:
# Separate feature and target variables
X = df.iloc[:, :-1] # Independent Variables
y = df.iloc[:, -1] # Dependent Variables

In [9]:
# Randomly distribute the data so CatBoost can work
perm = np.random.permutation(len(X))
X = X.iloc[perm].reset_index(drop=True) 
y = y.iloc[perm].reset_index(drop=True)

In [10]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=2814)

In [12]:
X_test

Unnamed: 0,CountyName,DwellingTypeDescr,YearofConstruction,GroundFloorArea(sq m),MainSpaceHeatingFuel,MainWaterHeatingFuel,VentilationMethod,StructureType,NoOfSidesSheltered,InsulationType,InsulationThickness,TotalDeliveredEnergy,EnergyCost
394550,Meath,Detached house,1998,244.45,Heating Oil,Heating Oil,Natural vent.,Masonry,zero,Factory Insulated,35.00,38683.684000,6023.05
123360,Donegal,Mid-terrace house,1900,188.05,Heating Oil,Heating Oil,Natural vent.,Masonry,two,Factory Insulated,35.00,33837.484640,5268.50
405727,Kerry,Detached house,2002,205.63,Heating Oil,Heating Oil,Natural vent.,Masonry,two,Factory Insulated,20.00,28406.311000,4422.86
298857,Dublin,Semi-detached house,1996,103.03,Mains Gas,Mains Gas,Natural vent.,Masonry,four,Loose Jacket,50.00,15651.053610,1565.11
934723,Roscommon,Semi-detached house,2007,142.24,Mains Gas,Mains Gas,Natural vent.,Masonry,two,Factory Insulated,25.00,16670.568460,1667.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...
152182,Cork,Mid-terrace house,1950,68.80,Heating Oil,Heating Oil,Natural vent.,Masonry,two,Factory Insulated,50.00,19159.346000,2983.11
222677,Westmeath,Mid-terrace house,1970,78.00,Solid Multi-Fuel,Solid Multi-Fuel,Natural vent.,Masonry,two,Loose Jacket,51.50,43046.256000,4093.70
949027,Waterford,Detached house,2007,326.22,Heating Oil,Heating Oil,Natural vent.,Timber or Steel Frame,one,Factory Insulated,20.00,35739.267000,5564.60
273713,Kildare,Detached house,1993,203.78,Heating Oil,Heating Oil,Natural vent.,Masonry,zero,Factory Insulated,35.00,27741.764790,4319.39


In [28]:
# Label Encoding of Target for CatBoost
yle = LabelEncoder()
y_train = yle.fit_transform(y_train)
y_test = yle.transform(y_test)

In [None]:
from imblearn.pipeline import Pipeline as Pipe
pipe = Pipe([('scaler1', CatBoostEncoder()), ('scaler2', MinMaxScaler()), ('smote', SMOTETomek()), ('rf', RandomForestClassifier(random_state=2814, criterion='entropy'))])


grid_search = GridSearchCV(estimator=pipe,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=stratified_kfold,
                           n_jobs=-1)

grid_search.fit(X_train, y_train)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')

In [29]:
# CatBoost Encoding
cbe_encoder = ce.cat_boost.CatBoostEncoder()
X_train = cbe_encoder.fit_transform(X_train, y_train)
X_test = cbe_encoder.transform(X_test)

In [13]:
df.iloc[209132]

CountyName                       Meath
DwellingTypeDescr                House
YearofConstruction                1978
GroundFloorArea(sq m)            96.21
MainSpaceHeatingFuel       Heating Oil
MainWaterHeatingFuel       Heating Oil
VentilationMethod        Natural vent.
StructureType                  Masonry
NoOfSidesSheltered                 two
InsulationType            Loose Jacket
InsulationThickness               50.0
TotalDeliveredEnergy         31994.857
EnergyCost                      4981.6
EnergyRating                         F
Name: 209132, dtype: object

In [30]:
# Taking a look at the encoding
pd.DataFrame(X_train)

Unnamed: 0,CountyName,DwellingTypeDescr,YearofConstruction,GroundFloorArea(sq m),MainSpaceHeatingFuel,MainWaterHeatingFuel,VentilationMethod,StructureType,NoOfSidesSheltered,InsulationType,InsulationThickness,TotalDeliveredEnergy,EnergyCost
209132,8.129275,8.129275,8.129275,128.66,8.129275,8.129275,8.129275,8.129275,8.129275,8.129275,33.600000,30202.508690,4702.53
578817,8.129275,8.129275,8.129275,111.69,9.064637,9.064637,9.064637,9.064637,9.064637,8.129275,25.000000,20162.983000,3139.38
321728,8.129275,8.129275,8.129275,256.87,8.129275,8.129275,8.709758,8.709758,8.129275,9.064637,50.000000,80066.133000,8006.61
129181,9.064637,8.064637,8.129275,70.08,8.709758,8.709758,9.532319,9.532319,8.709758,10.043092,25.000000,12560.068000,1955.60
756243,8.129275,10.064637,8.129275,65.24,8.532319,8.532319,9.225855,9.225855,8.532319,8.064637,28.724286,25958.259000,4041.70
...,...,...,...,...,...,...,...,...,...,...,...,...,...
848836,8.276301,8.153602,8.580915,84.43,8.479040,8.466121,8.404010,8.250801,7.949606,9.920093,30.000000,22973.034000,3576.90
727640,8.599192,8.153614,8.809947,88.88,8.479049,8.466130,8.404014,8.250805,8.063390,9.920099,80.000000,31396.103000,4888.37
467866,8.020865,8.380834,10.563166,71.78,7.431547,7.414135,8.404020,8.250812,8.063406,7.388765,26.707500,7934.413970,793.44
209946,8.020852,8.153635,9.526677,140.40,8.479065,8.466146,8.404016,8.250807,8.063396,7.388760,50.000000,36910.841430,5747.02


In [31]:
# Scaling the dataset. Not required for tree models but good practice.
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train, y_train)
X_test = scaler.transform(X_test)

In [32]:
# Use value counts as reference for under sampler
pd.DataFrame(y_train).value_counts().sort_index()

0       785
1     20778
2     27427
3     11093
4     24924
5     60160
6     88307
7     97297
8     92860
9     90079
10    77538
11    44791
12    35494
13    36765
14    53158
dtype: int64

In [33]:
resample=SMOTETomek(random_state=2814)
X_train, y_train = resample.fit_resample(X_train, y_train)

In [34]:
model = RandomForestClassifier(random_state=2814, criterion='entropy', n_estimators=175)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.56      0.37      0.45       196
           1       0.89      0.88      0.88      5195
           2       0.87      0.87      0.87      6857
           3       0.65      0.74      0.69      2773
           4       0.75      0.78      0.77      6231
           5       0.82      0.84      0.83     15040
           6       0.85      0.82      0.84     22077
           7       0.84      0.80      0.82     24325
           8       0.81      0.77      0.79     23215
           9       0.79      0.77      0.78     22520
          10       0.78      0.79      0.78     19384
          11       0.70      0.77      0.73     11198
          12       0.72      0.77      0.75      8873
          13       0.78      0.83      0.80      9191
          14       0.95      0.93      0.94     13290

    accuracy                           0.81    190365
   macro avg       0.78      0.78      0.78    190365
weighted avg       0.81   

In [35]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_test, y_pred)
matrix.diagonal()/matrix.sum(axis=1)

array([0.37244898, 0.87892204, 0.87137232, 0.74107465, 0.77948965,
       0.83949468, 0.82366264, 0.80480987, 0.77337928, 0.77162522,
       0.7868345 , 0.76504733, 0.77313197, 0.83070395, 0.93273138])

In [36]:
from numpy import save
save('../data/interim/Design/X_train.npy', X_train)
save('../data/interim/Design/X_test.npy', X_test)
save('../data/interim/Design/y_train.npy', y_train)
save('../data/interim/Design/y_test.npy', y_test)

In [37]:
# Save our final model
import pickle
filename = '../models/final_model.pkl'
pickle.dump(model, open(filename, 'wb'))

In [27]:
import xgboost as xgb
xgb_model = xgb.XGBClassifier(objective="multi:softprob", random_state=2814)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.23      0.62      0.34       196
           1       0.88      0.86      0.87      5195
           2       0.87      0.84      0.85      6857
           3       0.66      0.71      0.68      2773
           4       0.73      0.76      0.75      6231
           5       0.81      0.83      0.82     15040
           6       0.83      0.82      0.82     22077
           7       0.83      0.78      0.80     24325
           8       0.79      0.77      0.78     23215
           9       0.80      0.75      0.77     22520
          10       0.78      0.76      0.77     19384
          11       0.68      0.76      0.72     11198
          12       0.69      0.78      0.73      8873
          13       0.77      0.83      0.80      9191
          14       0.95      0.93      0.94     13290

    accuracy                           0.80    190365
   macro avg       0.75      0.79      0.76    190365
weighted avg       0.80   