# Design

The purpose of this notebook is to design our solution to attain a model that can predict a customer BER rating through classification

#### 1. Pre Model Work

In [1]:
import pandas as pd
pd.set_option('display.float_format', lambda x: '%.5f' % x)
from sklearn.linear_model import LinearRegression
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import Lasso
from sklearn.tree import DecisionTreeClassifier
from sklearn.inspection import permutation_importance
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.pipeline import make_pipeline
import graphviz
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics
import random
from sklearn.preprocessing import QuantileTransformer

random.seed(2814)

In [2]:
# Read in our cleaned up SEAI data
seai_dropped_na = pd.read_csv('../data/interim/1_seai_dropped_na.csv')
seai_dropped_na = seai_dropped_na.drop('BerRating', axis = 1)
seai_dropped_na = seai_dropped_na.drop('CO2Rating', axis = 1)
seai_dropped_na['NoOfSidesSheltered'] = seai_dropped_na['NoOfSidesSheltered'].astype('category')
new_cols = [col for col in seai_dropped_na.columns if col != 'EnergyRating'] + ['EnergyRating']
seai_dropped_na = seai_dropped_na[new_cols]

del(new_cols)

In [3]:
seai_dropped_na.groupby('EnergyRating').count()

Unnamed: 0_level_0,CountyName,DwellingTypeDescr,YearofConstruction,GroundFloorArea(sq m),MainSpaceHeatingFuel,MainWaterHeatingFuel,VentilationMethod,StructureType,NoOfSidesSheltered,InsulationType,InsulationThickness,TotalDeliveredEnergy
EnergyRating,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
A1,3,3,3,3,3,3,3,3,3,3,3,3
A2,71,71,71,71,71,71,71,71,71,71,71,71
A3,865,865,865,865,865,865,865,865,865,865,865,865
B1,2608,2608,2608,2608,2608,2608,2608,2608,2608,2608,2608,2608
B2,7385,7385,7385,7385,7385,7385,7385,7385,7385,7385,7385,7385
B3,19648,19648,19648,19648,19648,19648,19648,19648,19648,19648,19648,19648
C1,34849,34849,34849,34849,34849,34849,34849,34849,34849,34849,34849,34849
C2,47825,47825,47825,47825,47825,47825,47825,47825,47825,47825,47825,47825
C3,53588,53588,53588,53588,53588,53588,53588,53588,53588,53588,53588,53588
D1,55284,55284,55284,55284,55284,55284,55284,55284,55284,55284,55284,55284


We have a very imbalanced dataset for our classes which needs to be fixed or or model will be useless

---

### 2 Modelling Approach

The goal now is to be able to model BER ratings

I will break this down into steps:
- Chooses algorithms to try
    - Logistic Regression
    - kNN
    - Random Forest
    - SVMs
    - NNs
- Check the preprocessing requirements of each
    - Dummy variable encoding (like in the define phase).
    - Standardisation of variables .
    - Train/Test Split - 70/30.

##### 2.1 Scaling the Data

In [4]:
scaler = QuantileTransformer()
num_cols = ['YearofConstruction', 'GroundFloorArea(sq m)', 'TotalDeliveredEnergy', 'InsulationThickness']
seai_dropped_na_scaled = seai_dropped_na.copy()
seai_dropped_na_scaled[num_cols] = scaler.fit_transform(seai_dropped_na[num_cols])

##### 2.2 Encoding the Data

In [5]:
# https://stackoverflow.com/a/52935270/5923619
# One Hot Encodes our categorical feature and binds it to the original dataset
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    return(res)

In [6]:
# One Hot Encode all of our categorical features
seai_dropped_na_scaled = encode_and_bind(seai_dropped_na_scaled, 'CountyName')
seai_dropped_na_scaled = encode_and_bind(seai_dropped_na_scaled, 'DwellingTypeDescr')
seai_dropped_na_scaled = encode_and_bind(seai_dropped_na_scaled, 'MainSpaceHeatingFuel')
seai_dropped_na_scaled = encode_and_bind(seai_dropped_na_scaled, 'MainWaterHeatingFuel')
seai_dropped_na_scaled = encode_and_bind(seai_dropped_na_scaled, 'VentilationMethod')
seai_dropped_na_scaled = encode_and_bind(seai_dropped_na_scaled, 'StructureType')
seai_dropped_na_scaled = encode_and_bind(seai_dropped_na_scaled, 'InsulationType')
seai_dropped_na_scaled = encode_and_bind(seai_dropped_na_scaled, 'MainSpaceHeatingFuel')
seai_dropped_na_scaled = encode_and_bind(seai_dropped_na_scaled, 'MainWaterHeatingFuel')
seai_dropped_na_scaled = encode_and_bind(seai_dropped_na_scaled, 'NoOfSidesSheltered')

seai_dropped_na_scaled
# Dropping the unencoded columns for now
seai_dropped_na_scaled = seai_dropped_na_scaled.drop(['CountyName', 'DwellingTypeDescr', 'VentilationMethod', 'StructureType', 'InsulationType', 'MainSpaceHeatingFuel',	'MainWaterHeatingFuel',	'NoOfSidesSheltered'], axis = 1)

In [7]:
new_cols = [col for col in seai_dropped_na_scaled.columns if col != 'EnergyRating'] + ['EnergyRating']
seai_dropped_na_scaled = seai_dropped_na_scaled[new_cols]

del(new_cols)

In [8]:
X = seai_dropped_na_scaled.iloc[:, :-1] # Independent Variables
y = seai_dropped_na_scaled.iloc[:, -1] # Dependent Variables

In [9]:
y.value_counts()

D1    55284
C3    53588
D2    48914
C2    47825
C1    34849
G     30239
E1    28575
F     24954
E2    23701
B3    19648
B2     7385
B1     2608
A3      865
A2       71
A1        3
Name: EnergyRating, dtype: int64

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 3)

In [11]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE(k_neighbors=1)
X_train, y_train = oversample.fit_resample(X_train, y_train)

KeyboardInterrupt: 

In [None]:
y_train.value_counts()

C2    38758
D2    38758
D1    38758
G     38758
B3    38758
E1    38758
C1    38758
C3    38758
F     38758
E2    38758
B2    38758
B1    38758
A3    38758
A2    38758
A1    38758
Name: EnergyRating, dtype: int64

In [None]:
y_test.value_counts()

D1    16526
C3    16099
D2    14679
C2    14249
C1    10427
G      8955
E1     8807
F      7441
E2     7154
B3     5989
B2     2191
B1      752
A3      262
A2       22
Name: EnergyRating, dtype: int64

In [None]:
clf = RandomForestClassifier()
x = clf.fit(X_train, y_train)

In [None]:
# Use the forest's predict method on the test data
y_pred = clf.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred)) #classification report from sklearn

              precision    recall  f1-score   support

          A2       0.91      0.45      0.61        22
          A3       0.72      0.76      0.74       262
          B1       0.64      0.67      0.65       752
          B2       0.61      0.59      0.60      2191
          B3       0.59      0.60      0.59      5989
          C1       0.57      0.56      0.57     10427
          C2       0.56      0.56      0.56     14249
          C3       0.54      0.54      0.54     16099
          D1       0.56      0.55      0.56     16526
          D2       0.56      0.57      0.56     14679
          E1       0.46      0.45      0.46      8807
          E2       0.45      0.45      0.45      7154
           F       0.55      0.56      0.55      7441
           G       0.79      0.84      0.81      8955

    accuracy                           0.57    113553
   macro avg       0.61      0.58      0.59    113553
weighted avg       0.57      0.57      0.57    113553



In [None]:
x = pd.DataFrame(clf.feature_importances_, index=X_train.columns).sort_values(by=0, ascending=False)

In [None]:
x.iloc[0:10, :]

Unnamed: 0,0
TotalDeliveredEnergy,0.19592
YearofConstruction,0.17473
GroundFloorArea(sq m),0.15796
InsulationThickness,0.09556
NoOfSidesSheltered_2.0,0.01861
VentilationMethod_Natural vent.,0.01682
VentilationMethod_Bal.whole mech.vent heat recvr,0.01456
StructureType_Masonry,0.01414
NoOfSidesSheltered_3.0,0.01343
DwellingTypeDescr_Detached house,0.01242


In [None]:
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred))

0.5670832122445025


In [None]:
y_test.value_counts()

D1    2162
C3    2081
D2    1966
C2    1890
C1    1342
G     1239
E1    1167
E2     990
F      930
B3     786
B2     306
B1      92
A3      46
A2       3
Name: EnergyRating, dtype: int64

In [None]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
criterion = ['gini', 'entropy', 'log_loss']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'criterion': criterion,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 10, cv = 3, random_state=2814, verbose=2, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
rf_random.best_params_

In [None]:
rf = RandomForestClassifier(n_estimators= 1200,
min_samples_split=2,
min_samples_leaf=1,
max_depth=70,
criterion='entropy',
bootstrap=True)

rf.fit(X_train,y_train)

In [None]:
y_pred = rf.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
"""
Use your classification model to predict some labels
Then, plot confusion matrix and classification report using below code
y_test: real labels
y_pred: predicted model labels
"""
labels = ['A1', 'A2', 'A3', 'B1', 'B2', 'B3', 'C1', 'C2', 'C3', 'D1', 'D2', 'E1', 'E2', 'F', 'G'] 
print(classification_report(y_test, y_pred)) #classification report from sklearn
cnf_matrix = confusion_matrix(y_test, y_pred, labels=labels)
plt.imshow(cnf_matrix, cmap=plt.cm.Blues) #plot confusion matrix grid
threshold = cnf_matrix.max() / 2 #threshold to define text color
for i in range(cnf_matrix.shape[0]): #print text in grid
    for j in range(cnf_matrix.shape[1]): 
        plt.text(j, i, cnf_matrix[i,j], color="w" if cnf_matrix[i,j] > threshold else 'black')
tick_marks = np.arange(len(labels)) #define labeling spacing based on number of classes
plt.xticks(tick_marks, labels, rotation=45)
plt.yticks(tick_marks, labels)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.colorbar()
plt.tight_layout()

In [None]:
importance = clf.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: {}, Score: {}'.format(i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()

In [None]:
from sklearn import metrics
print(metrics.accuracy_score(y_test, y_pred))

In [None]:
knn = KNeighborsClassifier(n_neighbors=10)
knn.fit(X_train, y_train)

In [None]:
# Use the forest's predict method on the test data
y_pred = knn.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
"""
Use your classification model to predict some labels
Then, plot confusion matrix and classification report using below code
y_test: real labels
y_pred: predicted model labels
"""
labels = ['A1', 'A2', 'A3', 'B1', 'B2', 'B3', 'C1', 'C2', 'C3', 'D1', 'D2', 'E1', 'E2', 'F', 'G'] 
print(classification_report(y_test, y_pred)) #classification report from sklearn

---

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_classification, load_breast_cancer
import pandas as pd

In [None]:
# Read in our cleaned up SEAI data
seai_dropped_na = pd.read_csv('../data/interim/1_seai_dropped_na.csv')
seai_dropped_na = seai_dropped_na.drop('BerRating', axis = 1)
seai_dropped_na = seai_dropped_na.drop('CO2Rating', axis = 1)
seai_dropped_na['NoOfSidesSheltered'] = seai_dropped_na['NoOfSidesSheltered'].astype('category')
new_cols = [col for col in seai_dropped_na.columns if col != 'EnergyRating'] + ['EnergyRating']
seai_dropped_na = seai_dropped_na[new_cols]

del(new_cols)

In [None]:
from rpy2 import robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()
from rpy2.robjects.packages import importr # import R's "base" package
base = importr('base')

In [None]:
seai_dropped_na_r = robjects.conversion.py2rpy(seai_dropped_na)
robjects.globalenv["seai_dropped_na_r"] = seai_dropped_na_r

In [None]:
base.summary(seai_dropped_na_r)

In [None]:
robjects.r('''
sample_size = 20000
set.seed(1)
idxs = sample(1:nrow(seai_dropped_na_r),sample_size,replace=F)
subsample = seai_dropped_na_r[idxs,]
pvalues = list()
for (col in names(seai_dropped_na_r)) {
  if (class(seai_dropped_na_r[,col]) %in% c("numeric","integer")) {
    # Numeric variable. Using Kolmogorov-Smirnov test
    
    pvalues[[col]] = ks.test(subsample[[col]],seai_dropped_na_r[[col]])$p.value
    
  } else {
    
    
  }
}

pvalues''')

In [None]:
# https://stackoverflow.com/a/52935270/5923619
# One Hot Encodes our categorical feature and binds it to the original dataset
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    return(res)

In [None]:
# One Hot Encode all of our categorical features
seai_dropped_na_scaled = encode_and_bind(seai_dropped_na, 'CountyName')
seai_dropped_na_scaled = encode_and_bind(seai_dropped_na_scaled, 'DwellingTypeDescr')
seai_dropped_na_scaled = encode_and_bind(seai_dropped_na_scaled, 'MainSpaceHeatingFuel')
seai_dropped_na_scaled = encode_and_bind(seai_dropped_na_scaled, 'MainWaterHeatingFuel')
seai_dropped_na_scaled = encode_and_bind(seai_dropped_na_scaled, 'VentilationMethod')
seai_dropped_na_scaled = encode_and_bind(seai_dropped_na_scaled, 'StructureType')
seai_dropped_na_scaled = encode_and_bind(seai_dropped_na_scaled, 'InsulationType')
seai_dropped_na_scaled = encode_and_bind(seai_dropped_na_scaled, 'MainSpaceHeatingFuel')
seai_dropped_na_scaled = encode_and_bind(seai_dropped_na_scaled, 'MainWaterHeatingFuel')
seai_dropped_na_scaled = encode_and_bind(seai_dropped_na_scaled, 'NoOfSidesSheltered')

seai_dropped_na_scaled
# Dropping the unencoded columns for now
seai_dropped_na_scaled = seai_dropped_na_scaled.drop(['CountyName', 'DwellingTypeDescr', 'VentilationMethod', 'StructureType', 'InsulationType', 'MainSpaceHeatingFuel',	'MainWaterHeatingFuel',	'NoOfSidesSheltered'], axis = 1)

In [None]:
X = seai_dropped_na_scaled.iloc[:, :-1] # Independent Variables
y = seai_dropped_na_scaled.iloc[:, -1] # Dependent Variables

In [None]:
# 1. Split into train test split
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=3)

# 3. Perform the scaling on the data
# 4. Fit a model
# 5. Do stratified k fold sampling
# 6. GridSearchCV

In [None]:
scaler = QuantileTransformer()
num_cols = ['YearofConstruction', 'GroundFloorArea(sq m)', 'TotalDeliveredEnergy', 'InsulationThickness']
seai_dropped_na_scaled = seai_dropped_na.copy()
seai_dropped_na_scaled[num_cols] = scaler.fit_transform(seai_dropped_na[num_cols])

In [None]:
new_cols = [col for col in seai_dropped_na_scaled.columns if col != 'EnergyRating'] + ['EnergyRating']
seai_dropped_na_scaled = seai_dropped_na_scaled[new_cols]

del(new_cols)

In [None]:
from imblearn.over_sampling import SMOTE
oversample = SMOTE(k_neighbors=1)
X, y = oversample.fit_resample(X, y)

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)

In [None]:
lr.score(y_test, y_pred)

In [None]:
stratified_kfold = StratifiedKFold(n_splits=2,
                                       shuffle=True,
                                       random_state=1)
    
param_grid = {'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=stratified_kfold,
                           n_jobs=-1)

grid_search.fit(X_train, y_train)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)

In [None]:
y_test.value_counts()