In [2]:
import pandas as pd
import random
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.svm import LinearSVC
from imblearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler, StandardScaler, QuantileTransformer, RobustScaler
from imblearn.combine import SMOTETomek
from sklearn.model_selection import train_test_split, cross_validate
from imblearn.under_sampling import TomekLinks
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from yellowbrick.classifier.rocauc import roc_auc
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from yellowbrick.classifier import ROCAUC
from sklearn.metrics import roc_curve, auc
random.seed(2814)

In [3]:
df_dropped_na = pd.read_csv('../data/interim/1_seai_dropped_na.csv')
df_mean_mode_impute = pd.read_csv('../data/interim/2_seai_mean_mode_imputation.csv')
df_miss_forest_impute = pd.read_csv('../data/processed/3_seai_miss_forest_imputation.csv')

In [4]:
df_dropped_na = df_dropped_na.drop('BerRating', axis = 1)
df_dropped_na = df_dropped_na.drop('CO2Rating', axis = 1)
df_dropped_na['NoOfSidesSheltered'] = df_dropped_na['NoOfSidesSheltered'].astype('category')

df_dropped_na = df_dropped_na[['YearofConstruction', 'GroundFloorArea(sq m)', 'TotalDeliveredEnergy', 'EnergyRating']]
df_mean_mode_impute = df_mean_mode_impute[['YearofConstruction', 'GroundFloorArea(sq m)', 'TotalDeliveredEnergy', 'EnergyRating']]
df_miss_forest_impute = df_miss_forest_impute[['YearofConstruction', 'GroundFloorArea(sq m)', 'TotalDeliveredEnergy', 'EnergyRating']]

In [48]:
# https://stackoverflow.com/a/52935270/5923619
# One Hot Encodes our categorical feature and binds it to the original dataset
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    return(res)


# One Hot Encode all of our categorical features
df_dropped_na = encode_and_bind(df_dropped_na, 'CountyName')
df_dropped_na = encode_and_bind(df_dropped_na, 'DwellingTypeDescr')
df_dropped_na = encode_and_bind(df_dropped_na, 'MainSpaceHeatingFuel')
df_dropped_na = encode_and_bind(df_dropped_na, 'MainWaterHeatingFuel')
df_dropped_na = encode_and_bind(df_dropped_na, 'VentilationMethod')
df_dropped_na = encode_and_bind(df_dropped_na, 'StructureType')
df_dropped_na = encode_and_bind(df_dropped_na, 'InsulationType')
df_dropped_na = encode_and_bind(df_dropped_na, 'NoOfSidesSheltered')

# Dropping the unencoded columns for now
df_dropped_na = df_dropped_na.drop(['CountyName', 'NoOfSidesSheltered','DwellingTypeDescr', 'MainSpaceHeatingFuel', 'MainWaterHeatingFuel', 'VentilationMethod', 'StructureType','InsulationType'], axis = 1)

In [44]:
df_dropped_na

Unnamed: 0,YearofConstruction,EnergyRating,GroundFloorArea(sq m),CO2Rating,InsulationThickness,TotalDeliveredEnergy,CountyName_Carlow,CountyName_Cavan,CountyName_Clare,CountyName_Cork,...,StructureType_Masonry,StructureType_Timber or Steel Frame,InsulationType_Factory Insulated,InsulationType_Loose Jacket,InsulationType_None,NoOfSidesSheltered_0.0,NoOfSidesSheltered_1.0,NoOfSidesSheltered_2.0,NoOfSidesSheltered_3.0,NoOfSidesSheltered_4.0
0,1997,C2,171.19,45.53,20.0,25474.522,0,0,0,0,...,1,0,1,0,0,0,1,0,0,0
1,2010,B3,242.93,35.66,50.0,27654.474,0,0,0,0,...,1,0,1,0,0,0,0,1,0,0
2,1999,C3,99.38,44.65,20.0,17000.038,0,0,0,0,...,1,0,0,1,0,0,0,0,1,0
3,1985,D2,127.16,55.07,100.0,28182.863,0,0,0,0,...,1,0,0,1,0,0,0,1,0,0
4,1975,D1,88.57,62.68,0.0,18470.029,0,0,0,0,...,1,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
378504,1979,D1,81.74,51.64,0.0,16038.841,0,0,0,0,...,1,0,0,0,1,0,0,1,0,0
378505,1996,C3,78.81,50.29,35.0,13208.402,0,0,0,0,...,1,0,1,0,0,0,0,1,0,0
378506,1990,D1,118.09,62.88,35.0,25420.303,0,0,0,0,...,1,0,1,0,0,1,0,0,0,0
378507,1998,D2,114.30,69.24,60.0,26317.076,0,0,0,0,...,1,0,0,1,0,0,1,0,0,0


In [52]:
scaler = QuantileTransformer()
scale_vars = ['YearofConstruction', 'GroundFloorArea(sq m)', 'InsulationThickness', 'TotalDeliveredEnergy']
df_dropped_na[scale_vars]=scaler.fit_transform(df_dropped_na[scale_vars])

In [54]:
new_cols = [col for col in df_dropped_na.columns if col != 'EnergyRating'] + ['EnergyRating']
df_dropped_na = df_dropped_na[new_cols]

del(new_cols)

In [7]:
df_dropped_na
df_dropped_na[['CountyName', 'DwellingTypeDescr', 'MainSpaceHeatingFuel', 'MainWaterHeatingFuel', 'VentilationMethod', 'StructureType', 'NoOfSidesSheltered', 'InsulationType']] = df_dropped_na[['CountyName', 'DwellingTypeDescr', 'MainSpaceHeatingFuel', 'MainWaterHeatingFuel', 'VentilationMethod', 'StructureType', 'NoOfSidesSheltered', 'InsulationType']].astype('object')

In [8]:
df_dropped_na.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378509 entries, 0 to 378508
Data columns (total 13 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   CountyName             378509 non-null  object 
 1   DwellingTypeDescr      378509 non-null  object 
 2   YearofConstruction     378509 non-null  int64  
 3   EnergyRating           378509 non-null  object 
 4   GroundFloorArea(sq m)  378509 non-null  float64
 5   MainSpaceHeatingFuel   378509 non-null  object 
 6   MainWaterHeatingFuel   378509 non-null  object 
 7   VentilationMethod      378509 non-null  object 
 8   StructureType          378509 non-null  object 
 9   NoOfSidesSheltered     378509 non-null  object 
 10  InsulationType         378509 non-null  object 
 11  InsulationThickness    378509 non-null  float64
 12  TotalDeliveredEnergy   378509 non-null  float64
dtypes: float64(3), int64(1), object(9)
memory usage: 37.5+ MB


In [9]:
new_cols = [col for col in df_dropped_na.columns if col != 'EnergyRating'] + ['EnergyRating']
df_dropped_na = df_dropped_na[new_cols]

del(new_cols)

In [10]:
import prince

In [12]:
mfa = prince.MFA(
    groups=groups,
    n_components=2,
    n_iter=3,
    copy=True,
    check_input=True,
    engine='auto',
    random_state=42
... )

famd.fit(df_dropped_na.drop('EnergyRating', axis =1))

  uniques = Index(uniques)


: 

: 

In [5]:
from light_famd import FAMD
famd = FAMD(n_components=2)
famd.fit(df_dropped_na)

: 

: 

---

### 1. Dropped dataset model build

In [11]:
X = df_dropped_na.iloc[:, :-1] # Independent Variables
y = df_dropped_na.iloc[:, -1] # Dependent Variables

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y,  random_state=2814)

# Define SMOTE-Tomek Links
resample=SMOTETomek(tomek=TomekLinks(sampling_strategy='all'), smote=SMOTE(k_neighbors=1), random_state=2814)
X_train, y_train = resample.fit_resample(X_train, y_train)

In [12]:
# Modelling
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          A1       0.00      1.00      0.00         1
          A2       0.00      0.17      0.00        35
          A3       0.00      0.06      0.00       432
          B1       0.00      0.04      0.01      1304
          B2       0.01      0.09      0.02      3692
          B3       0.01      0.02      0.01      9824
          C1       0.01      0.02      0.02     17425
          C2       0.02      0.01      0.02     23913
          C3       0.03      0.01      0.02     26794
          D1       0.04      0.02      0.03     27642
          D2       0.03      0.02      0.02     24457
          E1       0.03      0.01      0.02     14288
          E2       0.03      0.02      0.02     11851
           F       0.13      0.06      0.08     12477
           G       0.95      0.47      0.63     15120

    accuracy                           0.06    189255
   macro avg       0.09      0.13      0.06    189255
weighted avg       0.11   

rf = RandomForestClassifier(criterion='entropy', random_state=2814)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))

In [14]:
knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          A1       0.00      0.00      0.00         1
          A2       0.01      0.66      0.02        35
          A3       0.01      0.26      0.01       432
          B1       0.00      0.06      0.01      1304
          B2       0.00      0.03      0.01      3692
          B3       0.00      0.01      0.01      9824
          C1       0.01      0.01      0.01     17425
          C2       0.00      0.00      0.00     23913
          C3       0.00      0.00      0.00     26794
          D1       0.01      0.00      0.00     27642
          D2       0.02      0.01      0.01     24457
          E1       0.03      0.02      0.02     14288
          E2       0.02      0.01      0.01     11851
           F       0.10      0.05      0.06     12477
           G       0.98      0.43      0.60     15120

    accuracy                           0.04    189255
   macro avg       0.08      0.10      0.05    189255
weighted avg       0.09   

---

## 2. Mean/Mode Imputed Modelling

In [15]:
X = df_mean_mode_impute.iloc[:, :-1] # Independent Variables
y = df_mean_mode_impute.iloc[:, -1] # Dependent Variables

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y,  random_state=2814)

# Define SMOTE-Tomek Links
resample=SMOTETomek(tomek=TomekLinks(sampling_strategy='all'), smote=SMOTE(k_neighbors=300), random_state=2814)
X_train, y_train = resample.fit_resample(X_train, y_train)

# Quantile Transform Variables
X_train[['YearofConstruction', 'GroundFloorArea(sq m)', 'TotalDeliveredEnergy']] = QuantileTransformer(random_state=2814).fit_transform(X_train[['YearofConstruction', 'GroundFloorArea(sq m)', 'TotalDeliveredEnergy']])
X_test[['YearofConstruction', 'GroundFloorArea(sq m)', 'TotalDeliveredEnergy']] = QuantileTransformer(random_state=2814).fit_transform(X_test[['YearofConstruction', 'GroundFloorArea(sq m)', 'TotalDeliveredEnergy']])

In [16]:
# Modelling
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          A1       0.01      0.94      0.02       152
          A2       0.00      0.00      0.00      8627
          A3       0.22      0.32      0.26     10237
          B1       0.02      0.10      0.03      3050
          B2       0.02      0.03      0.02      6561
          B3       0.07      0.05      0.06     15590
          C1       0.34      0.14      0.19     22755
          C2       0.17      0.15      0.16     24905
          C3       0.21      0.12      0.15     23636
          D1       0.36      0.20      0.26     22870
          D2       0.66      0.63      0.64     19637
          E1       0.51      0.48      0.49     11326
          E2       0.56      0.68      0.62      8956
           F       0.58      0.72      0.64      9269
           G       0.76      0.85      0.80     13221

    accuracy                           0.30    200792
   macro avg       0.30      0.36      0.29    200792
weighted avg       0.34   

In [17]:
rf = RandomForestClassifier(criterion='entropy', random_state=2814)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          A1       0.01      0.98      0.03       152
          A2       0.00      0.00      0.00      8627
          A3       0.01      0.01      0.01     10237
          B1       0.01      0.07      0.02      3050
          B2       0.02      0.04      0.02      6561
          B3       0.07      0.06      0.07     15590
          C1       0.25      0.12      0.16     22755
          C2       0.16      0.14      0.15     24905
          C3       0.24      0.12      0.16     23636
          D1       0.50      0.28      0.36     22870
          D2       0.67      0.64      0.65     19637
          E1       0.50      0.44      0.47     11326
          E2       0.55      0.53      0.54      8956
           F       0.66      0.75      0.70      9269
           G       0.79      0.86      0.82     13221

    accuracy                           0.29    200792
   macro avg       0.30      0.34      0.28    200792
weighted avg       0.34   

In [None]:
cross_val_score(rf, X_train, y_train, cv=5) 

from sklearn.model_selection import RandomizedSearchCV

param_space = {"bootstrap": [True, False],
        "max_depth": [6, 8, 10, 12, 14, None],
        "max_features": ['sqrt','log2'],
        "min_samples_leaf": [2, 3, 4, None],
        "min_samples_split": [2, 3, 4, 5, None],
        "n_estimators": [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
}

forest_rand_search = RandomizedSearchCV(rf, param_space, n_iter=32,
                                        scoring="accuracy", verbose=True, cv=5,
                                        n_jobs=-1, random_state=2814)

forest_rand_search.fit(X_train, y_train)

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

search_space = {"bootstrap": Categorical([True, False]), # values for boostrap can be either True or False
        "max_depth": Integer(6, 50), # values of max_depth are integers from 6 to 20
        "max_features": Categorical(['sqrt','log2']), 
        "min_samples_leaf": Integer(2, 10),
        "min_samples_split": Integer(2, 10),
        "n_estimators": Integer(100, 500)
    }

def on_step(optim_result):
    """
    Callback meant to view scores after
    each iteration while performing Bayesian
    Optimization in Skopt"""
    score = forest_bayes_search.best_score_
    print("best score: %s" % score)
    if score >= 0.98:
        print('Interrupting!')
        return True

forest_bayes_search = BayesSearchCV(rf, search_space, n_iter=32, # specify how many iterations
                                    scoring="accuracy", n_jobs=-1, cv=5)

forest_bayes_search.fit(X_train, y_train, callback=on_step) # callback=on_step will print score after each iteration

forest_bayes_search.best_params_

roc_auc(rf, X_train, y_train, X_test=X_test, y_test=y_test, classes=['A1','A2','A3','B1','B2','B3','C1','C2','C3','D1','D2','E1','E2','F','G'])    

d = y_test.unique()

class_name = list(d.flatten())

class_name

pr, tpr, thresholds = metrics.roc_curve(y_test,  
                     LRE.predict_proba(X_test)[:,1], pos_label = p) 

      auroc = round(metrics.auc(fpr, tpr),2)
      print('LRE',p,'--AUC--->',auroc)

In [18]:
knn = KNeighborsClassifier(n_neighbors=8)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          A1       0.01      0.98      0.02       152
          A2       0.00      0.00      0.00      8627
          A3       0.02      0.02      0.02     10237
          B1       0.01      0.06      0.02      3050
          B2       0.02      0.04      0.03      6561
          B3       0.06      0.06      0.06     15590
          C1       0.21      0.12      0.15     22755
          C2       0.16      0.14      0.15     24905
          C3       0.22      0.12      0.16     23636
          D1       0.56      0.35      0.43     22870
          D2       0.71      0.62      0.66     19637
          E1       0.56      0.52      0.54     11326
          E2       0.58      0.61      0.59      8956
           F       0.68      0.72      0.70      9269
           G       0.90      0.82      0.86     13221

    accuracy                           0.30    200792
   macro avg       0.31      0.34      0.29    200792
weighted avg       0.36   

---

### 3. missForest Imptation Modelling

In [None]:
X = df_miss_forest_impute.iloc[:, :-1] # Independent Variables
y = df_miss_forest_impute.iloc[:, -1] # Dependent Variables

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y,  random_state=2814)

# Define SMOTE-Tomek Links
resample=SMOTETomek(tomek=TomekLinks(sampling_strategy='all'), smote=SMOTE(k_neighbors=300), random_state=2814)
X_train, y_train = resample.fit_resample(X_train, y_train)

# Quantile Transform Variables
X_train[['YearofConstruction', 'GroundFloorArea(sq m)', 'TotalDeliveredEnergy']] = QuantileTransformer(random_state=2814).fit_transform(X_train[['YearofConstruction', 'GroundFloorArea(sq m)', 'TotalDeliveredEnergy']])
X_test[['YearofConstruction', 'GroundFloorArea(sq m)', 'TotalDeliveredEnergy']] = QuantileTransformer(random_state=2814).fit_transform(X_test[['YearofConstruction', 'GroundFloorArea(sq m)', 'TotalDeliveredEnergy']])

In [None]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(classification_report(y_test,y_pred))

In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test,y_pred))