In [120]:
import pandas as pd
import random
from sklearn.tree import ExtraTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OutputCodeClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import GaussianNB
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB  
from sklearn.neighbors import NearestCentroid
from sklearn.svm import NuSVC
from sklearn.linear_model import Perceptron
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler, StandardScaler, QuantileTransformer, RobustScaler
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from sklearn.metrics import classification_report
random.seed(2814)

In [123]:
df = pd.read_csv('../data/processed/3_seai_miss_forest_imputation.csv')
# Read in our cleaned up SEAI data
df = df.drop('BerRating', axis = 1)
df = df.drop('CO2Rating', axis = 1)
df['NoOfSidesSheltered'] = df['NoOfSidesSheltered'].astype('category')

In [124]:
df = df[['YearofConstruction', 'GroundFloorArea(sq m)', 'TotalDeliveredEnergy', 'EnergyRating']]

In [125]:
df.EnergyRating.value_counts()

C2    124530
C3    118185
D1    114352
C1    113782
D2     98187
B3     77970
G      66815
E1     56631
A3     51193
F      46347
E2     44780
A2     43242
B2     32814
B1     15261
A1      1258
Name: EnergyRating, dtype: int64

In [126]:
X = df.iloc[:, :-1] # Independent Variables
y = df.iloc[:, -1] # Dependent Variables

In [127]:
# Define SMOTE-Tomek Links
resample=SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))
X, y = resample.fit_resample(X, y)

In [130]:
X[['YearofConstruction', 'GroundFloorArea(sq m)', 'TotalDeliveredEnergy']] = QuantileTransformer().fit_transform(X[['YearofConstruction', 'GroundFloorArea(sq m)', 'TotalDeliveredEnergy']])

In [131]:
X

Unnamed: 0,YearofConstruction,GroundFloorArea(sq m),TotalDeliveredEnergy
0,0.474975,0.834647,0.760827
1,0.804805,0.939870,0.800123
2,0.509510,0.460497,0.557582
3,0.206206,0.739348,0.702097
4,0.374875,0.685611,0.809193
...,...,...,...
1867527,0.145646,0.226685,0.852013
1867528,0.124124,0.507119,0.962652
1867529,0.193694,0.581634,0.951299
1867530,0.070571,0.454537,0.956715


In [132]:
from sklearn.model_selection import train_test_split, cross_validate

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=2814)

In [133]:
dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)
from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          A1       0.97      0.97      0.97     37197
          A2       0.92      0.93      0.93     37439
          A3       0.90      0.88      0.89     37622
          B1       0.83      0.84      0.84     37188
          B2       0.75      0.76      0.75     37580
          B3       0.67      0.67      0.67     37519
          C1       0.66      0.66      0.66     37182
          C2       0.65      0.65      0.65     37260
          C3       0.66      0.65      0.66     37263
          D1       0.66      0.65      0.66     37164
          D2       0.69      0.69      0.69     37439
          E1       0.73      0.74      0.73     37441
          E2       0.76      0.77      0.76     37432
           F       0.78      0.79      0.79     37192
           G       0.83      0.82      0.82     37342

    accuracy                           0.76    560260
   macro avg       0.76      0.76      0.76    560260
weighted avg       0.76   

In [134]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          A1       0.89      0.94      0.92     37197
          A2       0.88      0.87      0.88     37439
          A3       0.86      0.83      0.84     37622
          B1       0.77      0.79      0.78     37188
          B2       0.71      0.73      0.72     37580
          B3       0.67      0.70      0.69     37519
          C1       0.70      0.74      0.72     37182
          C2       0.70      0.72      0.71     37260
          C3       0.70      0.72      0.71     37263
          D1       0.71      0.70      0.70     37164
          D2       0.75      0.73      0.74     37439
          E1       0.75      0.75      0.75     37441
          E2       0.77      0.77      0.77     37432
           F       0.80      0.78      0.79     37192
           G       0.90      0.78      0.84     37342

    accuracy                           0.77    560260
   macro avg       0.77      0.77      0.77    560260
weighted avg       0.77   

In [135]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          A1       0.96      0.98      0.97     37197
          A2       0.95      0.92      0.93     37439
          A3       0.91      0.90      0.90     37622
          B1       0.85      0.85      0.85     37188
          B2       0.78      0.79      0.78     37580
          B3       0.71      0.75      0.73     37519
          C1       0.72      0.76      0.74     37182
          C2       0.73      0.74      0.73     37260
          C3       0.73      0.74      0.73     37263
          D1       0.74      0.73      0.73     37164
          D2       0.77      0.76      0.77     37439
          E1       0.79      0.80      0.79     37441
          E2       0.82      0.82      0.82     37432
           F       0.84      0.83      0.84     37192
           G       0.92      0.82      0.87     37342

    accuracy                           0.81    560260
   macro avg       0.81      0.81      0.81    560260
weighted avg       0.81   

In [136]:
lr = LogisticRegression(max_iter = 10000)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          A1       0.49      0.53      0.51     37197
          A2       0.48      0.46      0.47     37439
          A3       0.53      0.56      0.55     37622
          B1       0.50      0.46      0.48     37188
          B2       0.33      0.32      0.32     37580
          B3       0.27      0.26      0.27     37519
          C1       0.25      0.28      0.27     37182
          C2       0.30      0.33      0.31     37260
          C3       0.33      0.35      0.34     37263
          D1       0.24      0.28      0.26     37164
          D2       0.23      0.23      0.23     37439
          E1       0.19      0.14      0.16     37441
          E2       0.20      0.14      0.16     37432
           F       0.31      0.28      0.29     37192
           G       0.49      0.67      0.56     37342

    accuracy                           0.35    560260
   macro avg       0.34      0.35      0.35    560260
weighted avg       0.34   

In [137]:
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          A1       0.47      0.12      0.20     37197
          A2       0.39      0.87      0.54     37439
          A3       0.21      0.31      0.25     37622
          B1       0.19      0.18      0.18     37188
          B2       0.31      0.12      0.18     37580
          B3       0.29      0.23      0.26     37519
          C1       0.22      0.24      0.23     37182
          C2       0.20      0.16      0.18     37260
          C3       0.19      0.18      0.18     37263
          D1       0.14      0.12      0.13     37164
          D2       0.15      0.10      0.12     37439
          E1       0.09      0.04      0.06     37441
          E2       0.15      0.06      0.08     37432
           F       0.17      0.19      0.18     37192
           G       0.31      0.81      0.45     37342

    accuracy                           0.25    560260
   macro avg       0.23      0.25      0.21    560260
weighted avg       0.23   

In [138]:
et = ExtraTreeClassifier()
et.fit(X_train, y_train)
y_pred = et.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          A1       0.91      0.92      0.91     37197
          A2       0.88      0.88      0.88     37439
          A3       0.84      0.83      0.84     37622
          B1       0.74      0.74      0.74     37188
          B2       0.65      0.65      0.65     37580
          B3       0.60      0.60      0.60     37519
          C1       0.61      0.61      0.61     37182
          C2       0.60      0.60      0.60     37260
          C3       0.60      0.60      0.60     37263
          D1       0.60      0.60      0.60     37164
          D2       0.63      0.63      0.63     37439
          E1       0.65      0.65      0.65     37441
          E2       0.67      0.67      0.67     37432
           F       0.70      0.70      0.70     37192
           G       0.79      0.77      0.78     37342

    accuracy                           0.70    560260
   macro avg       0.70      0.70      0.70    560260
weighted avg       0.70   

In [139]:
svc = LinearSVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          A1       0.41      0.71      0.52     37197
          A2       0.36      0.55      0.44     37439
          A3       0.05      0.07      0.06     37622
          B1       0.15      0.16      0.16     37188
          B2       0.21      0.08      0.12     37580
          B3       0.18      0.17      0.18     37519
          C1       0.30      0.13      0.18     37182
          C2       0.19      0.11      0.14     37260
          C3       0.39      0.05      0.09     37263
          D1       0.07      0.01      0.01     37164
          D2       0.11      0.08      0.09     37439
          E1       0.13      0.14      0.14     37441
          E2       0.04      0.03      0.04     37432
           F       0.14      0.24      0.17     37192
           G       0.36      0.82      0.50     37342

    accuracy                           0.22    560260
   macro avg       0.21      0.22      0.19    560260
weighted avg       0.21   