In [83]:
import pandas as pd
import random
from sklearn.tree import ExtraTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multioutput import ClassifierChain
from sklearn.multioutput import MultiOutputClassifier
from sklearn.multiclass import OutputCodeClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.calibration import CalibratedClassifierCV
from sklearn.naive_bayes import GaussianNB
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.naive_bayes import MultinomialNB  
from sklearn.neighbors import NearestCentroid
from sklearn.svm import NuSVC
from sklearn.linear_model import Perceptron
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler, StandardScaler, QuantileTransformer, RobustScaler
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from sklearn.metrics import classification_report
random.seed(2814)

In [101]:
df = pd.read_csv('../data/processed/3_seai_miss_forest_imputation.csv')
# Read in our cleaned up SEAI data
df = df.drop('BerRating', axis = 1)
df = df.drop('CO2Rating', axis = 1)
df['NoOfSidesSheltered'] = df['NoOfSidesSheltered'].astype('category')

df = df.sample(50000, random_state=2814)

In [102]:
#df = df[['YearofConstruction', 'GroundFloorArea(sq m)', 'TotalDeliveredEnergy', 'EnergyRating']]

In [103]:
# https://stackoverflow.com/a/52935270/5923619
# One Hot Encodes our categorical feature and binds it to the original dataset
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    return(res)


# One Hot Encode all of our categorical features
df = encode_and_bind(df, 'CountyName')
df = encode_and_bind(df, 'DwellingTypeDescr')
df = encode_and_bind(df, 'MainSpaceHeatingFuel')
df = encode_and_bind(df, 'MainWaterHeatingFuel')
df = encode_and_bind(df, 'VentilationMethod')
df = encode_and_bind(df, 'StructureType')
df = encode_and_bind(df, 'InsulationType')
df = encode_and_bind(df, 'NoOfSidesSheltered')

# Dropping the unencoded columns for now
df = df.drop(['CountyName', 'NoOfSidesSheltered','DwellingTypeDescr', 'MainSpaceHeatingFuel', 'MainWaterHeatingFuel', 'VentilationMethod', 'StructureType','InsulationType'], axis = 1)

In [104]:
new_cols = [col for col in df.columns if col != 'EnergyRating'] + ['EnergyRating']
df = df[new_cols]

del(new_cols)

In [105]:
df.EnergyRating.value_counts()

C2    6209
C3    5892
C1    5663
D1    5623
D2    4989
B3    3925
G     3319
E1    2786
A3    2416
F     2262
E2    2236
A2    2186
B2    1635
B1     792
A1      67
Name: EnergyRating, dtype: int64

In [106]:
X = df.iloc[:, :-1] # Independent Variables
y = df.iloc[:, -1] # Dependent Variables

In [107]:
# Define SMOTE-Tomek Links
resample=SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'))
X, y = resample.fit_resample(X, y)

In [108]:
X[['YearofConstruction', 'GroundFloorArea(sq m)', 'InsulationThickness', 'TotalDeliveredEnergy']] = QuantileTransformer().fit_transform(X[['YearofConstruction', 'GroundFloorArea(sq m)', 'InsulationThickness', 'TotalDeliveredEnergy']])

In [109]:
X

Unnamed: 0,YearofConstruction,GroundFloorArea(sq m),InsulationThickness,TotalDeliveredEnergy,CountyName_Carlow,CountyName_Cavan,CountyName_Clare,CountyName_Cork,CountyName_Donegal,CountyName_Dublin,...,StructureType_Masonry,StructureType_Timber or Steel Frame,InsulationType_Factory Insulated,InsulationType_Loose Jacket,InsulationType_None,NoOfSidesSheltered_0,NoOfSidesSheltered_1,NoOfSidesSheltered_2,NoOfSidesSheltered_3,NoOfSidesSheltered_4
0,0.281782,0.391881,0.000000,0.655035,0,0,0,0,0,1,...,1,0,0,0,1,0,0,1,0,0
1,0.485485,0.060221,0.000000,0.438774,0,0,0,0,0,1,...,1,0,0,0,1,0,0,0,1,0
2,0.437437,0.988806,0.966466,0.970199,0,0,0,0,0,1,...,1,0,0,1,0,0,0,1,0,0
3,0.541041,0.714447,0.258258,0.710127,0,0,0,0,1,0,...,1,0,1,0,0,0,1,0,0,0
4,0.681682,0.676066,0.589089,0.583378,0,0,0,1,0,0,...,0,1,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93106,0.048048,0.693412,0.190433,0.994818,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
93107,0.612112,0.011895,0.500709,0.278536,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,1
93108,0.121622,0.729925,0.149325,0.980113,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
93109,0.130631,0.247827,0.097343,0.889131,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [110]:
from sklearn.model_selection import train_test_split, cross_validate

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=2814)

In [118]:
dt = DecisionTreeClassifier(criterion='entropy')
dt.fit(X_train, y_train)

y_pred = dt.predict(X_test)
from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          A1       0.97      0.99      0.98      1923
          A2       0.88      0.86      0.87      1911
          A3       0.83      0.82      0.82      1886
          B1       0.84      0.84      0.84      1867
          B2       0.75      0.78      0.76      1890
          B3       0.72      0.72      0.72      1856
          C1       0.70      0.70      0.70      1838
          C2       0.70      0.70      0.70      1826
          C3       0.71      0.70      0.71      1862
          D1       0.72      0.73      0.72      1833
          D2       0.76      0.74      0.75      1852
          E1       0.78      0.78      0.78      1849
          E2       0.81      0.81      0.81      1862
           F       0.80      0.84      0.82      1847
           G       0.92      0.90      0.91      1832

    accuracy                           0.79     27934
   macro avg       0.79      0.79      0.79     27934
weighted avg       0.79   

In [112]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          A1       0.89      0.99      0.94      1923
          A2       0.76      0.74      0.75      1911
          A3       0.68      0.64      0.66      1886
          B1       0.60      0.72      0.66      1867
          B2       0.49      0.55      0.52      1890
          B3       0.31      0.32      0.31      1856
          C1       0.24      0.28      0.26      1838
          C2       0.22      0.26      0.24      1826
          C3       0.21      0.20      0.20      1862
          D1       0.23      0.22      0.22      1833
          D2       0.25      0.21      0.23      1852
          E1       0.31      0.31      0.31      1849
          E2       0.35      0.35      0.35      1862
           F       0.41      0.33      0.37      1847
           G       0.70      0.50      0.58      1832

    accuracy                           0.44     27934
   macro avg       0.44      0.44      0.44     27934
weighted avg       0.45   

In [113]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          A1       0.98      0.99      0.99      1923
          A2       0.89      0.89      0.89      1911
          A3       0.87      0.84      0.85      1886
          B1       0.84      0.89      0.86      1867
          B2       0.74      0.75      0.75      1890
          B3       0.56      0.54      0.55      1856
          C1       0.45      0.46      0.45      1838
          C2       0.41      0.42      0.42      1826
          C3       0.41      0.40      0.41      1862
          D1       0.43      0.42      0.43      1833
          D2       0.45      0.45      0.45      1852
          E1       0.55      0.53      0.54      1849
          E2       0.61      0.60      0.60      1862
           F       0.64      0.66      0.65      1847
           G       0.81      0.82      0.82      1832

    accuracy                           0.65     27934
   macro avg       0.64      0.64      0.64     27934
weighted avg       0.65   

In [114]:
lr = LogisticRegression(max_iter = 500)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(classification_report(y_test,y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

          A1       0.73      0.88      0.80      1923
          A2       0.69      0.57      0.62      1911
          A3       0.67      0.64      0.65      1886
          B1       0.58      0.59      0.59      1867
          B2       0.48      0.52      0.50      1890
          B3       0.44      0.30      0.36      1856
          C1       0.42      0.47      0.44      1838
          C2       0.42      0.50      0.46      1826
          C3       0.45      0.40      0.42      1862
          D1       0.47      0.48      0.47      1833
          D2       0.44      0.52      0.47      1852
          E1       0.36      0.31      0.33      1849
          E2       0.40      0.33      0.36      1862
           F       0.45      0.47      0.46      1847
           G       0.72      0.77      0.74      1832

    accuracy                           0.52     27934
   macro avg       0.51      0.52      0.51     27934
weighted avg       0.51   

In [115]:
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred = nb.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          A1       0.23      0.99      0.37      1923
          A2       0.14      0.35      0.20      1911
          A3       0.05      0.08      0.06      1886
          B1       0.08      0.16      0.10      1867
          B2       0.05      0.01      0.02      1890
          B3       0.12      0.07      0.09      1856
          C1       0.09      0.18      0.12      1838
          C2       0.07      0.00      0.00      1826
          C3       0.06      0.00      0.00      1862
          D1       0.00      0.00      0.00      1833
          D2       0.07      0.00      0.00      1852
          E1       0.00      0.00      0.00      1849
          E2       0.22      0.17      0.20      1862
           F       0.21      0.01      0.02      1847
           G       0.49      0.07      0.12      1832

    accuracy                           0.14     27934
   macro avg       0.12      0.14      0.09     27934
weighted avg       0.12   

In [116]:
et = ExtraTreeClassifier()
et.fit(X_train, y_train)
y_pred = et.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          A1       0.95      0.98      0.96      1923
          A2       0.82      0.79      0.80      1911
          A3       0.72      0.71      0.72      1886
          B1       0.70      0.71      0.70      1867
          B2       0.54      0.54      0.54      1890
          B3       0.34      0.34      0.34      1856
          C1       0.27      0.27      0.27      1838
          C2       0.27      0.27      0.27      1826
          C3       0.25      0.24      0.25      1862
          D1       0.25      0.25      0.25      1833
          D2       0.26      0.26      0.26      1852
          E1       0.33      0.33      0.33      1849
          E2       0.37      0.39      0.38      1862
           F       0.41      0.42      0.42      1847
           G       0.63      0.59      0.61      1832

    accuracy                           0.48     27934
   macro avg       0.47      0.47      0.47     27934
weighted avg       0.48   

In [117]:
svc = LinearSVC()
svc.fit(X_train, y_train)
y_pred = svc.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

          A1       0.71      0.90      0.80      1923
          A2       0.63      0.53      0.57      1911
          A3       0.51      0.56      0.53      1886
          B1       0.41      0.51      0.45      1867
          B2       0.40      0.43      0.41      1890
          B3       0.27      0.08      0.12      1856
          C1       0.25      0.36      0.29      1838
          C2       0.24      0.41      0.30      1826
          C3       0.21      0.14      0.17      1862
          D1       0.26      0.21      0.23      1833
          D2       0.31      0.22      0.26      1852
          E1       0.28      0.10      0.14      1849
          E2       0.30      0.22      0.25      1862
           F       0.30      0.36      0.32      1847
           G       0.57      0.87      0.69      1832

    accuracy                           0.39     27934
   macro avg       0.38      0.39      0.37     27934
weighted avg       0.38   