In [139]:
import pandas as pd
import random
random.seed(2814)

In [165]:
df = pd.read_csv('../data/processed/3_seai_miss_forest_imputation.csv')
# Read in our cleaned up SEAI data
df = df.drop('BerRating', axis = 1)
df = df.drop('CO2Rating', axis = 1)
df['NoOfSidesSheltered'] = df['NoOfSidesSheltered'].astype('category')

df = df.sample(30000)

In [166]:
#df = df[['YearofConstruction', 'GroundFloorArea(sq m)', 'TotalDeliveredEnergy', 'EnergyRating']]

In [167]:
# https://stackoverflow.com/a/52935270/5923619
# One Hot Encodes our categorical feature and binds it to the original dataset
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    return(res)


# One Hot Encode all of our categorical features
df = encode_and_bind(df, 'CountyName')
df = encode_and_bind(df, 'DwellingTypeDescr')
df = encode_and_bind(df, 'MainSpaceHeatingFuel')
df = encode_and_bind(df, 'MainWaterHeatingFuel')
df = encode_and_bind(df, 'VentilationMethod')
df = encode_and_bind(df, 'StructureType')
df = encode_and_bind(df, 'InsulationType')
df = encode_and_bind(df, 'NoOfSidesSheltered')

# Dropping the unencoded columns for now
df = df.drop(['CountyName', 'NoOfSidesSheltered','DwellingTypeDescr', 'MainSpaceHeatingFuel', 'MainWaterHeatingFuel', 'VentilationMethod', 'StructureType','InsulationType'], axis = 1)

In [168]:
new_cols = [col for col in df.columns if col != 'EnergyRating'] + ['EnergyRating']
df = df[new_cols]

del(new_cols)

In [169]:
df.EnergyRating.value_counts()

C2    3766
C3    3645
D1    3409
C1    3332
D2    2915
B3    2360
G     1967
E1    1664
A3    1503
E2    1344
F     1307
A2    1283
B2    1016
B1     442
A1      47
Name: EnergyRating, dtype: int64

In [170]:
X = df.iloc[:, :-1] # Independent Variables
y = df.iloc[:, -1] # Dependent Variables

In [171]:
from sklearn.model_selection import train_test_split, cross_validate

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=2814)

In [172]:
# instead of using scikit learn's pipeline, we import from imblearn
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import QuantileTransformer
# just like we did for StandardScaler, we instantiate SMOTE within the pipeline
pipe = Pipeline(steps = [('smote', SMOTE(k_neighbors=17)), 
                      ('quantilescaler', RobustScaler()),
                      ('rfc', RandomForestClassifier(criterion='entropy'))])
pipe.fit(X_train, y_train)
# cross validation using intra-fold sampling
cross_validate(pipe, X_train, y_train)

{'fit_time': array([4.750597  , 4.83933616, 4.32902718, 4.71308494, 4.41085505]),
 'score_time': array([0.16176796, 0.19692087, 0.10724807, 0.21867704, 0.15784287]),
 'test_score': array([0.45928571, 0.44428571, 0.46166667, 0.455     , 0.45761905])}

from sklearn.model_selection import GridSearchCV

parameters = {
    'smote__k_neighbors': [2, 3, 10, 12],
    'rfc__n_estimators': [4, 6, 9,], 
    'rfc__max_features': ['log2', 'sqrt'],
    'rfc__criterion': ['entropy', 'gini'], 
    'rfc__max_depth': [2, 3, 5, 10], 
    'rfc__min_samples_split': [2, 3, 5],
    'rfc__min_samples_leaf': [1, 5, 8] }

search = GridSearchCV(pipe, parameters, cv=5)

search.fit(X_train, y_train)

search.best_params_

# instead of using scikit learn's pipeline, we import from imblearn
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import QuantileTransformer
# just like we did for StandardScaler, we instantiate SMOTE within the pipeline
pipe = Pipeline(steps = [('smote', SMOTE(k_neighbors=6)), 
                      ('quantilescaler', QuantileTransformer()),
                      ('rfc', RandomForestClassifier(criterion='entropy', max_depth=10, max_features='sqrt', min_samples_leaf=1, min_samples_split=5, n_estimators=9))])
pipe.fit(X_train, y_train)
# cross validation using intra-fold sampling
cross_validate(pipe, X_train, y_train)

In [173]:
y_pred = pipe.predict(X_test)

In [174]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred)) #classification report from sklearn

              precision    recall  f1-score   support

          A1       0.00      0.00      0.00        11
          A2       0.77      0.79      0.78       397
          A3       0.75      0.71      0.73       455
          B1       0.37      0.37      0.37       131
          B2       0.38      0.35      0.37       280
          B3       0.46      0.44      0.45       702
          C1       0.49      0.46      0.47      1056
          C2       0.43      0.47      0.45      1109
          C3       0.41      0.43      0.42      1080
          D1       0.41      0.44      0.43      1031
          D2       0.41      0.46      0.44       839
          E1       0.31      0.28      0.29       509
          E2       0.33      0.25      0.29       408
           F       0.41      0.37      0.39       406
           G       0.74      0.74      0.74       586

    accuracy                           0.47      9000
   macro avg       0.44      0.44      0.44      9000
weighted avg       0.47   

In [123]:
from imblearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import QuantileTransformer
# just like we did for StandardScaler, we instantiate SMOTE within the pipeline
pipe = Pipeline(steps = [('smote', SMOTE()), 
                      ('quantilescaler', QuantileTransformer()),
                      ('rfc', KNeighborsClassifier())])
pipe.fit(X_train, y_train)
# cross validation using intra-fold sampling
cross_validate(pipe, X_train, y_train)

{'fit_time': array([1.03332806, 1.02375102, 1.03040409, 1.02716804, 1.02296424]),
 'score_time': array([0.47879791, 0.47944117, 0.47167206, 0.47490573, 0.47498083]),
 'test_score': array([0.68869048, 0.68666667, 0.68838095, 0.68792857, 0.68852381])}

In [None]:
df = pd.DataFrame({'col':np.random.randn(12000), 'target':np.random.randint(low = 0, high = 2, size=12000)})
new_df = df.groupby('target').apply(lambda x: x.sample(n=5000)).reset_index(drop = True)

new_df.target.value_counts()