In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import ExtraTreesRegressor
from catboost import CatBoostRegressor

In [2]:
df_enc = pd.read_csv("data/enc/data-enc-2024-04-01.csv")

In [3]:
df_enc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77455 entries, 0 to 77454
Data columns (total 85 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   bathrooms_total                   77455 non-null  float64
 1   bedrooms_extra                    77455 non-null  float64
 2   bedrooms                          77455 non-null  float64
 3   stories_total                     77455 non-null  float64
 4   size_interior                     77455 non-null  float64
 5   lng                               77455 non-null  float64
 6   lat                               77455 non-null  float64
 7   parkings                          77455 non-null  float64
 8   price                             77455 non-null  float64
 9   household_income                  77455 non-null  float64
 10  individual_income                 77455 non-null  float64
 11  commute_transit                   77455 non-null  float64
 12  comm

In [4]:
df_ON = df_enc[df_enc["province_Ontario"] == True]
df_QC = df_enc[df_enc["province_Quebec"] == True]
df_BC = df_enc[df_enc["province_British Columbia"] == True]
df_AB = df_enc[df_enc["province_Alberta"] == True]
df_SK = df_enc[df_enc["province_Saskatchewan"] == True]
df_MB = df_enc[df_enc["province_Manitoba"] == True]
# East
df_ES = df_enc[(df_enc["province_Nova Scotia"] == True) |
               (df_enc["province_New Brunswick"] == True) |
               (df_enc["province_Newfoundland & Labrador"] == True) |
               (df_enc["province_Prince Edward Island"] == True)]
# North
df_NO = df_enc[(df_enc["province_Yukon"] == True) |
               (df_enc["province_Northwest Territories"] == True)]

df_ON = df_ON.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_QC = df_QC.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_BC = df_BC.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_AB = df_AB.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_SK = df_SK.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_MB = df_MB.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_ES = df_ES.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta",
                    "province_Saskatchewan","province_Manitoba", "province_Yukon", "province_Northwest Territories"], axis=1)
df_NO = df_NO.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta",
                    "province_Saskatchewan", "province_Manitoba", "province_Nova Scotia", "province_New Brunswick",
                    "province_Newfoundland & Labrador", "province_Prince Edward Island"], axis=1)

# Ontario

In [5]:
df_ON.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32102 entries, 0 to 75724
Data columns (total 73 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   bathrooms_total                32102 non-null  float64
 1   bedrooms_extra                 32102 non-null  float64
 2   bedrooms                       32102 non-null  float64
 3   stories_total                  32102 non-null  float64
 4   size_interior                  32102 non-null  float64
 5   lng                            32102 non-null  float64
 6   lat                            32102 non-null  float64
 7   parkings                       32102 non-null  float64
 8   price                          32102 non-null  float64
 9   household_income               32102 non-null  float64
 10  individual_income              32102 non-null  float64
 11  commute_transit                32102 non-null  float64
 12  commute_foot                   32102 non-null  floa

In [6]:
kf = KFold(n_splits=5)

In [7]:
X = df_ON.drop("price", axis=1)
y = df_ON["price"]

scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.to_numpy().reshape(-1,1))

X_log = X.copy(deep=True)
X_log["size_interior"] = np.log1p(X_log["size_interior"])

X_scaled = scaler_X.fit_transform(X_log)
y_scaled = scaler_y.fit_transform(y.to_numpy().reshape(-1,1))

In [27]:
et = ExtraTreesRegressor()

scores_et = cross_val_score(et, X_scaled, y_scaled.ravel(), cv=kf, scoring="r2")

print(f"\n{scores_et}\n")
pd.Series(scores_et).describe()


[0.50567838 0.62831884 0.48078945 0.50413464 0.66026537]



count    5.000000
mean     0.555837
std      0.082128
min      0.480789
25%      0.504135
50%      0.505678
75%      0.628319
max      0.660265
dtype: float64

In [28]:
cb = CatBoostRegressor(silent=True)

scores_cb = cross_val_score(cb, X_scaled, y_scaled, cv=kf, scoring="r2")

print(f"\n{scores_cb}\n")
pd.Series(scores_cb).describe()


[0.56030611 0.69872319 0.50798953 0.5621815  0.67683699]



count    5.000000
mean     0.601207
std      0.082333
min      0.507990
25%      0.560306
50%      0.562182
75%      0.676837
max      0.698723
dtype: float64

In [8]:
from sklearn.model_selection import GridSearchCV

model = CatBoostRegressor(silent=True)
parameters = {'depth' : [6, 8, 10, 12, 15, 18, 20, 25, 30],
              'learning_rate' : [0.005, 0.01, 0.05, 0.1, 0.5],
              'iterations'    : [30, 50, 100, 150, 300]
              }

grid = GridSearchCV(estimator=model, param_grid = parameters, cv = 5, scoring="r2")


grid.fit(X_scaled, y_scaled)

KeyboardInterrupt: 

In [10]:
grid.best_estimator_

<catboost.core.CatBoostRegressor at 0x7f8d1cf2aed0>

In [14]:
grid.best_score_

0.5680146487479075

In [8]:
grid.best_params_

{'depth': 10, 'iterations': 100, 'learning_rate': 0.1}

In [18]:
dir(model)

['__class__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_base_calc_leaf_indexes',
 '_base_drop_unused_features',
 '_base_eval_metrics',
 '_base_predict',
 '_base_shrink',
 '_base_virtual_ensembles_predict',
 '_calc_fstr',
 '_calc_leaf_indexes',
 '_calc_ostr',
 '_check_is_compatible_loss',
 '_convert_to_asymmetric_representation',
 '_dataset_train_eval_split',
 '_deserialize_model',
 '_estimator_type',
 '_eval_metrics',
 '_fit',
 '_get_borders',
 '_get_cat_feature_indices',
 '_get_default_prediction_type',
 '_get_embedding_feature_indices',
 '_get_float_feature_indices',
 '_get_nan_treatments',
 '_get_params',
 '_

In [25]:
model.best_score_

{}