In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler

In [2]:
df_enc = pd.read_csv("data/enc/data-enc-2024-04-01.csv")

In [3]:
df_enc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77455 entries, 0 to 77454
Data columns (total 85 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   bathrooms_total                   77455 non-null  float64
 1   bedrooms_extra                    77455 non-null  float64
 2   bedrooms                          77455 non-null  float64
 3   stories_total                     77455 non-null  float64
 4   size_interior                     77455 non-null  float64
 5   lng                               77455 non-null  float64
 6   lat                               77455 non-null  float64
 7   parkings                          77455 non-null  float64
 8   price                             77455 non-null  float64
 9   household_income                  77455 non-null  float64
 10  individual_income                 77455 non-null  float64
 11  commute_transit                   77455 non-null  float64
 12  comm

In [4]:
df_ON = df_enc[df_enc["province_Ontario"] == True]
df_QC = df_enc[df_enc["province_Quebec"] == True]
df_BC = df_enc[df_enc["province_British Columbia"] == True]
df_AB = df_enc[df_enc["province_Alberta"] == True]
df_SK = df_enc[df_enc["province_Saskatchewan"] == True]
df_MB = df_enc[df_enc["province_Manitoba"] == True]
# East
df_ES = df_enc[(df_enc["province_Nova Scotia"] == True) |
               (df_enc["province_New Brunswick"] == True) |
               (df_enc["province_Newfoundland & Labrador"] == True) |
               (df_enc["province_Prince Edward Island"] == True)]
# North
df_NO = df_enc[(df_enc["province_Yukon"] == True) |
               (df_enc["province_Northwest Territories"] == True)]

df_ON = df_ON.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_QC = df_QC.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_BC = df_BC.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_AB = df_AB.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_SK = df_SK.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_MB = df_MB.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_ES = df_ES.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta",
                    "province_Saskatchewan","province_Manitoba", "province_Yukon", "province_Northwest Territories"], axis=1)
df_NO = df_NO.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta",
                    "province_Saskatchewan", "province_Manitoba", "province_Nova Scotia", "province_New Brunswick",
                    "province_Newfoundland & Labrador", "province_Prince Edward Island"], axis=1)

# Ontario

In [5]:
df_ON.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32102 entries, 0 to 75724
Data columns (total 73 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   bathrooms_total                32102 non-null  float64
 1   bedrooms_extra                 32102 non-null  float64
 2   bedrooms                       32102 non-null  float64
 3   stories_total                  32102 non-null  float64
 4   size_interior                  32102 non-null  float64
 5   lng                            32102 non-null  float64
 6   lat                            32102 non-null  float64
 7   parkings                       32102 non-null  float64
 8   price                          32102 non-null  float64
 9   household_income               32102 non-null  float64
 10  individual_income              32102 non-null  float64
 11  commute_transit                32102 non-null  float64
 12  commute_foot                   32102 non-null  floa

## Raw Input

In [6]:
kf = KFold(n_splits=10)

In [7]:
cb1 = CatBoostRegressor(silent=True)

X = df_ON.drop("price", axis=1)
y = df_ON["price"]

scores1 = cross_val_score(cb1, X, y, cv=kf, scoring="r2")

print(f"\n{scores1}\n")
pd.Series(scores1).describe()


[0.43895801 0.69410044 0.72141024 0.77870319 0.55823807 0.4440354
 0.49528477 0.66298953 0.63480885 0.7211476 ]



count    10.000000
mean      0.614968
std       0.122979
min       0.438958
25%       0.511023
50%       0.648899
75%       0.714386
max       0.778703
dtype: float64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

cb1.fit(X_train, y_train)
y_pred = cb1.predict(X_test)

In [9]:
r2_score(y_test, y_pred)

0.6948275402722368

In [10]:
y_df1 = pd.DataFrame({"y_test":y_test.values, "y_pred":y_pred})

In [11]:
np.round(y_df1.head(20))

Unnamed: 0,y_test,y_pred
0,625000.0,749583.0
1,775000.0,1033734.0
2,424900.0,492895.0
3,2180000.0,1802604.0
4,1195000.0,1137299.0
5,1979999.0,1306615.0
6,799000.0,786696.0
7,1549000.0,2193363.0
8,1249900.0,1185000.0
9,849900.0,945697.0


## Scaled

In [12]:
cb2 = CatBoostRegressor(silent=True)

scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.to_numpy().reshape(-1,1))

scores2 = cross_val_score(cb2, X_scaled, y_scaled, cv=kf, scoring="r2")

print(f"\n{scores2}\n")
pd.Series(scores2).describe()


[0.4425569  0.69797835 0.71674249 0.77316951 0.56399552 0.44262816
 0.49697722 0.66311806 0.63122491 0.7143468 ]



count    10.000000
mean      0.614274
std       0.120498
min       0.442557
25%       0.513732
50%       0.647171
75%       0.710255
max       0.773170
dtype: float64

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

cb2.fit(X_train, y_train)
y_pred = cb2.predict(X_test)

In [14]:
r2_score(y_test, y_pred)

0.6945561654193038

In [15]:
y_test[:,0].reshape(-1,1).shape

(6421, 1)

In [16]:
y_df2 = pd.DataFrame({"y_test":scaler_y.inverse_transform(y_test[:,0].reshape(-1,1))[:,0].tolist(),
                      "y_pred":scaler_y.inverse_transform(y_pred.reshape(-1,1))[:,0].tolist()})

In [17]:
np.round(y_df2.head(20))

Unnamed: 0,y_test,y_pred
0,625000.0,709138.0
1,775000.0,1064719.0
2,424900.0,486888.0
3,2180000.0,1809114.0
4,1195000.0,1131109.0
5,1979999.0,1304139.0
6,799000.0,794868.0
7,1549000.0,2136925.0
8,1249900.0,1172183.0
9,849900.0,1048988.0


## log1p: Size Only

In [18]:
cb3 = CatBoostRegressor(silent=True)

X_log = X.copy(deep=True)
X_log["size_interior"] = np.log1p(X_log["size_interior"])

X_scaled = scaler_X.fit_transform(X_log)
y_scaled = scaler_y.fit_transform(y.to_numpy().reshape(-1,1))

scores3 = cross_val_score(cb3, X_scaled, y_scaled, cv=kf, scoring="r2")

print(f"\n{scores3}\n")
pd.Series(scores3).describe()


[0.4425569  0.69797835 0.71674249 0.77316951 0.56399552 0.44262816
 0.49697722 0.66311806 0.63122491 0.7143468 ]



count    10.000000
mean      0.614274
std       0.120498
min       0.442557
25%       0.513732
50%       0.647171
75%       0.710255
max       0.773170
dtype: float64

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

cb3.fit(X_train, y_train)
y_pred = cb3.predict(X_test)

In [20]:
r2_score(y_test, y_pred)

0.6945561654193038

In [21]:
y_df3 = pd.DataFrame({"y_test":scaler_y.inverse_transform(y_test[:,0].reshape(-1,1))[:,0].tolist(),
                      "y_pred":scaler_y.inverse_transform(y_pred.reshape(-1,1))[:,0].tolist()})

In [22]:
np.round(y_df3.head(20))

Unnamed: 0,y_test,y_pred
0,625000.0,709138.0
1,775000.0,1064719.0
2,424900.0,486888.0
3,2180000.0,1809114.0
4,1195000.0,1131109.0
5,1979999.0,1304139.0
6,799000.0,794868.0
7,1549000.0,2136925.0
8,1249900.0,1172183.0
9,849900.0,1048988.0


# log1p: Size and Price

In [23]:
cb4 = CatBoostRegressor(silent=True)

X_log = X.copy(deep=True)
X_log["size_interior"] = np.log1p(X_log["size_interior"])

y_log = y.copy(deep=True)
y_log = np.log1p(y)

X_scaled = scaler_X.fit_transform(X_log)
y_scaled = scaler_y.fit_transform(y_log.to_numpy().reshape(-1,1))

scores4 = cross_val_score(cb4, X_scaled, y_scaled, cv=kf, scoring="r2")

print(f"\n{scores4}\n")
pd.Series(scores4).describe()


[0.66546207 0.78369634 0.83173502 0.85540965 0.69632677 0.64602372
 0.68602204 0.74331252 0.74403536 0.79243956]



count    10.000000
mean      0.744446
std       0.071049
min       0.646024
25%       0.688598
50%       0.743674
75%       0.790254
max       0.855410
dtype: float64

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

cb3.fit(X_train, y_train)
y_pred = cb3.predict(X_test)

In [25]:
r2_score(y_test, y_pred)

0.7946994785587216

### Reverse Log

In [26]:
r2_score(np.expm1(y_test), np.expm1(y_pred))

0.36834139255533005

In [27]:
y_df4 = pd.DataFrame({"y_test":np.expm1(scaler_y.inverse_transform(y_test))[:,0].tolist(),
                      "y_pred":np.expm1(scaler_y.inverse_transform(y_pred.reshape(-1,1))[:,0].tolist())})

In [28]:
np.round(y_df4.head(20))

Unnamed: 0,y_test,y_pred
0,625000.0,725887.0
1,775000.0,1108532.0
2,424900.0,425645.0
3,2180000.0,1782703.0
4,1195000.0,1135569.0
5,1979999.0,1242966.0
6,799000.0,825950.0
7,1549000.0,2088211.0
8,1249900.0,1150133.0
9,849900.0,852242.0


In [29]:
np.round(y_df3.head(20))

Unnamed: 0,y_test,y_pred
0,625000.0,709138.0
1,775000.0,1064719.0
2,424900.0,486888.0
3,2180000.0,1809114.0
4,1195000.0,1131109.0
5,1979999.0,1304139.0
6,799000.0,794868.0
7,1549000.0,2136925.0
8,1249900.0,1172183.0
9,849900.0,1048988.0
