In [126]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler

In [127]:
df_enc = pd.read_csv("data/enc/data-enc-2024-04-01.csv")

In [128]:
df_enc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77455 entries, 0 to 77454
Data columns (total 85 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   bathrooms_total                   77455 non-null  float64
 1   bedrooms_extra                    77455 non-null  float64
 2   bedrooms                          77455 non-null  float64
 3   stories_total                     77455 non-null  float64
 4   size_interior                     77455 non-null  float64
 5   lng                               77455 non-null  float64
 6   lat                               77455 non-null  float64
 7   parkings                          77455 non-null  float64
 8   price                             77455 non-null  float64
 9   household_income                  77455 non-null  float64
 10  individual_income                 77455 non-null  float64
 11  commute_transit                   77455 non-null  float64
 12  comm

In [129]:
df_ON = df_enc[df_enc["province_Ontario"] == True]
df_QC = df_enc[df_enc["province_Quebec"] == True]
df_BC = df_enc[df_enc["province_British Columbia"] == True]
df_AB = df_enc[df_enc["province_Alberta"] == True]
df_SK = df_enc[df_enc["province_Saskatchewan"] == True]
df_MB = df_enc[df_enc["province_Manitoba"] == True]
# East
df_ES = df_enc[(df_enc["province_Nova Scotia"] == True) |
               (df_enc["province_New Brunswick"] == True) |
               (df_enc["province_Newfoundland & Labrador"] == True) |
               (df_enc["province_Prince Edward Island"] == True)]
# North
df_NO = df_enc[(df_enc["province_Yukon"] == True) |
               (df_enc["province_Northwest Territories"] == True)]

df_ON = df_ON.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_QC = df_QC.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_BC = df_BC.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_AB = df_AB.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_SK = df_SK.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_MB = df_MB.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_ES = df_ES.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta",
                    "province_Saskatchewan","province_Manitoba", "province_Yukon", "province_Northwest Territories"], axis=1)
df_NO = df_NO.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta",
                    "province_Saskatchewan", "province_Manitoba", "province_Nova Scotia", "province_New Brunswick",
                    "province_Newfoundland & Labrador", "province_Prince Edward Island"], axis=1)

# Ontario

In [130]:
df_ON.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32102 entries, 0 to 75724
Data columns (total 73 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   bathrooms_total                32102 non-null  float64
 1   bedrooms_extra                 32102 non-null  float64
 2   bedrooms                       32102 non-null  float64
 3   stories_total                  32102 non-null  float64
 4   size_interior                  32102 non-null  float64
 5   lng                            32102 non-null  float64
 6   lat                            32102 non-null  float64
 7   parkings                       32102 non-null  float64
 8   price                          32102 non-null  float64
 9   household_income               32102 non-null  float64
 10  individual_income              32102 non-null  float64
 11  commute_transit                32102 non-null  float64
 12  commute_foot                   32102 non-null  floa

## Raw Input

In [131]:
kf = KFold(n_splits=10)

In [310]:
xgb1 = XGBRegressor()

X = df_ON.drop("price", axis=1)
y = df_ON["price"]

scores1 = cross_val_score(xgb1, X, y, cv=kf, scoring="r2")

print(f"\n{scores1}\n")
pd.Series(scores1).describe()


[0.35368652 0.58157181 0.68677471 0.74397317 0.50192841 0.41186669
 0.46075854 0.62124139 0.58318798 0.67307637]



count    10.000000
mean      0.561807
std       0.126997
min       0.353687
25%       0.471051
50%       0.582380
75%       0.660118
max       0.743973
dtype: float64

In [311]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb1.fit(X_train, y_train)
y_pred = xgb1.predict(X_test)

In [312]:
r2_score(y_test, y_pred)

0.6642689045533662

In [313]:
y_df1 = pd.DataFrame({"y_test":y_test.values, "y_pred":y_pred})

In [314]:
np.round(y_df1.head(20))

Unnamed: 0,y_test,y_pred
0,625000.0,674129.0
1,775000.0,1278794.0
2,424900.0,412678.0
3,2180000.0,1802787.0
4,1195000.0,1210302.0
5,1979999.0,1496843.0
6,799000.0,813308.0
7,1549000.0,2678817.0
8,1249900.0,1274838.0
9,849900.0,780594.0


## Scaled

In [315]:
xgb2 = XGBRegressor()

scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.to_numpy().reshape(-1,1))

scores2 = cross_val_score(xgb2, X_scaled, y_scaled, cv=kf, scoring="r2")

print(f"\n{scores2}\n")
pd.Series(scores2).describe()


[0.36607238 0.61976749 0.68388947 0.74209206 0.5032429  0.42573944
 0.44995325 0.61717676 0.59509104 0.65796865]



count    10.000000
mean      0.566099
std       0.123275
min       0.366072
25%       0.463276
50%       0.606134
75%       0.648418
max       0.742092
dtype: float64

In [316]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

xgb2.fit(X_train, y_train)
y_pred = xgb2.predict(X_test)

In [317]:
r2_score(y_test, y_pred)

0.673188744717641

In [318]:
y_test[:,0].reshape(-1,1).shape

(6421, 1)

In [319]:
y_df2 = pd.DataFrame({"y_test":scaler_y.inverse_transform(y_test[:,0].reshape(-1,1))[:,0].tolist(),
                      "y_pred":scaler_y.inverse_transform(y_pred.reshape(-1,1))[:,0].tolist()})

In [320]:
np.round(y_df2.head(20))

Unnamed: 0,y_test,y_pred
0,625000.0,631090.0
1,775000.0,1191296.0
2,424900.0,433957.0
3,2180000.0,1852860.0
4,1195000.0,1247205.0
5,1979999.0,1506818.0
6,799000.0,803795.0
7,1549000.0,1786066.0
8,1249900.0,1271470.0
9,849900.0,825274.0


## log1p: Size Only

In [321]:
xgb3 = XGBRegressor()

X_log = X.copy(deep=True)
X_log["size_interior"] = np.log1p(X_log["size_interior"])

X_scaled = scaler_X.fit_transform(X_log)
y_scaled = scaler_y.fit_transform(y.to_numpy().reshape(-1,1))

scores3 = cross_val_score(xgb3, X_scaled, y_scaled, cv=kf, scoring="r2")

print(f"\n{scores3}\n")
pd.Series(scores3).describe()


[0.36607238 0.61976749 0.68388947 0.74209206 0.5032429  0.42573944
 0.44995325 0.61717676 0.59509104 0.65796865]



count    10.000000
mean      0.566099
std       0.123275
min       0.366072
25%       0.463276
50%       0.606134
75%       0.648418
max       0.742092
dtype: float64

In [322]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

xgb3.fit(X_train, y_train)
y_pred = xgb3.predict(X_test)

In [323]:
r2_score(y_test, y_pred)

0.673188744717641

In [324]:
y_df3 = pd.DataFrame({"y_test":scaler_y.inverse_transform(y_test[:,0].reshape(-1,1))[:,0].tolist(),
                      "y_pred":scaler_y.inverse_transform(y_pred.reshape(-1,1))[:,0].tolist()})

In [325]:
np.round(y_df3.head(20))

Unnamed: 0,y_test,y_pred
0,625000.0,631090.0
1,775000.0,1191296.0
2,424900.0,433957.0
3,2180000.0,1852860.0
4,1195000.0,1247205.0
5,1979999.0,1506818.0
6,799000.0,803795.0
7,1549000.0,1786066.0
8,1249900.0,1271470.0
9,849900.0,825274.0


# log1p: Size and Price

In [326]:
xgb4 = XGBRegressor()

X_log = X.copy(deep=True)
X_log["size_interior"] = np.log1p(X_log["size_interior"])

y_log = y.copy(deep=True)
y_log = np.log1p(y)

X_scaled = scaler_X.fit_transform(X_log)
y_scaled = scaler_y.fit_transform(y_log.to_numpy().reshape(-1,1))

scores4 = cross_val_score(xgb4, X_scaled, y_scaled, cv=kf, scoring="r2")

print(f"\n{scores4}\n")
pd.Series(scores4).describe()


[0.62697633 0.75711354 0.81892001 0.83826102 0.6641295  0.59005937
 0.64721044 0.73745766 0.72734305 0.78494175]



count    10.000000
mean      0.719241
std       0.084114
min       0.590059
25%       0.651440
50%       0.732400
75%       0.777985
max       0.838261
dtype: float64

In [327]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

xgb3.fit(X_train, y_train)
y_pred = xgb3.predict(X_test)

In [328]:
r2_score(y_test, y_pred)

0.7811194612417051

### Reverse Log

In [329]:
r2_score(np.expm1(y_test), np.expm1(y_pred))

0.38641613165732136

In [330]:
y_df4 = pd.DataFrame({"y_test":np.expm1(scaler_y.inverse_transform(y_test))[:,0].tolist(),
                      "y_pred":np.expm1(scaler_y.inverse_transform(y_pred.reshape(-1,1))[:,0].tolist())})

In [331]:
np.round(y_df4.head(20))

Unnamed: 0,y_test,y_pred
0,625000.0,982461.0
1,775000.0,1071234.0
2,424900.0,422856.0
3,2180000.0,1504129.0
4,1195000.0,1167197.0
5,1979999.0,1251780.0
6,799000.0,880598.0
7,1549000.0,2410614.0
8,1249900.0,1244917.0
9,849900.0,790417.0


In [332]:
np.round(y_df3.head(20))

Unnamed: 0,y_test,y_pred
0,625000.0,631090.0
1,775000.0,1191296.0
2,424900.0,433957.0
3,2180000.0,1852860.0
4,1195000.0,1247205.0
5,1979999.0,1506818.0
6,799000.0,803795.0
7,1549000.0,1786066.0
8,1249900.0,1271470.0
9,849900.0,825274.0
