In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, root_mean_squared_error
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [4]:
df_enc = pd.read_csv("data/enc/data-enc-2024-05-21.csv")

In [5]:
df_enc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160115 entries, 0 to 160114
Data columns (total 86 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   avg_price_5                       160115 non-null  float64
 1   bathrooms_total                   160115 non-null  float64
 2   bedrooms_extra                    160115 non-null  float64
 3   bedrooms                          160115 non-null  float64
 4   stories_total                     160115 non-null  float64
 5   size_interior                     160115 non-null  float64
 6   lng                               160115 non-null  float64
 7   lat                               160115 non-null  float64
 8   parkings                          160115 non-null  float64
 9   price                             160115 non-null  float64
 10  household_income                  160115 non-null  float64
 11  individual_income                 160115 non-null  f

In [6]:
df_ON = df_enc[df_enc["province_Ontario"] == True]
df_QC = df_enc[df_enc["province_Quebec"] == True]
df_BC = df_enc[df_enc["province_British Columbia"] == True]
df_AB = df_enc[df_enc["province_Alberta"] == True]
df_SK = df_enc[df_enc["province_Saskatchewan"] == True]
df_MB = df_enc[df_enc["province_Manitoba"] == True]
# East
df_ES = df_enc[(df_enc["province_Nova Scotia"] == True) |
               (df_enc["province_New Brunswick"] == True) |
               (df_enc["province_Newfoundland & Labrador"] == True) |
               (df_enc["province_Prince Edward Island"] == True)]
# North
df_NO = df_enc[(df_enc["province_Yukon"] == True) |
               (df_enc["province_Northwest Territories"] == True)]

df_ON = df_ON.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_QC = df_QC.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_BC = df_BC.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_AB = df_AB.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_SK = df_SK.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_MB = df_MB.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_ES = df_ES.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta",
                    "province_Saskatchewan","province_Manitoba", "province_Yukon", "province_Northwest Territories"], axis=1)
df_NO = df_NO.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta",
                    "province_Saskatchewan", "province_Manitoba", "province_Nova Scotia", "province_New Brunswick",
                    "province_Newfoundland & Labrador", "province_Prince Edward Island"], axis=1)

In [7]:
df_ON.info()

<class 'pandas.core.frame.DataFrame'>
Index: 77056 entries, 0 to 154896
Data columns (total 74 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   avg_price_5                    77056 non-null  float64
 1   bathrooms_total                77056 non-null  float64
 2   bedrooms_extra                 77056 non-null  float64
 3   bedrooms                       77056 non-null  float64
 4   stories_total                  77056 non-null  float64
 5   size_interior                  77056 non-null  float64
 6   lng                            77056 non-null  float64
 7   lat                            77056 non-null  float64
 8   parkings                       77056 non-null  float64
 9   price                          77056 non-null  float64
 10  household_income               77056 non-null  float64
 11  individual_income              77056 non-null  float64
 12  commute_transit                77056 non-null  flo

In [8]:
df_BC.corr(numeric_only=True)["price"].sort_values(ascending=False).head(20)

price                         1.000000
avg_price_5                   0.728182
size_interior                 0.567894
bedrooms                      0.470157
bathrooms_total               0.469096
individual_income             0.466570
household_income              0.466570
building_type_House           0.389645
ownership_type_group_ids_1    0.376236
parkings                      0.324614
age_15_to_19                  0.246046
area_duplex                   0.242873
edu_post_graduate_degree      0.236685
single_family                 0.229029
edu_bachelor_degree           0.228890
multi_family                  0.227841
edu_university_certificate    0.190294
lang_other                    0.165017
age_10_to_14                  0.162924
loc_parks                     0.158973
Name: price, dtype: float64

## Raw Input

In [9]:
kf = KFold(n_splits=10)

In [10]:
cb1 = CatBoostRegressor(silent=True)

X = df_ON.drop("price", axis=1)
y = df_ON["price"]

scores1 = cross_val_score(cb1, X, y, cv=kf, scoring="r2")

print(f"\n{scores1}\n")
pd.Series(scores1).describe()


[0.71890356 0.70450586 0.59846249 0.72086617 0.79321051 0.74501494
 0.72236346 0.73063847 0.70066893 0.74210601]



count    10.000000
mean      0.717674
std       0.049358
min       0.598462
25%       0.708105
50%       0.721615
75%       0.739239
max       0.793211
dtype: float64

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

cb1.fit(X_train, y_train)
y_pred = cb1.predict(X_test)

In [12]:
r2_score(y_test, y_pred)

0.7404303994751333

In [13]:
root_mean_squared_error(y_test, y_pred)

276058.86683566857

In [14]:
y_df1 = pd.DataFrame({"y_test":y_test.values, "y_pred":y_pred})

In [15]:
np.round(y_df1.head(20))

Unnamed: 0,y_test,y_pred
0,1299000.0,1367841.0
1,559900.0,422141.0
2,369900.0,388110.0
3,939900.0,901682.0
4,699900.0,827991.0
5,790000.0,852431.0
6,629900.0,614814.0
7,275000.0,360406.0
8,599900.0,640364.0
9,1289900.0,1160528.0


## Scaled

In [16]:
cb2 = CatBoostRegressor(silent=True)

scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.to_numpy().reshape(-1,1))

scores2 = cross_val_score(cb2, X_scaled, y_scaled, cv=kf, scoring="r2")

print(f"\n{scores2}\n")
pd.Series(scores2).describe()


[0.71409917 0.70572296 0.60534044 0.72170963 0.795224   0.74498917
 0.72323319 0.72633864 0.69976312 0.74259725]



count    10.000000
mean      0.717902
std       0.047840
min       0.605340
25%       0.707817
50%       0.722471
75%       0.738533
max       0.795224
dtype: float64

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

cb2.fit(X_train, y_train)
y_pred = cb2.predict(X_test)

In [18]:
r2_score(y_test, y_pred)

0.7414105632911907

In [19]:
root_mean_squared_error(scaler_y.inverse_transform(y_test), scaler_y.inverse_transform(y_pred.reshape(-1,1)))

275537.15929927636

In [20]:
y_df2 = pd.DataFrame({"y_test":scaler_y.inverse_transform(y_test[:,0].reshape(-1,1))[:,0].tolist(),
                      "y_pred":scaler_y.inverse_transform(y_pred.reshape(-1,1))[:,0].tolist()})

In [21]:
np.round(y_df2.head(20))

Unnamed: 0,y_test,y_pred
0,1299000.0,1372134.0
1,559900.0,434518.0
2,369900.0,376609.0
3,939900.0,894375.0
4,699900.0,819007.0
5,790000.0,854122.0
6,629900.0,634359.0
7,275000.0,351898.0
8,599900.0,658089.0
9,1289900.0,1143188.0


## log1p: X Only

In [22]:
from scipy import stats

sub_df = df_AB.copy(deep=True)

skew_df = pd.DataFrame(sub_df.columns, columns=["Feature"])
skew_df["Corr_Price_Before"] = sub_df.corr()["price"].values
skew_df["Skew_Before"] = skew_df["Feature"].apply(lambda feature: stats.skew(sub_df[feature]))
skew_df["AbsSkew_Before"] = skew_df["Skew_Before"].apply(abs)
skew_df["Skewed_Before"] = skew_df["AbsSkew_Before"].apply(lambda x: True if x>= 0.5 else False)

# before_log = set(skew_df.query("Skewed_Before == True")["Feature"].values)

for column in skew_df.query("Skewed_Before == True")["Feature"].values:
    if column != "price":
        sub_df[column] = np.log1p(sub_df[column])


# skew_df = pd.DataFrame(sub_df.columns, columns=["Feature"])
skew_df["Corr_Price_After"] = sub_df.corr()["price"].values
skew_df["Skew_After"] = skew_df["Feature"].apply(lambda feature: stats.skew(sub_df[feature]))
skew_df["AbsSkew_After"] = skew_df["Skew_After"].apply(abs)
skew_df["Skewed_After"] = skew_df["AbsSkew_After"].apply(lambda x: True if x>= 0.5 else False)

# after_log = set(skew_df.query("Skewed_Before == True")["Feature"].values)

# log_columns = list(before_log - after_log - {"price", "lng", "lat"})
log_columns = skew_df[(skew_df["Skewed_Before"]==True) & (skew_df["Corr_Price_After"] > skew_df["Corr_Price_Before"])]["Feature"].values

log_columns

  result = getattr(ufunc, method)(*inputs, **kwargs)


array(['bedrooms_extra', 'commute_transit', 'commute_drive',
       'lang_en_only', 'lang_en_and_fr', 'edu_post_graduate_degree',
       'area_apt_5_plus_floors', 'loc_wellness',
       'building_type_Apartment', 'building_type_Row / Townhouse',
       'ownership_type_group_ids_1'], dtype=object)

In [23]:
cb3 = CatBoostRegressor(silent=True)

X_log = X.copy(deep=True)
X_log[log_columns] = np.log1p(X_log[log_columns])

X_scaled = scaler_X.fit_transform(X_log)
y_scaled = scaler_y.fit_transform(y.to_numpy().reshape(-1,1))

scores3 = cross_val_score(cb3, X_scaled, y_scaled, cv=kf, scoring="r2")

print(f"\n{scores3}\n")
pd.Series(scores3).describe()


[0.71409917 0.70572296 0.60534044 0.72170963 0.795224   0.74498917
 0.72322859 0.72633864 0.69976312 0.74259725]



count    10.000000
mean      0.717901
std       0.047840
min       0.605340
25%       0.707817
50%       0.722469
75%       0.738533
max       0.795224
dtype: float64

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

cb3.fit(X_train, y_train)
y_pred = cb3.predict(X_test)

In [25]:
r2_score(y_test, y_pred)

0.7414105632911907

In [26]:
root_mean_squared_error(y_test, y_pred)

0.5227749383645397

In [27]:
y_df3 = pd.DataFrame({"y_test":scaler_y.inverse_transform(y_test[:,0].reshape(-1,1))[:,0].tolist(),
                      "y_pred":scaler_y.inverse_transform(y_pred.reshape(-1,1))[:,0].tolist()})

In [28]:
np.round(y_df3.head(20))

Unnamed: 0,y_test,y_pred
0,1299000.0,1372134.0
1,559900.0,434518.0
2,369900.0,376609.0
3,939900.0,894375.0
4,699900.0,819007.0
5,790000.0,854122.0
6,629900.0,634359.0
7,275000.0,351898.0
8,599900.0,658089.0
9,1289900.0,1143188.0


# log1p: X and y

In [29]:
cb4 = CatBoostRegressor(silent=True)

X_log = X.copy(deep=True)
X_log[log_columns] = np.log1p(X_log[log_columns])

y_log = np.log1p(y)

X_scaled = scaler_X.fit_transform(X_log)
y_scaled = scaler_y.fit_transform(y_log.to_numpy().reshape(-1,1))

scores4 = cross_val_score(cb4, X_scaled, y_scaled, cv=kf, scoring="r2")

print(f"\n{scores4}\n")
pd.Series(scores4).describe()


[0.82182214 0.82146699 0.73593063 0.81733586 0.84662085 0.82513512
 0.82318958 0.8239074  0.82656587 0.82683272]



count    10.000000
mean      0.816881
std       0.029502
min       0.735931
25%       0.821556
50%       0.823548
75%       0.826208
max       0.846621
dtype: float64

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

cb4.fit(X_train, y_train)
y_pred = cb4.predict(X_test)

In [31]:
r2_score(y_test, y_pred)

0.8277191904545071

### Reverse Log

In [32]:
r2_score(np.expm1(y_test), np.expm1(y_pred))

0.34505682282317285

In [33]:
root_mean_squared_error(y_test, y_pred)

0.4148795485106637

In [34]:
y_df4 = pd.DataFrame({"y_test":np.expm1(scaler_y.inverse_transform(y_test))[:,0].tolist(),
                      "y_pred":np.expm1(scaler_y.inverse_transform(y_pred.reshape(-1,1))[:,0].tolist())})

In [35]:
np.round(y_df4.head(20))

Unnamed: 0,y_test,y_pred
0,1299000.0,1377517.0
1,559900.0,454482.0
2,369900.0,367924.0
3,939900.0,913473.0
4,699900.0,811925.0
5,790000.0,824204.0
6,629900.0,606206.0
7,275000.0,288117.0
8,599900.0,574330.0
9,1289900.0,1130909.0


In [36]:
np.round(y_df3.head(20))

Unnamed: 0,y_test,y_pred
0,1299000.0,1372134.0
1,559900.0,434518.0
2,369900.0,376609.0
3,939900.0,894375.0
4,699900.0,819007.0
5,790000.0,854122.0
6,629900.0,634359.0
7,275000.0,351898.0
8,599900.0,658089.0
9,1289900.0,1143188.0


In [37]:
plt.figure(figsize=(12,8))
plt.scatter(np.expm1(scaler_X.inverse_transform(X_test))[:,5], np.expm1(scaler_y.inverse_transform(y_test)))
plt.scatter(np.expm1(scaler_X.inverse_transform(X_test))[:,5], np.expm1(scaler_y.inverse_transform(y_pred.reshape(-1,1))))

  plt.scatter(np.expm1(scaler_X.inverse_transform(X_test))[:,5], np.expm1(scaler_y.inverse_transform(y_test)))
  plt.scatter(np.expm1(scaler_X.inverse_transform(X_test))[:,5], np.expm1(scaler_y.inverse_transform(y_pred.reshape(-1,1))))


<matplotlib.collections.PathCollection at 0x7f55cc3ea9c0>

  steps = self._extended_steps * scale
  high = edge.ge(_vmax - best_vmin)
  d, m = divmod(x, self.step)


Error in callback <function _draw_all_if_interactive at 0x7f55fc250ea0> (for post_execute), with arguments args (),kwargs {}:


ValueError: arange: cannot compute length

ValueError: arange: cannot compute length

<Figure size 1200x800 with 1 Axes>

In [38]:
r2_score(np.expm1(scaler_y.inverse_transform(y_test)), np.expm1(scaler_y.inverse_transform(y_pred.reshape(-1,1))))

0.7199900915708094

In [39]:
X_log

Unnamed: 0,avg_price_5,bathrooms_total,bedrooms_extra,bedrooms,stories_total,size_interior,lng,lat,parkings,household_income,...,loc_cafes,loc_quiet,loc_parks,building_type_Apartment,building_type_House,building_type_Row / Townhouse,building_type_Semi-Detached,ownership_type_group_ids_0,ownership_type_group_ids_1,ownership_type_group_ids_2
0,644900.0,2.0,0.000000,2.0,1.0,1072.000000,-83.029633,42.265783,0.0,125972.0819,...,1.5,4.5,3.5,0.693147,0.0,0.0,0.0,0.0,0.000000,1.0
1,1098759.8,3.0,0.000000,4.0,2.0,1965.881818,-83.087577,42.228649,0.0,205459.7539,...,1.0,5.0,0.0,0.000000,1.0,0.0,0.0,0.0,0.693147,0.0
2,347360.0,1.0,0.000000,4.0,1.0,1040.426146,-83.056441,42.304613,0.0,76724.5170,...,2.5,3.0,3.5,0.000000,1.0,0.0,0.0,0.0,0.693147,0.0
3,299537.6,1.0,0.000000,3.0,1.0,3718.455676,-83.055635,42.302468,0.0,78194.1950,...,2.0,4.0,3.5,0.000000,1.0,0.0,0.0,0.0,0.693147,0.0
4,895919.8,3.0,0.000000,4.0,2.0,2700.000000,-83.093660,42.134249,0.0,143383.8087,...,0.0,5.0,0.0,0.000000,1.0,0.0,0.0,0.0,0.693147,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
154362,717920.0,3.0,0.000000,3.0,2.0,3092.004209,-77.505887,46.114368,4.0,159543.4167,...,0.0,5.0,0.0,0.000000,1.0,0.0,0.0,0.0,0.693147,0.0
154363,725900.0,2.0,0.693147,3.0,1.0,1386.740020,-77.516865,46.108780,5.0,152340.5896,...,0.0,4.0,0.0,0.000000,1.0,0.0,0.0,0.0,0.693147,0.0
154520,765740.0,1.0,0.000000,3.0,1.0,480.000000,-79.803065,47.059016,0.0,113000.0000,...,0.5,5.0,0.0,0.000000,1.0,0.0,0.0,0.0,0.693147,0.0
154521,732940.0,0.0,0.000000,2.0,1.0,879.000000,-79.834607,47.047430,0.0,113000.0000,...,0.5,4.5,0.0,0.000000,1.0,0.0,0.0,0.0,0.693147,0.0


In [40]:
y

0          399900.0
1          959900.0
2          299900.0
3          399000.0
4          999900.0
            ...    
154362    1100000.0
154363     465000.0
154520     385000.0
154521     549000.0
154896     394000.0
Name: price, Length: 77056, dtype: float64

In [41]:
df_log = X_log.copy(deep=True)
df_log["price_log"] = np.log1p(y)

In [42]:
df_log.corr(numeric_only=True)["price_log"].sort_values(ascending=False).head(20)

price_log                     1.000000
avg_price_5                   0.734018
bathrooms_total               0.620813
size_interior                 0.562767
household_income              0.473246
individual_income             0.473246
bedrooms                      0.470760
parkings                      0.424573
single_family                 0.360289
building_type_House           0.341128
ownership_type_group_ids_1    0.332655
stories_total                 0.294120
edu_bachelor_degree           0.293487
owners                        0.287807
edu_university_certificate    0.268325
bedrooms_extra                0.258855
edu_post_graduate_degree      0.233717
multi_family                  0.232745
age_15_to_19                  0.221740
age_10_to_14                  0.195383
Name: price_log, dtype: float64

In [43]:
X.columns

Index(['avg_price_5', 'bathrooms_total', 'bedrooms_extra', 'bedrooms',
       'stories_total', 'size_interior', 'lng', 'lat', 'parkings',
       'household_income', 'individual_income', 'commute_transit',
       'commute_foot', 'commute_bicycle', 'commute_drive', 'single_family',
       'multi_family', 'single_person', 'multi_person', 'total_individuals',
       'age_0_to_4', 'age_5_to_9', 'age_10_to_14', 'age_15_to_19',
       'age_20_to_34', 'age_35_to_49', 'age_50_to_64', 'age_65_to_79',
       'age_80_plus', 'owners', 'renters', 'lang_en_only', 'lang_fr_only',
       'lang_en_and_fr', 'lang_other', 'edu_no_high_school', 'edu_high_school',
       'edu_trade_certificate', 'edu_college_certificate',
       'edu_university_certificate', 'edu_bachelor_degree',
       'edu_post_graduate_degree', 'household_children',
       'area_single_detached', 'area_semi_detached', 'area_duplex',
       'area_row_houses', 'area_apt_1_to_4_floors', 'area_apt_5_plus_floors',
       'loc_high_schools', 