In [175]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from catboost import CatBoostRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler

In [176]:
df_enc = pd.read_csv("data/enc/data-enc-2024-04-07.csv")

In [177]:
df_enc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85823 entries, 0 to 85822
Data columns (total 85 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   bathrooms_total                   85823 non-null  float64
 1   bedrooms_extra                    85823 non-null  float64
 2   bedrooms                          85823 non-null  float64
 3   stories_total                     85823 non-null  float64
 4   size_interior                     85823 non-null  float64
 5   lng                               85823 non-null  float64
 6   lat                               85823 non-null  float64
 7   parkings                          85823 non-null  float64
 8   price                             85823 non-null  float64
 9   household_income                  85823 non-null  float64
 10  individual_income                 85823 non-null  float64
 11  commute_transit                   85823 non-null  float64
 12  comm

In [178]:
df_ON = df_enc[df_enc["province_Ontario"] == True]
df_QC = df_enc[df_enc["province_Quebec"] == True]
df_BC = df_enc[df_enc["province_British Columbia"] == True]
df_AB = df_enc[df_enc["province_Alberta"] == True]
df_SK = df_enc[df_enc["province_Saskatchewan"] == True]
df_MB = df_enc[df_enc["province_Manitoba"] == True]
# East
df_ES = df_enc[(df_enc["province_Nova Scotia"] == True) |
               (df_enc["province_New Brunswick"] == True) |
               (df_enc["province_Newfoundland & Labrador"] == True) |
               (df_enc["province_Prince Edward Island"] == True)]
# North
df_NO = df_enc[(df_enc["province_Yukon"] == True) |
               (df_enc["province_Northwest Territories"] == True)]

df_ON = df_ON.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_QC = df_QC.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_BC = df_BC.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_AB = df_AB.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_SK = df_SK.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_MB = df_MB.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta", "province_Saskatchewan",
                    "province_Manitoba", "province_Nova Scotia", "province_New Brunswick", "province_Newfoundland & Labrador",
                    "province_Prince Edward Island", "province_Yukon", "province_Northwest Territories"], axis=1)
df_ES = df_ES.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta",
                    "province_Saskatchewan","province_Manitoba", "province_Yukon", "province_Northwest Territories"], axis=1)
df_NO = df_NO.drop(["province_Ontario", "province_Quebec", "province_British Columbia", "province_Alberta",
                    "province_Saskatchewan", "province_Manitoba", "province_Nova Scotia", "province_New Brunswick",
                    "province_Newfoundland & Labrador", "province_Prince Edward Island"], axis=1)

In [179]:
df_MB.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2628 entries, 30961 to 85761
Data columns (total 73 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   bathrooms_total                2628 non-null   float64
 1   bedrooms_extra                 2628 non-null   float64
 2   bedrooms                       2628 non-null   float64
 3   stories_total                  2628 non-null   float64
 4   size_interior                  2628 non-null   float64
 5   lng                            2628 non-null   float64
 6   lat                            2628 non-null   float64
 7   parkings                       2628 non-null   float64
 8   price                          2628 non-null   float64
 9   household_income               2628 non-null   float64
 10  individual_income              2628 non-null   float64
 11  commute_transit                2628 non-null   float64
 12  commute_foot                   2628 non-null   f

## Raw Input

In [180]:
kf = KFold(n_splits=10)

In [181]:
cb1 = CatBoostRegressor(silent=True)

X = df_MB.drop("price", axis=1)
y = df_MB["price"]

scores1 = cross_val_score(cb1, X, y, cv=kf, scoring="r2")

print(f"\n{scores1}\n")
pd.Series(scores1).describe()


[0.57267047 0.58925595 0.8858016  0.89652852 0.59858811 0.58091504
 0.77528453 0.61873117 0.7486905  0.82840755]



count    10.000000
mean      0.709487
std       0.131791
min       0.572670
25%       0.591589
50%       0.683711
75%       0.815127
max       0.896529
dtype: float64

In [182]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

cb1.fit(X_train, y_train)
y_pred = cb1.predict(X_test)

In [183]:
r2_score(y_test, y_pred)

0.8585900440221569

In [184]:
y_df1 = pd.DataFrame({"y_test":y_test.values, "y_pred":y_pred})

In [185]:
np.round(y_df1.head(20))

Unnamed: 0,y_test,y_pred
0,259900.0,256653.0
1,599900.0,602203.0
2,274900.0,237870.0
3,369900.0,407332.0
4,519900.0,593836.0
5,599900.0,655886.0
6,564900.0,545255.0
7,199900.0,261523.0
8,99900.0,161212.0
9,269900.0,236761.0


## Scaled

In [186]:
cb2 = CatBoostRegressor(silent=True)

scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.to_numpy().reshape(-1,1))

scores2 = cross_val_score(cb2, X_scaled, y_scaled, cv=kf, scoring="r2")

print(f"\n{scores2}\n")
pd.Series(scores2).describe()


[0.55860941 0.60115267 0.8884417  0.89066569 0.60346474 0.55616133
 0.78028614 0.64835957 0.73675376 0.82761922]



count    10.000000
mean      0.709151
std       0.132307
min       0.556161
25%       0.601731
50%       0.692557
75%       0.815786
max       0.890666
dtype: float64

In [187]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

cb2.fit(X_train, y_train)
y_pred = cb2.predict(X_test)

In [188]:
r2_score(y_test, y_pred)

0.8585394869276226

In [189]:
y_test[:,0].reshape(-1,1).shape

(526, 1)

In [190]:
y_df2 = pd.DataFrame({"y_test":scaler_y.inverse_transform(y_test[:,0].reshape(-1,1))[:,0].tolist(),
                      "y_pred":scaler_y.inverse_transform(y_pred.reshape(-1,1))[:,0].tolist()})

In [191]:
np.round(y_df2.head(20))

Unnamed: 0,y_test,y_pred
0,259900.0,273643.0
1,599900.0,611279.0
2,274900.0,245194.0
3,369900.0,416213.0
4,519900.0,558245.0
5,599900.0,652428.0
6,564900.0,548265.0
7,199900.0,258952.0
8,99900.0,171642.0
9,269900.0,239972.0


## log1p: Size Only

In [192]:
cb3 = CatBoostRegressor(silent=True)

X_log = X.copy(deep=True)
X_log["size_interior"] = np.log1p(X_log["size_interior"])

X_scaled = scaler_X.fit_transform(X_log)
y_scaled = scaler_y.fit_transform(y.to_numpy().reshape(-1,1))

scores3 = cross_val_score(cb3, X_scaled, y_scaled, cv=kf, scoring="r2")

print(f"\n{scores3}\n")
pd.Series(scores3).describe()


[0.55863069 0.6011714  0.8884653  0.89067972 0.60341794 0.55611278
 0.78016964 0.64833128 0.73698194 0.82759708]



count    10.000000
mean      0.709156
std       0.132317
min       0.556113
25%       0.601733
50%       0.692657
75%       0.815740
max       0.890680
dtype: float64

In [193]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

cb3.fit(X_train, y_train)
y_pred = cb3.predict(X_test)

In [194]:
r2_score(y_test, y_pred)

0.8585458977928961

In [195]:
y_df3 = pd.DataFrame({"y_test":scaler_y.inverse_transform(y_test[:,0].reshape(-1,1))[:,0].tolist(),
                      "y_pred":scaler_y.inverse_transform(y_pred.reshape(-1,1))[:,0].tolist()})

In [196]:
np.round(y_df3.head(20))

Unnamed: 0,y_test,y_pred
0,259900.0,273643.0
1,599900.0,611279.0
2,274900.0,245194.0
3,369900.0,416213.0
4,519900.0,558245.0
5,599900.0,652428.0
6,564900.0,548265.0
7,199900.0,258952.0
8,99900.0,171642.0
9,269900.0,239972.0


# log1p: Size and Price

In [197]:
cb4 = CatBoostRegressor(silent=True)

X_log = X.copy(deep=True)
X_log["size_interior"] = np.log1p(X_log["size_interior"])

y_log = y.copy(deep=True)
y_log = np.log1p(y)

X_scaled = scaler_X.fit_transform(X_log)
y_scaled = scaler_y.fit_transform(y_log.to_numpy().reshape(-1,1))

scores4 = cross_val_score(cb4, X_scaled, y_scaled, cv=kf, scoring="r2")

print(f"\n{scores4}\n")
pd.Series(scores4).describe()


[0.66641075 0.8090289  0.84974014 0.87470218 0.78136477 0.68350787
 0.7575178  0.8445545  0.81365657 0.81202016]



count    10.000000
mean      0.789250
std       0.069013
min       0.666411
25%       0.763480
50%       0.810525
75%       0.836830
max       0.874702
dtype: float64

In [198]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)

cb3.fit(X_train, y_train)
y_pred = cb3.predict(X_test)

In [199]:
r2_score(y_test, y_pred)

0.8606017135370583

### Reverse Log

In [200]:
r2_score(np.expm1(y_test), np.expm1(y_pred))

0.7045746702308415

In [201]:
y_df4 = pd.DataFrame({"y_test":np.expm1(scaler_y.inverse_transform(y_test))[:,0].tolist(),
                      "y_pred":np.expm1(scaler_y.inverse_transform(y_pred.reshape(-1,1))[:,0].tolist())})

In [202]:
np.round(y_df4.head(20))

Unnamed: 0,y_test,y_pred
0,259900.0,239938.0
1,599900.0,577015.0
2,274900.0,250991.0
3,369900.0,421303.0
4,519900.0,534947.0
5,599900.0,637136.0
6,564900.0,559221.0
7,199900.0,228927.0
8,99900.0,177284.0
9,269900.0,247827.0


In [203]:
np.round(y_df3.head(20))

Unnamed: 0,y_test,y_pred
0,259900.0,273643.0
1,599900.0,611279.0
2,274900.0,245194.0
3,369900.0,416213.0
4,519900.0,558245.0
5,599900.0,652428.0
6,564900.0,548265.0
7,199900.0,258952.0
8,99900.0,171642.0
9,269900.0,239972.0
