In [51]:
import pandas as pd

df = pd.read_csv("../data/processed/cleaned_data_after_eda.csv")
df.head()


Unnamed: 0,Net_Metrekare,Oda_Sayısı,Bulunduğu_Kat,Eşya_Durumu,Binanın_Yaşı,Isıtma_Tipi,Fiyat,Şehir,Binanın_Kat_Sayısı,Kullanım_Durumu,Yatırıma_Uygunluk,Takas,Banyo_Sayısı
0,100.0,4.0,3.Kat,0,1.0,Kombi Doğalgaz,14.038655,adana,10.0,Boş,0,1,1.0
1,89.0,3.0,4.Kat,0,0.0,Kombi Doğalgaz,14.375127,adana,14.0,Boş,1,1,1.0
2,140.0,4.0,Düz Giriş (Zemin),0,2.0,Klimalı,14.346139,adana,4.0,Boş,1,0,1.0
3,90.0,3.0,2.Kat,0,0.0,Kombi Doğalgaz,14.346139,adana,14.0,Boş,1,1,1.0
4,105.0,4.0,8.Kat,0,1.0,Kombi Doğalgaz,14.506155,adana,11.0,Boş,1,0,1.0


## CatBoost Model Training 

In [52]:
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import numpy as np

y1 = df["Fiyat"]
x1 = df.drop("Fiyat", axis=1)
cat_columns = np.where(x1.dtypes != float)[0]

x_train1, x_test1, y_train1, y_test1 = train_test_split(x1,y1,test_size=0.2,random_state=34,stratify=None)
model_pipeline1 = Pipeline(steps =[("model",CatBoostRegressor(cat_features=cat_columns))])

model_pipeline1.fit(x_train1,y_train1)
y_predict1 = model_pipeline1.predict(x_test1)



Learning rate set to 0.061619
0:	learn: 0.4286093	total: 44.7ms	remaining: 44.7s
1:	learn: 0.4186009	total: 96.5ms	remaining: 48.2s
2:	learn: 0.4092912	total: 145ms	remaining: 48.3s
3:	learn: 0.4012838	total: 198ms	remaining: 49.2s
4:	learn: 0.3934183	total: 248ms	remaining: 49.4s
5:	learn: 0.3860966	total: 332ms	remaining: 55.1s
6:	learn: 0.3794494	total: 392ms	remaining: 55.6s
7:	learn: 0.3736533	total: 445ms	remaining: 55.2s
8:	learn: 0.3681290	total: 494ms	remaining: 54.4s
9:	learn: 0.3628114	total: 543ms	remaining: 53.8s
10:	learn: 0.3578420	total: 593ms	remaining: 53.3s
11:	learn: 0.3534802	total: 646ms	remaining: 53.2s
12:	learn: 0.3496966	total: 695ms	remaining: 52.8s
13:	learn: 0.3459350	total: 742ms	remaining: 52.2s
14:	learn: 0.3426521	total: 777ms	remaining: 51s
15:	learn: 0.3394496	total: 813ms	remaining: 50s
16:	learn: 0.3366924	total: 847ms	remaining: 49s
17:	learn: 0.3338986	total: 890ms	remaining: 48.6s
18:	learn: 0.3311501	total: 930ms	remaining: 48s
19:	learn: 0.3287

## CatBoost Model Evaluation

In [53]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

rmse1 = np.sqrt(mean_squared_error(y_test1,y_predict1))
mae1 = mean_absolute_error(y_test1,y_predict1)
r21 = r2_score(y_test1,y_predict1)

train_score1 = model_pipeline1.score(x_train1,y_train1)
test_score1 = model_pipeline1.score(x_test1,y_test1)

print(f"RMSE: {rmse1:.2f}") 
print(f"MAE:  {mae1:.2f}")
print(f"R2:   {r21:.2f}")

print(f"Training R2: {train_score1:.2f}")
print(f"Testing R2:  {test_score1:.2f}")

RMSE: 0.27
MAE:  0.20
R2:   0.64
Training R2: 0.72
Testing R2:  0.64


## Data Encoding for The Random Forest Model

In [56]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import TargetEncoder

y2 = df["Fiyat"]
x2 = df.drop("Fiyat", axis=1)


x_train2, x_test2, y_train2, y_test2 = train_test_split(x2,y2,test_size=0.2,random_state=65,stratify=None)

cat_columns= (df.select_dtypes(include=["object"])).columns

ordinal_cols = [col for col in cat_columns if col != "Şehir"]


rf_ct = ColumnTransformer(
    transformers=[
    ('target_enc', TargetEncoder(), ["Şehir"]),
    ("encoder",OrdinalEncoder(),ordinal_cols)
    ],
    remainder="passthrough"
)

model_pipeline2 = Pipeline(steps=[
    ("transformer", rf_ct),
    ("model",RandomForestRegressor(random_state=34))
])


model_pipeline2.fit(x_train2,y_train2)
y_predict2 =model_pipeline2.predict(x_test2)






## Random Forest Model Evaluation

In [57]:
rmse2 = np.sqrt(mean_squared_error(y_test2,y_predict2))
mae2 = mean_absolute_error(y_test2,y_predict2)
r22 = r2_score(y_test2,y_predict2)

train_score2 = model_pipeline2.score(x_train2,y_train2)
test_score2 = model_pipeline2.score(x_test2,y_test2)

print(f"RMSE: %{rmse2*100:.4f}") 
print(f"MAE:  %{mae2*100:.4f}")
print(f"R2:   %{r22*100:.4f}")

print(f"Training R2: %{train_score2*100:.4f}")
print(f"Testing R2:  %{test_score2*100:.4f}")

RMSE: %27.7302
MAE:  %20.6605
R2:   %59.7240
Training R2: %87.2610
Testing R2:  %59.7240
