In [2]:
import pandas as pd

df = pd.read_csv("../data/processed/cleaned_data_after_eda.csv")
df.head()


Unnamed: 0,Net_Metrekare,Oda_Sayısı,Bulunduğu_Kat,Eşya_Durumu,Binanın_Yaşı,Isıtma_Tipi,Fiyat,Şehir,Binanın_Kat_Sayısı,Kullanım_Durumu,Yatırıma_Uygunluk,Takas,Banyo_Sayısı
0,100.0,4.0,3.Kat,0,1.0,Kombi Doğalgaz,14.038655,adana,10.0,Boş,0,1,1.0
1,89.0,3.0,4.Kat,0,0.0,Kombi Doğalgaz,14.375127,adana,14.0,Boş,1,1,1.0
2,140.0,4.0,Düz Giriş (Zemin),0,2.0,Klimalı,14.346139,adana,4.0,Boş,1,0,1.0
3,90.0,3.0,2.Kat,0,0.0,Kombi Doğalgaz,14.346139,adana,14.0,Boş,1,1,1.0
4,105.0,4.0,8.Kat,0,1.0,Kombi Doğalgaz,14.506155,adana,11.0,Boş,1,0,1.0


## CatBoost Model Training 

In [3]:
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import numpy as np

y1 = df["Fiyat"]
x1 = df.drop("Fiyat", axis=1)
cat_columns = np.where(x1.dtypes != float)[0]

x_train1, x_test1, y_train1, y_test1 = train_test_split(x1,y1,test_size=0.2,random_state=34,stratify=None)
model_pipeline1 = Pipeline(steps =[("model",CatBoostRegressor(cat_features=cat_columns))])

model_pipeline1.fit(x_train1,y_train1)
y_predict1 = model_pipeline1.predict(x_test1)



Learning rate set to 0.061619
0:	learn: 0.4286093	total: 248ms	remaining: 4m 7s
1:	learn: 0.4186009	total: 350ms	remaining: 2m 54s
2:	learn: 0.4092912	total: 448ms	remaining: 2m 28s
3:	learn: 0.4012838	total: 554ms	remaining: 2m 17s
4:	learn: 0.3934183	total: 651ms	remaining: 2m 9s
5:	learn: 0.3860966	total: 708ms	remaining: 1m 57s
6:	learn: 0.3794494	total: 767ms	remaining: 1m 48s
7:	learn: 0.3736533	total: 813ms	remaining: 1m 40s
8:	learn: 0.3681290	total: 849ms	remaining: 1m 33s
9:	learn: 0.3628114	total: 886ms	remaining: 1m 27s
10:	learn: 0.3578420	total: 921ms	remaining: 1m 22s
11:	learn: 0.3534802	total: 958ms	remaining: 1m 18s
12:	learn: 0.3496966	total: 999ms	remaining: 1m 15s
13:	learn: 0.3459350	total: 1.04s	remaining: 1m 13s
14:	learn: 0.3426521	total: 1.07s	remaining: 1m 10s
15:	learn: 0.3394496	total: 1.11s	remaining: 1m 8s
16:	learn: 0.3366924	total: 1.14s	remaining: 1m 6s
17:	learn: 0.3338986	total: 1.18s	remaining: 1m 4s
18:	learn: 0.3311501	total: 1.22s	remaining: 1m 2

## CatBoost Model Evaluation

In [4]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

rmse1 = np.sqrt(mean_squared_error(y_test1,y_predict1))
mae1 = mean_absolute_error(y_test1,y_predict1)
r21 = r2_score(y_test1,y_predict1)

train_score1 = model_pipeline1.score(x_train1,y_train1)
test_score1 = model_pipeline1.score(x_test1,y_test1)

print(f"RMSE: {rmse1:.2f}") 
print(f"MAE:  {mae1:.2f}")
print(f"R2:   {r21:.2f}")

print(f"Training R2: {train_score1:.2f}")
print(f"Testing R2:  {test_score1:.2f}")

RMSE: 0.27
MAE:  0.20
R2:   0.64
Training R2: 0.72
Testing R2:  0.64


## Data Encoding for The Random Forest Model

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer

cat_columns= (df.select_dtypes(include=["object"])).columns
rf_df = df



rf_ct = ColumnTransformer(
    transformers=[
    ("encoder",OrdinalEncoder(),cat_columns)
    ],
    remainder="passthrough"
)

model_pipeline2 = Pipeline(steps=[
    ("transformer", rf_ct),
    ("model",RandomForestRegressor(random_state=34))
])


model_pipeline2.fit(x_train1,y_train1)
y_predict2 =model_pipeline2.predict(x_test1)






## Random Forest Model Evaluation

In [15]:
rmse2 = np.sqrt(mean_squared_error(y_test1,y_predict2))
mae2 = mean_absolute_error(y_test1,y_predict2)
r22 = r2_score(y_test1,y_predict2)

train_score2 = model_pipeline2.score(x_train1,y_train1)
test_score2 = model_pipeline2.score(x_test1,y_test1)

print(f"RMSE: {rmse2:.2f}") 
print(f"MAE:  {mae2:.2f}")
print(f"R2:   {r22:.2f}")

print(f"Training R2: {train_score2:.2f}")
print(f"Testing R2:  {test_score2:.2f}")

RMSE: 0.34
MAE:  0.26
R2:   0.41
Training R2: 0.43
Testing R2:  0.41
