# Model Training 

In [1]:
import pandas as pd 
import numpy as np
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder , RobustScaler
from category_encoders import TargetEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split , cross_validate, KFold

In [2]:
df = pd.read_csv("../data/processed/cleaned_data_after_eda.csv")

In [3]:
df.head()

Unnamed: 0,Net_Metrekare,Oda_Sayısı,Bulunduğu_Kat,Eşya_Durumu,Binanın_Yaşı,Isıtma_Tipi,Fiyat,Şehir,Binanın_Kat_Sayısı,Kullanım_Durumu,Yatırıma_Uygunluk,Takas,Banyo_Sayısı
0,100.0,4.0,3.Kat,0,1.0,Kombi Doğalgaz,14.038655,adana,10.0,Boş,0,1,1.0
1,89.0,3.0,4.Kat,0,0.0,Kombi Doğalgaz,14.375127,adana,14.0,Boş,1,1,1.0
2,140.0,4.0,Düz Giriş (Zemin),0,2.0,Klimalı,14.346139,adana,4.0,Boş,1,0,1.0
3,90.0,3.0,2.Kat,0,0.0,Kombi Doğalgaz,14.346139,adana,14.0,Boş,1,1,1.0
4,105.0,4.0,8.Kat,0,1.0,Kombi Doğalgaz,14.506155,adana,11.0,Boş,1,0,1.0


### Evaluation Function

In [4]:
def evaluate_model_cv (scores):
    print("---------------------EVALUATION----------------------")
    print(f"Fit Time: {np.mean(scores['fit_time']):.2f}")
    print(f"Score Time: {np.mean(scores['score_time']):.2f}")
    print(f"RMSE:%{-np.mean(scores['test_neg_root_mean_squared_error'])*100:.2f}")
    print(f"MAE:%{-np.mean(scores['test_neg_mean_absolute_error'])*100:.2f}")
    print(f"MSE:%{-np.mean(scores['test_neg_mean_squared_error'])*100:.2f}")
    print(f"R-squared (R2):%{np.mean(scores['test_r2'])*100:.2f}")

## Linear Regression Feature Encoding And Model Training

In [5]:
onehot_columns = ["Isıtma_Tipi", "Kullanım_Durumu"]
scale_cols = ['Net_Metrekare', 'Oda_Sayısı', 'Binanın_Yaşı' , 'Banyo_Sayısı']
mapping_dict = {
    'Kot 4 (-4).Kat': -4,'Kot 3 (-3).Kat': -3,'Kot 2 (-2).Kat': -2,'Kot 1 (-1).Kat': -1,'Bodrum Kat': -1,
    'Düz Giriş (Zemin)': 0,'Yüksek Giriş': 0,'Bahçe Katı': 0,        
    '1.Kat': 1,'2.Kat': 2,'3.Kat': 3,'4.Kat': 4,'5.Kat': 5,'6.Kat': 6,'7.Kat': 7,'8.Kat': 8,'9.Kat': 9,
    '10.Kat': 10,'11.Kat': 11,'12.Kat': 12,'13.Kat': 13,'14.Kat': 14,'15.Kat': 15,'16.Kat': 16,'17.Kat': 17,'18.Kat': 18,'19.Kat': 19,
    '21.Kat': 21,'22.Kat': 22,'26.Kat': 26,'30.Kat': 30,'40+.Kat': 41,'Bahçe Dublex': 42,
    'Çatı Katı': 100,'Çatı Dubleks': 100,'Müstakil': 100,'Villa Tipi': 100
}
ordinal_cols_mapping = [{
    'col': 'Bulunduğu_Kat', 
    'mapping': mapping_dict
}]

In [6]:
preprocessor_lr = ColumnTransformer(
    transformers=[
        ('target_enc', TargetEncoder(smoothing=10), ["Şehir"]),
        ('ordinal_enc', OrdinalEncoder(mapping=ordinal_cols_mapping), ["Bulunduğu_Kat"]),
        ('onehot_enc', OneHotEncoder(handle_unknown='ignore'), onehot_columns),
        ( 'Scaler' , RobustScaler() , scale_cols)
    ],
    remainder='passthrough' 
)
pipeline_lr = Pipeline([
    ("preprocessor", preprocessor_lr),
    ("model", LinearRegression())
])

In [7]:
X = df.drop("Fiyat" ,axis = 1)
y = df["Fiyat"]

num_folds = 5
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)
scoring_metric = ['neg_mean_squared_error' ,'neg_root_mean_squared_error' , 'neg_mean_absolute_error' , 'r2']
scores = cross_validate(pipeline_lr, X, y, cv=cv_strategy, scoring= scoring_metric)

evaluate_model_cv(scores)

---------------------EVALUATION----------------------
Fit Time: 0.15
Score Time: 0.03
RMSE:%30.37
MAE:%23.09
MSE:%9.22
R-squared (R2):%52.35


### Top 10 Important Features For Linear Regression

In [8]:
pipeline_lr.fit(X, y)
model = pipeline_lr['model']
coefficients = model.coef_

fitted_preprocessor = pipeline_lr['preprocessor']
feature_names = fitted_preprocessor.get_feature_names_out()

In [9]:
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

feature_importance['Abs_Coefficient'] = np.abs(feature_importance['Coefficient'])
feature_importance = feature_importance.sort_values(
    by='Abs_Coefficient', 
    ascending=False
).reset_index(drop=True)
print("----------------------------TOP 10 IMPORTANT FEATURE----------------------------")
print(feature_importance.head(10))

----------------------------TOP 10 IMPORTANT FEATURE----------------------------
                                       Feature  Coefficient  Abs_Coefficient
0                            target_enc__Şehir     1.286159         1.286159
1                        Scaler__Net_Metrekare     0.162097         0.162097
2                         Scaler__Binanın_Yaşı    -0.141667         0.141667
3                         Scaler__Banyo_Sayısı     0.140684         0.140684
4        onehot_enc__Isıtma_Tipi_Yerden Isıtma     0.090791         0.090791
5                           Scaler__Oda_Sayısı     0.085086         0.085086
6                       remainder__Eşya_Durumu     0.059388         0.059388
7  onehot_enc__Isıtma_Tipi_Merkezi (Pay Ölçer)     0.039375         0.039375
8              onehot_enc__Isıtma_Tipi_Klimalı    -0.039286         0.039286
9       onehot_enc__Isıtma_Tipi_Kombi Doğalgaz    -0.034630         0.034630


## XGBoost Feature Encoding and Model Training

I'm gonna use almost the same pipeline.But I will change the onehot encoding features with label encoding method to reduce multicollinearity.

In [14]:
label_cols = ["Isıtma_Tipi", "Kullanım_Durumu"]

In [15]:
preprocessor_xgb = ColumnTransformer(
    transformers=[
        ('target_enc', TargetEncoder(smoothing=10), ["Şehir"]),
        ('ordinal_enc', OrdinalEncoder(mapping=ordinal_cols_mapping), ["Bulunduğu_Kat"]),
        ('label_enc', OrdinalEncoder() , label_cols),
        ( 'Scaler' , RobustScaler() , scale_cols)
    ],
    remainder='passthrough' 
)
pipeline_xgb = Pipeline([
    ("preprocessor", preprocessor_xgb),
    ("model", XGBRegressor())
])

In [16]:
X = df.drop("Fiyat" ,axis = 1)
y = df["Fiyat"]

num_folds = 5
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)
scoring_metric = ['neg_mean_squared_error' ,'neg_root_mean_squared_error' , 'neg_mean_absolute_error' , 'r2']
scores = cross_validate(pipeline_xgb, X, y, cv=cv_strategy, scoring= scoring_metric)

evaluate_model_cv(scores)

---------------------EVALUATION----------------------
Fit Time: 0.41
Score Time: 0.04
RMSE:%26.73
MAE:%19.67
MSE:%7.15
R-squared (R2):%63.07


    Even though the time spent on fitting and predicting increases with the XGBoost model, the R-squared score and error metrics show that XGBoost gives a better result.

### Top 10 Important Features For XGBoost

In [17]:
pipeline_xgb.fit(X, y)
model = pipeline_xgb.named_steps["model"]

feature_names = pipeline_xgb.named_steps["preprocessor"].get_feature_names_out()
importances = model.feature_importances_

feature_importance_df = pd.DataFrame({
    "feature": feature_names,
    "importance": importances
}).sort_values("importance", ascending=False)

print(feature_importance_df.head(10))

                          feature  importance
7            Scaler__Banyo_Sayısı    0.565285
0               target_enc__Şehir    0.091852
4           Scaler__Net_Metrekare    0.069319
5              Scaler__Oda_Sayısı    0.061237
6            Scaler__Binanın_Yaşı    0.053637
8          remainder__Eşya_Durumu    0.034164
9   remainder__Binanın_Kat_Sayısı    0.029772
1      ordinal_enc__Bulunduğu_Kat    0.025850
2          label_enc__Isıtma_Tipi    0.025150
11               remainder__Takas    0.015908
