# Model Training 

In [145]:
import pandas as pd 
import numpy as np
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder , RobustScaler
from category_encoders import TargetEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split , cross_validate, KFold
from sklearn.metrics import mean_squared_error , mean_absolute_error , r2_score
from sklearn.model_selection import RandomizedSearchCV

In [146]:
df = pd.read_csv("../data/processed/cleaned_data_after_eda.csv")

In [147]:
df.head()

Unnamed: 0,Net_Metrekare,Oda_Sayısı,Bulunduğu_Kat,Eşya_Durumu,Binanın_Yaşı,Isıtma_Tipi,Fiyat,Şehir,Binanın_Kat_Sayısı,Kullanım_Durumu,Yatırıma_Uygunluk,Takas,Banyo_Sayısı
0,100.0,4.0,3.Kat,0,1.0,Kombi Doğalgaz,14.038655,adana,10.0,Boş,0,1,1.0
1,89.0,3.0,4.Kat,0,0.0,Kombi Doğalgaz,14.375127,adana,14.0,Boş,1,1,1.0
2,140.0,4.0,Düz Giriş (Zemin),0,2.0,Klimalı,14.346139,adana,4.0,Boş,1,0,1.0
3,90.0,3.0,2.Kat,0,0.0,Kombi Doğalgaz,14.346139,adana,14.0,Boş,1,1,1.0
4,105.0,4.0,8.Kat,0,1.0,Kombi Doğalgaz,14.506155,adana,11.0,Boş,1,0,1.0


### Evaluation Function

In [148]:
def evaluate_model_cv (scores):
    print("---------------------EVALUATION----------------------")
    print(f"Fit Time: {np.mean(scores['fit_time'])}")
    print(f"Score Time: {np.mean(scores['score_time'])}")
    print(f"RMSE:{-np.mean(scores['test_neg_root_mean_squared_error'])}")
    print(f"MAE:{-np.mean(scores['test_neg_mean_absolute_error'])}")
    print(f"MSE:{-np.mean(scores['test_neg_mean_squared_error'])}")
    print(f"R-squared (R2):{np.mean(scores['test_r2'])}")

## Linear Regression Feature Encoding And Model Training

In [149]:
onehot_columns = ["Isıtma_Tipi", "Kullanım_Durumu"]
scale_cols = ['Net_Metrekare', 'Oda_Sayısı', 'Binanın_Yaşı' , 'Banyo_Sayısı']
mapping_dict = {
    'Kot 4 (-4).Kat': -4,'Kot 3 (-3).Kat': -3,'Kot 2 (-2).Kat': -2,'Kot 1 (-1).Kat': -1,'Bodrum Kat': -1,
    'Düz Giriş (Zemin)': 0,'Yüksek Giriş': 0,'Bahçe Katı': 0,        
    '1.Kat': 1,'2.Kat': 2,'3.Kat': 3,'4.Kat': 4,'5.Kat': 5,'6.Kat': 6,'7.Kat': 7,'8.Kat': 8,'9.Kat': 9,
    '10.Kat': 10,'11.Kat': 11,'12.Kat': 12,'13.Kat': 13,'14.Kat': 14,'15.Kat': 15,'16.Kat': 16,'17.Kat': 17,'18.Kat': 18,'19.Kat': 19,
    '21.Kat': 21,'22.Kat': 22,'26.Kat': 26,'30.Kat': 30,'40+.Kat': 41,'Bahçe Dublex': 42,
    'Çatı Katı': 100,'Çatı Dubleks': 100,'Müstakil': 100,'Villa Tipi': 100
}
ordinal_cols_mapping = [{
    'col': 'Bulunduğu_Kat', 
    'mapping': mapping_dict
}]

In [150]:
preprocessor_lr = ColumnTransformer(
    transformers=[
        ('target_enc', TargetEncoder(smoothing=10), ["Şehir"]),
        ('ordinal_enc', OrdinalEncoder(mapping=ordinal_cols_mapping), ["Bulunduğu_Kat"]),
        ('onehot_enc', OneHotEncoder(handle_unknown='ignore'), onehot_columns),
        ( 'Scaler' , RobustScaler() , scale_cols)
    ],
    remainder='passthrough' 
)
pipeline_lr = Pipeline([
    ("preprocessor", preprocessor_lr),
    ("model", LinearRegression())
])

In [151]:
X = df.drop("Fiyat" ,axis = 1)
y = df["Fiyat"]

num_folds = 5
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)
scoring_metric = ['neg_mean_squared_error' ,'neg_root_mean_squared_error' , 'neg_mean_absolute_error' , 'r2']
scores = cross_validate(pipeline_lr, X, y, cv=cv_strategy, scoring= scoring_metric)

evaluate_model_cv(scores)

---------------------EVALUATION----------------------
Fit Time: 0.0591425895690918
Score Time: 0.013712882995605469
RMSE:0.3036543806600008
MAE:0.23094315111905156
MSE:0.09221539478705112
R-squared (R2):0.5234515144524807


### Top 10 Important Features For Linear Regression

In [152]:
pipeline_lr.fit(X, y)
model = pipeline_lr['model']
coefficients = model.coef_

fitted_preprocessor = pipeline_lr['preprocessor']
feature_names = fitted_preprocessor.get_feature_names_out()

In [153]:
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

feature_importance['Abs_Coefficient'] = np.abs(feature_importance['Coefficient'])
feature_importance = feature_importance.sort_values(
    by='Abs_Coefficient', 
    ascending=False
).reset_index(drop=True)
print("----------------------------TOP 10 IMPORTANT FEATURE----------------------------")
print(feature_importance.head(10))

----------------------------TOP 10 IMPORTANT FEATURE----------------------------
                                       Feature  Coefficient  Abs_Coefficient
0                            target_enc__Şehir     1.286159         1.286159
1                        Scaler__Net_Metrekare     0.162097         0.162097
2                         Scaler__Binanın_Yaşı    -0.141667         0.141667
3                         Scaler__Banyo_Sayısı     0.140684         0.140684
4        onehot_enc__Isıtma_Tipi_Yerden Isıtma     0.090791         0.090791
5                           Scaler__Oda_Sayısı     0.085086         0.085086
6                       remainder__Eşya_Durumu     0.059388         0.059388
7  onehot_enc__Isıtma_Tipi_Merkezi (Pay Ölçer)     0.039375         0.039375
8              onehot_enc__Isıtma_Tipi_Klimalı    -0.039286         0.039286
9       onehot_enc__Isıtma_Tipi_Kombi Doğalgaz    -0.034630         0.034630


## XGBoost Feature Encoding and Model Training

I'm gonna use almost the same pipeline.But I will change the onehot encoding features with label encoding method to reduce multicollinearity.

In [154]:
label_cols = ["Isıtma_Tipi", "Kullanım_Durumu"]

In [155]:
preprocessor_xgb = ColumnTransformer(
    transformers=[
        ('target_enc', TargetEncoder(smoothing=10), ["Şehir"]),
        ('ordinal_enc', OrdinalEncoder(mapping=ordinal_cols_mapping), ["Bulunduğu_Kat"]),
        ('label_enc', OrdinalEncoder() , label_cols),
        ( 'Scaler' , RobustScaler() , scale_cols)
    ],
    remainder='passthrough' 
)
pipeline_xgb = Pipeline([
    ("preprocessor", preprocessor_xgb),
    ("model", XGBRegressor())
])

In [156]:
X = df.drop("Fiyat" ,axis = 1)
y = df["Fiyat"]

num_folds = 5
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)
scoring_metric = ['neg_mean_squared_error' ,'neg_root_mean_squared_error' , 'neg_mean_absolute_error' , 'r2']
scores = cross_validate(pipeline_xgb, X, y, cv=cv_strategy, scoring= scoring_metric)
print(f"R_squared (R2) Scores for each fold: {scores['test_r2']}")
print("\n")
evaluate_model_cv(scores)
print("\n")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Predict on the training set
pipeline_xgb.fit(X_train, y_train)  
y_train_pred = pipeline_xgb.predict(X_train)
# Predict on the test set
y_test_pred = pipeline_xgb.predict(X_test)
# Evaluate training set performance
train_mae = mean_absolute_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)
print("---------------------TRAIN SET EVALUATION----------------------")
print(f"Train MAE: {train_mae}")
print(f"Train R-squared (R2): {train_r2}")
# Evaluate test set performance
test_mae = mean_absolute_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)
print("---------------------TEST SET EVALUATION----------------------")
print(f"Test MAE: {test_mae}")
print(f"Test R-squared (R2): {test_r2}")

R_squared (R2) Scores for each fold: [0.63310147 0.62399821 0.63144652 0.6332695  0.631586  ]


---------------------EVALUATION----------------------
Fit Time: 0.2877230644226074
Score Time: 0.03913760185241699
RMSE:0.26732364221700566
MAE:0.19671280381110065
MSE:0.0714665559167872
R-squared (R2):0.6306803399673179


---------------------TRAIN SET EVALUATION----------------------
Train MAE: 0.14040391999167515
Train R-squared (R2): 0.8211184343131152
---------------------TEST SET EVALUATION----------------------
Test MAE: 0.19885842630559808
Test R-squared (R2): 0.624965101936395


There is a gap between train and test R2 scores(approximately 0.20) because the dataset is noisy, XGBoost model is likely memorizing the noise in the training set.
But our cross validation scores are close to each other.That shows the consistency.In the model tuning we will try to lower the Train score to close the gap.

### XGBoost Hyperparameter tuning

In [None]:
param_grid = {
    "model__n_estimators": [200, 400],
    "model__max_depth": [3, 4, 5],
    "model__learning_rate": [0.05, 0.1],
    "model__subsample": [0.6, 0.8, 1.0],
    "model__colsample_bytree": [0.6, 0.8, 1.0],
    "model__gamma": [0, 5, 10],
    "model__reg_alpha": [0, 0.1, 1],
    "model__reg_lambda": [1, 5, 10]
}

random_search = RandomizedSearchCV(
    estimator=pipeline_xgb,
    param_distributions=param_grid,
    n_iter=100,       
    scoring="r2",
    cv=5,             
    verbose=1,
    n_jobs=-1,
    random_state=42   
)

random_search.fit(X_train, y_train)
print(f"Best parameters found: {random_search.best_params_}")
best_model = random_search.best_estimator_
y_test_pred_best = best_model.predict(X_test)
y_train_pred_best = best_model.predict(X_train)


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters found: {'model__subsample': 0.8, 'model__reg_lambda': 10, 'model__reg_alpha': 1, 'model__n_estimators': 400, 'model__max_depth': 5, 'model__learning_rate': 0.1, 'model__gamma': 0, 'model__colsample_bytree': 0.6}


In [158]:
test_mae_best = mean_absolute_error(y_test, y_test_pred_best)
test_r2_best = r2_score(y_test, y_test_pred_best)
train_mae_best = mean_absolute_error(y_train, y_train_pred_best)
train_r2_best = r2_score(y_train, y_train_pred_best)

print("---------------------TEST SET EVALUATION AFTER HYPERPARAMETER TUNING----------------------")
print(f"Test MAE after tuning: {test_mae_best}")
print(f"Test R-squared (R2) after tuning: {test_r2_best}")
print("---------------------TRAIN SET EVALUATION AFTER HYPERPARAMETER TUNING----------------------")
print(f"Train MAE after tuning: {train_mae_best}")
print(f"Train R-squared (R2) after tuning: {train_r2_best}")

---------------------TEST SET EVALUATION AFTER HYPERPARAMETER TUNING----------------------
Test MAE after tuning: 0.19580548190485123
Test R-squared (R2) after tuning: 0.6404219818970037
---------------------TRAIN SET EVALUATION AFTER HYPERPARAMETER TUNING----------------------
Train MAE after tuning: 0.1670971247040829
Train R-squared (R2) after tuning: 0.7457048947762541


After hyperparameter tuning, the overfitting decreased as we wanted. The results are now more consistent and show less overfitting.