In [75]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
from bayes_opt import BayesianOptimization

In [6]:
df = pd.read_csv('kc_housing.csv')

In [10]:
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,2014-10-12T22:00:00Z,221900,3,1.0,1180,5650,1.0,False,0,3,7,1180,,1955,,98178,47.5112,-122.257,1340,5650
1,2014-12-08T23:00:00Z,538000,3,2.25,2570,7242,2.0,False,0,3,7,2170,400.0,1951,1991.0,98125,47.721,-122.319,1690,7639
2,2015-02-24T23:00:00Z,180000,2,1.0,770,10000,1.0,False,0,3,6,770,,1933,,98028,47.7379,-122.233,2720,8062
3,2014-12-08T23:00:00Z,604000,4,3.0,1960,5000,1.0,False,0,5,7,1050,910.0,1965,,98136,47.5208,-122.393,1360,5000
4,2015-02-17T23:00:00Z,510000,3,2.0,1680,8080,1.0,False,0,3,8,1680,,1987,,98074,47.6168,-122.045,1800,7503


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           21613 non-null  object 
 1   price          21613 non-null  int64  
 2   bedrooms       21613 non-null  int64  
 3   bathrooms      21613 non-null  float64
 4   sqft_living    21613 non-null  int64  
 5   sqft_lot       21613 non-null  int64  
 6   floors         21613 non-null  float64
 7   waterfront     21613 non-null  object 
 8   view           21613 non-null  int64  
 9   condition      21613 non-null  int64  
 10  grade          21613 non-null  int64  
 11  sqft_above     21613 non-null  int64  
 12  sqft_basement  8487 non-null   float64
 13  yr_built       21613 non-null  int64  
 14  yr_renovated   914 non-null    float64
 15  zipcode        21613 non-null  int64  
 16  lat            21613 non-null  float64
 17  long           21613 non-null  float64
 18  sqft_l

In [14]:
df['waterfront'] = df['waterfront'].astype('object')

In [18]:
target = 'price'

In [20]:
df.isna().sum().sort_values(ascending=False)

yr_renovated     20699
sqft_basement    13126
date                 0
price                0
sqft_living15        0
long                 0
lat                  0
zipcode              0
yr_built             0
sqft_above           0
grade                0
condition            0
view                 0
waterfront           0
floors               0
sqft_lot             0
sqft_living          0
bathrooms            0
bedrooms             0
sqft_lot15           0
dtype: int64

In [22]:
# Datanın bölünməsi 

train, test = train_test_split(df, test_size=0.2, random_state=123)

## XGBoost alqoritmasının qurulması

In [28]:
feature_cols = [c for c in df.columns if c != target]
categorical_cols = ['date', 'waterfront']
numeric_cols = [c for c in feature_cols if c not in categorical_cols]

In [30]:
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numeric_cols),
    ],
    remainder="drop",
)

In [34]:
base_model = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    subsample=1.0,
    colsample_bytree=1.0,
    random_state=123
)

In [36]:
pipe = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", base_model)
])

In [38]:
X_train, y_train = train[feature_cols], train[target]
X_test,  y_test  = test[feature_cols],  test[target]

In [40]:
pipe.fit(X_train, y_train)

In [44]:
# Proqnozlar

pred = pipe.predict(X_test)
pred

array([ 608958.8 ,  519630.84,  839689.5 , ...,  471982.53, 1167968.9 ,
       1180378.9 ], dtype=float32)

In [85]:
# Reqressiya Modelinin Perormansının Qiymətləndirilməsi

def adjusted_r2(y_true, y_pred, p):
    n = len(y_true)
    r2 = r2_score(y_true, y_pred)
    return 1.0 - (1.0 - r2) * (n - 1) / max(n - p - 1, 1)

p_effective = pipe.named_steps["model"].n_features_in_
p_effective

388

In [92]:
adj_r2 = adjusted_r2(y_test.values, pred, p_effective)
mae= mean_absolute_error(y_test, pred)
rmse = mean_squared_error(y_test, pred, squared=False)

print(f'Adjusted R2: {adj_r2:.1f}')
print(f"MAE: {mae:.1f}")
print(f"RMSE: {rmse:.1f}")

Adjusted R2: 0.9
MAE: 67052.2
RMSE: 117071.3




## Bayesian optimization ilə optimal hiperparametrlərin tapılması

In [94]:
def train_eval(colsample_bytree, n_estimators, learning_rate, max_depth):
    # Cast to the intended types
    n_estimators = int(round(n_estimators))
    max_depth = int(round(max_depth))

    model = XGBRegressor(
        objective="reg:squarederror",
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        subsample=1.0,
        colsample_bytree=colsample_bytree,
        random_state=123,
        n_jobs=-1
    )

    pipe_bo = Pipeline(steps=[
        ("preprocess", preprocess),
        ("model", model)
    ])

    pipe_bo.fit(X_train, y_train)
    pred_val = pipe_bo.predict(X_test)

    p_eff = pipe_bo.named_steps["model"].n_features_in_
    adj = adjusted_r2(y_test.values, pred_val, p_eff)

    # bayes_opt maximizes the returned value
    return adj


In [96]:
pbounds = {
    "colsample_bytree": (0.1, 1.0),       # ~ mtry (fraction of columns)
    "n_estimators":     (10, 150),        # trees
    "learning_rate":    (0.01, 0.5),      # learn_rate
    "max_depth":        (2, 10)           # tree_depth
}

In [98]:
optimizer = BayesianOptimization(
    f=train_eval,
    pbounds=pbounds,
    random_state=123,
    verbose=2
)

In [100]:
optimizer.maximize(init_points=5, n_iter=5)

|   iter    |  target   | colsam... | n_esti... | learni... | max_depth |
-------------------------------------------------------------------------
| [39m1        [39m | [39m0.8828567[39m | [39m0.7268222[39m | [39m50.059506[39m | [39m0.1211572[39m | [39m6.4105181[39m |
| [39m2        [39m | [39m0.8538386[39m | [39m0.7475220[39m | [39m69.234904[39m | [39m0.4905744[39m | [39m7.4786379[39m |
| [35m3        [39m | [35m0.8870071[39m | [35m0.5328387[39m | [35m64.896452[39m | [35m0.1781572[39m | [35m7.8323976[39m |
| [39m4        [39m | [39m0.8720694[39m | [39m0.4947150[39m | [39m18.354905[39m | [39m0.2050416[39m | [39m7.9039632[39m |
| [39m5        [39m | [39m0.8746641[39m | [39m0.2642425[39m | [39m34.563245[39m | [39m0.2704601[39m | [39m6.2546206[39m |
| [39m6        [39m | [39m0.8706694[39m | [39m0.3906630[39m | [39m118.71340[39m | [39m0.3668456[39m | [39m6.5117517[39m |
| [39m7        [39m | [39m0.8734054[39m | [

In [102]:
best_params = optimizer.max["params"]
best_params

{'colsample_bytree': 0.59971167112098,
 'n_estimators': 63.972420084947906,
 'learning_rate': 0.12359042692648141,
 'max_depth': 7.506479109134754}

In [104]:
best_params["n_estimators"] = int(round(best_params["n_estimators"]))
best_params["max_depth"] = int(round(best_params["max_depth"]))
best_params

{'colsample_bytree': 0.59971167112098,
 'n_estimators': 64,
 'learning_rate': 0.12359042692648141,
 'max_depth': 8}

## Hiperparametr sazlama ilə modelləşdirmə

In [107]:
best_model = XGBRegressor(
    objective="reg:squarederror",
    n_estimators=best_params["n_estimators"],
    learning_rate=best_params["learning_rate"],
    max_depth=best_params["max_depth"],
    subsample=1.0,
    colsample_bytree=best_params["colsample_bytree"],
    random_state=123
)

In [109]:
pipe_best = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", best_model)
])

In [111]:
pipe_best.fit(X_train, y_train)

In [115]:
# Proqnozlar

pred_best = pipe_best.predict(X_test)
pred_best

array([ 581078.4 ,  515538.66,  855098.4 , ...,  497838.  , 1136987.5 ,
       1202531.4 ], dtype=float32)

In [117]:
# Reqressiya Modelinin Perormansının Qiymətləndirilməsi

p_eff_best = pipe_best.named_steps["model"].n_features_in_
adj_r2_best = adjusted_r2(y_test.values, pred_best, p_eff_best)
mae_best  = mean_absolute_error(y_test, pred_best)
rmse_best = mean_squared_error(y_test, pred_best, squared=False)

print(f'Adjusted R2: {adj_r2_best:.1f}')
print(f"MAE: {mae_best:.1f}")
print(f"RMSE: {rmse_best:.1f}")

Adjusted R2: 0.9
MAE: 66645.1
RMSE: 120000.6


