In [19]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, root_mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
import xgboost as xgb
from etl_to_ml import load_housing_valid
df = load_housing_valid()
df = df.sample(frac=0.1, random_state=42)

In [20]:
df.info()
df.head()
df.sample(3)

<class 'pandas.core.frame.DataFrame'>
Index: 56669 entries, 55844 to 476244
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   listing_id           56669 non-null  int64  
 1   listing_date         56669 non-null  object 
 2   season               56669 non-null  object 
 3   listing_year         56669 non-null  int64  
 4   decade               56669 non-null  int64  
 5   listing_month        56669 non-null  int64  
 6   listing_day          56669 non-null  int64  
 7   listing_day_of_week  56669 non-null  object 
 8   city                 56669 non-null  object 
 9   district             56669 non-null  object 
 10  postal_code          56669 non-null  object 
 11  rooms                56669 non-null  int64  
 12  floor                56669 non-null  int64  
 13  total_floors         56669 non-null  int64  
 14  floor_ratio          56669 non-null  float64
 15  year_built           56669 non-null 

Unnamed: 0,listing_id,listing_date,season,listing_year,decade,listing_month,listing_day,listing_day_of_week,city,district,...,year_built,building_age,area_sqm,area_sqm_bucket,distance_center_km,distance_km_bucket,price_sqm,price_total,has_elevator_int,invalid_floor_flag
332385,469008,2023-10-18,Autumn,2023,1980,10,18,Wednesday,Poznan,Naramowice,...,1989,36,127.6,100+,1.61,1-2.9,7880.0,1005444.0,0,False
37652,53168,2023-11-03,Autumn,2023,2010,11,3,Friday,Warszawa,Ursynow,...,2013,12,104.3,100+,13.38,10+,4480.0,467238.0,1,False
56786,80077,2024-06-15,Summer,2024,2010,6,15,Saturday,Gdansk,Przymorze,...,2010,15,82.6,70-99,1.77,1-2.9,8543.0,705617.0,1,False


In [21]:
X = df.drop(columns=['price_sqm', 'price_total'])
y = df['price_sqm']

In [22]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.7, random_state=42)
X_test, X_valid, y_test, y_valid = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [23]:
print("Train size: ", X_train.shape)
print("Validation size: ", X_valid.shape)
print("Test size: ", X_test.shape)

Train size:  (17000, 23)
Validation size:  (19835, 23)
Test size:  (19834, 23)


In [24]:
num_cols = X.select_dtypes(include=[np.number]).columns.to_list()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.to_list()
print("Numbers: ", num_cols)
print("Strings: ", cat_cols)

Numbers:  ['listing_id', 'listing_year', 'decade', 'listing_month', 'listing_day', 'rooms', 'floor', 'total_floors', 'floor_ratio', 'year_built', 'building_age', 'area_sqm', 'distance_center_km', 'has_elevator_int']
Strings:  ['listing_date', 'season', 'listing_day_of_week', 'city', 'district', 'postal_code', 'area_sqm_bucket', 'distance_km_bucket', 'invalid_floor_flag']


In [25]:
num_transfomer = Pipeline(steps=[
    ("Imputer", SimpleImputer(strategy='median')),
    ("scaler", StandardScaler())
])

cat_transfomer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("encoder", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

pre_processor = ColumnTransformer(
    transformers=[
        ("num", num_transfomer, num_cols),
        ("cat", cat_transfomer, cat_cols)
    ]
)

In [None]:
models = {
    "RandomForest": RandomForestRegressor(
        n_estimators=50,
        max_depth=None,
        n_jobs=-1,
        random_state=42
    ),
    "GradientBoosting": GradientBoostingRegressor(
        learning_rate=0.05,
        n_estimators=50,
        max_depth=3,
        random_state=42
    ),
    "XGBRegressor": XGBRegressor(
        n_estimators=100,
        learning_rate=0.03,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric="logloss",
        n_jobs=-1
    )
}

In [None]:
results = []
best_mae = np.inf

for name, model in models.items():
    print(f"\n Trenowany model: {name}...")
    start_time = time.time()
    pipe = Pipeline([
        ("pre", pre_processor),
        ("model", model)
    ])

    # model trainging
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)

    # metryki
    mae = mean_absolute_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results.append({
    "model": name,
    "mae": mae,
    "rmse": rmse,
    "r2": r2                
    })
    end_time = time.time()

    training_time = end_time - start_time

    print(f"MAE  = {mae:,.2f}")
    print(f"RMSE = {rmse:,.2f}")
    print(f"R²   = {r2:,.4f}")
    print(f"Czas trenowania: {training_time: .2f} sekund")
    
    if mae < best_mae:
        best_mae = mae
        best_model_name = name
        best_pipeline = pipe

results_df = pd.DataFrame(results).sort_values("mae")
print("\nZapisano metryki:")
print(results_df[["model", "mae", "rmse", "r2"]].to_string(index=False))



 Trenowany model: RandomForest...
MAE  = 401.72
RMSE = 585.24
R²   = 0.9455
Czas trenowania:  20.77 sekund

 Trenowany model: GradientBoosting...
MAE  = 790.61
RMSE = 1,007.48
R²   = 0.8384
Czas trenowania:  262.38 sekund

 Trenowany model: XGBRegressor...
MAE  = 822.64
RMSE = 1,051.70
R²   = 0.8239
Czas trenowania:  4.01 sekund

Zapisano metryki:               model         mae         rmse        r2
0      RandomForest  401.717268   585.235704  0.945477
1  GradientBoosting  790.614398  1007.478713  0.838420
2      XGBRegressor  822.641973  1051.701627  0.823923
