In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
import xgboost as xgb
from etl_to_ml import load_housing_valid
df = load_housing_valid()

In [31]:
df.info()
df.head()
df.sample(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 566687 entries, 0 to 566686
Data columns (total 25 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   listing_id           566687 non-null  int64  
 1   listing_date         566687 non-null  object 
 2   season               566687 non-null  object 
 3   listing_year         566687 non-null  int64  
 4   decade               566687 non-null  int64  
 5   listing_month        566687 non-null  int64  
 6   listing_day          566687 non-null  int64  
 7   listing_day_of_week  566687 non-null  object 
 8   city                 566687 non-null  object 
 9   district             566687 non-null  object 
 10  postal_code          566687 non-null  object 
 11  rooms                566687 non-null  int64  
 12  floor                566687 non-null  int64  
 13  total_floors         566687 non-null  int64  
 14  floor_ratio          566687 non-null  float64
 15  year_built       

Unnamed: 0,listing_id,listing_date,season,listing_year,decade,listing_month,listing_day,listing_day_of_week,city,district,postal_code,rooms,floor,total_floors,floor_ratio,year_built,building_age,area_sqm,area_sqm_bucket,distance_center_km,distance_km_bucket,price_sqm,price_total,has_elevator_int,invalid_floor_flag
157351,224271,2024-04-21,Spring,2024,1960,4,21,Sunday,Krakow,Stare Miasto,38-249,2,5,13,0.385,1966,59,77.4,70-99,3.33,3-4.9,10515.0,839094.0,1,False
306805,435169,2025-05-26,Spring,2025,2010,5,26,Monday,Krakow,Stare Miasto,47-672,1,10,16,0.625,2017,8,86.8,70-99,2.29,1-2.9,11307.0,981414.0,1,False
91757,131682,2025-08-10,Summer,2025,1970,8,10,Sunday,Lodz,Baluty,13-819,2,4,5,0.8,1979,46,70.8,70-99,6.54,5-9.9,3741.0,264840.0,1,False


In [32]:
X = df.drop(columns=['price_sqm', 'price_total'])
y = df['price_sqm']

In [33]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.7, random_state=42)
X_test, X_valid, y_test, y_valid = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [34]:
print("Train size: ", X_train.shape)
print("Validation size: ", X_valid.shape)
print("Test size: ", X_test.shape)

Train size:  (170006, 23)
Validation size:  (198341, 23)
Test size:  (198340, 23)


In [35]:
num_cols = X.select_dtypes(include=[np.number]).columns.to_list()
cat_cols = X.select_dtypes(exclude=[np.number]).columns.to_list()
print("Numbers: ", num_cols)
print("Strings: ", cat_cols)

Numbers:  ['listing_id', 'listing_year', 'decade', 'listing_month', 'listing_day', 'rooms', 'floor', 'total_floors', 'floor_ratio', 'year_built', 'building_age', 'area_sqm', 'distance_center_km', 'has_elevator_int']
Strings:  ['listing_date', 'season', 'listing_day_of_week', 'city', 'district', 'postal_code', 'area_sqm_bucket', 'distance_km_bucket', 'invalid_floor_flag']


In [36]:
num_transfomer = Pipeline(steps=[
    ("Imputer", SimpleImputer(strategy='median')),
    ("scaler", StandardScaler)
])

cat_transfomer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("encoder", OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

pre_processor = ColumnTransformer(
    transformers=[
        ("num", num_transfomer, num_cols),
        ("cat", cat_transfomer, cat_cols)
    ]
)

In [37]:
models = {
    "RandomForest": RandomForestRegressor(
        n_estimators=200,
        max_depth=None,
        n_jobs=1,
        random_state=42
    ),
    "GradientBoosting": GradientBoostingRegressor(
        learning_rate=0.05,
        n_estimators=300,
        max_depth=3,
        random_state=42
    ),
    "XGBRegressor": XGBRegressor(
        n_estimators=2000,
        learning_rate=0.03,
        max_depth=5,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        eval_metric="logloss",
        n_jobs=-1
    )
}

In [None]:
results = []

for name, model in models.items():
    print(f"\n Trenowany model: {name}...")

    pipe = Pipeline([
        ("pre", pre_processor),
        ("model", model)
    ])

    # model trainging
    pipe.fit(X_train, y_train)

    # model prediction
    y_pred = pipe.predict(X_train)

    # metryki
    mae = mean_absolute_error(y_test, y_pred)
    
