
# Hedonic Housing Valuation: Hierarchical Routing (ppm) + Quantile GBM

Notebook implements two variants requested: NN routing (classifier → per‑segment regressors on log price-per-m²) and production‑friendly GBM with quantile uncertainty. Configure in the next cell.


In [4]:

# ==== Imports ====
import os, re, json, math, gc, random, warnings
from datetime import datetime
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, confusion_matrix
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.isotonic import IsotonicRegression
from sklearn.utils.class_weight import compute_class_weight
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import lightgbm as lgb
import xgboost as xgb

try:
    import h3
except Exception:
    h3 = None

SEED = SEED
random.seed(SEED); np.random.seed(SEED); tf.keras.utils.set_random_seed(SEED)
pd.options.display.float_format = '{:,.2f}'.format
warnings.filterwarnings('ignore')

# ==== Utility helpers ====

def clean_text(s: str) -> str:
    s = (s or '').lower()
    patterns = [
        r'oferta nie stanowi.*?kodeksu cywilnego', r'prosz[ąa] o kontakt.*',
        r'tylko u nas.*', r'nie pobieramy prowizji.*', r'bez prowizji.*', r'kupuj.*bezpieczni.*'
    ]
    for p in patterns:
        s = re.sub(p, ' ', s, flags=re.IGNORECASE)
    s = re.sub(r'[^a-zA-Ząćęłńóśźż0-9\s]', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

# Derive city slug from Predicted_Loc like "Miasto -> Dzielnica -> ..."

def city_from_loc(loc: str) -> str:
    if not isinstance(loc, str) or not loc:
        return 'unknown'
    city = loc.split('->')[0].strip()
    city = city.lower().replace(' ', '_')
    city = re.sub(r'[^a-z_ąćęłńóśźż]', '', city)
    return city if city else 'unknown'

# Safe numeric conversion

def to_num(s):
    try:
        return float(str(s).replace(' ', '').replace(',', '.'))
    except Exception:
        return np.nan

# Simple metrics

def median_ape(y_true, y_pred):
    ape = np.abs((y_true - y_pred) / np.clip(y_true, 1e-9, None))
    return float(np.median(ape))

print('Imports ready.')


Imports ready.


In [2]:

# ==== Config ====
DATA_PATH = 'Data_state_LSTM_predicted_full_v4_FINAL.csv'  # CSV UTF-8-SIG with ';' separator
SAVE_DIR = 'artifacts_ppm_routing'
MODEL_VARIANT = 'BOTH'  # 'NN', 'GBM', or 'BOTH'

# Binning / grouping
N_BINS = 10
MIN_SAMPLES_GROUP = 150
GEO_MODE = 'city'  # 'city' or 'h3' (h3 requires lat/lon columns)
H3_RESOLUTION = 8   # ~0.46 km cells (adjust 8–9 to target 0.3–0.7 km)

# Validation & randomness
SEED = 42
TIME_COL = 'PublishDate'  # optional; if missing, stratified random split is used
VAL_FRACTION_TIME = 0.2   # last 20% by time

# Uncertainty band (alpha = relative half‑width around median on ppm scale)
ALPHA_PCT = 0.10  # 10%
MC_SAMPLES = 50   # for NN MC Dropout

# Text settings
TFIDF_MAX_FEATURES = 3000  # for GBM variant
TEXT_MAX_TOKENS = 3000     # for NN TextVectorization (multi_hot)

os.makedirs(SAVE_DIR, exist_ok=True)
print('Config ready — update DATA_PATH as needed.')


Config ready — update DATA_PATH as needed.


In [3]:

# ==== (Optional) Install dependencies if missing ====
import importlib, subprocess, sys

def ensure(pkg, pip_name=None):
    try:
        importlib.import_module(pkg)
    except Exception:
        subprocess.run([sys.executable, '-m', 'pip', 'install', '-q', pip_name or pkg])

# Core
for pkg in ['numpy','pandas','scikit-learn','tensorflow']:
    ensure(pkg)
# Geo and GBM
ensure('h3', 'h3')
ensure('geohash2', 'geohash2')
ensure('lightgbm', 'lightgbm')
ensure('xgboost', 'xgboost')
ensure('tensorflow_addons','tensorflow-addons')

print('Dependencies checked.')



TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

 The versions of TensorFlow you are currently using is 2.19.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


Dependencies checked.


In [9]:
# ==== Load CSV (header-agnostic with positional fallback) v3 ====
import pandas as pd, numpy as np
from datetime import datetime

expected_cols = ['SaleId','Title','Description','Area','Price','NumberOfRooms','BuiltYear','BuildingType',
                 'OfferFrom','Floor','Floors','TypeOfMarket','Type','Predict_State','Predicted_Loc','pricepermeter']

index_map = {
    0:'SaleId', 3:'Title', 4:'Description', 5:'Area', 6:'Price',
    11:'NumberOfRooms', 12:'BuiltYear', 14:'BuildingType', 16:'OfferFrom',
    17:'Floor', 18:'Floors', 19:'TypeOfMarket', 28:'Type', 54:'Predicted_Loc', 55:'Predict_State'
}

# 1) Try regular read with header row
try:
    df = pd.read_csv(DATA_PATH, sep=';', encoding='utf-8-sig', low_memory=False)
except Exception as e:
    raise RuntimeError(f'Failed to read CSV at {DATA_PATH}: {e}')

# 2) If expected columns are missing, fallback to positional mapping without any heuristics
if not set(expected_cols[:-1]).issubset(df.columns):
    def map_by_index(df_raw):
        present = [k for k in index_map.keys() if k < df_raw.shape[1]]
        if not present:
            raise ValueError('None of the expected indices are present in the raw frame.')
        out = df_raw.iloc[:, present].copy()
        out.columns = [index_map[k] for k in present]
        return out

    fallback_success = False
    for skip in (1, 0):  # prefer skipping first row (likely header); then try without skipping
        try:
            df_raw = pd.read_csv(DATA_PATH, sep=';', encoding='utf-8-sig', header=None, low_memory=False, skiprows=skip)
            df = map_by_index(df_raw)
            fallback_success = True
            break
        except Exception as e:
            last_err = e
            continue
    if not fallback_success:
        raise RuntimeError(f'Fallback read failed: {last_err}')

# 3) Ensure all expected columns exist with safe defaults
n = len(df)
for c in expected_cols:
    if c not in df.columns:
        if c in ['Title','Description','Predicted_Loc','BuildingType','OfferFrom','TypeOfMarket','Type','Predict_State']:
            df[c] = pd.Series(['']*n, dtype='string')
        elif c in ['Area','Price','NumberOfRooms','BuiltYear','Floor','Floors']:
            df[c] = np.nan
        elif c == 'pricepermeter':
            df[c] = np.nan

# 4) Numerics and text clean
for c in ['Price','Area','NumberOfRooms','BuiltYear','Floor','Floors']:
    df[c] = pd.to_numeric(df[c], errors='coerce')

df['Title'] = df['Title'].astype('string').fillna('')
df['Description'] = df['Description'].astype('string').fillna('').apply(clean_text)

# 5) ppm
if df['pricepermeter'].isna().all():
    df['pricepermeter'] = np.where(df['Area']>0, df['Price']/df['Area'], np.nan)

# 6) Basic filters
df = df[(df['Price']>1000) & (df['Area']>0)].copy()

# 7) Outlier trimming
p1_P, p99_P = df['Price'].quantile([0.01,0.99])
p1_ppm, p99_ppm = df['pricepermeter'].quantile([0.01,0.99])
df = df[(df['Price'].between(p1_P, p99_P)) & (df['pricepermeter'].between(p1_ppm, p99_ppm))].copy()

# 8) BuildingAge
year_now = datetime.now().year
by = df['BuiltYear'].copy()
med_year = by.dropna().median() if not by.dropna().empty else 2000
by = by.fillna(med_year).clip(1800, year_now+1)
df['BuildingAge'] = np.maximum(0, year_now - by).astype(int)

# 9) Categoricals
for c in ['Predict_State','Predicted_Loc','BuildingType','TypeOfMarket','Type','OfferFrom']:
    df[c] = df[c].astype('string').fillna('unknown').replace({'nan':'unknown','None':'unknown'})

# 10) city_slug / h3
df['city_slug'] = df['Predicted_Loc'].apply(city_from_loc)
if GEO_MODE=='h3' and 'Latitude' in df.columns and 'Longitude' in df.columns:
    import h3
    df['h3_cell'] = df.apply(lambda r: h3.geo_to_h3(r['Latitude'], r['Longitude'], H3_RESOLUTION), axis=1)
else:
    df['h3_cell'] = 'na'

# 11) Geo unit and target
geo_unit = 'h3_cell' if GEO_MODE=='h3' and df['h3_cell'].nunique()>1 else 'city_slug'
print('Geo unit used for grouping:', geo_unit)

df['Price_log_m2'] = np.log1p(df['pricepermeter'])
print('Prepared rows:', len(df))


Geo unit used for grouping: city_slug
Prepared rows: 1223662


In [10]:

# ==== Binning per geography ====
np.random.seed(SEED)

# Ensure minimum samples and quantile bins per geo
bins = []
for g, gdf in df.groupby(geo_unit):
    if len(gdf) < max(MIN_SAMPLES_GROUP, N_BINS*10):
        continue
    q = np.linspace(0,1,N_BINS+1)
    edges = np.unique(np.quantile(gdf['pricepermeter'], q))
    if len(edges) < 3:
        continue
    b = np.digitize(gdf['pricepermeter'], edges[1:-1], right=True)
    label = [f"{g}__bin{bi}" for bi in b]
    bins.append(pd.DataFrame({'idx': gdf.index, 'GroupLabel': label}))

if bins:
    bin_df = pd.concat(bins, ignore_index=True)
    df = df.join(bin_df.set_index('idx'), how='left')

mask_missing = df['GroupLabel'].isna()
if mask_missing.any():
    for g, gdf in df[mask_missing].groupby('city_slug'):
        if len(gdf) < N_BINS*5:
            df.loc[gdf.index, 'GroupLabel'] = f"{g}__bin0"
            continue
        edges = np.unique(np.quantile(gdf['pricepermeter'], np.linspace(0,1,N_BINS+1)))
        b = np.digitize(gdf['pricepermeter'], edges[1:-1], right=True)
        df.loc[gdf.index, 'GroupLabel'] = [f"{g}__bin{bi}" for bi in b]

counts = df['GroupLabel'].value_counts()
rare = set(counts[counts < MIN_SAMPLES_GROUP].index)
if rare:
    df['GroupLabel'] = df['GroupLabel'].apply(lambda x: x if x not in rare else f"{str(x).split('__')[0]}__bin0")

print('Unique groups:', df['GroupLabel'].nunique())
print(df['GroupLabel'].value_counts().head())

groups = sorted(df['GroupLabel'].unique())
group_to_id = {g:i for i,g in enumerate(groups)}
id_to_group = {i:g for g,i in group_to_id.items()}
with open(os.path.join(SAVE_DIR,'group_labels.json'),'w',encoding='utf-8') as f:
    json.dump({'group_to_id':group_to_id,'id_to_group':id_to_group}, f, ensure_ascii=False, indent=2)


Unique groups: 7430
GroupLabel
warszawa__bin2    14234
warszawa__bin5    14101
warszawa__bin0    14099
warszawa__bin7    14099
warszawa__bin4    14098
Name: count, dtype: int64


In [16]:
# ==== Rebuild label mapping after any relabeling and guard against NaNs ====
# Recompute mapping on the full df so that 'other' and any fallback labels are included
groups = sorted(df['GroupLabel'].unique())
group_to_id = {g: i for i, g in enumerate(groups)}
id_to_group = {i: g for g, i in group_to_id.items()}

# (Optional) save mapping
import os, json
os.makedirs(SAVE_DIR, exist_ok=True)
with open(os.path.join(SAVE_DIR,'group_labels.json'),'w',encoding='utf-8') as f:
    json.dump({'group_to_id':group_to_id,'id_to_group':id_to_group}, f, ensure_ascii=False, indent=2)

# ==== Safe dataset creators that drop rows with unmapped labels (should be none, but safe) ====
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Ensure TEXT_MAX_TOKENS exists
try:
    TEXT_MAX_TOKENS
except NameError:
    TEXT_MAX_TOKENS = 3000

# Build an adapt dataset if needed later (text/vectorizers)
adapt_dict = tf.data.Dataset.from_tensor_slices(dict(train_df[NUMERIC + CATEG + ['Title','Description']])).batch(256)

def ds_from(df_):
    y_map = df_['GroupLabel'].map(group_to_id)
    ok = y_map.notna()
    dfx = df_.loc[ok]
    y = y_map.loc[ok].astype('int32').values
    xdict = {f: dfx[f].values for f in NUMERIC+CATEG}
    xdict['text_all'] = (dfx['Title'].fillna('') + ' ' + dfx['Description'].fillna('')).values
    ds = tf.data.Dataset.from_tensor_slices((xdict, y)).shuffle(len(dfx), seed=SEED).batch(256).prefetch(tf.data.AUTOTUNE)
    return ds

train_ds = ds_from(train_df)
val_ds = ds_from(val_df)

# When computing class weights, also guard against NaNs
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

y_train_all = train_df['GroupLabel'].map(group_to_id)
y_train = y_train_all[y_train_all.notna()].astype('int32').values
classes = np.arange(len(group_to_id))
class_weight = {int(i): float(w) for i,w in enumerate(compute_class_weight(class_weight='balanced', classes=classes, y=y_train))}


In [17]:

# ==== Feature schema ====
NUMERIC = ['Area','NumberOfRooms','Floor','Floors','BuildingAge']
CATEG = ['Predict_State','Predicted_Loc','BuildingType','TypeOfMarket','Type','OfferFrom','city_slug']
TEXT_COLS = ['Title','Description']

CLASSIFIER_INPUTS = NUMERIC + CATEG + TEXT_COLS
REG_INPUTS = CLASSIFIER_INPUTS.copy()

for c in NUMERIC:
    if c not in train_df.columns:
        train_df[c] = train_df[c].median(); val_df[c] = val_df[c].median()

for c in CATEG:
    for d in (train_df,val_df):
        if c not in d.columns:
            d[c] = 'unknown'
        d[c] = d[c].astype(str).fillna('unknown')

for c in TEXT_COLS:
    for d in (train_df,val_df):
        if c not in d.columns:
            d[c] = ''
        d[c] = d[c].astype(str)

print('Prepared feature columns.')


Prepared feature columns.


In [19]:
# ==== NN Classifier for GroupLabel (with robust Windows-safe saving) ====
if MODEL_VARIANT in ('NN','BOTH'):
    import os
    import numpy as np
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers
    from sklearn.utils.class_weight import compute_class_weight
    from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, confusion_matrix

    # Inputs and preprocessing layers
    inputs = {}
    encoded = []

    # Dataset for adapting preprocessing layers
    adapt_dict = tf.data.Dataset.from_tensor_slices(dict(train_df[NUMERIC + CATEG + ['Title','Description']])).batch(256)

    # Numeric
    for f in NUMERIC:
        inp = keras.Input(shape=(1,), name=f, dtype=tf.float32); inputs[f] = inp
        layer_norm = layers.Normalization(axis=-1)
        layer_norm.adapt(adapt_dict.map(lambda x: tf.expand_dims(tf.cast(x[f], tf.float32), -1)))
        encoded.append(layer_norm(inp))

    # Categorical (one-hot via StringLookup)
    for f in CATEG:
        inp = keras.Input(shape=(1,), name=f, dtype=tf.string); inputs[f] = inp
        sl = layers.StringLookup(output_mode='one_hot')
        sl.adapt(adapt_dict.map(lambda x: x[f]))
        encoded.append(sl(inp))

    # Text feature: Title + Description
    text_input = keras.Input(shape=(1,), name='text_all', dtype=tf.string)
    tv = layers.TextVectorization(max_tokens=TEXT_MAX_TOKENS, output_mode='multi_hot')
    tv.adapt(tf.data.Dataset.from_tensor_slices((train_df['Title'].fillna('') + ' ' + train_df['Description'].fillna(''))).batch(256))
    encoded.append(tv(text_input))
    inputs['text_all'] = text_input

    # Classifier body
    x = layers.Concatenate()(encoded)
    x = layers.Dense(256, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(128, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    out = layers.Dense(len(group_to_id), activation='softmax', name='class')(x)

    clf = keras.Model(inputs, out)
    clf.compile(optimizer=keras.optimizers.Adam(1e-3), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Input pipeline
    def ds_from(df_):
        y_map = df_['GroupLabel'].map(group_to_id)
        ok = y_map.notna()
        dfx = df_.loc[ok]
        y = y_map.loc[ok].astype('int32').values
        xdict = {f: dfx[f].values for f in NUMERIC+CATEG}
        xdict['text_all'] = (dfx['Title'].fillna('') + ' ' + dfx['Description'].fillna('')).values
        return tf.data.Dataset.from_tensor_slices((xdict, y)).shuffle(len(dfx), seed=SEED).batch(256).prefetch(tf.data.AUTOTUNE)

    train_ds = ds_from(train_df)
    val_ds = ds_from(val_df)

    # Class weights (balanced)
    y_train_all = train_df['GroupLabel'].map(group_to_id)
    y_train = y_train_all[y_train_all.notna()].astype('int32').values
    classes = np.arange(len(group_to_id))
    class_weight = {int(i): float(w) for i,w in enumerate(compute_class_weight(class_weight='balanced', classes=classes, y=y_train))}

    # Callbacks
    es = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    rlr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6)
    csvlog = keras.callbacks.CSVLogger(os.path.join(SAVE_DIR,'clf_training_log.csv'))

    # Train
    history = clf.fit(train_ds, validation_data=val_ds, epochs=25, callbacks=[es, rlr, csvlog], class_weight=class_weight)

    # Robust save on Windows (CP1250): try .keras, fall back to SavedModel
    save_base = os.path.join(SAVE_DIR, 'routing_classifier')
    try:
        clf.save(save_base + '.keras')
        print('Saved .keras at:', save_base + '.keras')
    except UnicodeEncodeError as e:
        print('UnicodeEncodeError during .keras save; falling back to SavedModel:', e)
        try:
            # Keras 3 preferred API
            clf.export(save_base + '_sm')
            print('Saved SavedModel at:', save_base + '_sm')
        except Exception:
            # Legacy TF fallback
            tf.saved_model.save(clf, save_base + '_sm')
            print('Saved (legacy) TF SavedModel at:', save_base + '_sm')

    # Simple eval on val for sanity
    def predict_proba(df_):
        xdict = {f: df_[f].values for f in NUMERIC+CATEG}
        xdict['text_all'] = (df_['Title'].fillna('')+' '+df_['Description'].fillna('')).values
        proba = clf.predict(xdict, batch_size=512, verbose=0)
        return proba

    proba_val = predict_proba(val_df)
    y_true = val_df['GroupLabel'].map(group_to_id).values
    y_pred = proba_val.argmax(axis=1)
    acc = accuracy_score(y_true, y_pred)
    f1m = f1_score(y_true, y_pred, average='macro')
    bacc = balanced_accuracy_score(y_true, y_pred)
    print({'accuracy':acc, 'macro_f1':f1m, 'balanced_acc':bacc})
else:
    print('NN routing skipped by config.')


Epoch 1/25
[1m3824/3824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4242s[0m 1s/step - accuracy: 7.6985e-04 - loss: 8.6267 - val_accuracy: 4.0861e-06 - val_loss: 8.5075 - learning_rate: 0.0010
Epoch 2/25
[1m3824/3824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4314s[0m 1s/step - accuracy: 0.0000e+00 - loss: 8.6835 - val_accuracy: 4.0861e-06 - val_loss: 8.5521 - learning_rate: 0.0010
Epoch 3/25
[1m3824/3824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4385s[0m 1s/step - accuracy: 2.8340e-06 - loss: 8.6737 - val_accuracy: 4.0861e-06 - val_loss: 8.5641 - learning_rate: 0.0010
Epoch 4/25
[1m3824/3824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4295s[0m 1s/step - accuracy: 7.3671e-07 - loss: 8.6037 - val_accuracy: 4.0861e-06 - val_loss: 8.5719 - learning_rate: 0.0010
Epoch 5/25
[1m3824/3824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4243s[0m 1s/step - accuracy: 7.6090e-06 - loss: 8.6592 - val_accuracy: 4.0861e-06 - val_loss: 8.5729 - learning_rate: 2.0000e-04
Epoch 

INFO:tensorflow:Assets written to: artifacts_ppm_routing\routing_classifier_sm\assets


Saved artifact at 'artifacts_ppm_routing\routing_classifier_sm'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): Dict[['Area', TensorSpec(shape=(None, 1), dtype=tf.float32, name='Area')], ['NumberOfRooms', TensorSpec(shape=(None, 1), dtype=tf.float32, name='NumberOfRooms')], ['Floor', TensorSpec(shape=(None, 1), dtype=tf.float32, name='Floor')], ['Floors', TensorSpec(shape=(None, 1), dtype=tf.float32, name='Floors')], ['BuildingAge', TensorSpec(shape=(None, 1), dtype=tf.float32, name='BuildingAge')], ['Predict_State', TensorSpec(shape=(None, 1), dtype=tf.string, name='Predict_State')], ['Predicted_Loc', TensorSpec(shape=(None, 1), dtype=tf.string, name='Predicted_Loc')], ['BuildingType', TensorSpec(shape=(None, 1), dtype=tf.string, name='BuildingType')], ['TypeOfMarket', TensorSpec(shape=(None, 1), dtype=tf.string, name='TypeOfMarket')], ['Type', TensorSpec(shape=(None, 1), dtype=tf.string, name='Type')], ['OfferFrom', TensorSpec(shape=(None, 1), 

In [21]:
# ==== NN Regressors per group (Price_log_m2) with MC Dropout + Windows-safe saving ====
if MODEL_VARIANT in ('NN','BOTH'):
    import os, re, numpy as np
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers
    from sklearn.metrics import mean_absolute_percentage_error

    def build_regressor(train_subset):
        inputs = {}; encoded = []
        adapt_ds = tf.data.Dataset.from_tensor_slices(dict(train_subset[REG_INPUTS])).batch(256)
        for f in NUMERIC:
            inp = keras.Input(shape=(1,), name=f, dtype=tf.float32); inputs[f]=inp
            norm = layers.Normalization(axis=-1); norm.adapt(adapt_ds.map(lambda x: tf.expand_dims(tf.cast(x[f], tf.float32),-1)))
            encoded.append(norm(inp))
        for f in CATEG:
            inp = keras.Input(shape=(1,), name=f, dtype=tf.string); inputs[f]=inp
            lk = layers.StringLookup(output_mode='one_hot'); lk.adapt(adapt_ds.map(lambda x: x[f]))
            encoded.append(lk(inp))
        txt = keras.Input(shape=(1,), name='text_all', dtype=tf.string); inputs['text_all']=txt
        tv = layers.TextVectorization(max_tokens=TEXT_MAX_TOKENS, output_mode='multi_hot')
        tv.adapt(tf.data.Dataset.from_tensor_slices((train_subset['Title'].fillna('')+' '+train_subset['Description'].fillna(''))).batch(256))
        encoded.append(tv(txt))
        x = layers.Concatenate()(encoded)
        x = layers.Dense(256, activation='relu')(x)
        x = layers.Dropout(0.3)(x, training=True)
        x = layers.Dense(128, activation='relu')(x)
        x = layers.Dropout(0.3)(x, training=True)
        out = layers.Dense(1, name='price_log_m2')(x)
        m = keras.Model(inputs, out)
        m.compile(optimizer=keras.optimizers.Adam(1e-3), loss='mse', metrics=[keras.metrics.RootMeanSquaredError(name='rmse')])
        return m

    def df_to_ds(df_):
        xdict = {f: df_[f].values for f in NUMERIC+CATEG}
        xdict['text_all'] = (df_['Title'].fillna('')+' '+df_['Description'].fillna('')).values
        y = df_['Price_log_m2'].astype('float32').values
        return tf.data.Dataset.from_tensor_slices((xdict, y)).shuffle(len(df_), seed=SEED).batch(256).prefetch(tf.data.AUTOTUNE)

    es = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # ----- Global fallback regressor -----
    print('Training global fallback regressor...')
    global_reg = build_regressor(train_df)
    csvlog = keras.callbacks.CSVLogger(os.path.join(SAVE_DIR,'global_reg_log.csv'))
    global_reg.fit(df_to_ds(train_df), validation_data=df_to_ds(val_df), epochs=30, callbacks=[es, csvlog], verbose=1)

    # Robust save (.keras -> SavedModel)
    base_global = os.path.join(SAVE_DIR,'regressor_global')
    try:
        global_reg.save(base_global + '.keras')
        print('Saved .keras at:', base_global + '.keras')
    except UnicodeEncodeError as e:
        print('UnicodeEncodeError during .keras save; falling back to SavedModel:', e)
        try:
            global_reg.export(base_global + '_sm')
            print('Saved SavedModel at:', base_global + '_sm')
        except Exception:
            tf.saved_model.save(global_reg, base_global + '_sm')
            print('Saved (legacy) TF SavedModel at:', base_global + '_sm')

    # ----- Per-group regressors -----
    reg_paths = {}
    group_stats = []
    for g, gdf in train_df.groupby('GroupLabel'):
        if len(gdf) < MIN_SAMPLES_GROUP:
            continue
        print(f'Training regressor for group {g} (n={len(gdf)})')
        reg = build_regressor(gdf)
        safe = re.sub(r'[^a-zA-Z0-9_]+', '_', g)
        logfile = os.path.join(SAVE_DIR, f'reg_{safe}.csv')
        csvlog = keras.callbacks.CSVLogger(logfile)
        val_g = val_df[val_df['GroupLabel']==g]
        if len(val_g)==0:
            val_g = val_df.sample(min(5000, len(val_df)), random_state=SEED)
        reg.fit(df_to_ds(gdf), validation_data=df_to_ds(val_g), epochs=20, callbacks=[es, csvlog], verbose=0)

        # Robust save per-group
        base_group = os.path.join(SAVE_DIR, f'regressor__{abs(hash(g))}')
        try:
            reg.save(base_group + '.keras')
            print('Saved .keras at:', base_group + '.keras')
            reg_paths[g] = base_group + '.keras'
        except UnicodeEncodeError as e:
            print('UnicodeEncodeError during .keras save; falling back to SavedModel:', e)
            try:
                reg.export(base_group + '_sm')
                print('Saved SavedModel at:', base_group + '_sm')
                reg_paths[g] = base_group + '_sm'
            except Exception:
                tf.saved_model.save(reg, base_group + '_sm')
                print('Saved (legacy) TF SavedModel at:', base_group + '_sm')
                reg_paths[g] = base_group + '_sm'

        if len(val_g) >= 50:
            xdict = {f: val_g[f].values for f in NUMERIC+CATEG}
            xdict['text_all'] = (val_g['Title'].fillna('')+' '+val_g['Description'].fillna('')).values
            pred = np.expm1(reg.predict(xdict, verbose=0).ravel())
            mape = mean_absolute_percentage_error(val_g['pricepermeter'], pred)
            medape = np.median(np.abs((val_g['pricepermeter'].values - pred) / np.clip(val_g['pricepermeter'].values,1e-9,None)))
        else:
            mape = np.nan; medape = np.nan
        group_stats.append({'group':g, 'n_train':len(gdf), 'n_val':len(val_g), 'MAPE':mape, 'MedianAPE':medape})

    pd.DataFrame(group_stats).to_csv(os.path.join(SAVE_DIR,'per_group_metrics.csv'), index=False)


Training global fallback regressor...
Epoch 1/30
[1m3824/3824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1823s[0m 456ms/step - loss: 69.8288 - rmse: 8.3413 - val_loss: 32.5372 - val_rmse: 5.7041
Epoch 2/30
[1m3824/3824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1811s[0m 454ms/step - loss: 24.4612 - rmse: 4.9269 - val_loss: 6.7689 - val_rmse: 2.6017
Epoch 3/30
[1m3824/3824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1796s[0m 450ms/step - loss: 4.1205 - rmse: 2.0060 - val_loss: 0.3021 - val_rmse: 0.5497
Epoch 4/30
[1m3824/3824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1811s[0m 454ms/step - loss: 0.2277 - rmse: 0.4766 - val_loss: 0.1948 - val_rmse: 0.4414
Epoch 5/30
[1m3824/3824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1826s[0m 458ms/step - loss: 0.1944 - rmse: 0.4410 - val_loss: 0.1948 - val_rmse: 0.4414
Epoch 6/30
[1m3824/3824[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1859s[0m 467ms/step - loss: 0.1947 - rmse: 0.4412 - val_loss: 0.1948 - val_rmse: 

INFO:tensorflow:Assets written to: artifacts_ppm_routing\regressor_global_sm\assets


Saved artifact at 'artifacts_ppm_routing\regressor_global_sm'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): Dict[['Area', TensorSpec(shape=(None, 1), dtype=tf.float32, name='Area')], ['NumberOfRooms', TensorSpec(shape=(None, 1), dtype=tf.float32, name='NumberOfRooms')], ['Floor', TensorSpec(shape=(None, 1), dtype=tf.float32, name='Floor')], ['Floors', TensorSpec(shape=(None, 1), dtype=tf.float32, name='Floors')], ['BuildingAge', TensorSpec(shape=(None, 1), dtype=tf.float32, name='BuildingAge')], ['Predict_State', TensorSpec(shape=(None, 1), dtype=tf.string, name='Predict_State')], ['Predicted_Loc', TensorSpec(shape=(None, 1), dtype=tf.string, name='Predicted_Loc')], ['BuildingType', TensorSpec(shape=(None, 1), dtype=tf.string, name='BuildingType')], ['TypeOfMarket', TensorSpec(shape=(None, 1), dtype=tf.string, name='TypeOfMarket')], ['Type', TensorSpec(shape=(None, 1), dtype=tf.string, name='Type')], ['OfferFrom', TensorSpec(shape=(None, 1), dt

INFO:tensorflow:Assets written to: artifacts_ppm_routing\regressor__5946482021831562226_sm\assets


Saved artifact at 'artifacts_ppm_routing\regressor__5946482021831562226_sm'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): Dict[['Area', TensorSpec(shape=(None, 1), dtype=tf.float32, name='Area')], ['NumberOfRooms', TensorSpec(shape=(None, 1), dtype=tf.float32, name='NumberOfRooms')], ['Floor', TensorSpec(shape=(None, 1), dtype=tf.float32, name='Floor')], ['Floors', TensorSpec(shape=(None, 1), dtype=tf.float32, name='Floors')], ['BuildingAge', TensorSpec(shape=(None, 1), dtype=tf.float32, name='BuildingAge')], ['Predict_State', TensorSpec(shape=(None, 1), dtype=tf.string, name='Predict_State')], ['Predicted_Loc', TensorSpec(shape=(None, 1), dtype=tf.string, name='Predicted_Loc')], ['BuildingType', TensorSpec(shape=(None, 1), dtype=tf.string, name='BuildingType')], ['TypeOfMarket', TensorSpec(shape=(None, 1), dtype=tf.string, name='TypeOfMarket')], ['Type', TensorSpec(shape=(None, 1), dtype=tf.string, name='Type')], ['OfferFrom', TensorSpec(shape

INFO:tensorflow:Assets written to: artifacts_ppm_routing\regressor__2439099339002759264_sm\assets


Saved artifact at 'artifacts_ppm_routing\regressor__2439099339002759264_sm'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): Dict[['Area', TensorSpec(shape=(None, 1), dtype=tf.float32, name='Area')], ['NumberOfRooms', TensorSpec(shape=(None, 1), dtype=tf.float32, name='NumberOfRooms')], ['Floor', TensorSpec(shape=(None, 1), dtype=tf.float32, name='Floor')], ['Floors', TensorSpec(shape=(None, 1), dtype=tf.float32, name='Floors')], ['BuildingAge', TensorSpec(shape=(None, 1), dtype=tf.float32, name='BuildingAge')], ['Predict_State', TensorSpec(shape=(None, 1), dtype=tf.string, name='Predict_State')], ['Predicted_Loc', TensorSpec(shape=(None, 1), dtype=tf.string, name='Predicted_Loc')], ['BuildingType', TensorSpec(shape=(None, 1), dtype=tf.string, name='BuildingType')], ['TypeOfMarket', TensorSpec(shape=(None, 1), dtype=tf.string, name='TypeOfMarket')], ['Type', TensorSpec(shape=(None, 1), dtype=tf.string, name='Type')], ['OfferFrom', TensorSpec(shape

INFO:tensorflow:Assets written to: artifacts_ppm_routing\regressor__4717277491672629630_sm\assets


Saved artifact at 'artifacts_ppm_routing\regressor__4717277491672629630_sm'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): Dict[['Area', TensorSpec(shape=(None, 1), dtype=tf.float32, name='Area')], ['NumberOfRooms', TensorSpec(shape=(None, 1), dtype=tf.float32, name='NumberOfRooms')], ['Floor', TensorSpec(shape=(None, 1), dtype=tf.float32, name='Floor')], ['Floors', TensorSpec(shape=(None, 1), dtype=tf.float32, name='Floors')], ['BuildingAge', TensorSpec(shape=(None, 1), dtype=tf.float32, name='BuildingAge')], ['Predict_State', TensorSpec(shape=(None, 1), dtype=tf.string, name='Predict_State')], ['Predicted_Loc', TensorSpec(shape=(None, 1), dtype=tf.string, name='Predicted_Loc')], ['BuildingType', TensorSpec(shape=(None, 1), dtype=tf.string, name='BuildingType')], ['TypeOfMarket', TensorSpec(shape=(None, 1), dtype=tf.string, name='TypeOfMarket')], ['Type', TensorSpec(shape=(None, 1), dtype=tf.string, name='Type')], ['OfferFrom', TensorSpec(shape







Training regressor for group augustów__bin0 (n=479)
UnicodeEncodeError during .keras save; falling back to SavedModel: 'charmap' codec can't encode character '\xb2' in position 7273: character maps to <undefined>
INFO:tensorflow:Assets written to: artifacts_ppm_routing\regressor__8079749075666341245_sm\assets


INFO:tensorflow:Assets written to: artifacts_ppm_routing\regressor__8079749075666341245_sm\assets


Saved artifact at 'artifacts_ppm_routing\regressor__8079749075666341245_sm'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): Dict[['Area', TensorSpec(shape=(None, 1), dtype=tf.float32, name='Area')], ['NumberOfRooms', TensorSpec(shape=(None, 1), dtype=tf.float32, name='NumberOfRooms')], ['Floor', TensorSpec(shape=(None, 1), dtype=tf.float32, name='Floor')], ['Floors', TensorSpec(shape=(None, 1), dtype=tf.float32, name='Floors')], ['BuildingAge', TensorSpec(shape=(None, 1), dtype=tf.float32, name='BuildingAge')], ['Predict_State', TensorSpec(shape=(None, 1), dtype=tf.string, name='Predict_State')], ['Predicted_Loc', TensorSpec(shape=(None, 1), dtype=tf.string, name='Predicted_Loc')], ['BuildingType', TensorSpec(shape=(None, 1), dtype=tf.string, name='BuildingType')], ['TypeOfMarket', TensorSpec(shape=(None, 1), dtype=tf.string, name='TypeOfMarket')], ['Type', TensorSpec(shape=(None, 1), dtype=tf.string, name='Type')], ['OfferFrom', TensorSpec(shape

INFO:tensorflow:Assets written to: artifacts_ppm_routing\regressor__2366006624724727952_sm\assets


Saved artifact at 'artifacts_ppm_routing\regressor__2366006624724727952_sm'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): Dict[['Area', TensorSpec(shape=(None, 1), dtype=tf.float32, name='Area')], ['NumberOfRooms', TensorSpec(shape=(None, 1), dtype=tf.float32, name='NumberOfRooms')], ['Floor', TensorSpec(shape=(None, 1), dtype=tf.float32, name='Floor')], ['Floors', TensorSpec(shape=(None, 1), dtype=tf.float32, name='Floors')], ['BuildingAge', TensorSpec(shape=(None, 1), dtype=tf.float32, name='BuildingAge')], ['Predict_State', TensorSpec(shape=(None, 1), dtype=tf.string, name='Predict_State')], ['Predicted_Loc', TensorSpec(shape=(None, 1), dtype=tf.string, name='Predicted_Loc')], ['BuildingType', TensorSpec(shape=(None, 1), dtype=tf.string, name='BuildingType')], ['TypeOfMarket', TensorSpec(shape=(None, 1), dtype=tf.string, name='TypeOfMarket')], ['Type', TensorSpec(shape=(None, 1), dtype=tf.string, name='Type')], ['OfferFrom', TensorSpec(shape

INFO:tensorflow:Assets written to: artifacts_ppm_routing\regressor__5452600282512542450_sm\assets


INFO:tensorflow:Assets written to: artifacts_ppm_routing\regressor__4480766881104448127_sm\assets


Saved artifact at 'artifacts_ppm_routing\regressor__4480766881104448127_sm'. The following endpoints are available:

* Endpoint 'serve'
  args_0 (POSITIONAL_ONLY): Dict[['Area', TensorSpec(shape=(None, 1), dtype=tf.float32, name='Area')], ['NumberOfRooms', TensorSpec(shape=(None, 1), dtype=tf.float32, name='NumberOfRooms')], ['Floor', TensorSpec(shape=(None, 1), dtype=tf.float32, name='Floor')], ['Floors', TensorSpec(shape=(None, 1), dtype=tf.float32, name='Floors')], ['BuildingAge', TensorSpec(shape=(None, 1), dtype=tf.float32, name='BuildingAge')], ['Predict_State', TensorSpec(shape=(None, 1), dtype=tf.string, name='Predict_State')], ['Predicted_Loc', TensorSpec(shape=(None, 1), dtype=tf.string, name='Predicted_Loc')], ['BuildingType', TensorSpec(shape=(None, 1), dtype=tf.string, name='BuildingType')], ['TypeOfMarket', TensorSpec(shape=(None, 1), dtype=tf.string, name='TypeOfMarket')], ['Type', TensorSpec(shape=(None, 1), dtype=tf.string, name='Type')], ['OfferFrom', TensorSpec(shape

In [24]:
# ==== GBM variant: LightGBM quantile (bez OHE, kategorie natywne + early stopping) ====
import os, numpy as np, pandas as pd
import lightgbm as lgb
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import mean_absolute_percentage_error

# 1) Definicje kolumn
cat_cols = ['Predict_State','Predicted_Loc','BuildingType','TypeOfMarket','Type','OfferFrom','city_slug']
num_cols = ['Area','NumberOfRooms','Floor','Floors','BuildingAge']

# 2) Dtypes kategorii (wymagane przez LightGBM)
for c in cat_cols:
    train_df[c] = train_df[c].astype('category')
    val_df[c]   = val_df[c].astype('category')

# 3) Macierze wejściowe (bez TF‑IDF, by przyspieszyć i obniżyć RAM)
Xtr = train_df[num_cols + cat_cols]
ytr = train_df['pricepermeter'].values.astype('float32')
Xva = val_df[num_cols + cat_cols]
yva = val_df['pricepermeter'].values.astype('float32')

# 4) Datasets z informacją o kolumnach kategorycznych
dtrain = lgb.Dataset(Xtr, label=ytr, categorical_feature=cat_cols, free_raw_data=False)
dvalid = lgb.Dataset(Xva, label=yva, categorical_feature=cat_cols, free_raw_data=False)

# 5) Funkcja ucząca kwantyl (używa early stopping i 32 wątków)
def fit_lgb_quantile(alpha, seed=SEED):
    params = {
        'objective': 'quantile',
        'alpha': alpha,
        'learning_rate': 0.05,
        'num_leaves': 64,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 1,
        'min_data_in_leaf': 50,
        'seed': int(seed),
        'num_threads': 32
    }
    model = lgb.train(
        params,
        dtrain,
        num_boost_round=200,
        valid_sets=[dtrain, dvalid],
        valid_names=['train','valid'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=30, first_metric_only=False, verbose=True),
            lgb.log_evaluation(period=50)
        ]
    )
    return model

# 6) Trening trzech modeli kwantylowych
lgb_p10 = fit_lgb_quantile(0.10)
lgb_p50 = fit_lgb_quantile(0.50)
lgb_p90 = fit_lgb_quantile(0.90)

# 7) Predykcje z najlepszej iteracji
p10 = lgb_p10.predict(Xva, num_iteration=lgb_p10.best_iteration)
p50 = lgb_p50.predict(Xva, num_iteration=lgb_p50.best_iteration)
p90 = lgb_p90.predict(Xva, num_iteration=lgb_p90.best_iteration)

# 8) Kalibracja prawdopodobieństwa trafienia pasma ±ALPHA_PCT wokół mediany
low  = p50 * (1 - ALPHA_PCT)
high = p50 * (1 + ALPHA_PCT)
event = ((yva >= low) & (yva <= high)).astype(int)

# Surowy „szerokość/poziom” pasma z modeli kwantylowych
raw_width_cov = ((p90 - p10) / np.maximum(1.0, p50)).clip(0, 1)

iso = IsotonicRegression(out_of_bounds='clip').fit(raw_width_cov, event)
prob = iso.predict(raw_width_cov)

# 9) Metryki walidacyjne
rmse_log = float(np.sqrt(np.mean((np.log1p(yva) - np.log1p(p50))**2)))
mape = float(mean_absolute_percentage_error(yva, p50))
medape = float(np.median(np.abs((yva - p50) / np.clip(yva, 1e-9, None))))
print({'GBM_rmse_log_ppm': rmse_log, 'GBM_MAPE_ppm': mape, 'GBM_MedianAPE_ppm': medape})

# 10) Próbka 20 i zapis modeli
rng = np.random.RandomState(SEED)
sample_idx = rng.choice(len(val_df), size=min(20, len(val_df)), replace=False)
sample20 = val_df.iloc[sample_idx][['SaleId','Title','pricepermeter','Area','Predicted_Loc']].copy()
sample20['Predicted_ppm'] = p50[sample_idx]
sample20['Probability']   = prob[sample_idx]
sample20['GroupLabel_pred'] = val_df.iloc[sample_idx]['GroupLabel'].values
sample20['ClassConfidence']  = 1.0
sample20 = sample20[['SaleId','Title','pricepermeter','Predicted_ppm','Probability','GroupLabel_pred','ClassConfidence','Area','Predicted_Loc']]
sample20['Predicted_ppm'] = sample20['Predicted_ppm'].round(0)
sample20['Probability']   = sample20['Probability'].round(3)
sample20.to_csv(os.path.join(SAVE_DIR,'predictions_sample20_GBM.csv'), index=False)

# Modele (tekstowe pliki .txt są bezpieczne pod Windows)
lgb_p10.save_model(os.path.join(SAVE_DIR,'lgb_q10.txt'))
lgb_p50.save_model(os.path.join(SAVE_DIR,'lgb_q50.txt'))
lgb_p90.save_model(os.path.join(SAVE_DIR,'lgb_q90.txt'))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009501 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 16842
[LightGBM] [Info] Number of data points in the train set: 978929, number of used features: 12
[LightGBM] [Info] Start training from score 5700.000000
Training until validation scores don't improve for 30 rounds
[50]	train's quantile: 345.752	valid's quantile: 349.05
[100]	train's quantile: 292.963	valid's quantile: 298.88
[150]	train's quantile: 273.723	valid's quantile: 281.878
[200]	train's quantile: 264.42	valid's quantile: 274.424
Did not meet early stopping. Best iteration is:
[200]	train's quantile: 264.42	valid's quantile: 274.424
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009386 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you 

<lightgbm.basic.Booster at 0x1b953c5f990>

In [25]:

# ==== Price correctness & correction on ppm scale ====
# For any predictions DataFrame with columns: 'pricepermeter', 'Predicted_ppm', 'Probability', and 'Area'

def evaluate_correctness(df_pred, alpha_pct=ALPHA_PCT, prob_threshold=0.5):
    import numpy as np
    dfp = df_pred.copy()
    dfp['ppm_bs'] = dfp['pricepermeter']
    dfp['ppm_pred'] = dfp['Predicted_ppm']
    dfp['Delta_ppm'] = dfp['ppm_pred'] - dfp['ppm_bs']
    dfp['Delta_total'] = dfp['Delta_ppm'] * dfp['Area']
    dfp['is_trafiona'] = (np.abs(dfp['Delta_ppm']) <= alpha_pct * dfp['ppm_pred']) & (dfp['Probability'] >= prob_threshold)
    return dfp

print('Call evaluate_correctness(sample_df) to compute trafność and corrections.')


Call evaluate_correctness(sample_df) to compute trafność and corrections.


In [27]:
# ==== Save config and minimal README ====
import os, json

os.makedirs(SAVE_DIR, exist_ok=True)

with open(os.path.join(SAVE_DIR, 'config.json'), 'w', encoding='utf-8') as f:
    json.dump({
        'MODEL_VARIANT': MODEL_VARIANT,
        'N_BINS': N_BINS,
        'MIN_SAMPLES_GROUP': MIN_SAMPLES_GROUP,
        'GEO_MODE': GEO_MODE,
        'H3_RESOLUTION': H3_RESOLUTION,
        'ALPHA_PCT': ALPHA_PCT,
        'SEED': SEED
    }, f, indent=2, ensure_ascii=False)

with open(os.path.join(SAVE_DIR, 'README.txt'), 'w', encoding='utf-8') as f:
    f.write('Artifacts saved for hedonic ppm routing experiment.\n')

print('Artifacts saved to', SAVE_DIR)


Artifacts saved to artifacts_ppm_routing


In [28]:
# ==== Apply trained model(s) to full base and save Predicted_base_full.csv ====
import os, numpy as np, pandas as pd
import lightgbm as lgb

# 1) Wybór pełnej bazy (po preprocessie)
# Jeśli 'df' istnieje, to jest to pełna baza po przygotowaniu; w innym razie sklej train/val.
if 'df' in globals():
    base_df = df.copy()
else:
    base_df = pd.concat([train_df, val_df], axis=0, ignore_index=True)

# Zachowaj kopię do zapisu
pred_df = base_df.copy()

# 2) Domyślna kolumna grupy (jeśli brak predykcji klasą, użyj GroupLabel z binningu)
pred_df['Group_Assigned'] = pred_df.get('_Group_pred', pred_df.get('GroupLabel', 'unknown')).astype(str)

# 3) Spróbuj użyć wariantu GBM (median - q50), jeśli plik istnieje
save_dir = SAVE_DIR if 'SAVE_DIR' in globals() else 'artifacts_ppm_routing'
model_q50_path = os.path.join(save_dir, 'lgb_q50.txt')

used_model = None
if os.path.exists(model_q50_path):
    # Przygotuj typy kategorii tak jak w treningu GBM
    cat_cols = ['Predict_State','Predicted_Loc','BuildingType','TypeOfMarket','Type','OfferFrom','city_slug']
    num_cols = ['Area','NumberOfRooms','Floor','Floors','BuildingAge']
    for c in cat_cols:
        pred_df[c] = pred_df[c].astype('category')
    Xall = pred_df[num_cols + cat_cols]
    booster = lgb.Booster(model_file=model_q50_path)
    p50_all = booster.predict(Xall, num_iteration=getattr(booster, 'best_iteration', None))
    pred_df['Predicted_ppm'] = p50_all
    pred_df['Predicted_Price'] = pred_df['Predicted_ppm'] * pred_df['Area']
    used_model = 'GBM_q50_ppm'
else:
    # Fallback: jeśli zapisano globalny regressor Keras, wykorzystaj go (ppm na log-skali)
    import tensorflow as tf
    from tensorflow import keras
    base_global = os.path.join(save_dir,'regressor_global')
    path_keras = base_global + '.keras'
    path_sm = base_global + '_sm'
    model = None
    if os.path.exists(path_keras):
        model = keras.models.load_model(path_keras)
    elif os.path.exists(path_sm):
        try:
            model = keras.models.load_model(path_sm)
        except Exception:
            model = tf.saved_model.load(path_sm)
    if model is None:
        raise RuntimeError('Nie znaleziono modelu do predykcji: brak lgb_q50.txt oraz regressor_global.* w SAVE_DIR')
    # Zbuduj wejście tak jak w treningu regresora (NUMERIC+CATEG+text_all)
    NUMERIC = ['Area','NumberOfRooms','Floor','Floors','BuildingAge']
    CATEG = ['Predict_State','Predicted_Loc','BuildingType','TypeOfMarket','Type','OfferFrom','city_slug']
    xdict = {f: pred_df[f].values for f in NUMERIC+CATEG}
    xdict['text_all'] = (pred_df['Title'].fillna('') + ' ' + pred_df['Description'].fillna('')).values
    ylog = model.predict(xdict, batch_size=512, verbose=1).ravel()
    pred_df['Predicted_ppm'] = np.expm1(ylog)
    pred_df['Predicted_Price'] = pred_df['Predicted_ppm'] * pred_df['Area']
    used_model = 'NN_global_ppm'

# 4) Rzutowania i zabezpieczenia
pred_df['Predicted_Price'] = pred_df['Predicted_Price'].astype(float)

# 5) Próbka 20 losowych rekordów do podglądu
SEED = SEED if 'SEED' in globals() else 42
sample_cols = ['SaleId','Title','Predicted_Loc','Predict_State','Price','Predicted_Price','Group_Assigned']
sample_cols = [c for c in sample_cols if c in pred_df.columns]
sample20 = pred_df.sample(n=min(20, len(pred_df)), random_state=SEED)[sample_cols].copy()
print('Model used:', used_model)
print(sample20.head(20))

# 6) Zapis całej bazy z predykcją
out_path = 'Predicted_base_full.csv'
pred_df.to_csv(out_path, sep=';', encoding='utf-8-sig', index=False)
print('Saved full predicted base to:', out_path, 'with shape:', pred_df.shape)


Model used: GBM_q50_ppm
          SaleId                                              Title  \
420899   2735273  Nowe mieszkanie trzypokojowe(NrA_56) Unii Lube...   
1298242  4987862  Mieszkanie 3-pokojowe umeblowane i gotowe do z...   
575917   3052776              Mieszkanie, Sosnowiec, Zagórze, 62 m²   
1151407  4599949     3pok mieszkanie z ogródkiem poniżej 500tyś.!!!   
476638   2847992  Piękne 3-pokojowe mieszkanie Armii Krajowej Ol...   
464882   2824189                   SPRZEDAM MIESZKANIE BEZCZYNSZOWE   
1317549  5036565                 Mieszkanie 33m2, parter z balkonem   
1024795  4255505             Mieszkanie, Gdańsk, Śródmieście, 41 m²   
1256606  4885755                        Mieszkanie, Warszawa, 57 m²   
766454   3554140                  Mieszkanie, Ustroń, Ustroń, 48 m²   
892894   3897185  Nowe Południe - mieszkanie 5.A.04 - Nowa ofert...   
897700   3911507  Kawalerka w Śródmieściu, po Remoncie, Niski Cz...   
1091555  4436620                           Mieszkanie

In [29]:
# ==== Show fixed examples by SaleId (same as v10 sample) ====
import numpy as np, pandas as pd

# Ensure previous prediction cell has been run to create pred_df
if 'pred_df' not in globals():
    raise RuntimeError('Brak ramki pred_df. Najpierw uruchom komórkę z inferencją i zapisaniem Predicted_base_full.csv.')

# SaleId list taken from the v10 sample table
sale_ids = [
    3847651, 3920583, 2797305, 5052646, 3210230,
    5109215, 3523743, 3861971, 421056, 2014107,
    3491630, 2844584, 4701335, 3291881, 2890421
]

# Normalize SaleId to numeric for matching
pred_df['_SaleId_num'] = pd.to_numeric(pred_df['SaleId'], errors='coerce')
mask = pred_df['_SaleId_num'].isin(sale_ids)
subset = pred_df.loc[mask].copy()

# Preserve the given order
order_map = {sid: i for i, sid in enumerate(sale_ids)}
subset['_order'] = subset['_SaleId_num'].map(order_map)
subset = subset.sort_values('_order')

# Select and rename columns for display
cols = ['SaleId','Title','Predicted_Loc','Predict_State','Price','Predicted_Price','Group_Assigned']
cols = [c for c in cols if c in subset.columns]
view = subset[cols].copy()
view = view.rename(columns={'Group_Assigned':'Group'})

# Report missing ids, if any
found = set(subset['_SaleId_num'].dropna().astype(int).tolist())
missing = [sid for sid in sale_ids if sid not in found]
if missing:
    print('Uwaga: Nie znaleziono następujących SaleId w pred_df:', missing)

print(view.reset_index(drop=True).head(20))


     SaleId                                              Title  \
0   3847651                            Mieszkanie, ul. Zamkowa   
1   3920583    Mieszkanie w centrum Jawor 2 pokoje po remoncie   
2   2797305  2pok 41met, okolice Sztabowej LOGGIA/PIWNICA/G...   
3   5052646         Widok na cały Kraków - Górka Narodowa 60m2   
4   3210230                               Mieszkanie Bydgoszcz   
5   5109215                             Mieszkanie, ul. Smolna   
6   3523743     Kraków – Podgórze – ul. Krasickiego – 37,71 m²   
7   3861971                               Mieszkanie Sosnowiec   
8    421056    Promocja 2 pokoje Sosnowiec Sielec Klimontowska   
9   2014107                          Mieszkanie Świętochłowice   
10  3491630                    Przytulne 3 pokoje z ogrodem L7   
11  2844584  LUNA Gdańsk Galaktyczna dwupoziomowe 58 m2 do ...   
12  4701335    Narożne: wyjątkowy rozkład 2 pokoi - dwustronne   
13  3291881                    Mieszkanie, Łódź, Bałuty, 37 m²   
14  289042