In [1]:
# CELL 1: Importy i konfiguracja
import os
import re
import numpy as np
import pandas as pd
import csv
import gc
from datetime import datetime

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_probability as tfp
from sklearn.model_selection import train_test_split

# Konfiguracja
SEED = 42
tf.keras.utils.set_random_seed(SEED)
np.random.seed(SEED)
pd.options.display.float_format = '{:,.2f}'.format






In [2]:
# CELL 2: Wczytywanie i mapowanie po indeksach

PATH = 'Data_state_LSTM_predicted_full_v4_FINAL.csv'

try:
    df = pd.read_csv(PATH, sep=';', encoding='utf-8-sig', header=None, skiprows=1, low_memory=False)
    print(f"Wczytano {len(df)} wierszy z pliku: {PATH}")
except Exception as e:
    raise RuntimeError(f"Nie udało się wczytać pliku. Upewnij się, że istnieje i ma separator ';'. Błąd: {e}")

column_index_map = {
    0: 'SaleId', 3: 'Title', 4: 'Description', 5: 'Area', 6: 'Price',
    11: 'NumberOfRooms', 12: 'BuiltYear', 14: 'BuildingType', 16: 'OfferFrom',
    17: 'Floor', 18: 'Floors', 19: 'TypeOfMarket', 28: 'Type',
    54: 'Predicted_Loc', 55: 'Predict_State'
}

valid_index_map = {idx: name for idx, name in column_index_map.items() if idx < df.shape[1]}
df_clean = df[list(valid_index_map.keys())].copy()
df_clean.columns = list(valid_index_map.values())

print(f"\nWybrano i przemianowano {len(df_clean.columns)} kluczowych kolumn.")
display(df_clean.head())

Wczytano 1467262 wierszy z pliku: Data_state_LSTM_predicted_full_v4_FINAL.csv

Wybrano i przemianowano 15 kluczowych kolumn.


Unnamed: 0,SaleId,Title,Description,Area,Price,NumberOfRooms,BuiltYear,BuildingType,OfferFrom,Floor,Floors,TypeOfMarket,Type,Predicted_Loc,Predict_State
0,99,Sprzedam mieszkanie na parterze 64.8m2 Białyst...,Sprzedam mieszkanie na parterze 64.8m2 w 3-pię...,64.8,540000,3,,Blok,Osoba prywatna,0,,Wtórny,,Białystok -> ? -> ? -> ?,FOR_RENOVATION
1,115,"Mieszkanie bezczynszowe, 3 pokoje, 2 łazienki",SPRZEDAŻ WYŁĄCZNIE BEZPOŚREDNIA. Agencjom nier...,51.0,540000,3,2013.0,,Osoba prywatna,0,,Wtórny,607044548.0,Białystok -> ? -> ? -> ?,AFTER_RENOVATION
2,140,Mieszkanie trzypokojowe na sprzedaż,***Oferta bez prowizji biura i podatku PCC!***...,67.62,544000,3,2023.0,Apartamentowiec,Agencja,0,1.0,Wtórny,797817821.0,Białystok -> ? -> ? -> ?,GOOD
3,145,3 Pokoje- 48M2-Osiedle Dziesięciny,Przedstawiamy na sprzedaż 3 pokojowe mieszkani...,48.0,459000,3,,Blok,Agencja,3,,Wtórny,606163921.0,Białystok -> ? -> ? -> ?,AFTER_RENOVATION
4,159,"Mieszkanie, 87 m², Białystok","Przestronne, jasne mieszkanie na zamkniętym os...",87.0,779000,4,2005.0,Blok,Osoba prywatna,1,,Wtórny,570386002.0,Białystok -> ? -> ? -> ?,AFTER_RENOVATION


In [3]:
# CELL 3: Inżynieria Cech i usuwanie outlierów
df_proc = df_clean.copy(); del df_clean; gc.collect()

# Czyszczenie i konwersja typów
df_proc['Price'] = pd.to_numeric(df_proc['Price'], errors='coerce')
df_proc['Area'] = pd.to_numeric(df_proc['Area'], errors='coerce')
df_proc = df_proc.dropna(subset=['Price', 'Area'])
df_proc = df_proc[df_proc['Price'] > 1000]

# Usuwanie outlierów z ceny
q_low = df_proc['Price'].quantile(0.01)
q_high = df_proc['Price'].quantile(0.99)
df_proc = df_proc[(df_proc['Price'] >= q_low) & (df_proc['Price'] <= q_high)]
print(f"\nDane po usunięciu 2% skrajnych cen (outlierów): {df_proc.shape}")

# Czyszczenie tekstu opisu
def clean_text(s: str) -> str:
    s = (s or "").lower()
    patterns = [r'oferta nie stanowi.*?oferty w rozumieniu kodeksu cywilnego', r'prosz[ąa] o kontakt.*', r'tylko u nas.*', r'nie pobieramy prowizji.*']
    for p in patterns: s = re.sub(p, ' ', s, flags=re.IGNORECASE)
    s = re.sub(r'[^a-zA-Ząćęłńóśźż\s]', ' ', s); s = re.sub(r'\s+', ' ', s).strip()
    return s
df_proc['Description'] = df_proc['Description'].fillna('').astype(str).apply(clean_text)

# Inżynieria cech numerycznych
for c in ['NumberOfRooms','Floor','Floors','BuiltYear']:
    if c in df_proc.columns: df_proc[c] = pd.to_numeric(df_proc[c], errors='coerce')
if 'BuiltYear' in df_proc.columns:
    by = df_proc['BuiltYear']
    median_year = by.dropna().median() if not by.dropna().empty else 2000
    by = by.fillna(median_year).clip(1800, datetime.now().year + 1)
    df_proc['BuildingAge'] = (datetime.now().year - by).astype(int)
else:
    df_proc['BuildingAge'] = 60
    
# Definicja list cech
numeric_features = [c for c in ['Area','NumberOfRooms','Floor','Floors','BuildingAge'] if c in df_proc.columns]
categorical_features = [c for c in ['Predict_State','Predicted_Loc','BuildingType','TypeOfMarket','Type','OfferFrom'] if c in df_proc.columns]
text_feature = 'Description'

# Wypełnianie braków
for c in numeric_features: df_proc[c].fillna(df_proc[c].median(), inplace=True)
for c in categorical_features: df_proc[c] = df_proc[c].astype(str).fillna('unknown').replace({'nan':'unknown','None':'unknown'})

print("\nUżyte cechy numeryczne:", numeric_features)
print("Użyte cechy kategoryczne:", categorical_features)
df_proc['Price_log'] = np.log1p(df_proc['Price'])


Dane po usunięciu 2% skrajnych cen (outlierów): (1235203, 15)

Użyte cechy numeryczne: ['Area', 'NumberOfRooms', 'Floor', 'Floors', 'BuildingAge']
Użyte cechy kategoryczne: ['Predict_State', 'Predicted_Loc', 'BuildingType', 'TypeOfMarket', 'Type', 'OfferFrom']


In [4]:
# CELL 4: Podział na zbiory i tworzenie tf.data.Dataset
features = numeric_features + categorical_features + [text_feature]
target = 'Price_log'
train_df, val_df = train_test_split(df_proc, test_size=0.2, random_state=SEED)
del df_proc; gc.collect()
print(f"Zbiór treningowy: {train_df.shape}, Walidacyjny: {val_df.shape}")

def df_to_dataset(dataframe, shuffle=True, batch_size=256):
    df = dataframe.copy()
    labels = df.pop(target).values.astype('float32')
    features_dict = {col: df[col].values for col in features}
    ds = tf.data.Dataset.from_tensor_slices((features_dict, labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe), seed=SEED)
    return ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)

print("\nTworzenie datasetów...")
train_ds = df_to_dataset(train_df)
val_ds = df_to_dataset(val_df, shuffle=False)
adapt_ds = tf.data.Dataset.from_tensor_slices(dict(train_df[features])).batch(256)
print("Datasety gotowe.")

Zbiór treningowy: (988162, 17), Walidacyjny: (247041, 17)

Tworzenie datasetów...
Datasety gotowe.


In [5]:
# CELL 5: Model probabilistyczny z uproszczoną, poprawną funkcją straty

# --- 1. Niestandardowa Funkcja Straty (oparta tylko na pewności) ---
def create_confidence_loss():
    def confidence_based_loss(y_true_log, y_pred_params):
        # Krok 1: Stwórz obiekt rozkładu z parametrów zwróconych przez model
        mu_log = y_pred_params[:, 0:1]
        sigma_log = 1e-6 + tf.math.softplus(y_pred_params[:, 1:2])
        y_pred_dist = tfp.distributions.Normal(loc=mu_log, scale=sigma_log)
        
        # Krok 2: Oblicz standardową stratę NLL
        nll = -y_pred_dist.log_prob(y_true_log)
        
        # Krok 3: Oblicz niestandardowy komponent karny
        y_true = tf.math.expm1(y_true_log)
        y_pred = tf.math.expm1(mu_log)
        pct_error = tf.abs(y_true - y_pred) / (y_true + keras.backend.epsilon())
        
        # Logika kar oparta WYŁĄCZNIE na pewności (sigma)
        penalty = tf.where(
            sigma_log > 0.2, # Mała pewność
            tf.where(pct_error > 0.05, 50.0, 0.0), # Kara za błąd > 5%
            tf.where(
                sigma_log > 0.1, # Średnia pewność
                tf.where(pct_error > 0.10, 25.0, 0.0), # Kara za błąd > 10%
                tf.where(pct_error > 0.15, 10.0, 0.0)  # Wysoka pewność, kara za błąd > 15%
            )
        )
        
        return tf.reduce_mean(nll + penalty)

    return confidence_based_loss

# --- 2. Metryki (do obserwacji) ---
def distribution_mape(y_true_log, y_pred_params):
    y_true = tf.math.expm1(y_true_log)
    y_pred = tf.math.expm1(y_pred_params[:, 0:1])
    return tf.reduce_mean(tf.abs(y_true - y_pred) / (y_true + keras.backend.epsilon())) * 100

# --- 3. Budowa modelu (zwraca tensor 2D) ---
inputs = {}; encoded_features = []
for fname in numeric_features:
    inputs[fname] = keras.Input(shape=(1,), name=fname, dtype=tf.float32)
    norm = layers.Normalization(axis=-1); norm.adapt(adapt_ds.map(lambda x: tf.expand_dims(x[fname], axis=-1)))
    encoded_features.append(norm(inputs[fname]))
for fname in categorical_features:
    inputs[fname] = keras.Input(shape=(1,), name=fname, dtype=tf.string)
    lookup = layers.StringLookup(output_mode='one_hot'); lookup.adapt(adapt_ds.map(lambda x: x[fname]))
    encoded_features.append(lookup(inputs[fname]))
inputs[text_feature] = keras.Input(shape=(1,), name=text_feature, dtype=tf.string)
text_vec = layers.TextVectorization(max_tokens=2000, output_mode='multi_hot'); text_vec.adapt(adapt_ds.map(lambda x: x[text_feature]))
encoded_features.append(text_vec(inputs[text_feature]))

all_features = layers.Concatenate()(encoded_features)
x = layers.Dense(256, activation="relu")(all_features)
x = layers.Dropout(0.3)(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.3)(x)
output = layers.Dense(2, name="distribution_params")(x)
model = keras.Model(inputs, output)

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss=create_confidence_loss(),
    metrics=[distribution_mape]
)
model.summary()

In [6]:
# CELL 6: Trening modelu
es = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1)
rlr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6, verbose=1)
csv_logger = keras.callbacks.CSVLogger('training_log_price_v9.csv')

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=50,
    callbacks=[es, rlr, csv_logger]
)

Epoch 1/50
[1m3861/3861[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m762s[0m 189ms/step - distribution_mape: inf - loss: 52.5233 - val_distribution_mape: inf - val_loss: 50.9792 - learning_rate: 0.0010
Epoch 2/50
[1m3861/3861[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m798s[0m 198ms/step - distribution_mape: inf - loss: 49.9571 - val_distribution_mape: 1864748.7500 - val_loss: 47.2845 - learning_rate: 0.0010
Epoch 3/50
[1m3861/3861[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m832s[0m 207ms/step - distribution_mape: inf - loss: 49.2823 - val_distribution_mape: 41.2063 - val_loss: 47.1701 - learning_rate: 0.0010
Epoch 4/50
[1m3861/3861[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m840s[0m 209ms/step - distribution_mape: 671695207029939372032.0000 - loss: 48.7256 - val_distribution_mape: 43.5816 - val_loss: 47.1193 - learning_rate: 0.0010
Epoch 5/50
[1m3861/3861[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m858s[0m 214ms/step - distribution_mape: inf - loss: 15130.9941 

In [7]:
# CELL 7: Finalna ocena, zapis i test predykcji
MODEL_SAVE_PATH = 'price_regressor_v9_probabilistic_simple.keras'
print("\n--- Ocena finalnego modelu na zbiorze walidacyjnym ---")
results = model.evaluate(val_ds, verbose=0, return_dict=True)
print(f"Final Validation Loss: {results['loss']:.4f}")
print(f"Validation MAPE: {results['distribution_mape']:.2f}%")
model.save(MODEL_SAVE_PATH)
print(f"\nModel probabilistyczny zapisany w: {MODEL_SAVE_PATH}")

reloaded_model = keras.models.load_model(MODEL_SAVE_PATH, custom_objects={'confidence_based_loss': create_confidence_loss(), 'distribution_mape': distribution_mape})
sample_df = val_df.sample(15, random_state=42)
sample_input_dict = {col: tf.convert_to_tensor(sample_df[col].values) for col in features}

pred_params = reloaded_model.predict(sample_input_dict)
pred_mean_log = pred_params[:, 0]
pred_stddev_log = 1e-6 + tf.math.softplus(pred_params[:, 1])
predicted_price = np.expm1(pred_mean_log)
uncertainty = pred_stddev_log.numpy()

comparison = pd.DataFrame({
    'Prawdziwa Cena': np.expm1(sample_df['Price_log'].values), 'Przewidziana Cena': predicted_price,
    'Niepewność (sigma)': uncertainty, 'Area': sample_df['Area'].values,
    'Predict_State': sample_df['Predict_State'].values
}, index=sample_df['SaleId'].values)
comparison.index.name = 'SaleId'
def get_margin(sigma):
    if sigma > 0.2: return "5% (Mała pewność)"
    if sigma > 0.1: return "10% (Średnia pewność)"
    return "15% (Wysoka pewność)"
comparison['Dozwolony Błąd'] = [get_margin(s) for s in comparison['Niepewność (sigma)']]
print("\n--- Test predykcji probabilistycznej na 15 losowych próbkach ---")
display(comparison)


--- Ocena finalnego modelu na zbiorze walidacyjnym ---
Final Validation Loss: 46.9933
Validation MAPE: 43.19%

Model probabilistyczny zapisany w: price_regressor_v9_probabilistic_simple.keras
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 304ms/step

--- Test predykcji probabilistycznej na 15 losowych próbkach ---


Unnamed: 0_level_0,Prawdziwa Cena,Przewidziana Cena,Niepewność (sigma),Area,Predict_State,Dozwolony Błąd
SaleId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3847651,219000.0,485593.94,0.65,39.0,FOR_RENOVATION,5% (Mała pewność)
3920583,219000.0,485820.91,0.65,46.8,FOR_RENOVATION,5% (Mała pewność)
2797305,549000.0,486721.5,0.65,41.0,GOOD,5% (Mała pewność)
5052646,1000000.0,485222.69,0.65,60.0,GOOD,5% (Mała pewność)
3210230,350000.0,485719.47,0.65,51.0,GOOD,5% (Mała pewność)
5109215,509000.0,485980.31,0.65,40.0,GOOD,5% (Mała pewność)
3523743,549000.0,485623.59,0.65,37.71,FOR_RENOVATION,5% (Mała pewność)
3861971,329000.0,486004.88,0.65,54.37,GOOD,5% (Mała pewność)
421056,399000.0,487382.0,0.65,39.0,DEVELOPER_STATE,5% (Mała pewność)
2014107,135000.0,485856.12,0.65,36.25,AFTER_RENOVATION,5% (Mała pewność)
