In [1]:
# CELL 1: Importy i konfiguracja
import os
import re
import numpy as np
import pandas as pd
from datetime import datetime
import joblib

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Konfiguracja
SEED = 42
tf.keras.utils.set_random_seed(SEED)
np.random.seed(SEED)
pd.options.display.float_format = '{:,.2f}'.format

In [2]:
# CELL 2: Wczytywanie danych i wstępne czyszczenie
import csv

PATH = 'Data_state_LSTM_predicted_full_v4_FINAL.csv'

# Wczytujemy plik z separatorem ';', bo tak został zapisany
try:
    df = pd.read_csv(PATH, sep=';', encoding='utf-8-sig', low_memory=False)
    print(f"Wczytano {len(df)} wierszy z pliku: {PATH}")
except Exception as e:
    raise RuntimeError(f"Nie udało się wczytać pliku. Upewnij się, że istnieje i ma separator ';'. Błąd: {e}")

# --- Ujednolicenie nazw kolumn (kluczowy krok) ---
def find_col_by_name(df, candidates):
    df_cols_lower = {str(c).lower().strip(): str(c) for c in df.columns}
    for cand in candidates:
        if str(cand).lower().strip() in df_cols_lower:
            return df_cols_lower[str(cand).lower().strip()]
    return None

rename_map = {}
feature_map = {
    'SaleId': ['SaleId', 'id', '88'], 'Title': ['Title', 'tytuł', 'Mieszkanie trzypokojowe na sprzedaż'],
    'Area': ['Area', 'Powierzchnia', '73'], 'Price': ['Price', 'Cena', '766500'],
    'NumberOfRooms': ['NumberOfRooms', 'rooms', '3'], 'Floor': ['Floor', 'piętro'],
    'Floors': ['Floors', 'liczba pięter'], 'BuiltYear': ['BuiltYear', 'rok budowy', '2005'],
    'BuildingType': ['BuildingType', 'typ budynku', 'Blok'], 'TypeOfMarket': ['TypeOfMarket', 'rynek', 'Wtórny'],
    'Type': ['Type', 'typ', 'Mieszkania'], 'OfferFrom': ['OfferFrom', 'oferta od', 'Agencja'],
    'OwnerType': ['OwnerType'], 'Predict_State': ['Predict_State'],
    'Predicted_Loc': ['Predicted_Loc', 'Białystok -> ? -> ? -> ?']
}
for canonical, candidates in feature_map.items():
    found = find_col_by_name(df, candidates)
    if found and found != canonical:
        rename_map[found] = canonical

df.rename(columns=rename_map, inplace=True)
print(f"\nUjednolicono nazwy kolumn. Zmiany: {rename_map}")

# --- Podstawowe czyszczenie ---
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
df['Area'] = pd.to_numeric(df['Area'], errors='coerce')
df = df.dropna(subset=['Price', 'Area'])
df = df[df['Price'] > 1000] # Usuwamy nierealistycznie niskie ceny
print(f"\nDane po podstawowym czyszczeniu (usunięto braki w Price/Area): {df.shape}")

Wczytano 1467262 wierszy z pliku: Data_state_LSTM_predicted_full_v4_FINAL.csv

Ujednolicono nazwy kolumn. Zmiany: {'88': 'SaleId', 'Mieszkanie trzypokojowe na sprzedaż': 'Title', '73': 'Area', '766500': 'Price', '3': 'NumberOfRooms', '2005': 'BuiltYear', 'Blok': 'BuildingType', 'Wtórny': 'TypeOfMarket', 'Mieszkania': 'Type', 'Agencja': 'OfferFrom', 'Białystok -> ? -> ? -> ?': 'Predicted_Loc'}

Dane po podstawowym czyszczeniu (usunięto braki w Price/Area): (1260266, 56)


In [3]:
# CELL 3: Inżynieria Cech
df_proc = df.copy()

# Cechy liczbowe
num_cols_to_convert = ['NumberOfRooms','Floor','Floors','BuiltYear']
for c in num_cols_to_convert:
    if c in df_proc.columns:
        df_proc[c] = pd.to_numeric(df_proc[c], errors='coerce')

# BuiltYear -> BuildingAge
if 'BuiltYear' in df_proc.columns:
    by = df_proc['BuiltYear']
    median_year = by.dropna().median() if not by.dropna().empty else 2000
    by = by.fillna(median_year).clip(1800, datetime.now().year + 1)
    df_proc['BuildingAge'] = (datetime.now().year - by).astype(int)
else:
    df_proc['BuildingAge'] = 60 # Domyślny wiek, jeśli brakuje danych o roku

# Definicja list cech, które faktycznie istnieją w danych
numeric_features = [c for c in ['Area','NumberOfRooms','Floor','Floors','BuildingAge'] if c in df_proc.columns]
categorical_features = [c for c in ['Predict_State','Predicted_Loc','BuildingType','TypeOfMarket','Type','OfferFrom'] if c in df_proc.columns]

# Wypełnianie braków w cechach
for c in numeric_features:
    df_proc[c].fillna(df_proc[c].median(), inplace=True)
for c in categorical_features:
    df_proc[c] = df_proc[c].astype(str).fillna('unknown').replace({'nan':'unknown','None':'unknown'})

print("Użyte cechy numeryczne:", numeric_features)
print("Użyte cechy kategoryczne:", categorical_features)

# Logarytmowanie ceny (targetu) - stabilizuje trening
df_proc['Price_log'] = np.log1p(df_proc['Price'])

Użyte cechy numeryczne: ['Area', 'NumberOfRooms', 'BuildingAge']
Użyte cechy kategoryczne: ['Predict_State', 'Predicted_Loc', 'BuildingType', 'TypeOfMarket', 'Type', 'OfferFrom']


In [4]:
# CELL 4: Podział na zbiory i tworzenie tf.data.Dataset

from sklearn.model_selection import train_test_split

features = numeric_features + categorical_features
target = 'Price_log'

train_df, val_df = train_test_split(df_proc, test_size=0.2, random_state=SEED)

def df_to_dataset(dataframe, shuffle=True, batch_size=256):
    df = dataframe.copy()
    labels = df.pop(target)
    ds = tf.data.Dataset.from_tensor_slices((dict(df[features]), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe), seed=SEED)
    ds = ds.batch(batch_size)
    ds = ds.prefetch(tf.data.AUTOTUNE)
    return ds

train_ds = df_to_dataset(train_df)
val_ds = df_to_dataset(val_df, shuffle=False)

In [5]:
# CELL 5: Budowa modelu z warstwami preprocessingu

# --- 1. Przygotowanie warstw preprocessingu ---
inputs = {}
encoded_features = []

# Cechy numeryczne
for feature_name in numeric_features:
    inputs[feature_name] = keras.Input(shape=(1,), name=feature_name, dtype=tf.float32)
    normalizer = layers.Normalization()
    normalizer.adapt(train_df[feature_name].values.reshape(-1, 1))
    encoded = normalizer(inputs[feature_name])
    encoded_features.append(encoded)

# Cechy kategoryczne
for feature_name in categorical_features:
    inputs[feature_name] = keras.Input(shape=(1,), name=feature_name, dtype=tf.string)
    vocab = train_df[feature_name].unique()
    lookup = layers.StringLookup(vocabulary=vocab, output_mode='one_hot')
    encoded = lookup(inputs[feature_name])
    encoded_features.append(encoded)

# --- 2. Połączenie przetworzonych cech ---
all_features = layers.Concatenate()(encoded_features)

# --- 3. Głowica regresyjna (Deep part) ---
x = layers.Dense(256, activation="relu")(all_features)
x = layers.Dropout(0.3)(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.3)(x)
output = layers.Dense(1, name="price_log")(x)

model = keras.Model(inputs, output)

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=0.001),
    loss="mean_squared_error",
    metrics=[keras.metrics.RootMeanSquaredError(name="rmse")]
)

model.summary()

In [6]:
# CELL 6: Trening modelu
es = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1)
rlr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6, verbose=1)
csv_logger = keras.callbacks.CSVLogger('training_log_price_v6.csv')

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=50,
    callbacks=[es, rlr, csv_logger]
)

Epoch 1/50
[1m3939/3939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 20ms/step - loss: 7.8152 - rmse: 2.4277 - val_loss: 0.3277 - val_rmse: 0.5724 - learning_rate: 0.0010
Epoch 2/50
[1m3939/3939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 23ms/step - loss: 1.5217 - rmse: 1.2332 - val_loss: 0.2247 - val_rmse: 0.4740 - learning_rate: 0.0010
Epoch 3/50
[1m3939/3939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 24ms/step - loss: 0.9863 - rmse: 0.9927 - val_loss: 0.2082 - val_rmse: 0.4562 - learning_rate: 0.0010
Epoch 4/50
[1m3939/3939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 24ms/step - loss: 0.6656 - rmse: 0.8154 - val_loss: 0.2049 - val_rmse: 0.4526 - learning_rate: 0.0010
Epoch 5/50
[1m3939/3939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 24ms/step - loss: 0.4408 - rmse: 0.6632 - val_loss: 0.1971 - val_rmse: 0.4440 - learning_rate: 0.0010
Epoch 6/50
[1m3939/3939[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 26ms

In [7]:
# CELL 7: Ocena, zapis modelu i test predykcji

print("\nOcena finalnego modelu na zbiorze walidacyjnym:")
results = model.evaluate(val_ds)
print(f"Validation RMSE (on log scale): {results[1]:.4f}")

# Zapis modelu (teraz zawiera cały preprocessing)
MODEL_SAVE_PATH = 'price_regressor_v6_with_preprocessing.keras'
model.save(MODEL_SAVE_PATH)
print(f"\nModel z warstwami preprocessingu zapisany w: {MODEL_SAVE_PATH}")


# --- TEST INFERENCJI ---
# Wczytujemy model z powrotem
reloaded_model = keras.models.load_model(MODEL_SAVE_PATH)

# Bierzemy 5 losowych próbek z danych walidacyjnych do testu
sample_df = val_df.sample(5, random_state=SEED)
sample_labels = np.expm1(sample_df.pop('Price_log')) # Prawdziwe ceny

# Konwertujemy próbkę na format, który akceptuje model.predict
sample_ds = tf.data.Dataset.from_tensor_slices(dict(sample_df[features])).batch(5)

# Predykcja jest teraz banalnie prosta!
predicted_price_log = reloaded_model.predict(sample_ds)
predicted_price = np.expm1(predicted_price_log.flatten())

# Wyświetlenie wyników
comparison = pd.DataFrame({
    'Prawdziwa Cena': sample_labels,
    'Przewidziana Cena': predicted_price
})
print("\n--- Test predykcji na 5 losowych próbkach ---")
display(comparison)


Ocena finalnego modelu na zbiorze walidacyjnym:
[1m985/985[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 8ms/step - loss: 0.1299 - rmse: 0.3604
Validation RMSE (on log scale): 0.3592

Model z warstwami preprocessingu zapisany w: price_regressor_v6_with_preprocessing.keras
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 164ms/step

--- Test predykcji na 5 losowych próbkach ---


Unnamed: 0,Prawdziwa Cena,Przewidziana Cena
1417762,300098.0,321019.19
210624,499900.0,464304.44
178267,255000.0,256246.09
1132538,2000000.0,675817.81
653728,290000.0,273343.94
