In [1]:
# CELL 1: Importy i konfiguracja
import os, re, numpy as np, pandas as pd, csv, gc
from datetime import datetime
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
SEED = 42; tf.keras.utils.set_random_seed(SEED); np.random.seed(SEED); pd.options.display.float_format = '{:,.2f}'.format

In [2]:
# CELL 2: Wczytywanie i mapowanie po indeksach
PATH = 'Data_state_LSTM_predicted_full_v4_FINAL.csv'
try:
    df = pd.read_csv(PATH, sep=';', encoding='utf-8-sig', header=None, skiprows=1, low_memory=False)
    print(f"Wczytano {len(df)} wierszy z pliku: {PATH}")
except Exception as e:
    raise RuntimeError(f"Nie udało się wczytać pliku. Błąd: {e}")
column_index_map = {
    0: 'SaleId', 3: 'Title', 4: 'Description', 5: 'Area', 6: 'Price',
    11: 'NumberOfRooms', 12: 'BuiltYear', 14: 'BuildingType', 16: 'OfferFrom',
    17: 'Floor', 18: 'Floors', 19: 'TypeOfMarket', 28: 'Type',
    54: 'Predicted_Loc', 55: 'Predict_State'
}
valid_index_map = {idx: name for idx, name in column_index_map.items() if idx < df.shape[1]}
df_clean = df[list(valid_index_map.keys())].copy()
df_clean.columns = list(valid_index_map.values())
print(f"\nWybrano i przemianowano {len(df_clean.columns)} kluczowych kolumn.")

Wczytano 1467262 wierszy z pliku: Data_state_LSTM_predicted_full_v4_FINAL.csv

Wybrano i przemianowano 15 kluczowych kolumn.


In [3]:
# CELL 3: Inżynieria Cech i usuwanie outlierów
df_proc = df_clean.copy(); del df_clean; gc.collect()
df_proc['Price'] = pd.to_numeric(df_proc['Price'], errors='coerce')
df_proc['Area'] = pd.to_numeric(df_proc['Area'], errors='coerce')
df_proc = df_proc.dropna(subset=['Price', 'Area'])
df_proc = df_proc[df_proc['Price'] > 1000]
q_low = df_proc['Price'].quantile(0.01); q_high = df_proc['Price'].quantile(0.99)
df_proc = df_proc[(df_proc['Price'] >= q_low) & (df_proc['Price'] <= q_high)]
print(f"\nDane po usunięciu 2% skrajnych cen (outlierów): {df_proc.shape}")
def clean_text(s: str) -> str:
    s = (s or "").lower(); patterns = [r'oferta nie stanowi.*?oferty w rozumieniu kodeksu cywilnego', r'prosz[ąa] o kontakt.*', r'tylko u nas.*', r'nie pobieramy prowizji.*']
    for p in patterns: s = re.sub(p, ' ', s, flags=re.IGNORECASE)
    s = re.sub(r'[^a-zA-Ząćęłńóśźż\s]', ' ', s); s = re.sub(r'\s+', ' ', s).strip()
    return s
df_proc['Description'] = df_proc['Description'].fillna('').astype(str).apply(clean_text)
for c in ['NumberOfRooms','Floor','Floors','BuiltYear']:
    if c in df_proc.columns: df_proc[c] = pd.to_numeric(df_proc[c], errors='coerce')
if 'BuiltYear' in df_proc.columns:
    by = df_proc['BuiltYear']
    median_year = by.dropna().median() if not by.dropna().empty else 2000
    by = by.fillna(median_year).clip(1800, datetime.now().year + 1)
    df_proc['BuildingAge'] = (datetime.now().year - by).astype(int)
else:
    df_proc['BuildingAge'] = 60
numeric_features = [c for c in ['Area','NumberOfRooms','Floor','Floors','BuildingAge'] if c in df_proc.columns]
categorical_features = [c for c in ['Predict_State','Predicted_Loc','BuildingType','TypeOfMarket','Type','OfferFrom'] if c in df_proc.columns]
text_feature = 'Description'
for c in numeric_features: df_proc[c].fillna(df_proc[c].median(), inplace=True)
for c in categorical_features: df_proc[c] = df_proc[c].astype(str).fillna('unknown').replace({'nan':'unknown','None':'unknown'})
print("\nUżyte cechy:", numeric_features + categorical_features + [text_feature])
df_proc['Price_log'] = np.log1p(df_proc['Price'])


Dane po usunięciu 2% skrajnych cen (outlierów): (1235203, 15)

Użyte cechy: ['Area', 'NumberOfRooms', 'Floor', 'Floors', 'BuildingAge', 'Predict_State', 'Predicted_Loc', 'BuildingType', 'TypeOfMarket', 'Type', 'OfferFrom', 'Description']


In [4]:
# CELL 4: Podział na zbiory i tworzenie tf.data.Dataset
features = numeric_features + categorical_features + [text_feature]
target = 'Price_log'
train_df, val_df = train_test_split(df_proc, test_size=0.2, random_state=SEED)
del df_proc; gc.collect()
print(f"Zbiór treningowy: {train_df.shape}, Walidacyjny: {val_df.shape}")
def df_to_dataset(dataframe, shuffle=True, batch_size=256):
    df = dataframe.copy(); labels = df.pop(target).values.astype('float32')
    features_dict = {col: df[col].values for col in features}
    ds = tf.data.Dataset.from_tensor_slices((features_dict, labels))
    if shuffle: ds = ds.shuffle(buffer_size=len(dataframe), seed=SEED)
    return ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
train_ds = df_to_dataset(train_df)
val_ds = df_to_dataset(val_df, shuffle=False)
adapt_ds = tf.data.Dataset.from_tensor_slices(dict(train_df[features])).batch(256)
print("Datasety gotowe.")

Zbiór treningowy: (988162, 17), Walidacyjny: (247041, 17)
Datasety gotowe.


In [5]:
# CELL 5: Budowa modelu z warstwami preprocessingu (Numeric + Categorical + Text)
inputs = {}; encoded_features = []
print("Adaptacja warstw...")
for fname in numeric_features:
    inputs[fname] = keras.Input(shape=(1,), name=fname, dtype=tf.float32)
    norm = layers.Normalization(axis=-1); norm.adapt(adapt_ds.map(lambda x: tf.expand_dims(x[fname], axis=-1)))
    encoded_features.append(norm(inputs[fname]))
for fname in categorical_features:
    inputs[fname] = keras.Input(shape=(1,), name=fname, dtype=tf.string)
    lookup = layers.StringLookup(output_mode='one_hot'); lookup.adapt(adapt_ds.map(lambda x: x[fname]))
    encoded_features.append(lookup(inputs[fname]))
inputs[text_feature] = keras.Input(shape=(1,), name=text_feature, dtype=tf.string)
text_vec = layers.TextVectorization(max_tokens=2000, output_mode='multi_hot'); text_vec.adapt(adapt_ds.map(lambda x: x[text_feature]))
encoded_features.append(text_vec(inputs[text_feature]))
all_features = layers.Concatenate()(encoded_features)
x = layers.Dense(256, activation="relu")(all_features)
x = layers.Dropout(0.3)(x)
x = layers.Dense(128, activation="relu")(x)
x = layers.Dropout(0.3)(x)
output = layers.Dense(1, name="price_log")(x)
model = keras.Model(inputs, output)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss="mean_squared_error", metrics=[keras.metrics.RootMeanSquaredError(name="rmse")])
model.summary()

Adaptacja warstw...


In [6]:
# CELL 6: Trening modelu
es = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1)
rlr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-6, verbose=1)
csv_logger = keras.callbacks.CSVLogger('training_log_price_v7.csv')
history = model.fit(train_ds, validation_data=val_ds, epochs=50, callbacks=[es, rlr, csv_logger])

Epoch 1/50
[1m3861/3861[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m721s[0m 178ms/step - loss: 5.9423 - rmse: 2.2072 - val_loss: 0.2281 - val_rmse: 0.4776 - learning_rate: 0.0010
Epoch 2/50
[1m3861/3861[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m750s[0m 186ms/step - loss: 1.3532 - rmse: 1.1622 - val_loss: 0.1119 - val_rmse: 0.3346 - learning_rate: 0.0010
Epoch 3/50
[1m3861/3861[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m769s[0m 191ms/step - loss: 0.7664 - rmse: 0.8751 - val_loss: 0.0844 - val_rmse: 0.2905 - learning_rate: 0.0010
Epoch 4/50
[1m3861/3861[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m784s[0m 194ms/step - loss: 0.4679 - rmse: 0.6837 - val_loss: 0.0901 - val_rmse: 0.3002 - learning_rate: 0.0010
Epoch 5/50
[1m3861/3861[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m791s[0m 196ms/step - loss: 0.2748 - rmse: 0.5238 - val_loss: 0.0727 - val_rmse: 0.2696 - learning_rate: 0.0010
Epoch 6/50
[1m3861/3861[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m793s[0m

In [7]:
# CELL 7: Finalna ocena, zapis i predykcja z regułami biznesowymi

MODEL_SAVE_PATH = 'price_regressor_v7_final.keras'

# --- Ocena surowego modelu ---
print("\n--- Ocena surowego modelu na zbiorze walidacyjnym ---")
results = model.evaluate(val_ds, verbose=0, return_dict=True)
print(f"Validation RMSE (on log scale): {results['rmse']:.4f}")
model.save(MODEL_SAVE_PATH)
print(f"\nModel z warstwami preprocessingu zapisany w: {MODEL_SAVE_PATH}")


# --- Pełna predykcja i zastosowanie reguł biznesowych ---
print("\nObliczanie efektywności i stosowanie reguł biznesowych...")
reloaded_model = keras.models.load_model(MODEL_SAVE_PATH)
val_ds_full = df_to_dataset(val_df, shuffle=False, batch_size=2048)

# Wyciągamy prawdziwe ceny z oryginalnego DataFrame, aby zachować indeksy
true_prices_series = val_df['Price']
# Przewidujemy ceny
predicted_price_log_full = reloaded_model.predict(val_ds_full, verbose=0)
predicted_prices_raw = np.expm1(predicted_price_log_full.flatten())

# Stwórzmy tymczasowy DataFrame do obliczeń
results_df = pd.DataFrame({
    'TruePrice': true_prices_series,
    'PredictedPriceRaw': predicted_prices_raw
}, index=true_prices_series.index)


# --- Funkcja implementująca Twoje reguły ---
def apply_business_rules(predicted_price, offer_price):
    lower_bound_hard = offer_price * 0.90
    upper_bound_hard = offer_price * 1.10
    lower_bound_soft = offer_price * 0.95
    upper_bound_soft = offer_price * 1.05
    
    clipped_price = np.clip(predicted_price, lower_bound_hard, upper_bound_hard)
    is_close = (predicted_price >= lower_bound_soft) & (predicted_price <= upper_bound_soft)
    final_price = np.where(is_close, offer_price, clipped_price)
    return np.round(final_price)

results_df['FinalPrice'] = apply_business_rules(results_df['PredictedPriceRaw'], results_df['TruePrice'])


# --- Ocena efektywności (przed i po) ---
mape_raw = np.mean(np.abs((results_df['TruePrice'] - results_df['PredictedPriceRaw']) / results_df['TruePrice'])) * 100
median_ape_raw = np.median(np.abs((results_df['TruePrice'] - results_df['PredictedPriceRaw']) / results_df['TruePrice'])) * 100
mape_final = np.mean(np.abs((results_df['TruePrice'] - results_df['FinalPrice']) / results_df['TruePrice'])) * 100
median_ape_final = np.median(np.abs((results_df['TruePrice'] - results_df['FinalPrice']) / results_df['TruePrice'])) * 100

print("\n" + "="*55)
print("--- Efektywność Procentowa (Przed i Po Regułach) ---")
print("="*55)
print(f"Mediana błędu (surowy model):   {median_ape_raw:.2f}%")
print(f"Mediana błędu (PO REGUŁACH):    {median_ape_final:.2f}%  <-- Twoja główna miara sukcesu")
print("-" * 55)
print(f"Średni błąd MAPE (surowy model): {mape_raw:.2f}%")
print(f"Średni błąd MAPE (PO REGUŁACH): {mape_final:.2f}%")
print("="*55)


# --- Wizualizacja na losowej próbce ---
print("\n--- Test predykcji z regułami biznesowymi (15 losowych próbek) ---")
sample_indices = val_df.sample(15, random_state=SEED).index
sample_results = results_df.loc[sample_indices]

# Dołączamy dodatkowe kolumny do wyświetlenia
sample_results = sample_results.join(val_df[['SaleId', 'Area', 'Predict_State', 'Predicted_Loc']])
sample_results.set_index('SaleId', inplace=True)

display(sample_results[['TruePrice', 'PredictedPriceRaw', 'FinalPrice', 'Area', 'Predict_State', 'Predicted_Loc']].rename(columns={
    'TruePrice': 'Cena Ofertowa',
    'PredictedPriceRaw': 'Predykcja Surowa',
    'FinalPrice': 'Cena Finalna'
}))


--- Ocena surowego modelu na zbiorze walidacyjnym ---
Validation RMSE (on log scale): 0.2115

Model z warstwami preprocessingu zapisany w: price_regressor_v7_final.keras

Obliczanie efektywności i stosowanie reguł biznesowych...

--- Efektywność Procentowa (Przed i Po Regułach) ---
Mediana błędu (surowy model):   11.67%
Mediana błędu (PO REGUŁACH):    10.00%  <-- Twoja główna miara sukcesu
-------------------------------------------------------
Średni błąd MAPE (surowy model): 16.20%
Średni błąd MAPE (PO REGUŁACH): 7.09%

--- Test predykcji z regułami biznesowymi (15 losowych próbek) ---


Unnamed: 0_level_0,Cena Ofertowa,Predykcja Surowa,Cena Finalna,Area,Predict_State,Predicted_Loc
SaleId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3847651,219000.0,265189.56,240900.0,39.0,FOR_RENOVATION,Katowice -> ? -> ? -> ?
3920583,219000.0,322021.56,240900.0,46.8,FOR_RENOVATION,Jawor -> ? -> ? -> ?
2797305,549000.0,665302.81,603900.0,41.0,GOOD,Wrocław -> Wrocław-fabryczna -> ? -> ?
5052646,1000000.0,1151269.88,1100000.0,60.0,GOOD,Kraków -> Kraków-krowodrza -> Prądnik biały -> ?
3210230,350000.0,247995.81,315000.0,51.0,GOOD,Bydgoszcz -> ? -> ? -> ?
5109215,509000.0,486603.62,509000.0,40.0,GOOD,Poznań -> Poznań-nowe miasto -> Rataje -> ?
3523743,549000.0,460055.88,494100.0,37.71,FOR_RENOVATION,Kraków -> Kraków-podgórze -> ? -> ?
3861971,329000.0,320051.09,329000.0,54.37,GOOD,Sosnowiec -> ? -> ? -> ?
421056,399000.0,364448.34,364448.0,39.0,DEVELOPER_STATE,Sosnowiec -> ? -> ? -> ?
2014107,135000.0,230921.69,148500.0,36.25,AFTER_RENOVATION,Świętochłowice -> ? -> ? -> ?
