In [1]:
# CELL 1: konfiguracja, wczytanie, inżynieria cech, preprocesing (z poprawkami: regex year + imputacja + SVD safe)

import os, csv, re, json, gc, math, datetime as dt
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, f1_score
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# (opcjonalnie) deterministyczność
import random, tensorflow as tf
SEED = 42
np.random.seed(SEED); random.seed(SEED); tf.random.set_seed(SEED)

# 4 źródła
FILES = {
    'sf_after_renovation.csv': 'AFTER_RENOVATION',
    'sf_developer_state.csv' : 'DEVELOPER_STATE',
    'sf_for_renovation.csv'  : 'FOR_RENOVATION',
    'sf_good.csv'            : 'GOOD',
}

label_names  = ['AFTER_RENOVATION','DEVELOPER_STATE','FOR_RENOVATION','GOOD']
label_to_idx = {name: i for i, name in enumerate(label_names)}

# solidny odczyt CSV (jak w v4)
def robust_read_records(path, encoding_candidates=('utf-8-sig','utf-8','cp1250','latin1')):
    last_err = None
    for enc in encoding_candidates:
        try:
            rows = []
            with open(path, 'r', encoding=enc, errors='replace', newline='') as f:
                reader = csv.reader(f, delimiter=',', quotechar='"', escapechar='\\')
                for row in reader:
                    if not row or (len(row) == 1 and not str(row).strip()):
                        continue
                    rows.append(row)
            if not rows: continue
            median_len = int(np.median([len(r) for r in rows]))
            if median_len < 10: continue
            return rows
        except Exception as e:
            last_err = e
            continue
    if last_err: raise last_err
    raise RuntimeError(f'Unable to parse CSV: {path}')

# mapowanie indeksów kolumn (jak w v4)
IDX_MAP = {
    'Description': 4,
    'Area': 5,
    'Price': 6,
    'NumberOfRooms': 11,
    'BuiltYear': 12,
    'BuildingType': 14,
    'OfferFrom': 16,
    'Floor': 17,
    'Floors': 18,
    'TypeOfMarket': 19,
}
IDX_LABEL = 15

def extract_required_df(rows, force_label):
    sel = {}
    for name, idx in IDX_MAP.items():
        sel[name] = [(r[idx] if len(r) > idx else None) for r in rows]
    out = pd.DataFrame(sel)
    out['BuildingCondition'] = [force_label for _ in rows]
    return out

frames = []
for path, label in FILES.items():
    if not os.path.exists(path):
        raise FileNotFoundError(f'Brak pliku: {path}')
    rows    = robust_read_records(path)
    df_part = extract_required_df(rows, force_label=label)
    frames.append(df_part)

full = pd.concat(frames, ignore_index=True)

# ——— czyszczenie tekstu ———
def clean_text(s: str) -> str:
    s = s or ""
    s = s.lower()
    # typowe disclaimers i powtarzalne frazy
    patterns = [
        r'oferta nie stanowi.*?oferty w rozumieniu kodeksu cywilnego',
        r'prosz[ąa] o kontakt.*',
        r'tylko u nas.*',
        r'nie pobieramy prowizji.*',
    ]
    for p in patterns:
        s = re.sub(p, ' ', s, flags=re.IGNORECASE)
    # telefony, e-maile, linki
    s = re.sub(r'\b\d{3}[-\s]?\d{3}[-\s]?\d{3,4}\b', ' ', s)
    s = re.sub(r'[\w\.-]+@[\w\.-]+', ' ', s)
    s = re.sub(r'http\S+|www\.\S+', ' ', s)
    # białe znaki
    s = re.sub(r'\s+', ' ', s).strip()
    return s

full['Description'] = full['Description'].fillna('').astype(str).apply(clean_text)

# ——— liczby ———
for col in ['Area','Price','NumberOfRooms','Floor','Floors']:
    full[col] = pd.to_numeric(full[col], errors='coerce')

# ——— bezpieczny parsing roku (regex) zamiast bezpośredniego to_datetime, aby uniknąć ostrzeżeń i niespójności ———
def extract_year(val):
    s = str(val)
    m = re.search(r'(?:18|19|20)\d{2}', s)
    if m:
        y = int(m.group(0))
        if 1800 <= y <= dt.datetime.now().year + 1:
            return y
    return np.nan

years_rx = full['BuiltYear'].apply(extract_year)
full['year'] = years_rx
full['year'] = full['year'].fillna(full['year'].median())

# ——— inżynieria cech ———
# price_per_m2, log1p, winsoryzacja skrajności dla Area/Price/ppm
full['price_per_m2'] = np.where((full['Area']>0) & (full['Price']>0), full['Price'] / full['Area'], np.nan)

# winsoryzacja 1–99 percentyl
for col in ['Area','Price','price_per_m2']:
    q01, q99 = full[col].quantile(0.01), full[col].quantile(0.99)
    full[col] = full[col].clip(lower=q01, upper=q99)

full['log_area']  = np.log1p(full['Area'])
full['log_price'] = np.log1p(full['Price'])
full['log_ppm']   = np.log1p(full['price_per_m2'])

# ——— kategorie ———
for col in ['BuildingType','OfferFrom','TypeOfMarket']:
    full[col] = full[col].fillna('unknown').astype(str).str.strip().str.lower()

def norm_market(v):
    v = (v or '').lower()
    if 'pierwot' in v: return 'pierwotny'
    if 'wtór' in v or 'wtorn' in v: return 'wtórny'
    return v if v else 'unknown'
full['TypeOfMarket'] = full['TypeOfMarket'].apply(norm_market)

# redukcja rzadkich kategorii (top-K)
def topk_map(series, k=30):
    top = series.value_counts().nlargest(k).index
    return series.where(series.isin(top), other='other')

full['BuildingType'] = topk_map(full['BuildingType'], k=30)
full['OfferFrom']    = topk_map(full['OfferFrom'],    k=30)
# TypeOfMarket ma niewiele poziomów — po normalizacji zostawiamy

# ——— etykiety ———
y_idx = full['BuildingCondition'].map(label_to_idx).astype(int).values
y     = to_categorical(y_idx, num_classes=len(label_names))

# ——— tokenizacja tekstu ———
max_words, max_len = 30000, 250
tokenizer = Tokenizer(num_words=max_words, oov_token="")
tokenizer.fit_on_texts(full['Description'].astype(str))
X_text = pad_sequences(tokenizer.texts_to_sequences(full['Description'].astype(str)), maxlen=max_len)

# ——— cechy tabelaryczne ———
numeric_features = ['Area','Price','NumberOfRooms','Floor','Floors','year',
                    'price_per_m2','log_area','log_price','log_ppm']
categorical_features = ['BuildingType','OfferFrom','TypeOfMarket']

# Imputacja braków w gałęzi numerycznej + skalowanie, aby usunąć NaN-y przed SVD
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipe, numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

# Fit-transform do macierzy cech (sparse)
X_tab_sparse = preprocessor.fit_transform(full[numeric_features + categorical_features])

# Redukcja wymiaru OneHot -> gęste n_comp (SVD) z bezpiecznikiem na liczbę kolumn
n_comp = min(256, max(2, X_tab_sparse.shape[1] - 1))
svd = TruncatedSVD(n_components=n_comp, random_state=SEED)
X_tabular = svd.fit_transform(X_tab_sparse)

# Split
X_text_train, X_text_test, X_tab_train, X_tab_test, y_train, y_test, y_idx_train, y_idx_test = train_test_split(
    X_text, X_tabular, y, y_idx, test_size=0.2, random_state=SEED, stratify=y_idx
)

# Wagi klas + boost GOOD
classes = np.unique(y_idx_train)
cw = compute_class_weight(class_weight='balanced', classes=classes, y=y_idx_train)
class_weight = {int(c): float(w) for c, w in zip(classes, cw)}
class_weight[label_to_idx['GOOD']] = class_weight.get(label_to_idx['GOOD'], 1.0) * 1.15
print("\nClass weights (GOOD boosted):", class_weight)

# Raport rozkładu
counts = full['BuildingCondition'].value_counts(dropna=False)
perc   = (counts / len(full) * 100).round(2)
balance= pd.DataFrame({'count': counts, 'percent': perc}).sort_index()
print("\n=== Balans klas przed treningiem ===")
print(balance)



Class weights (GOOD boosted): {0: 0.9944602272727273, 1: 0.9944602272727273, 2: 1.0169959325973272, 3: 1.1436292613636363}

=== Balans klas przed treningiem ===
                   count  percent
BuildingCondition                
AFTER_RENOVATION   22000    25.14
DEVELOPER_STATE    22000    25.14
FOR_RENOVATION     21513    24.58
GOOD               22000    25.14


In [2]:
# CELL 2: model BiLSTM + mocniejsza gałąź tabelaryczna, AdamW, macro-F1 callback i trening

from tensorflow.keras.layers import Input, Embedding, LSTM, GRU, Bidirectional, Dropout, Dense, Concatenate, SpatialDropout1D, BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import Callback, EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, CSVLogger
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.optimizers import AdamW  # tf.keras >= 2.11
import numpy as np

# Callback macro-F1 (jak w v4)
class ValMacroF1(Callback):
    def __init__(self, val_data, batch_size=1024, verbose=1):
        super().__init__()
        self.X_text_val, self.X_tab_val, self.y_val = val_data
        self.batch_size = batch_size
        self.verbose = verbose
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        y_pred_proba = self.model.predict([self.X_text_val, self.X_tab_val],
                                          batch_size=self.batch_size, verbose=0)
        y_pred = np.argmax(y_pred_proba, axis=1)
        y_true = np.argmax(self.y_val, axis=1)
        macro_f1 = f1_score(y_true, y_pred, average='macro')
        logs['val_macro_f1'] = macro_f1
        if self.verbose:
            print(f"\\nval_macro_f1: {macro_f1:.4f}")

# Tekst: Embedding + SpatialDropout + BiLSTM
text_input = Input(shape=(max_len,), name='text_input')
emb = Embedding(input_dim=max_words, output_dim=200)(text_input)
emb = SpatialDropout1D(0.2)(emb)
text_repr = Bidirectional(LSTM(64, recurrent_dropout=0.2))(emb)
text_repr = Dropout(0.4)(text_repr)

# Tabela: Dense 128->64 z BN i Dropout
tabular_input = Input(shape=(X_tab_train.shape[1],), name='tabular_input')
tab = Dense(128, activation='relu')(tabular_input)
tab = BatchNormalization()(tab)
tab = Dropout(0.3)(tab)
tab = Dense(64, activation='relu')(tab)
tab = BatchNormalization()(tab)
tab = Dropout(0.3)(tab)

# Fuzja i klasyfikacja
x = Concatenate()([text_repr, tab])
x = Dense(128, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.4)(x)
output = Dense(len(label_names), activation='softmax')(x)

model = Model(inputs=[text_input, tabular_input], outputs=output)
model.summary()

# Kompilacja: AdamW + label smoothing 0.02
model.compile(
    optimizer=AdamW(learning_rate=1e-3, weight_decay=1e-4),
    loss=CategoricalCrossentropy(label_smoothing=0.02),
    metrics=['accuracy']
)

macro_cb = ValMacroF1(val_data=(X_text_test, X_tab_test, y_test), batch_size=1024, verbose=1)
es  = EarlyStopping(monitor='val_macro_f1', mode='max', patience=5, restore_best_weights=True, verbose=1)
ckp = ModelCheckpoint('model_best_macro_f1.keras', monitor='val_macro_f1', mode='max', save_best_only=True, verbose=1)
rlr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-5, verbose=1)
csv = CSVLogger('training_log.csv', append=False)

print("\\nTrening v5: BiLSTM+SVD, 25 epok, ES/CKPT po val_macro_f1, RLR po val_loss ...")
history = model.fit(
    [X_text_train, X_tab_train], y_train,
    epochs=25,
    batch_size=128,
    validation_data=([X_text_test, X_tab_test], y_test),
    callbacks=[macro_cb, es, ckp, rlr, csv],
    class_weight=class_weight
)


\nTrening v5: BiLSTM+SVD, 25 epok, ES/CKPT po val_macro_f1, RLR po val_loss ...
Epoch 1/25
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.4970 - loss: 1.1328\nval_macro_f1: 0.6318

Epoch 1: val_macro_f1 improved from -inf to 0.63182, saving model to model_best_macro_f1.keras
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1107s[0m 2s/step - accuracy: 0.4971 - loss: 1.1325 - val_accuracy: 0.6408 - val_loss: 0.8115 - val_macro_f1: 0.6318 - learning_rate: 0.0010
Epoch 2/25
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2s/step - accuracy: 0.6496 - loss: 0.8519\nval_macro_f1: 0.6499

Epoch 2: val_macro_f1 improved from 0.63182 to 0.64994, saving model to model_best_macro_f1.keras
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1045s[0m 2s/step - accuracy: 0.6496 - loss: 0.8519 - val_accuracy: 0.6573 - val_loss: 0.7990 - val_macro_f1: 0.6499 - learning_rate: 0.0010
Epoch 3/25
[1m547/547[0m [32m━━━━━━━━━━━

In [3]:
# CELL 3: ocena i raport

loss, accuracy = model.evaluate([X_text_test, X_tab_test], y_test)
print(f"\\nDokładność na zbiorze testowym: {accuracy:.4f}")

y_pred_proba = model.predict([X_text_test, X_tab_test])
y_pred = np.argmax(y_pred_proba, axis=1)
y_true = np.argmax(y_test, axis=1)

print("\\nRaport klasyfikacji na zbiorze testowym:")
print(classification_report(y_true, y_pred, target_names=label_names))


[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 135ms/step - accuracy: 0.8453 - loss: 0.5820
\nDokładność na zbiorze testowym: 0.8480
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 131ms/step
\nRaport klasyfikacji na zbiorze testowym:
                  precision    recall  f1-score   support

AFTER_RENOVATION       0.81      0.82      0.81      4400
 DEVELOPER_STATE       0.98      0.97      0.97      4400
  FOR_RENOVATION       0.88      0.84      0.86      4303
            GOOD       0.73      0.77      0.75      4400

        accuracy                           0.85     17503
       macro avg       0.85      0.85      0.85     17503
    weighted avg       0.85      0.85      0.85     17503



In [4]:
# CELL 4: zapis artefaktów (model, tokenizer, preprocessor, SVD, mapowania)

import joblib, json

model.save('model_lstm_stan.keras')

with open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer.to_json(), ensure_ascii=False))

joblib.dump(preprocessor, 'preprocessor.joblib')
joblib.dump(svd, 'svd_256.joblib')

label_mapping = {i: name for i, name in enumerate(label_names)}
with open('label_mapping.json', 'w', encoding='utf-8') as f:
    json.dump({str(k): v for k, v in label_mapping.items()}, f, ensure_ascii=False)

columns_for_prediction = numeric_features + categorical_features
joblib.dump(columns_for_prediction, 'columns_for_prediction.joblib')

print("Zapisano: model_lstm_stan.keras, tokenizer.json, preprocessor.joblib, svd_256.joblib, label_mapping.json, columns_for_prediction.joblib")


Zapisano: model_lstm_stan.keras, tokenizer.json, preprocessor.joblib, svd_256.joblib, label_mapping.json, columns_for_prediction.joblib


In [5]:
# CELL 4.5 (poprawiony i odporny na błędy CSV): Szybka regeneracja brakujących artefaktów

print("Rozpoczynam regenerację brakujących parametrów z oryginalnych plików CSV...")

import pandas as pd
import numpy as np
import joblib
import re
import datetime as dt
import csv # Potrzebne dla robust_read_records
import os

# --- Kopiujemy niezbędne funkcje i definicje z komórki 1 ---

# Twoja oryginalna, solidna funkcja do wczytywania CSV
def robust_read_records(path, encoding_candidates=('utf-8-sig','utf-8','cp1250','latin1')):
    last_err = None
    for enc in encoding_candidates:
        try:
            rows = []
            with open(path, 'r', encoding=enc, errors='replace', newline='') as f:
                reader = csv.reader(f, delimiter=',', quotechar='"', escapechar='\\')
                for row in reader:
                    if not row or (len(row) == 1 and not str(row).strip()):
                        continue
                    rows.append(row)
            if not rows: continue
            median_len = int(np.median([len(r) for r in rows]))
            if median_len < 10: continue
            return rows
        except Exception as e:
            last_err = e
            continue
    if last_err: raise last_err
    raise RuntimeError(f'Unable to parse CSV: {path}')

# Twoje oryginalne mapowanie indeksów
IDX_MAP = {
    'Description': 4, 'Area': 5, 'Price': 6, 'NumberOfRooms': 11, 'BuiltYear': 12,
    'BuildingType': 14, 'OfferFrom': 16, 'Floor': 17, 'Floors': 18, 'TypeOfMarket': 19,
}

# Twoja oryginalna funkcja do ekstrakcji DataFrame
def extract_required_df(rows, force_label):
    sel = {}
    for name, idx in IDX_MAP.items():
        sel[name] = [(r[idx] if len(r) > idx else None) for r in rows]
    out = pd.DataFrame(sel)
    out['BuildingCondition'] = [force_label for _ in rows]
    return out

# Funkcja do parsowania roku
def extract_year(val):
    s = str(val)
    m = re.search(r'(?:18|19|20)\d{2}', s)
    if m:
        y = int(m.group(0))
        if 1800 <= y <= dt.datetime.now().year + 1: return y
    return np.nan

# Wczytanie i złączenie oryginalnych danych treningowych (używając Twojego kodu)
FILES = { 'sf_after_renovation.csv': 'AFTER_RENOVATION', 'sf_developer_state.csv' : 'DEVELOPER_STATE', 'sf_for_renovation.csv'  : 'FOR_RENOVATION', 'sf_good.csv' : 'GOOD', }
frames = []
for path, label in FILES.items():
    if not os.path.exists(path): raise FileNotFoundError(f'Brak pliku: {path}')
    rows    = robust_read_records(path)
    df_part = extract_required_df(rows, force_label=label)
    frames.append(df_part)
full = pd.concat(frames, ignore_index=True)
print("Dane treningowe wczytane poprawnie.")

# --- Przetwarzanie danych w celu wyliczenia i zapisu parametrów ---
for col in ['Area','Price','NumberOfRooms','Floor','Floors']: full[col] = pd.to_numeric(full[col], errors='coerce')

# 1. Oblicz i zapisz medianę roku
full['year'] = full['BuiltYear'].apply(extract_year)
median_year_for_imputation = full['year'].median()
joblib.dump(median_year_for_imputation, 'median_year.joblib')
print(f"Zapisano median_year.joblib (wartość: {median_year_for_imputation})")

# 2. Oblicz i zapisz parametry winsoryzacji
full['price_per_m2'] = np.where((full['Area']>0) & (full['Price']>0), full['Price'] / full['Area'], np.nan)
winsor_params = {}
for col in ['Area','Price','price_per_m2']:
    q01, q99 = full[col].quantile(0.01), full[col].quantile(0.99)
    winsor_params[col] = {'lower': q01, 'upper': q99}
joblib.dump(winsor_params, 'winsor_params.joblib')
print("Zapisano winsor_params.joblib")

# 3. Oblicz i zapisz parametry top-K
for col in ['BuildingType','OfferFrom','TypeOfMarket']: full[col] = full[col].fillna('unknown').astype(str).str.strip().str.lower()
top_k_params = {}
def topk_map_and_capture(series, col_name, k=30):
    top = series.value_counts().nlargest(k).index
    top_k_params[col_name] = list(top)
    return series.where(series.isin(top), other='other')

_ = topk_map_and_capture(full['BuildingType'], 'BuildingType', k=30)
_ = topk_map_and_capture(full['OfferFrom'], 'OfferFrom', k=30)
joblib.dump(top_k_params, 'top_k_params.joblib')
print("Zapisano top_k_params.joblib")

print("\nRegeneracja artefaktów zakończona. Możesz teraz uruchomić poprawioną komórkę 5.")

Rozpoczynam regenerację brakujących parametrów z oryginalnych plików CSV...
Dane treningowe wczytane poprawnie.
Zapisano median_year.joblib (wartość: 1995.0)
Zapisano winsor_params.joblib
Zapisano top_k_params.joblib

Regeneracja artefaktów zakończona. Możesz teraz uruchomić poprawioną komórkę 5.


In [6]:
# CELL 5 (FINALNA, POPRAWIONA WERSJA): Inference z pełną spójnością i poprawnym wyświetlaniem

import os, json, re, gc, datetime as dt
import numpy as np
import pandas as pd
import joblib
import tensorflow as tf
import csv
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from tensorflow.keras.preprocessing.sequence import pad_sequences

# --- Ścieżki do plików ---
IN_PATH  = 'saleflats_2024_2025_v2_WITH_PREDICTIONS.csv'
OUT_PATH = 'Data_state_LSTM_predicted_full_v4_FINAL.csv' # Nowa nazwa pliku wyjściowego

# --- Artefakty z treningu ---
MODEL_PATH = 'model_lstm_stan.keras'
TOKENIZER_PATH = 'tokenizer.json'
PREPROC_PATH = 'preprocessor.joblib'
SVD_PATH = 'svd_256.joblib'
LABEL_MAP_PATH = 'label_mapping.json'
COLS_FOR_PRED_PATH = 'columns_for_prediction.joblib'
TOP_K_PATH = 'top_k_params.joblib'
WINSOR_PATH = 'winsor_params.joblib'
MEDIAN_YEAR_PATH = 'median_year.joblib'
MAX_LEN = 250

# --- Funkcje pomocnicze ---
def robust_read_records(path, encoding_candidates=('utf-8-sig','utf-8','cp1250','latin1')):
    last_err = None
    for enc in encoding_candidates:
        try:
            rows = []
            with open(path, 'r', encoding=enc, errors='replace', newline='') as f:
                reader = csv.reader(f, delimiter=',', quotechar='"', escapechar='\\')
                for row in reader:
                    if not row or (len(row) == 1 and not str(row).strip()):
                        continue
                    rows.append(row)
            if not rows: continue
            median_len = int(np.median([len(r) for r in rows]))
            if median_len < 10: continue
            print(f"Plik wczytany pomyślnie: encoding='{enc}', znaleziono wierszy: {len(rows)}")
            return rows
        except Exception as e:
            last_err = e
            continue
    if last_err: raise last_err
    raise RuntimeError(f'Unable to parse CSV: {path}')

IDX_MAP = {
    'Description': 4, 'Area': 5, 'Price': 6, 'NumberOfRooms': 11, 'BuiltYear': 12,
    'BuildingType': 14, 'OfferFrom': 16, 'Floor': 17, 'Floors': 18, 'TypeOfMarket': 19,
}

def extract_required_df_from_rows(rows):
    sel = {}
    for name, idx in IDX_MAP.items():
        sel[name] = [(r[idx] if len(r) > idx else None) for r in rows]
    return pd.DataFrame(sel)

def clean_text(s: str) -> str:
    s = (s or "").lower()
    patterns = [r'oferta nie stanowi.*?oferty w rozumieniu kodeksu cywilnego', r'prosz[ąa] o kontakt.*', r'tylko u nas.*', r'nie pobieramy prowizji.*']
    for p in patterns: s = re.sub(p, ' ', s, flags=re.IGNORECASE)
    s = re.sub(r'\b\d{3}[-\s]?\d{3}[-\s]?\d{3,4}\b', ' ', s)
    s = re.sub(r'[\w\.-]+@[\w\.-]+', ' ', s)
    s = re.sub(r'http\S+|www\.\S+', ' ', s)
    s = re.sub(r'\s+', ' ', s).strip()
    return s

def extract_year(val):
    s = str(val)
    m = re.search(r'(?:18|19|20)\d{2}', s)
    if m:
        y = int(m.group(0))
        if 1800 <= y <= dt.datetime.now().year + 1: return y
    return np.nan

def norm_market(v):
    v = (v or '').lower()
    if 'pierwot' in v: return 'pierwotny'
    if 'wtór' in v or 'wtorn' in v: return 'wtórny'
    return v if v else 'unknown'

# --- Główny skrypt predykcyjny ---
print("Wczytywanie wszystkich artefaktów...")
with open(TOKENIZER_PATH, 'r', encoding='utf-8') as f: tokenizer_data = json.load(f)
tokenizer = tokenizer_from_json(json.dumps(tokenizer_data) if isinstance(tokenizer_data, dict) else tokenizer_data)
preprocessor = joblib.load(PREPROC_PATH)
svd = joblib.load(SVD_PATH)
columns_for_prediction = joblib.load(COLS_FOR_PRED_PATH)
label_map = {int(k): v for k, v in json.load(open(LABEL_MAP_PATH, 'r', encoding='utf-8')).items()}
model = tf.keras.models.load_model(MODEL_PATH)
top_k_params = joblib.load(TOP_K_PATH)
winsor_params = joblib.load(WINSOR_PATH)
median_year = joblib.load(MEDIAN_YEAR_PATH)
print("Artefakty wczytane.")

print(f"Wczytywanie surowych danych z {IN_PATH}...")
all_rows = robust_read_records(IN_PATH)
header = all_rows[0]
data_rows = all_rows[1:]

df_src = pd.DataFrame(data_rows)
df_src.columns = header[:len(df_src.columns)]

print("Strukturyzowanie danych dla modelu...")
infer_df = extract_required_df_from_rows(data_rows)

print("Rozpoczynam spójny preprocessing...")
infer_df['Description'] = infer_df['Description'].fillna('').astype(str).apply(clean_text)
for col in ['Area','Price','NumberOfRooms','Floor','Floors']: infer_df[col] = pd.to_numeric(infer_df[col], errors='coerce')
infer_df['year'] = infer_df['BuiltYear'].apply(extract_year)
infer_df['year'] = infer_df['year'].fillna(median_year)
infer_df['price_per_m2'] = np.where((infer_df['Area']>0) & (infer_df['Price']>0), infer_df['Price']/infer_df['Area'], np.nan)
for col, params in winsor_params.items():
    if col in infer_df.columns: infer_df[col] = infer_df[col].clip(lower=params['lower'], upper=params['upper'])
infer_df['log_area']  = np.log1p(infer_df['Area'])
infer_df['log_price'] = np.log1p(infer_df['Price'])
infer_df['log_ppm']   = np.log1p(infer_df['price_per_m2'])
for col in ['BuildingType','OfferFrom','TypeOfMarket']: infer_df[col] = infer_df[col].fillna('unknown').astype(str).str.strip().str.lower()
infer_df['TypeOfMarket'] = infer_df['TypeOfMarket'].apply(norm_market)
for col, top_list in top_k_params.items():
    if col in infer_df.columns: infer_df[col] = infer_df[col].where(infer_df[col].isin(top_list), 'other')
print("Preprocessing zakończony.")

print("Transformacja danych i predykcja...")
X_text = pad_sequences(tokenizer.texts_to_sequences(infer_df['Description'].astype(str)), maxlen=MAX_LEN)
X_tab_sparse = preprocessor.transform(infer_df[columns_for_prediction])
X_tabular = svd.transform(X_tab_sparse)
y_pred_proba = model.predict([X_text, X_tabular], batch_size=2048, verbose=1)
y_pred_idx = np.argmax(y_pred_proba, axis=1)
predict_state = [label_map.get(i, 'UNKNOWN') for i in y_pred_idx]
print("Predykcja zakończona.")

df_out = df_src.copy()
df_out['Predict_State'] = predict_state
df_out.to_csv(OUT_PATH, index=False, encoding='utf-8-sig', sep=';')
print(f"\nZapisano wynik do: {OUT_PATH} (użyto separatora ';')")

# --- POPRAWIONA SEKCJA WYŚWIETLANIA WYNIKÓW ---
print("\nPodgląd wyników (15 pierwszych wierszy):")
pd.set_option('display.max_colwidth', 50) # Ograniczamy szerokość dla czytelności podglądu

# Bezpieczne znajdowanie kolumn po nazwach z nagłówka
def find_col_by_name(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

price_col_name = find_col_by_name(df_out, ['Price', 'price', 'Cena'])
area_col_name = find_col_by_name(df_out, ['Area', 'area', 'Powierzchnia'])
saleid_col_name = find_col_by_name(df_out, ['SaleId', 'id']) or df_out.columns[0] # Jeśli nie ma, bierz pierwszą

preview_cols = [c for c in [saleid_col_name, price_col_name, area_col_name, 'Predict_State'] if c is not None]

# Sprawdź, czy lista kolumn nie jest pusta
if preview_cols:
    display(df_out[preview_cols].head(15))
else:
    print("Nie udało się znaleźć kolumn do wyświetlenia podglądu.")

print("\nRozkład przewidzianych stanów:")
print(df_out['Predict_State'].value_counts())

Wczytywanie wszystkich artefaktów...
Artefakty wczytane.
Wczytywanie surowych danych z saleflats_2024_2025_v2_WITH_PREDICTIONS.csv...
Plik wczytany pomyślnie: encoding='utf-8-sig', znaleziono wierszy: 1467263
Strukturyzowanie danych dla modelu...
Rozpoczynam spójny preprocessing...
Preprocessing zakończony.
Transformacja danych i predykcja...
[1m717/717[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m469s[0m 653ms/step
Predykcja zakończona.

Zapisano wynik do: Data_state_LSTM_predicted_full_v4_FINAL.csv (użyto separatora ';')

Podgląd wyników (15 pierwszych wierszy):


Unnamed: 0,88,Predict_State
0,99,FOR_RENOVATION
1,115,AFTER_RENOVATION
2,140,GOOD
3,145,AFTER_RENOVATION
4,159,AFTER_RENOVATION
5,165,GOOD
6,173,GOOD
7,179,AFTER_RENOVATION
8,189,GOOD
9,208,GOOD



Rozkład przewidzianych stanów:
Predict_State
AFTER_RENOVATION    468889
DEVELOPER_STATE     465196
GOOD                361664
FOR_RENOVATION      171513
Name: count, dtype: int64


In [7]:
# CELL 6 (FINALNA ANALIZA): Wczytanie i analiza poprawnie wygenerowanego pliku

import pandas as pd
from IPython.display import display

PATH = 'Data_state_LSTM_predicted_full_v4_FINAL.csv'

# Ustawienia wyświetlania
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 120)

try:
    print(f"Wczytywanie finalnego pliku z wynikami: {PATH}")
    # Wczytujemy poprawny plik, pamiętając o separatorze ';'
    df = pd.read_csv(PATH, sep=';', low_memory=False)
    print(f"Wczytano {len(df)} wierszy.")

    # --- Krok 1: Wyświetl rozkład przewidzianych stanów ---
    print("\nRozkład przewidzianych stanów w pliku wynikowym:")
    print(df['Predict_State'].value_counts())
    
    # --- Krok 2: Wyświetl reprezentatywne próbki ---
    print("\nReprezentatywne przykłady dla każdej kategorii:")
    
    # Funkcja do bezpiecznego znajdowania nazw kolumn (na wszelki wypadek)
    def pick_col(df, candidates):
        # Sprawdzamy też nazwy z małych liter
        candidates_lower = [str(c).lower() for c in candidates]
        df_cols_lower = {c.lower(): c for c in df.columns}
        
        for cand_lower in candidates_lower:
            if cand_lower in df_cols_lower:
                return df_cols_lower[cand_lower]
        return None # Zwróć None, jeśli nic nie znaleziono

    # Identyfikujemy kolumny, których nazwy mogą się różnić
    # Używamy oryginalnych nazw z IDX_MAP oraz potencjalnych alternatyw
    saleid_col = pick_col(df, ['SaleId', 'id', df.columns[0]])
    title_col = pick_col(df, ['Title', 'tytuł', 'Mieszkanie trzypokojowe na sprzedaż'])
    price_col = pick_col(df, ['Price', 'Cena'])
    area_col = pick_col(df, ['Area', 'Powierzchnia'])
    loc_col = pick_col(df, ['Predicted_Loc']) 
    state_col = 'Predict_State'

    # Zbieramy próbki
    samples_list = []
    target_states = df[state_col].unique()
    for state in target_states:
        # Bierzemy po 4 losowe próbki dla każdego stanu
        samples_list.append(df[df[state_col] == state].sample(n=min(4, len(df[df[state_col] == state])), random_state=42))

    if samples_list:
        df_samples = pd.concat(samples_list).sort_values(by=state_col).reset_index(drop=True)
        
        # Wyświetlanie
        display_cols = [c for c in [saleid_col, title_col, price_col, area_col, loc_col, state_col] if c is not None]
        display(df_samples[display_cols])
        print(f"\nWyświetlono {len(df_samples)} przykładów.")
    else:
        print("Nie udało się znaleźć próbek do wyświetlenia.")


except FileNotFoundError:
    print(f"BŁĄD: Nie znaleziono pliku '{PATH}'. Upewnij się, że komórka 5 została pomyślnie uruchomiona i plik istnieje.")
except Exception as e:
    print(f"Wystąpił nieoczekiwany błąd: {e}")

Wczytywanie finalnego pliku z wynikami: Data_state_LSTM_predicted_full_v4_FINAL.csv
Wczytano 1467262 wierszy.

Rozkład przewidzianych stanów w pliku wynikowym:
Predict_State
AFTER_RENOVATION    468889
DEVELOPER_STATE     465196
GOOD                361664
FOR_RENOVATION      171513
Name: count, dtype: int64

Reprezentatywne przykłady dla każdej kategorii:


Unnamed: 0,88,Mieszkanie trzypokojowe na sprzedaż,Predict_State
0,4748163,"Mieszkanie, Opole, Zaodrze, 47 m²",AFTER_RENOVATION
1,3612967,Takiej panoramy Krakowa nie widzi się codziennie.,AFTER_RENOVATION
2,3873884,3-pokojowe mieszkanie 70m2 + balkon Bezpośrednio,AFTER_RENOVATION
3,2753375,Stylowe 2pokoje z widokiem na las i zachody słońca,AFTER_RENOVATION
4,5176968,"Nowe mieszkanie Działki Leśne, ul. Poznańska 1",DEVELOPER_STATE
5,3514076,Zgarnij dod GRUDNIOWY Rabat_HALA_Komórka_GOTOWE,DEVELOPER_STATE
6,686572,Mieszkanie trzypokojowe na sprzedaż,DEVELOPER_STATE
7,4463074,Mieszkanie,DEVELOPER_STATE
8,4838395,Przestronne mieszkanie w centrum Łeby,FOR_RENOVATION
9,4003061,"Mieszkanie, Kraków, Prądnik Biały, 58 m²",FOR_RENOVATION



Wyświetlono 16 przykładów.
