In [1]:
# --- KONFIGURACJA I WCIĄGANIE NOWYCH BAZ ---

import os, csv, io, re, json, gc
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# 4 źródła (każdy plik = jedna etykieta docelowa)
FILES = {
    'sf_after_renovation.csv': 'AFTER_RENOVATION',
    'sf_developer_state.csv' : 'DEVELOPER_STATE',
    'sf_for_renovation.csv' : 'FOR_RENOVATION',
    'sf_good.csv' : 'GOOD',
}

# Wymagane przez model pola (zgodnie z notebookiem trenowania)
REQUIRED_TEXT = ['Description']
REQUIRED_NUM  = ['Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors']
REQUIRED_CAT  = ['BuildingType', 'OfferFrom', 'TypeOfMarket']
REQUIRED_DT   = ['BuiltYear']  # źródło kolumny 'year'
REQUIRED_ALL  = REQUIRED_TEXT + REQUIRED_NUM + REQUIRED_CAT + REQUIRED_DT

# UWAGA: pliki źródłowe mają stałą kolejność kolumn w rekordach CSV.
# Na podstawie diagnostyki i próbek w notatniku mapujemy pozycje -> nazwy:
#  idx:  4=Description, 5=Area, 6=Price, 11=NumberOfRooms, 12=BuiltYear,
#       14=BuildingType, 16=OfferFrom, 17=Floor, 18=Floors, 19=TypeOfMarket, 15=etykieta źródłowa
IDX_MAP = {
    'Description':   4,
    'Area':          5,
    'Price':         6,
    'NumberOfRooms': 11,
    'BuiltYear':     12,
    'BuildingType':  14,
    'OfferFrom':     16,
    'Floor':         17,
    'Floors':        18,
    'TypeOfMarket':  19,
}
IDX_LABEL = 15  # zawiera nazwę stanu w rekordach; dla pewności nadpisujemy etykietę nazwą pliku

# Parser wymuszający prawidłowe dzielenie po przecinku i cudzysłowie, z escape'ami
def robust_read_records(path, encoding_candidates=('utf-8-sig','utf-8','cp1250','latin1')):
    last_err = None
    for enc in encoding_candidates:
        try:
            rows = []
            with open(path, 'r', encoding=enc, errors='replace', newline='') as f:
                reader = csv.reader(f, delimiter=',', quotechar='"', escapechar='\\')
                for row in reader:
                    # pomijamy puste/krótkie wiersze
                    if not row or (len(row) == 1 and not row.strip()):
                        continue
                    rows.append(row)
            if not rows:
                continue
            # Heurystyka: zbiór powinien mieć dziesiątki pól; jeśli wszystko skleiło się do 1 kolumny, to zły enc
            median_len = int(np.median([len(r) for r in rows]))
            if median_len < 10:
                continue
            return rows
        except Exception as e:
            last_err = e
            continue
    if last_err:
        raise last_err
    raise RuntimeError(f'Unable to parse CSV: {path}')

def extract_required_df(rows, force_label):
    # Przekształć listę rekordów => DataFrame z wybranymi kolumnami po indeksach
    sel = {}
    ncols_needed = max(list(IDX_MAP.values()) + [IDX_LABEL]) + 1
    for name, idx in IDX_MAP.items():
        sel[name] = [ (r[idx] if len(r) > idx else None) for r in rows ]
    # Etykieta (nadpisujemy nazwą pliku, aby uniezależnić się od zawartości)
    labels = [ force_label for _ in rows ]
    out = pd.DataFrame(sel)
    out['BuildingCondition'] = labels
    return out

frames = []
for path, label in FILES.items():
    if not os.path.exists(path):
        raise FileNotFoundError(f'Brak pliku: {path}')
    rows = robust_read_records(path)
    df_part = extract_required_df(rows, force_label=label)
    frames.append(df_part)

full = pd.concat(frames, ignore_index=True)

# --- CZYSZCZENIE, TYPY, CECHY WTÓRNE ---

# Tekst
full['Description'] = full['Description'].fillna('').astype(str)

# Numeryczne
for col in ['Area','Price','NumberOfRooms','Floor','Floors']:
    full[col] = pd.to_numeric(full[col], errors='coerce')

# Rok budowy -> year (liczba całkowita); uwzględniamy formaty typu '2025' lub daty
years = pd.to_datetime(full['BuiltYear'], errors='coerce').dt.year
# Jeżeli wszystko NaN, spróbuj bezpośrednio rzutować na int
if years.isna().all():
    years = pd.to_numeric(full['BuiltYear'], errors='coerce')
full['year'] = years
# Uzupełnij medianą
full['year'] = full['year'].fillna(full['year'].median())

# Kategoryczne
for col in ['BuildingType','OfferFrom','TypeOfMarket']:
    full[col] = full[col].fillna('unknown').astype(str).str.strip().str.lower()

# Normalizacja TypeOfMarket na 'pierwotny' / 'wtórny' / inne
def norm_market(v):
    v = (v or '').lower()
    if 'pierwot' in v:
        return 'pierwotny'
    if 'wtór' in v or 'wtorn' in v:
        return 'wtórny'
    return v if v else 'unknown'
full['TypeOfMarket'] = full['TypeOfMarket'].apply(norm_market)

# Usunięcie oczywistych anomalii (opcjonalnie)
# full = full[full['Area'] > 0]
# full = full[full['Price'] > 0]

# --- BALANS KLAS (PRZED TRENINGIEM) ---

counts = full['BuildingCondition'].value_counts(dropna=False)
perc   = (counts / len(full) * 100).round(2)
balance = pd.DataFrame({'count': counts, 'percent': perc}).sort_index()
print('\\n=== Balans klas przed treningiem ===')
print(balance)

# --- PODZIAŁ I PRZYGOTOWANIE WEJŚĆ ---

# Mapa etykiet -> indeksy
label_names = ['AFTER_RENOVATION','DEVELOPER_STATE','FOR_RENOVATION','GOOD']
label_to_idx = {name: i for i, name in enumerate(label_names)}
y_idx = full['BuildingCondition'].map(label_to_idx).astype(int).values
y = to_categorical(y_idx, num_classes=len(label_names))

# Tekst
max_words, max_len = 10000, 200
tokenizer = Tokenizer(num_words=max_words, oov_token="")
tokenizer.fit_on_texts(full['Description'].astype(str))
X_text = pad_sequences(tokenizer.texts_to_sequences(full['Description'].astype(str)), maxlen=max_len)

# Tabelaryczne
numeric_features = ['Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors', 'year']
categorical_features = ['BuildingType', 'OfferFrom', 'TypeOfMarket']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)
X_tabular = preprocessor.fit_transform(full[numeric_features + categorical_features])

# Train/test split ze stratą po klasie
X_text_train, X_text_test, X_tab_train, X_tab_test, y_train, y_test, y_idx_train, y_idx_test = train_test_split(
    X_text, X_tabular, y, y_idx, test_size=0.2, random_state=42, stratify=y_idx
)

# --- WAŻENIE KLAS (opcjonalnie, przy dużej nierównowadze) ---
classes = np.unique(y_idx_train)
cw = compute_class_weight(class_weight='balanced', classes=classes, y=y_idx_train)
class_weight = {int(c): float(w) for c, w in zip(classes, cw)}
print('\\nClass weights:', class_weight)


  years = pd.to_datetime(full['BuiltYear'], errors='coerce').dt.year


\n=== Balans klas przed treningiem ===
                   count  percent
BuildingCondition                
AFTER_RENOVATION   22000    25.14
DEVELOPER_STATE    22000    25.14
FOR_RENOVATION     21513    24.58
GOOD               22000    25.14
\nClass weights: {0: 0.9944602272727273, 1: 0.9944602272727273, 2: 1.0169959325973272, 3: 0.9944602272727273}


In [2]:
# --- ARCHITEKTURA I TRENING ---

from tensorflow.keras.layers import Input, Embedding, LSTM, Dropout, Dense, Concatenate
from tensorflow.keras.models import Model

max_words, max_len = 10000, 200  # musi zgadzać się z tokenizacją

text_input = Input(shape=(max_len,), name='text_input')
embedding_layer = Embedding(input_dim=max_words, output_dim=128)(text_input)
lstm_layer = LSTM(64, recurrent_dropout=0.2)(embedding_layer)
dropout_lstm = Dropout(0.4)(lstm_layer)

tabular_input = Input(shape=(X_tab_train.shape[1],), name='tabular_input')
tabular_dense = Dense(32, activation='relu')(tabular_input)

concatenated = Concatenate()([dropout_lstm, tabular_dense])
dense1 = Dense(64, activation='relu')(concatenated)
dropout_final = Dropout(0.5)(dense1)
output = Dense(len(label_names), activation='softmax')(dropout_final)

model = Model(inputs=[text_input, tabular_input], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()

print("\\nRozpoczynam trening na nowych danych...")
history = model.fit(
    [X_text_train, X_tab_train], y_train,
    epochs=10,
    batch_size=128,
    validation_data=([X_text_test, X_tab_test], y_test),
    class_weight=class_weight  # odkomentuj, jeśli dysproporcja klas jest duża
)

# --- OCENA ---
loss, accuracy = model.evaluate([X_text_test, X_tab_test], y_test)
print(f"\\nDokładność na zbiorze testowym: {accuracy:.4f}")

y_pred_proba = model.predict([X_text_test, X_tab_test])
y_pred = np.argmax(y_pred_proba, axis=1)
print("\\nRaport klasyfikacji na zbiorze testowym:")
print(classification_report(y_idx_test, y_pred, target_names=label_names))


\nRozpoczynam trening na nowych danych...
Epoch 1/10
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 346ms/step - accuracy: 0.4483 - loss: 1.1516 - val_accuracy: 0.5579 - val_loss: 0.9476
Epoch 2/10
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m176s[0m 321ms/step - accuracy: 0.5590 - loss: 0.9729 - val_accuracy: 0.5876 - val_loss: 0.9016
Epoch 3/10
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 343ms/step - accuracy: 0.6330 - loss: 0.8847 - val_accuracy: 0.7144 - val_loss: 0.7676
Epoch 4/10
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m195s[0m 357ms/step - accuracy: 0.6963 - loss: 0.7924 - val_accuracy: 0.6469 - val_loss: 0.8515
Epoch 5/10
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m252s[0m 462ms/step - accuracy: 0.6741 - loss: 0.8382 - val_accuracy: 0.6733 - val_loss: 0.8368
Epoch 6/10
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m280s[0m 512ms/step - accuracy: 0.7109 - loss: 0.7852 - va

In [4]:
# --- ZAPIS ARTEFAKTÓW ---

import joblib, json

model.save('model_lstm_stan.keras')

with open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer.to_json(), ensure_ascii=False))

joblib.dump(preprocessor, 'preprocessor.joblib')

label_mapping = {i: name for i, name in enumerate(label_names)}
with open('label_mapping.json', 'w', encoding='utf-8') as f:
    json.dump({str(k): v for k, v in label_mapping.items()}, f, ensure_ascii=False)

columns_for_prediction = numeric_features + categorical_features
joblib.dump(columns_for_prediction, 'columns_for_prediction.joblib')

print("Zapisano: model_lstm_stan.keras, tokenizer.json, preprocessor.joblib, label_mapping.json, columns_for_prediction.joblib")


Zapisano: model_lstm_stan.keras, tokenizer.json, preprocessor.joblib, label_mapping.json, columns_for_prediction.joblib
