In [1]:
# CELL 1: konfiguracja, wczytanie i przygotowanie danych (id. z v2, bez zmian merytorycznych)

import os, csv, io, re, json, gc
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# 4 źródła (każdy plik = jedna etykieta docelowa)
FILES = {
    'sf_after_renovation.csv': 'AFTER_RENOVATION',
    'sf_developer_state.csv' : 'DEVELOPER_STATE',
    'sf_for_renovation.csv'  : 'FOR_RENOVATION',
    'sf_good.csv'            : 'GOOD',
}

# Wymagane przez model pola (zgodnie z notebookiem trenowania)
REQUIRED_TEXT = ['Description']
REQUIRED_NUM  = ['Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors']
REQUIRED_CAT  = ['BuildingType', 'OfferFrom', 'TypeOfMarket']
REQUIRED_DT   = ['BuiltYear']  # źródło kolumny 'year'
REQUIRED_ALL  = REQUIRED_TEXT + REQUIRED_NUM + REQUIRED_CAT + REQUIRED_DT

# UWAGA: pliki źródłowe mają stałą kolejność kolumn w rekordach CSV.
# Na podstawie diagnostyki i próbek mapujemy pozycje -> nazwy:
# idx: 4=Description, 5=Area, 6=Price, 11=NumberOfRooms, 12=BuiltYear,
# 14=BuildingType, 16=OfferFrom, 17=Floor, 18=Floors, 19=TypeOfMarket, 15=etykieta źródłowa
IDX_MAP = {
    'Description':    4,
    'Area':           5,
    'Price':          6,
    'NumberOfRooms': 11,
    'BuiltYear':     12,
    'BuildingType':  14,
    'OfferFrom':     16,
    'Floor':         17,
    'Floors':        18,
    'TypeOfMarket':  19,
}
IDX_LABEL = 15  # zawiera nazwę stanu w rekordach; dla pewności nadpisujemy nazwą pliku

# Parser wymuszający prawidłowe dzielenie po przecinku i cudzysłowie, z escape'ami
def robust_read_records(path, encoding_candidates=('utf-8-sig','utf-8','cp1250','latin1')):
    last_err = None
    for enc in encoding_candidates:
        try:
            rows = []
            with open(path, 'r', encoding=enc, errors='replace', newline='') as f:
                reader = csv.reader(f, delimiter=',', quotechar='"', escapechar='\\')
                for row in reader:
                    if not row or (len(row) == 1 and not str(row).strip()):
                        continue
                    rows.append(row)
            if not rows:
                continue
            median_len = int(np.median([len(r) for r in rows]))
            if median_len < 10:
                continue
            return rows
        except Exception as e:
            last_err = e
            continue
    if last_err:
        raise last_err
    raise RuntimeError(f'Unable to parse CSV: {path}')

def extract_required_df(rows, force_label):
    sel = {}
    for name, idx in IDX_MAP.items():
        sel[name] = [(r[idx] if len(r) > idx else None) for r in rows]
    labels = [force_label for _ in rows]
    out = pd.DataFrame(sel)
    out['BuildingCondition'] = labels
    return out

frames = []
for path, label in FILES.items():
    if not os.path.exists(path):
        raise FileNotFoundError(f'Brak pliku: {path}')
    rows   = robust_read_records(path)
    df_part= extract_required_df(rows, force_label=label)
    frames.append(df_part)

full = pd.concat(frames, ignore_index=True)

# --- CZYSZCZENIE, TYPY, CECHY WTÓRNE ---

# Tekst
full['Description'] = full['Description'].fillna('').astype(str)

# Numeryczne
for col in ['Area','Price','NumberOfRooms','Floor','Floors']:
    full[col] = pd.to_numeric(full[col], errors='coerce')

# Rok budowy -> year (liczba całkowita)
years = pd.to_datetime(full['BuiltYear'], errors='coerce').dt.year
if years.isna().all():
    years = pd.to_numeric(full['BuiltYear'], errors='coerce')
full['year'] = years
full['year'] = full['year'].fillna(full['year'].median())

# Kategoryczne
for col in ['BuildingType','OfferFrom','TypeOfMarket']:
    full[col] = full[col].fillna('unknown').astype(str).str.strip().str.lower()

def norm_market(v):
    v = (v or '').lower()
    if 'pierwot' in v: return 'pierwotny'
    if 'wtór' in v or 'wtorn' in v: return 'wtórny'
    return v if v else 'unknown'
full['TypeOfMarket'] = full['TypeOfMarket'].apply(norm_market)

# --- BALANS KLAS (informacyjnie) ---
counts = full['BuildingCondition'].value_counts(dropna=False)
perc   = (counts / len(full) * 100).round(2)
balance= pd.DataFrame({'count': counts, 'percent': perc}).sort_index()
print("\n=== Balans klas przed treningiem ===")
print(balance)

# --- PODZIAŁ I PRZYGOTOWANIE WEJŚĆ ---

label_names = ['AFTER_RENOVATION','DEVELOPER_STATE','FOR_RENOVATION','GOOD']
label_to_idx = {name: i for i, name in enumerate(label_names)}
y_idx = full['BuildingCondition'].map(label_to_idx).astype(int).values
y = to_categorical(y_idx, num_classes=len(label_names))

# Tekst
max_words, max_len = 10000, 200
tokenizer = Tokenizer(num_words=max_words, oov_token="")
tokenizer.fit_on_texts(full['Description'].astype(str))
X_text = pad_sequences(tokenizer.texts_to_sequences(full['Description'].astype(str)), maxlen=max_len)

# Tabelaryczne
numeric_features     = ['Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors', 'year']
categorical_features = ['BuildingType', 'OfferFrom', 'TypeOfMarket']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)
X_tabular = preprocessor.fit_transform(full[numeric_features + categorical_features])

# Train/test split ze stratą po klasie
X_text_train, X_text_test, X_tab_train, X_tab_test, y_train, y_test, y_idx_train, y_idx_test = train_test_split(
    X_text, X_tabular, y, y_idx, test_size=0.2, random_state=42, stratify=y_idx
)

# Wagi klas (dla porządku, mimo że rozkład jest wyrównany)
classes = np.unique(y_idx_train)
cw = compute_class_weight(class_weight='balanced', classes=classes, y=y_idx_train)
class_weight = {int(c): float(w) for c, w in zip(classes, cw)}
print('\nClass weights:', class_weight)


  years = pd.to_datetime(full['BuiltYear'], errors='coerce').dt.year



=== Balans klas przed treningiem ===
                   count  percent
BuildingCondition                
AFTER_RENOVATION   22000    25.14
DEVELOPER_STATE    22000    25.14
FOR_RENOVATION     21513    24.58
GOOD               22000    25.14

Class weights: {0: 0.9944602272727273, 1: 0.9944602272727273, 2: 1.0169959325973272, 3: 0.9944602272727273}


In [2]:
# CELL 2: architektura (bez zmian względem v2), kompilacja i trenowanie z 20 epokami + callbacki

from tensorflow.keras.layers import Input, Embedding, LSTM, Dropout, Dense, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam

# Architektura
text_input = Input(shape=(max_len,), name='text_input')
embedding_layer = Embedding(input_dim=max_words, output_dim=128)(text_input)
lstm_layer = LSTM(64, recurrent_dropout=0.2)(embedding_layer)
dropout_lstm = Dropout(0.4)(lstm_layer)

tabular_input = Input(shape=(X_tab_train.shape[1],), name='tabular_input')
tabular_dense = Dense(32, activation='relu')(tabular_input)

concatenated  = Concatenate()([dropout_lstm, tabular_dense])
dense1        = Dense(64, activation='relu')(concatenated)
dropout_final = Dropout(0.5)(dense1)
output        = Dense(len(label_names), activation='softmax')(dropout_final)

model = Model(inputs=[text_input, tabular_input], outputs=output)
model.summary()

# Kompilacja: label smoothing + Adam lr=1e-3
model.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss=CategoricalCrossentropy(label_smoothing=0.05),
    metrics=['accuracy']
)

# Callbacki: EarlyStopping, ReduceLROnPlateau, Checkpoint
es  = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)
rlr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-5, verbose=1)
ckp = ModelCheckpoint('model_best.keras', monitor='val_loss', save_best_only=True, verbose=1)

print("\nRozpoczynam trening na nowych danych (20 epok + callbacki)...")
history = model.fit(
    [X_text_train, X_tab_train], y_train,
    epochs=20,
    batch_size=128,
    validation_data=([X_text_test, X_tab_test], y_test),
    callbacks=[es, rlr, ckp],
    class_weight=class_weight  # pozostawione dla spójności
)



Rozpoczynam trening na nowych danych (20 epok + callbacki)...
Epoch 1/20
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 480ms/step - accuracy: 0.4345 - loss: 1.2044
Epoch 1: val_loss improved from inf to 1.04479, saving model to model_best.keras
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m280s[0m 506ms/step - accuracy: 0.4346 - loss: 1.2043 - val_accuracy: 0.5445 - val_loss: 1.0448 - learning_rate: 0.0010
Epoch 2/20
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 498ms/step - accuracy: 0.5605 - loss: 1.0295
Epoch 2: val_loss improved from 1.04479 to 0.97104, saving model to model_best.keras
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m288s[0m 526ms/step - accuracy: 0.5605 - loss: 1.0295 - val_accuracy: 0.5789 - val_loss: 0.9710 - learning_rate: 0.0010
Epoch 3/20
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 513ms/step - accuracy: 0.5164 - loss: 1.1323
Epoch 3: val_loss did not improve from 0.97104


In [3]:
# CELL 3: ocena na teście

loss, accuracy = model.evaluate([X_text_test, X_tab_test], y_test)
print(f"\nDokładność na zbiorze testowym: {accuracy:.4f}")

# Raport per klasa
y_pred_proba = model.predict([X_text_test, X_tab_test])
y_pred       = np.argmax(y_pred_proba, axis=1)
y_true       = np.argmax(y_test, axis=1)
print("\nRaport klasyfikacji na zbiorze testowym:")
print(classification_report(y_true, y_pred, target_names=label_names))


[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 82ms/step - accuracy: 0.7282 - loss: 0.8022

Dokładność na zbiorze testowym: 0.7301
[1m547/547[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 68ms/step

Raport klasyfikacji na zbiorze testowym:
                  precision    recall  f1-score   support

AFTER_RENOVATION       0.59      0.75      0.66      4400
 DEVELOPER_STATE       0.91      0.89      0.90      4400
  FOR_RENOVATION       0.80      0.73      0.76      4303
            GOOD       0.67      0.55      0.61      4400

        accuracy                           0.73     17503
       macro avg       0.74      0.73      0.73     17503
    weighted avg       0.74      0.73      0.73     17503



In [4]:
# CELL 4: zapis artefaktów (bez zmian względem v2)

import joblib, json

# Model z przywróconymi najlepszymi wagami (EarlyStopping) + checkpoint na val_loss
model.save('model_lstm_stan.keras')

with open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer.to_json(), ensure_ascii=False))

joblib.dump(preprocessor, 'preprocessor.joblib')

label_mapping = {i: name for i, name in enumerate(label_names)}
with open('label_mapping.json', 'w', encoding='utf-8') as f:
    json.dump({str(k): v for k, v in label_mapping.items()}, f, ensure_ascii=False)

columns_for_prediction = numeric_features + categorical_features
joblib.dump(columns_for_prediction, 'columns_for_prediction.joblib')

print("Zapisano: model_lstm_stan.keras, tokenizer.json, preprocessor.joblib, label_mapping.json, columns_for_prediction.joblib")


Zapisano: model_lstm_stan.keras, tokenizer.json, preprocessor.joblib, label_mapping.json, columns_for_prediction.joblib
