In [1]:
# === SEKCJA 1: IMPORT I KONFIGURACJA (POPRAWIONA) ===
import pandas as pd
import numpy as np
import re
import warnings

from tensorflow.keras.models import Model
# ZMIANA: Dodanie brakującego importu Concatenate
from tensorflow.keras.layers import Input, Embedding, LSTM, Concatenate, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from IPython.display import display
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore', category=pd.errors.DtypeWarning)

print("Wczytywanie danych...")
df_main_raw = pd.read_csv('saleflats_mazowieckie.csv', sep=',', header=None, on_bad_lines='skip', low_memory=False)
df_meta_raw = pd.read_csv('saleflatmeta_mazowieckie.csv', sep=',', header=None, on_bad_lines='skip')
print("Wszystkie pliki wczytane.")

Wczytywanie danych...
Wszystkie pliki wczytane.


In [2]:
# === SEKCJA 2: PRZYGOTOWANIE DANYCH (FINALNA WERSJA v5) ===
print("--- Krok 1: Nadawanie nazw kolumnom ---")
df_main = df_main_raw.copy()
df_meta = df_meta_raw.copy()

main_cols_map = {0: 'SaleId', 2: 'Location', 4: 'Description', 5: 'Area', 6: 'Price', 16: 'LocationPath', 17: 'NumberOfRooms', 35: 'Floor', 36: 'Floors', 21: 'BuildingType'}
df_main.rename(columns=main_cols_map, inplace=True)
meta_cols = ['LocationId', 'ParentId', 'Name', 'Type', 'Path']
df_meta.columns = meta_cols
location_id_to_name = pd.Series(df_meta.Name.values, index=df_meta.LocationId).to_dict()

print("--- Krok 2: Czyszczenie i filtrowanie ---")
# ZMIANA: Usuwamy tylko wiersze, jeśli BRAKUJE Description LUB LocationPath
df_main.dropna(subset=['Description', 'LocationPath'], inplace=True)
df_main['LocationPath'] = pd.to_numeric(df_main['LocationPath'], errors='coerce').fillna(0).astype(int)
df_model_ready = df_main[df_main['LocationPath'] != 0].copy()

def clean_text(text): return re.sub(r'[^a-ząęółśżźćń ]', '', str(text).lower())
df_model_ready['description_clean'] = df_model_ready['Description'].apply(clean_text)

# Uzupełnianie braków w danych numerycznych - przygotowanie pod pipeline
numeric_features = ['Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors']
for col in numeric_features:
    df_model_ready[col] = pd.to_numeric(df_model_ready[col], errors='coerce')


print(f"Finalny zbiór danych gotowy. Wiersze: {len(df_model_ready)}, Unikalnych LocationPath: {df_model_ready['LocationPath'].nunique()}")

--- Krok 1: Nadawanie nazw kolumnom ---
--- Krok 2: Czyszczenie i filtrowanie ---
Finalny zbiór danych gotowy. Wiersze: 58, Unikalnych LocationPath: 9


In [3]:
# === SEKCJA 3: PRZYGOTOWANIE DANYCH WE/WY (BEZ STRATYFIKACJI) ===
# 1. Wejścia (Tylko tekst i dane numeryczne)
MAX_WORDS, MAX_LEN = 20000, 200
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<unk>")
tokenizer.fit_on_texts(df_model_ready['description_clean'])
sequences = tokenizer.texts_to_sequences(df_model_ready['description_clean'])
X_text = pad_sequences(sequences, maxlen=MAX_LEN)

numeric_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
X_numeric = numeric_pipeline.fit_transform(df_model_ready[numeric_features])

# 2. Wyjście (LocationPath)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df_model_ready['LocationPath'])
num_classes = len(label_encoder.classes_)
print(f"Dane przygotowane do treningu. Liczba klas (LocationPath): {num_classes}")

# 3. Podział na zbiory (ZMIANA: usunięto stratify)
X_train_text, X_val_text, X_train_num, X_val_num, y_train, y_val = train_test_split(
    X_text, X_numeric, y_encoded, test_size=0.2, random_state=42
)

Dane przygotowane do treningu. Liczba klas (LocationPath): 9




In [4]:
# === SEKCJA 4: BUDOWA I TRENING MODELU (WERSJA UPROSZCZONA) ===
input_text = Input(shape=(MAX_LEN,), name='text_input')
input_numeric = Input(shape=(X_numeric.shape[1],), name='numeric_input')

embedding_layer = Embedding(input_dim=MAX_WORDS, output_dim=128)(input_text)
lstm_layer = Bidirectional(LSTM(128))(embedding_layer)
concatenated = Concatenate()([lstm_layer, input_numeric])

x = Dense(256, activation='relu')(concatenated)
x = Dropout(0.5)(x)
output_layer = Dense(num_classes, activation='softmax')(x)

model = Model(inputs=[input_text, input_numeric], outputs=output_layer)
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

callbacks = [EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)]

print("\nRozpoczynam trening...")
history = model.fit(
    [X_train_text, X_train_num], y_train,
    validation_data=([X_val_text, X_val_num], y_val),
    epochs=20, batch_size=128, callbacks=callbacks
)
model.save('simple_location_predictor.keras')


Rozpoczynam trening...
Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - accuracy: 0.1304 - loss: 2.1941 - val_accuracy: 0.0833 - val_loss: 2.1383
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 422ms/step - accuracy: 0.0870 - loss: 2.1601 - val_accuracy: 0.1667 - val_loss: 2.1259
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 417ms/step - accuracy: 0.1739 - loss: 2.1533 - val_accuracy: 0.3333 - val_loss: 2.1137
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 420ms/step - accuracy: 0.2826 - loss: 2.1161 - val_accuracy: 0.3333 - val_loss: 2.1016
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 455ms/step - accuracy: 0.3696 - loss: 2.0617 - val_accuracy: 0.3333 - val_loss: 2.0899
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 486ms/step - accuracy: 0.3696 - loss: 2.0351 - val_accuracy: 0.3333 - val_loss: 2.0796


In [5]:
# === SEKCJA 5: PREDYKCJA I ANALIZA WYNIKÓW ===
print("Generowanie predykcji na całym zbiorze...")
df_predict = df_main.copy()
df_predict['description_clean'] = df_predict['Description'].apply(clean_text)

# Uzupełnianie braków w danych numerycznych przed transformacją
for col in numeric_features:
    df_predict[col] = pd.to_numeric(df_predict[col], errors='coerce').fillna(0)

sequences_full = tokenizer.texts_to_sequences(df_predict['description_clean'])
X_text_full = pad_sequences(sequences_full, maxlen=MAX_LEN)
X_numeric_full = numeric_pipeline.transform(df_predict[numeric_features])

predictions_proba = model.predict([X_text_full, X_numeric_full])
predicted_labels_encoded = np.argmax(predictions_proba, axis=1)

df_predict['Predicted_LocationPath'] = label_encoder.inverse_transform(predicted_labels_encoded)
df_predict['Predicted_Prob'] = np.max(predictions_proba, axis=1)
df_predict['Predicted_Name'] = df_predict['Predicted_LocationPath'].apply(lambda x: id_to_name_map.get(int(x), 'Brak w TERYT'))

print("\nPrzykładowe 25 losowych wierszy z wynikami predykcji:")
display(df_predict[['SaleId', 'Location', 'LocationPath', 'Predicted_Name', 'Predicted_Prob']].sample(25, random_state=42))

Generowanie predykcji na całym zbiorze...




[1m10151/10151[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m487s[0m 48ms/step


NameError: name 'id_to_name_map' is not defined