In [1]:
# === SEKCJA 1: IMPORT I KONFIGURACJA ===
import pandas as pd
import numpy as np
import re
import warnings

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Concatenate, Dense, Dropout, BatchNormalization, Bidirectional, Reshape
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from IPython.display import display
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore', category=pd.errors.DtypeWarning)

print("Wczytywanie danych...")
# ZMIANA: Wczytujemy TYLKO te dwa pliki.
df_main_raw = pd.read_csv('saleflats_mazowieckie_b.csv', sep=',', header=None, on_bad_lines='skip', low_memory=False)
df_meta_raw = pd.read_csv('saleflatmeta_mazowieckie.csv', sep=',', header=None, on_bad_lines='skip')
print("Wszystkie pliki wczytane.")

Wczytywanie danych...
Wszystkie pliki wczytane.


In [2]:
# === SEKCJA 2: PRZYGOTOWANIE DANYCH ===
print("--- Nadawanie nazw kolumnom ---")
df_main = df_main_raw.copy()
df_meta = df_meta_raw.copy()

num_cols_main = df_main.shape[1]
original_cols = [i for i in range(num_cols_main - 7)]
id_cols = ['WojewodztwoID', 'PowiatID', 'GminaID', 'RodzajGminyID', 'MiastoID', 'DzielnicaID', 'UlicaID']
df_main.columns = original_cols + id_cols

main_cols_map = {
    4: 'Description', 5: 'Area', 6: 'Price', 17: 'NumberOfRooms', 
    35: 'Floor', 36: 'Floors', 21: 'BuildingType'
}
df_main.rename(columns=main_cols_map, inplace=True)

meta_cols = ['LocationId', 'ParentId', 'Name', 'Type', 'Path']
df_meta.columns = meta_cols
id_to_name_map = pd.Series(df_meta.Name.values, index=df_meta.LocationId).to_dict()

print("--- Czyszczenie danych ---")
numeric_features = ['Area', 'Price', 'NumberOfRooms', 'Floor', 'Floors']
for col in numeric_features:
    df_main[col] = pd.to_numeric(df_main[col], errors='coerce')

df_main['UlicaID'] = pd.to_numeric(df_main['UlicaID'], errors='coerce').fillna(0).astype(int)
df_main.dropna(subset=['Description'] + numeric_features, inplace=True)
df_main = df_main[df_main['UlicaID'] != 0].copy()

MIN_SAMPLES_PER_STREET = 25
street_counts = df_main['UlicaID'].value_counts()
streets_to_keep = street_counts[street_counts >= MIN_SAMPLES_PER_STREET].index
df_model_ready = df_main[df_main['UlicaID'].isin(streets_to_keep)].copy()

def clean_text(text): return re.sub(r'[^a-ząęółśżźćń ]', '', str(text).lower())
df_model_ready['description_clean'] = df_model_ready['Description'].apply(clean_text)

print(f"Finalny zbiór danych gotowy. Wiersze: {len(df_model_ready)}, Unikalnych ulic: {df_model_ready['UlicaID'].nunique()}")

--- Nadawanie nazw kolumnom ---
--- Czyszczenie danych ---
Finalny zbiór danych gotowy. Wiersze: 80970, Unikalnych ulic: 1107


In [5]:
# === SEKCJA 3: INŻYNIERIA CECH DLA MODELU (WERSJA ZOPTYMALIZOWANA) ===

# 1. Cechy tekstowe
MAX_WORDS, MAX_LEN = 20000, 200
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token="<unk>")

print("Uczenie tokenizera na danych tekstowych (może chwilę potrwać)...")
# ZMIANA: Zamiast łączyć serie, uczymy tokenizer krok po kroku - jest to znacznie szybsze
# Najpierw na głównym zbiorze treningowym
tokenizer.fit_on_texts(df_model_ready['description_clean'])
# Potem aktualizujemy słownik o słowa z reszty danych (jeśli jakieś są nowe)
# df_main['description_clean'] jest już stworzony w komórce 2
tokenizer.fit_on_texts(df_main['description_clean'])


# Teraz tworzymy sekwencje dla zbioru treningowego
sequences = tokenizer.texts_to_sequences(df_model_ready['description_clean'])
X_text = pd.DataFrame(pad_sequences(sequences, maxlen=MAX_LEN), index=df_model_ready.index)

# 2. Cechy numeryczne
numeric_pipeline = Pipeline([('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())])
X_numeric = pd.DataFrame(numeric_pipeline.fit_transform(df_model_ready[numeric_features]), index=df_model_ready.index)

# 3. Cechy kategoryczne (Label Encoding)
print("Kodowanie cech kategorycznych...")
categorical_features_to_embed = ['WojewodztwoID', 'PowiatID', 'GminaID', 'MiastoID', 'DzielnicaID', 'BuildingType']
categorical_encoders = {}
X_categorical_encoded = pd.DataFrame(index=df_model_ready.index)

for col in categorical_features_to_embed:
    df_model_ready[col] = df_model_ready[col].astype(str).fillna('missing')
    le = LabelEncoder()
    all_values = df_main[col].astype(str).fillna('missing').unique()
    le.fit(all_values)
    
    # Tworzymy mapowanie i używamy go, aby uniknąć błędu z nieznanymi etykietami
    mapping = dict(zip(le.classes_, le.transform(le.classes_)))
    unknown_label = len(le.classes_)
    X_categorical_encoded[col] = df_model_ready[col].map(mapping).fillna(unknown_label).astype(int)
    categorical_encoders[col] = le

# 4. Zmienna celu (y) - UlicaID
print("Kodowanie etykiety celu (Ulica)...")
target_label_encoder = LabelEncoder()
y_encoded = target_label_encoder.fit_transform(df_model_ready['UlicaID'])
y_encoded_series = pd.Series(y_encoded, index=df_model_ready.index)
num_classes = len(target_label_encoder.classes_)
print(f"Dane przygotowane do treningu. Liczba klas (UlicaID): {num_classes}")

# 5. Podział na zbiory
X_train_idx, X_val_idx = train_test_split(df_model_ready.index, test_size=0.2, random_state=42, stratify=y_encoded_series)
X_train_text, X_val_text = X_text.loc[X_train_idx], X_text.loc[X_val_idx]
X_train_num, X_val_num = X_numeric.loc[X_train_idx], X_numeric.loc[X_val_idx]
X_train_cat, X_val_cat = X_categorical_encoded.loc[X_train_idx], X_categorical_encoded.loc[X_val_idx]
y_train, y_val = y_encoded_series.loc[X_train_idx], y_encoded_series.loc[X_val_idx]

Uczenie tokenizera na danych tekstowych (może chwilę potrwać)...
Kodowanie cech kategorycznych...
Kodowanie etykiety celu (Ulica)...
Dane przygotowane do treningu. Liczba klas (UlicaID): 1107


In [6]:
# === SEKCJA 4: BUDOWA I TRENING MODELU Z EMBEDDINGAMI KATEGORYCZNYMI ===
from tensorflow.keras.layers import Reshape

# Wejścia modelu
input_text = Input(shape=(MAX_LEN,), name='text_input')
input_numeric = Input(shape=(X_numeric.shape[1],), name='numeric_input')
categorical_inputs = []
embedding_layers = []

for col in categorical_features_to_embed:
    # +1 dla obsługi nieznanych etykiet
    num_unique_values = len(categorical_encoders[col].classes_) + 1
    embedding_dim = min(10, (num_unique_values + 1) // 2)
    cat_input = Input(shape=(1,), name=f'input_{col}')
    categorical_inputs.append(cat_input)
    embedding = Embedding(input_dim=num_unique_values, output_dim=embedding_dim)(cat_input)
    # ZMIANA: Dodanie warstwy Reshape do spłaszczenia wymiaru
    embedding = Reshape(target_shape=(embedding_dim,))(embedding)
    embedding_layers.append(embedding)

text_embedding = Embedding(input_dim=MAX_WORDS, output_dim=128, name='text_embedding')(input_text)
lstm_layer = Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0.3))(text_embedding)
all_features = [lstm_layer, input_numeric] + embedding_layers
concatenated = Concatenate()(all_features)

x = Dense(512, activation='relu')(concatenated); x = BatchNormalization()(x); x = Dropout(0.6)(x)
x = Dense(256, activation='relu')(x); x = BatchNormalization()(x); x = Dropout(0.6)(x)
output_layer = Dense(num_classes, activation='softmax', name='output_ulica')(x)

model = Model(inputs=[input_text, input_numeric] + categorical_inputs, outputs=output_layer)
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

X_train_dict = {'text_input': X_train_text, 'numeric_input': X_train_num}
X_val_dict = {'text_input': X_val_text, 'numeric_input': X_val_num}
for col in categorical_features_to_embed:
    X_train_dict[f'input_{col}'] = X_train_cat[col]
    X_val_dict[f'input_{col}'] = X_val_cat[col]

callbacks = [EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True), ReduceLROnPlateau(monitor='val_loss', patience=2)]

print("\nRozpoczynam trening sieci neuronowej...")
history = model.fit(X_train_dict, y_train, validation_data=(X_val_dict, y_val), epochs=30, batch_size=256, callbacks=callbacks)
model.save('final_location_predictor_v4.keras')


Rozpoczynam trening sieci neuronowej...
Epoch 1/30
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1093s[0m 4s/step - accuracy: 0.0339 - loss: 6.7724 - val_accuracy: 0.0915 - val_loss: 5.8776 - learning_rate: 0.0010
Epoch 2/30
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m980s[0m 4s/step - accuracy: 0.2444 - loss: 3.4110 - val_accuracy: 0.3274 - val_loss: 3.0395 - learning_rate: 0.0010
Epoch 3/30
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m985s[0m 4s/step - accuracy: 0.3113 - loss: 2.4042 - val_accuracy: 0.3756 - val_loss: 1.9976 - learning_rate: 0.0010
Epoch 4/30
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m986s[0m 4s/step - accuracy: 0.3619 - loss: 2.1237 - val_accuracy: 0.3929 - val_loss: 1.9039 - learning_rate: 0.0010
Epoch 5/30
[1m254/254[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1026s[0m 4s/step - accuracy: 0.4535 - loss: 1.7831 - val_accuracy: 0.3946 - val_loss: 1.9305 - learning_rate: 0.0010
Epoch 6/30
[1m254/254

In [7]:
# === SEKCJA 5: PREDYKCJA I ANALIZA WYNIKÓW ===
print("Przygotowywanie całego zbioru do predykcji...")
df_predict = df_main.copy()
df_predict['description_clean'] = df_predict['Description'].apply(clean_text)

for col in numeric_features: df_predict[col] = pd.to_numeric(df_predict[col], errors='coerce').fillna(0)
df_predict['BuildingType'] = df_predict['BuildingType'].astype(str).fillna('missing')

sequences_full = tokenizer.texts_to_sequences(df_predict['description_clean'])
X_text_full = pad_sequences(sequences_full, maxlen=MAX_LEN)
X_numeric_full = numeric_pipeline.transform(df_predict[numeric_features])
X_categorical_full = cat_pipeline.transform(df_predict[['BuildingType']])

print("Generowanie predykcji...")
predictions = model.predict([X_text_full, X_numeric_full, X_categorical_full])

print("Dekodowanie i składanie wyników...")
for i, col in enumerate(output_columns):
    pred_encoded = np.argmax(predictions[i], axis=1)
    df_predict[f'Predicted_{col}'] = target_encoders[col].inverse_transform(pred_encoded)
    df_predict[f'Predicted_{col}_Prob'] = np.max(predictions[i], axis=1)

def get_name(loc_id):
    try: return id_to_name_map.get(int(loc_id), 'brak')
    except (ValueError, TypeError): return 'błędne_id'
for col in output_columns:
    df_predict[f'Predicted_{col}_Name'] = df_predict[f'Predicted_{col}'].apply(get_name)

print("\nPrzykładowe 25 losowych wierszy z wynikami predykcji:")
display_cols = ['SaleId', 'Location', 'Predicted_M_Name', 'Predicted_Dziel_Name', 'Predicted_Ul_Name']
display(df_predict[display_cols].sample(25, random_state=42))

Przygotowywanie całego zbioru do predykcji...


NameError: name 'cat_pipeline' is not defined