# Wersja v13: Model Finalny (Nowe Cechy, Ulepszona Strata, Top-K=5)

## Część 1: Przygotowanie Danych i Zaawansowana Inżynieria Cech

In [1]:
import pandas as pd
import numpy as np
import os
import joblib
import gc
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse

# Definicja stałych globalnych
ARTIFACTS_DIR = 'artifacts_v13'
PROCESSED_DATA_DIR = 'processed_data_v13'
MAX_TEXT_FEATURES = 20000
RANDOM_STATE = 42

os.makedirs(ARTIFACTS_DIR, exist_ok=True)
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
print(f"Katalogi '{ARTIFACTS_DIR}' i '{PROCESSED_DATA_DIR}' są gotowe.")

Katalogi 'artifacts_v13' i 'processed_data_v13' są gotowe.


In [2]:
def load_location_data(location_path='lokalizacja.csv'):
    return pd.read_csv(location_path, na_values=['\\N', 'NULL'], sep=',', header=None, names=['id', 'parent_id', 'name', 'type', 'full_name'])

def load_offers_data(offers_path='saleflats_2024_2025.csv'):
    print(f"Wczytywanie pliku: {offers_path}")
    column_names = ['SaleId', 'title', 'description', 'area', 'price', 'locationPath']
    try:
        first_row = pd.read_csv(offers_path, header=None, sep=',', nrows=1, on_bad_lines='skip')
        last_col_index = first_row.shape[1] - 1
        usecols = [0, 3, 4, 5, 6, last_col_index]
        df_offers = pd.read_csv(offers_path, header=None, sep=',', usecols=usecols, on_bad_lines='skip')
        df_offers.columns = column_names
    except Exception as e:
        print(f"Wystąpił błąd: {e}")
        df_offers = pd.DataFrame(columns=column_names)
    return df_offers

def load_geo_data(geo_path='wspolrzedne.csv'):
    print(f"Próba wczytania opcjonalnych danych geograficznych z: {geo_path}")
    try:
        df_geo = pd.read_csv(geo_path, header=None, names=['SaleId', 'latitude', 'longitude'])
        print("Dane geograficzne wczytane pomyślnie.")
        return df_geo
    except FileNotFoundError:
        print("Plik z danymi geograficznymi nie został znaleziony. Ten krok zostanie pominięty.")
        return None

df_loc = load_location_data()
df_offers = load_offers_data()
df_geo = load_geo_data()

# Merge z danymi geo, jeśli istnieją
if df_geo is not None:
    df_offers = pd.merge(df_offers, df_geo, on='SaleId', how='left')

id_to_name = dict(zip(df_loc['id'], df_loc['name']))
hierarchy_map = dict(zip(df_loc['id'], df_loc['parent_id']))
joblib.dump(id_to_name, os.path.join(ARTIFACTS_DIR, 'id_to_name.joblib'))
joblib.dump(hierarchy_map, os.path.join(ARTIFACTS_DIR, 'hierarchy_map.joblib'))
print("Mapy 'id_to_name' i 'hierarchy_map' zostały stworzone i zapisane.")

Wczytywanie pliku: saleflats_2024_2025.csv
Próba wczytania opcjonalnych danych geograficznych z: wspolrzedne.csv
Plik z danymi geograficznymi nie został znaleziony. Ten krok zostanie pominięty.
Mapy 'id_to_name' i 'hierarchy_map' zostały stworzone i zapisane.


## Inżynieria Cech: Dodanie `is_central_subdistrict` i obsługa Geo-danych

In [3]:
# KOMÓRKA [3] - POPRAWIONA I FINALNA WERSJA

# --- POPRAWKA: Odporne na błędy parsowanie locationPath ---
path_cols = ['woj_id', 'pow_id', 'gmi_id', 'city_id', 'district_id', 'subdistrict_id', 'street_id']

# Krok 1: Rozdziel string na kolumny
path_df = df_offers['locationPath'].str.split(',', expand=True)

# Krok 2: Nadaj nazwy tym kolumnom, które istnieją
path_df.columns = path_cols[:path_df.shape[1]]

# Krok 3: Upewnij się, że ramka ma wszystkie docelowe kolumny, brakujące wypełnij 0
path_df = path_df.reindex(columns=path_cols, fill_value=0)

# Krok 4: Konwersja na typ numeryczny
path_df = path_df.apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)
# --- KONIEC POPRAWKI ---


# Cechy Numeryczne podstawowe
df_offers['area'] = pd.to_numeric(df_offers['area'], errors='coerce')
df_offers['price'] = pd.to_numeric(df_offers['price'], errors='coerce')
df_offers['price_per_meter'] = df_offers['price'] / df_offers['area']

# Połącz dane w jedną, dużą ramkę
df_full = pd.concat([df_offers.reset_index(drop=True), path_df.reset_index(drop=True)], axis=1)

# Filtrowanie i czyszczenie
valid_rows_conditions = (
    df_full['area'].notna() &
    df_full['price'].notna() &
    df_full['city_id'].notna() &
    (df_full['area'] > 0) &
    (df_full['price'] > 0) &
    (df_full['city_id'] != 0)
)
df_offers_clean = df_full[valid_rows_conditions].copy()
df_offers_clean.reset_index(drop=True, inplace=True)

# Jawne zarządzanie nazwami kolumn przy tworzeniu nowej cechy
df_merged = pd.merge(df_offers_clean, df_loc[['id', 'name']], left_on='district_id', right_on='id', how='left')
df_merged.rename(columns={'name': 'name_dist'}, inplace=True)

df_merged = pd.merge(df_merged, df_loc[['id', 'name']], left_on='subdistrict_id', right_on='id', how='left')
df_merged.rename(columns={'name': 'name_subdist'}, inplace=True)

df_merged['is_central_subdistrict'] = np.where(df_merged['name_dist'] == df_merged['name_subdist'], 1, 0)

# Lista kolumn do zachowania
final_cols = ['SaleId', 'title', 'description', 'area', 'price', 'price_per_meter', 'city_id',
              'district_id', 'subdistrict_id', 'street_id', 'is_central_subdistrict']
if 'latitude' in df_merged.columns and 'longitude' in df_merged.columns:
    final_cols.extend(['latitude', 'longitude'])
    print("Cechy geograficzne zostaną dołączone do modelu.")

df_processed = df_merged[final_cols].copy()
df_processed.rename(columns={'district_id': 'target_district_id', 'subdistrict_id': 'target_subdistrict_id', 'street_id': 'target_street_id'}, inplace=True)
df_processed['text_features'] = df_processed['title'].fillna('') + " " + df_processed['description'].fillna('')
df_processed.drop(columns=['title', 'description'], inplace=True)
df_processed.dropna(subset=['text_features'], inplace=True)

print("\n--- Nagłówek finalnego df_processed ---\n")
print(df_processed.head())


--- Nagłówek finalnego df_processed ---

   SaleId   area     price  price_per_meter  city_id  target_district_id  \
0      88  73.00  766500.0     10500.000000      352                   0   
1      99  64.80  540000.0      8333.333333      352                   0   
2     115  51.00  540000.0     10588.235294      352                   0   
3     140  67.62  544000.0      8044.957113      352                   0   
4     145  48.00  459000.0      9562.500000      352                   0   

   target_subdistrict_id  target_street_id  is_central_subdistrict  \
0                 103786                 0                       0   
1                  99764                 0                       0   
2                  74375                 0                       0   
3                  74375            517513                       0   
4                  95559                 0                       0   

                                       text_features  
0  Mieszkanie trzypokojow

In [4]:
# Stratyfikacja
city_counts = df_processed['city_id'].value_counts()
valid_cities = city_counts[city_counts > 1].index
df_filtered = df_processed[df_processed['city_id'].isin(valid_cities)].copy()

# Podział na zbiory
target_columns = ['target_district_id', 'target_subdistrict_id', 'target_street_id']
X = df_filtered.drop(columns=target_columns)
y = df_filtered[target_columns]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=X['city_id'])
print(f"Podział danych: {len(X_train)} próbek treningowych, {len(X_val)} próbek walidacyjnych.")

# Przetwarzanie Cech Numerycznych (z nowymi cechami)
numeric_features = ['area', 'price', 'price_per_meter', 'is_central_subdistrict']
if 'latitude' in X_train.columns and 'longitude' in X_train.columns:
    numeric_features.extend(['latitude', 'longitude'])

imputer = SimpleImputer(strategy='median'); scaler = StandardScaler()
X_train_num = scaler.fit_transform(imputer.fit_transform(X_train[numeric_features]))
X_val_num = scaler.transform(imputer.transform(X_val[numeric_features]))
joblib.dump(imputer, os.path.join(ARTIFACTS_DIR, 'imputer.joblib'))
joblib.dump(scaler, os.path.join(ARTIFACTS_DIR, 'scaler.joblib'))

# Przetwarzanie Cech Tekstowych
vectorizer = TfidfVectorizer(max_features=MAX_TEXT_FEATURES, ngram_range=(1, 2))
X_train_text = vectorizer.fit_transform(X_train['text_features'])
X_val_text = vectorizer.transform(X_val['text_features'])
joblib.dump(vectorizer, os.path.join(ARTIFACTS_DIR, 'vectorizer.joblib'))

# Zapisanie ID miast i SaleId
X_train_city = X_train['city_id'].values
X_val_city = X_val['city_id'].values
X_val_sale_ids = X_val['SaleId'].values
np.save(os.path.join(PROCESSED_DATA_DIR, 'X_val_sale_ids.npy'), X_val_sale_ids)

# Zapisanie y_train i y_val
y_train.to_pickle(os.path.join(PROCESSED_DATA_DIR, 'y_train.pkl'))
y_val.to_pickle(os.path.join(PROCESSED_DATA_DIR, 'y_val.pkl'))
print("Transformatory dopasowane i zapisane.")

Podział danych: 887561 próbek treningowych, 221891 próbek walidacyjnych.
Transformatory dopasowane i zapisane.


In [5]:
scipy.sparse.save_npz(os.path.join(PROCESSED_DATA_DIR, 'X_train_text.npz'), X_train_text)
scipy.sparse.save_npz(os.path.join(PROCESSED_DATA_DIR, 'X_val_text.npz'), X_val_text)
np.save(os.path.join(PROCESSED_DATA_DIR, 'X_train_num.npy'), X_train_num)
np.save(os.path.join(PROCESSED_DATA_DIR, 'X_val_num.npy'), X_val_num)
np.save(os.path.join(PROCESSED_DATA_DIR, 'X_train_city.npy'), X_train_city)
np.save(os.path.join(PROCESSED_DATA_DIR, 'X_val_city.npy'), X_val_city)
print("Wszystkie przetworzone zbiory danych zostały zapisane.")
del df_loc, df_offers, df_processed, X, y, X_train, X_val, y_train, y_val, df_filtered, df_merged
del X_train_text, X_val_text, X_train_num, X_val_num, X_train_city, X_val_city
gc.collect()
print("\n" + "="*80)
print("CZĘŚĆ 1 ZAKOŃCZONA POWODZENIEM. ZRESTARTUJ KERNEL.")
print("="*80)

Wszystkie przetworzone zbiory danych zostały zapisane.

CZĘŚĆ 1 ZAKOŃCZONA POWODZENIEM. ZRESTARTUJ KERNEL.


# Część 2: Budowa, Trening i Inferencia Modelu

In [1]:
import pandas as pd
import numpy as np
import joblib
import os
import scipy.sparse
import tensorflow as tf
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Dense, Dropout, Embedding, Flatten, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import register_keras_serializable

ARTIFACTS_DIR = 'artifacts_v13'
PROCESSED_DATA_DIR = 'processed_data_v13'
RANDOM_STATE = 42
MODEL_PATH = os.path.join(ARTIFACTS_DIR, 'best_location_model_v13.keras')

In [2]:
print("Wczytywanie artefaktów...")
hierarchy_map = joblib.load(os.path.join(ARTIFACTS_DIR, 'hierarchy_map.joblib'))
id_to_name = joblib.load(os.path.join(ARTIFACTS_DIR, 'id_to_name.joblib'))
print("Wczytywanie przetworzonych danych...")
X_train_text = scipy.sparse.load_npz(os.path.join(PROCESSED_DATA_DIR, 'X_train_text.npz'))
X_val_text = scipy.sparse.load_npz(os.path.join(PROCESSED_DATA_DIR, 'X_val_text.npz'))
X_train_num = np.load(os.path.join(PROCESSED_DATA_DIR, 'X_train_num.npy'))
X_val_num = np.load(os.path.join(PROCESSED_DATA_DIR, 'X_val_num.npy'))
X_train_city = np.load(os.path.join(PROCESSED_DATA_DIR, 'X_train_city.npy'))
X_val_city = np.load(os.path.join(PROCESSED_DATA_DIR, 'X_val_city.npy'))
X_val_sale_ids = np.load(os.path.join(PROCESSED_DATA_DIR, 'X_val_sale_ids.npy'))
y_train = pd.read_pickle(os.path.join(PROCESSED_DATA_DIR, 'y_train.pkl'))
y_val = pd.read_pickle(os.path.join(PROCESSED_DATA_DIR, 'y_val.pkl'))
print("Dane wczytane.")

Wczytywanie artefaktów...
Wczytywanie przetworzonych danych...
Dane wczytane.


## Udoskonalona Hierarchiczna Funkcja Straty (Rekomendacja #1)
**Zmiana:** Udoskonalona logika karania. Kara jest stosowana, gdy predykcja jest błędna ORAZ jej przewidywany rodzic również jest błędny, co skupia się na najpoważniejszych pomyłkach.

In [3]:
@register_keras_serializable()
class HierarchicalLoss(tf.keras.losses.Loss):
    def __init__(self, id_to_parent_map, penalty_config, **kwargs):
        super().__init__(**kwargs)
        self.id_to_parent_map = id_to_parent_map
        self.penalty_config = penalty_config
        self.base_loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction='none')
        self.parent_fallback_penalty = tf.constant(penalty_config.get('parent_fallback', 1.2), dtype=tf.float32)
        self.wrong_branch_penalty = tf.constant(penalty_config.get('wrong_branch', 1.8), dtype=tf.float32)
        
        keys = [int(k) for k in id_to_parent_map.keys()]
        values = [int(v) if pd.notna(v) else 0 for v in id_to_parent_map.values()]
        self.parent_table = tf.lookup.StaticHashTable(
            tf.lookup.KeyValueTensorInitializer(keys, values, key_dtype=tf.int64, value_dtype=tf.int64), 0
        )

    def call(self, y_true, y_pred):
        y_true_tensor = tf.cast(y_true, dtype=tf.int64)
        y_pred_ids = tf.cast(tf.argmax(y_pred, axis=-1), dtype=tf.int64)
        base_loss = self.base_loss_fn(y_true_tensor, y_pred)
        
        mask = tf.not_equal(y_true_tensor, 0)
        
        true_parents = self.parent_table.lookup(y_true_tensor)
        pred_parents = self.parent_table.lookup(y_pred_ids)
        
        is_correct = tf.equal(y_true_tensor, y_pred_ids)
        is_parent_fallback = tf.equal(true_parents, y_pred_ids) # Błąd typu dziecko -> rodzic
        is_same_branch = tf.equal(true_parents, pred_parents)

        penalties = tf.ones_like(base_loss)
        penalties = tf.where(is_parent_fallback, tf.ones_like(base_loss) * self.parent_fallback_penalty, penalties)
        penalties = tf.where(tf.logical_not(is_same_branch), tf.ones_like(base_loss) * self.wrong_branch_penalty, penalties)
        penalties = tf.where(is_correct, 1.0, penalties) # Nadpisz karę, jeśli predykcja jest poprawna

        penalized_loss = base_loss * penalties
        masked_loss = tf.where(mask, penalized_loss, 0.0)

        num_valid_labels = tf.reduce_sum(tf.cast(mask, tf.float32))
        return tf.math.divide_no_nan(tf.reduce_sum(masked_loss), num_valid_labels)

    def get_config(self):
        base_config = super().get_config()
        config = {"id_to_parent_map": self.id_to_parent_map, "penalty_config": self.penalty_config}
        return {**base_config, **config}

print("Udoskonalona hierarchiczna funkcja straty zdefiniowana.")

Udoskonalona hierarchiczna funkcja straty zdefiniowana.


## Definicja Architektury i Trening (Dostrojenie Hyperparametrów)
**Zmiany:**
1.  **Architektura:** Dodano jedną warstwę `Dense` do ciała modelu.
2.  **Wagi:** Zwiększono wagi dla `loss_weights` i `sample_weight`, aby model mocniej skupił się na ulicach.

In [4]:
# KOMÓRKA [4] - POPRAWIONA WERSJA

def create_mapper(series_list):
    """Tworzy mapowanie z oryginalnych ID na ciągłe indeksy z wielu serii danych."""
    unique_ids = pd.unique(pd.concat(series_list))
    return {val: i for i, val in enumerate(unique_ids)}

def map_labels(series, mapper):
    """Mapuje etykiety używając stworzonego mapowania."""
    return series.map(mapper).fillna(0).astype(int).values

# Mapowanie dla każdej głowicy
district_map = create_mapper([y_train['target_district_id'], y_val['target_district_id']])
subdistrict_map = create_mapper([y_train['target_subdistrict_id'], y_val['target_subdistrict_id']])
street_map = create_mapper([y_train['target_street_id'], y_val['target_street_id']])
city_map = create_mapper([pd.Series(X_train_city), pd.Series(X_val_city)])

# Zapisanie map do późniejszego użytku
joblib.dump(district_map, os.path.join(ARTIFACTS_DIR, 'district_map.joblib'))
joblib.dump(subdistrict_map, os.path.join(ARTIFACTS_DIR, 'subdistrict_map.joblib'))
joblib.dump(street_map, os.path.join(ARTIFACTS_DIR, 'street_map.joblib'))
joblib.dump(city_map, os.path.join(ARTIFACTS_DIR, 'city_map.joblib'))

y_district_train_mapped = map_labels(y_train['target_district_id'], district_map)
y_subdistrict_train_mapped = map_labels(y_train['target_subdistrict_id'], subdistrict_map)
y_street_train_mapped = map_labels(y_train['target_street_id'], street_map)
y_district_val_mapped = map_labels(y_val['target_district_id'], district_map)
y_subdistrict_val_mapped = map_labels(y_val['target_subdistrict_id'], subdistrict_map)
y_street_val_mapped = map_labels(y_val['target_street_id'], street_map)
X_train_city_mapped = map_labels(pd.Series(X_train_city), city_map)
X_val_city_mapped = map_labels(pd.Series(X_val_city), city_map)

# --- POPRAWKA: Ponowna definicja stałej MAX_TEXT_FEATURES ---
MAX_TEXT_FEATURES = 20000
# --- KONIEC POPRAWKI ---

# Definicja modelu z dodatkową warstwą
NUM_DISTRICTS, NUM_SUBDISTRICTS, NUM_STREETS, NUM_CITIES = len(district_map), len(subdistrict_map), len(street_map), len(city_map)
NUM_FEATURES = X_train_num.shape[1]

input_text = Input(shape=(MAX_TEXT_FEATURES,), name='text_input', sparse=True)
input_num = Input(shape=(NUM_FEATURES,), name='num_input')
input_city = Input(shape=(1,), name='city_input')

text_branch = Dense(128, activation='relu')(input_text); text_branch = Dropout(0.3)(text_branch)
num_branch = Dense(64, activation='relu')(input_num); num_branch = Dense(32, activation='relu')(num_branch)
city_branch = Embedding(input_dim=NUM_CITIES, output_dim=50, name='city_embedding')(input_city); city_branch = Flatten()(city_branch)
combined = Concatenate()([text_branch, num_branch, city_branch])
z = Dense(512, activation='relu')(combined); z = Dropout(0.5)(z)
z = Dense(256, activation='relu')(z); z = Dropout(0.5)(z)
z = Dense(128, activation='relu')(z); z = Dropout(0.5)(z) # NOWA WARSTWA

output_district = Dense(NUM_DISTRICTS, activation='softmax', name='district_output')(z)
output_subdistrict = Dense(NUM_SUBDISTRICTS, activation='softmax', name='subdistrict_output')(z)
output_street = Dense(NUM_STREETS, activation='softmax', name='street_output')(z)

model = Model(inputs=[input_text, input_num, input_city], outputs=[output_district, output_subdistrict, output_street])
model.summary()

In [5]:
# KOMÓRKA [5] - POPRAWIONA WERSJA TRENINGU

# --- POPRAWKA: Powrót do prostszej, bardziej stabilnej funkcji straty ---
# Zamiast skomplikowanej HierarchicalLoss, używamy standardowej straty.
# Pozwoli to modelowi nauczyć się podstaw, zanim wprowadzimy bardziej złożone kary.
district_loss = 'sparse_categorical_crossentropy'
subdistrict_loss = 'sparse_categorical_crossentropy'
street_loss = 'sparse_categorical_crossentropy'
# --- KONIEC POPRAWKI ---

model.compile(
    optimizer='adam',
    loss={
        'district_output': district_loss,
        'subdistrict_output': subdistrict_loss,
        'street_output': street_loss
    },
    # Zmniejszamy nieco wagi, aby były mniej agresywne
    loss_weights={'district_output': 1.0, 'subdistrict_output': 1.2, 'street_output': 1.5},
    metrics={'district_output': 'accuracy', 'subdistrict_output': 'accuracy', 'street_output': 'accuracy'}
)

# Wagi dla próbek pozostają, ale można je też zmniejszyć, jeśli wyniki będą słabe
subdistrict_train_weights = np.where(y_train['target_subdistrict_id'].values > 0, 1.5, 1.0)
street_train_weights = np.where(y_train['target_street_id'].values > 0, 2.0, 1.0)
sample_weights_list = [np.ones(len(y_train)), subdistrict_train_weights, street_train_weights]

X_train_dict = {'text_input': X_train_text, 'num_input': X_train_num, 'city_input': X_train_city_mapped}
y_train_list = [y_district_train_mapped, y_subdistrict_train_mapped, y_street_train_mapped]
X_val_dict = {'text_input': X_val_text, 'num_input': X_val_num, 'city_input': X_val_city_mapped}
y_val_list = [y_district_val_mapped, y_subdistrict_val_mapped, y_street_val_mapped]

callbacks = [
    ModelCheckpoint(MODEL_PATH, monitor='val_loss', save_best_only=True, verbose=1),
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)
]

history = model.fit(
    X_train_dict, y_train_list,
    sample_weight=sample_weights_list,
    validation_data=(X_val_dict, y_val_list),
    epochs=20, batch_size=128,
    callbacks=callbacks
)

Epoch 1/20
[1m6934/6935[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 81ms/step - district_output_accuracy: 0.7079 - district_output_loss: 1.0065 - loss: 17.3343 - street_output_accuracy: 0.6346 - street_output_loss: 7.4920 - subdistrict_output_accuracy: 0.5136 - subdistrict_output_loss: 4.2416
Epoch 1: val_loss improved from inf to 7.58085, saving model to artifacts_v13\best_location_model_v13.keras
[1m6935/6935[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m619s[0m 89ms/step - district_output_accuracy: 0.7079 - district_output_loss: 1.0064 - loss: 17.3338 - street_output_accuracy: 0.6346 - street_output_loss: 7.4918 - subdistrict_output_accuracy: 0.5136 - subdistrict_output_loss: 4.2414 - val_district_output_accuracy: 0.7689 - val_district_output_loss: 0.6418 - val_loss: 7.5808 - val_street_output_accuracy: 0.6429 - val_street_output_loss: 3.0727 - val_subdistrict_output_accuracy: 0.5629 - val_subdistrict_output_loss: 1.9418
Epoch 2/20
[1m6934/6935[0m [32m━━━━━━━━━━━━━━

## Predykcja Top-K (k=5) z Post-processingiem i Analizą Błędów

In [6]:
print(f"Wczytywanie najlepszego modelu z: {MODEL_PATH}")
best_model = tf.keras.models.load_model(MODEL_PATH, custom_objects={'HierarchicalLoss': HierarchicalLoss})
print("Model wczytany pomyślnie.")

inv_district_map = {v: k for k, v in district_map.items()}
inv_subdistrict_map = {v: k for k, v in subdistrict_map.items()}
inv_street_map = {v: k for k, v in street_map.items()}

def find_best_consistent_path(top_k_districts, top_k_subdistricts, top_k_streets, hierarchy_map):
    for dist_id in top_k_districts:
        for sub_id in top_k_subdistricts:
            if sub_id != 0 and hierarchy_map.get(sub_id) != dist_id:
                continue
            for street_id in top_k_streets:
                if street_id != 0:
                    expected_parent = sub_id if sub_id != 0 else dist_id
                    if hierarchy_map.get(street_id) != expected_parent:
                        continue
                return dist_id, sub_id, street_id
    return top_k_districts[0], top_k_subdistricts[0], top_k_streets[0]

num_samples = 1000
k = 5 # Zwiększono K
indices = np.random.choice(range(len(y_val)), num_samples, replace=False)
results_data = []

print(f"Przeprowadzanie predykcji Top-{k} z post-processingiem na {num_samples} próbkach...")
for i in indices:
    dense_vector_sample = X_val_text[i].toarray()
    current_sale_id = X_val_sale_ids[i]
    input_sample = {
        'text_input': dense_vector_sample,
        'num_input': np.expand_dims(X_val_num[i], axis=0),
        'city_input': np.expand_dims(map_labels(pd.Series(X_val_city[i]), city_map), axis=0)
    }
    
    preds = best_model.predict(input_sample, verbose=0)
    pred_district_probs, pred_subdistrict_probs, pred_street_probs = preds[0], preds[1], preds[2]

    top_k_district_ids = [inv_district_map.get(idx, 0) for idx in np.argsort(pred_district_probs[0])[::-1][:k]]
    top_k_subdistrict_ids = [inv_subdistrict_map.get(idx, 0) for idx in np.argsort(pred_subdistrict_probs[0])[::-1][:k]]
    top_k_street_ids = [inv_street_map.get(idx, 0) for idx in np.argsort(pred_street_probs[0])[::-1][:k]]

    final_district, final_subdistrict, final_street = find_best_consistent_path(
        top_k_district_ids, top_k_subdistrict_ids, top_k_street_ids, hierarchy_map
    )
    
    true_vals = y_val.iloc[i]
    results_data.append({
        'SaleId': current_sale_id,
        'City_ID': X_val_city[i],
        'True_District': true_vals['target_district_id'], 'Pred_District': final_district,
        'True_SubDistrict': true_vals['target_subdistrict_id'], 'Pred_SubDistrict': final_subdistrict,
        'True_Street': true_vals['target_street_id'], 'Pred_Street': final_street
    })

df_results = pd.DataFrame(results_data)

df_results['Correct_District'] = (df_results['True_District'] == df_results['Pred_District'])
df_results['Correct_SubDistrict'] = (df_results['True_SubDistrict'] == df_results['Pred_SubDistrict'])
df_results['Correct_Street'] = (df_results['True_Street'] == df_results['Pred_Street'])
df_results['Correct_Overall'] = df_results['Correct_District'] & df_results['Correct_SubDistrict'] & df_results['Correct_Street']

acc_district = df_results['Correct_District'].mean()
acc_subdistrict = df_results['Correct_SubDistrict'].mean()
acc_street = df_results['Correct_Street'].mean()
acc_overall = df_results['Correct_Overall'].mean()

print("\n--- Wyniki Ewaluacji (po Top-K i Post-processingu) ---")
print(f"Dokładność dla Dzielnic: {acc_district:.2%}")
print(f"Dokładność dla Pod-dzielnic: {acc_subdistrict:.2%}")
print(f"Dokładność dla Ulic: {acc_street:.2%}")
print(f"\nDokładność CAŁKOWITA (wszystkie poziomy poprawne): {acc_overall:.2%}")

Wczytywanie najlepszego modelu z: artifacts_v13\best_location_model_v13.keras
Model wczytany pomyślnie.
Przeprowadzanie predykcji Top-5 z post-processingiem na 1000 próbkach...

--- Wyniki Ewaluacji (po Top-K i Post-processingu) ---
Dokładność dla Dzielnic: 92.10%
Dokładność dla Pod-dzielnic: 62.30%
Dokładność dla Ulic: 67.30%

Dokładność CAŁKOWITA (wszystkie poziomy poprawne): 45.20%


In [7]:
for col in ['City_ID', 'True_District', 'Pred_District', 'True_SubDistrict', 'Pred_SubDistrict', 'True_Street', 'Pred_Street']:
    df_results[f'{col}_Name'] = df_results[col].apply(lambda x: id_to_name.get(x, 'Brak'))

df_errors = df_results[~df_results['Correct_Overall']].copy()
print("\n--- Analiza Błędów ---")

if not df_errors.empty:
    district_errors = df_errors[~df_errors['Correct_District']]
    if not district_errors.empty:
        print("\nTop 10 najczęstszych pomyłek na poziomie DZIELNICY:")
        display(district_errors.groupby(['True_District_Name', 'Pred_District_Name']).size().nlargest(10))
    else: print("\nBrak błędów na poziomie DZIELNICY!")
        
    subdistrict_errors = df_errors[df_errors['Correct_District'] & ~df_errors['Correct_SubDistrict']]
    if not subdistrict_errors.empty:
        print("\nTop 10 najczęstszych pomyłek na poziomie POD-DZIELNICY (przy poprawnej dzielnicy):")
        display(subdistrict_errors.groupby(['True_SubDistrict_Name', 'Pred_SubDistrict_Name']).size().nlargest(10))
    else: print("\nBrak błędów na poziomie POD-DZIELNICY przy poprawnych dzielnicach.")
else: print("\nModel nie popełnił żadnych błędów!")

def build_path(row, prefix):
    city = row['City_ID_Name'] if row['City_ID_Name'] != 'Brak' else '?'
    district = row[f'{prefix}_District_Name'] if row[f'{prefix}_District_Name'] != 'Brak' else '?'
    subdistrict = row[f'{prefix}_SubDistrict_Name'] if row[f'{prefix}_SubDistrict_Name'] != 'Brak' else '?'
    street = row[f'{prefix}_Street_Name'] if row[f'{prefix}_Street_Name'] != 'Brak' else '?'
    return f"{city} -> {district} -> {subdistrict} -> {street}"

df_results['True_Loc'] = df_results.apply(lambda row: build_path(row, 'True'), axis=1)
df_results['Predict_Loc'] = df_results.apply(lambda row: build_path(row, 'Pred'), axis=1)

print("\n--- Losowe wyniki predykcji ---")
df_display = df_results[['SaleId', 'True_Loc', 'Predict_Loc', 'Correct_Overall']].set_index('SaleId')
display(df_display.sample(min(20, len(df_display)), random_state=RANDOM_STATE))


--- Analiza Błędów ---

Top 10 najczęstszych pomyłek na poziomie DZIELNICY:


True_District_Name  Pred_District_Name 
Łódź-bałuty         Łódź-górna             5
Brak                Kraków-krowodrza       3
                    Wrocław-fabryczna      3
Praga-północ        Praga-południe         3
Bemowo              Wola                   2
Brak                Kraków-nowa huta       2
                    Wrocław-psie pole      2
                    Wrocław-śródmieście    2
                    Łódź-śródmieście       2
Kraków-podgórze     Kraków-krowodrza       2
dtype: int64


Top 10 najczęstszych pomyłek na poziomie POD-DZIELNICY (przy poprawnej dzielnicy):


True_SubDistrict_Name  Pred_SubDistrict_Name
Śródmieście            Brak                     19
Centrum                Brak                      6
Sielec                 Brak                      5
Raków                  Brak                      4
Wiczlino               Brak                      4
Gaj                    Krzyki                    3
Jasień                 Brak                      3
Ołtaszyn               Krzyki                    3
Podgórze               Brak                      3
Saska kępa             Grochów                   3
dtype: int64


--- Losowe wyniki predykcji ---


Unnamed: 0_level_0,True_Loc,Predict_Loc,Correct_Overall
SaleId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3756510,Kraków -> Kraków-krowodrza -> Krowodrza -> Józ...,Kraków -> Kraków-krowodrza -> Krowodrza -> ?,False
2775138,Bydgoszcz -> ? -> ? -> ?,Bydgoszcz -> ? -> ? -> ?,True
4962364,Lesznowola -> ? -> ? -> Jedności,Lesznowola -> ? -> ? -> ?,False
2209887,Bolesławiec -> ? -> ? -> ?,Bolesławiec -> ? -> ? -> ?,True
4527117,Kraków -> Kraków-podgórze -> Dębniki -> ?,Kraków -> Kraków-podgórze -> Dębniki -> ?,True
3109793,Kraków -> Kraków-podgórze -> Podgórze -> ?,Kraków -> Kraków-podgórze -> Podgórze -> ?,True
4692007,Porosły -> ? -> ? -> Wierzbowa,Porosły -> ? -> ? -> ?,False
4367371,Warszawa -> Mokotów -> ? -> ?,Warszawa -> Mokotów -> Służew -> ?,False
3078095,Jarosław -> ? -> ? -> ?,Jarosław -> ? -> ? -> ?,True
4663171,Warszawa -> Wola -> Mirów -> Chłodna,Warszawa -> Wola -> Mirów -> ?,False
