# Wersja v15_1: Model Hybrydowy (Najlepsze Cechy + Stabilny Trening) nowa baza

## Część 1: Przygotowanie Danych i Zaawansowana Inżynieria Cech

In [None]:
# --- POCZĄTEK NOWEJ I KOMPLETNEJ CZĘŚCI 1 (WERSJA FINALNA) ---

import pandas as pd
import numpy as np
import os
import joblib
import gc
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse
from IPython.display import display

# --- Krok 1: Konfiguracja ---
ARTIFACTS_DIR = 'artifacts_v15_final'
PROCESSED_DATA_DIR = 'processed_data_v15_final'
MAX_TEXT_FEATURES = 20000
RANDOM_STATE = 42

os.makedirs(ARTIFACTS_DIR, exist_ok=True)
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)
print(f"Katalogi '{ARTIFACTS_DIR}' i '{PROCESSED_DATA_DIR}' są gotowe.")

# --- Krok 2: Wczytywanie Danych ---
def load_location_data(location_path='lokalizacja.csv'):
    df_loc = pd.read_csv(
        location_path, sep=',', header=None,
        names=['id', 'parent_id', 'name', 'type', 'full_name'], dtype=str
    )
    df_loc['id'] = pd.to_numeric(df_loc['id'], errors='coerce')
    df_loc['parent_id'] = pd.to_numeric(df_loc['parent_id'], errors='coerce')
    df_loc.dropna(subset=['id'], inplace=True)
    df_loc['id'] = df_loc['id'].astype('int64')
    return df_loc

def load_offers_data_final(offers_path='saleflats_2024_2025_v2.csv'):
    print(f"Wczytywanie pliku: {offers_path}...")
    try:
        cols_map = {
            0: 'SaleId', 3: 'title', 4: 'description',
            5: 'area', 6: 'price', 52: 'locationPath'
        }
        df_selected = pd.read_csv(
            offers_path, header=None, sep=',', quotechar='"', dtype=str,
            on_bad_lines='warn', usecols=list(cols_map.keys()), low_memory=False
        )
        df_selected.columns = list(cols_map.values())
        print(f"Pomyślnie wczytano {len(df_selected)} wierszy.")
        return df_selected
    except Exception as e:
        print(f"Wystąpił KRYTYCZNY błąd: {e}")
        return pd.DataFrame()

df_loc = load_location_data()
df_offers = load_offers_data_final()
assert not df_offers.empty, "DataFrame z ofertami jest pusty."

id_to_name = dict(zip(df_loc['id'], df_loc['name']))
hierarchy_map = dict(zip(df_loc['id'], df_loc['parent_id']))
joblib.dump(id_to_name, os.path.join(ARTIFACTS_DIR, 'id_to_name.joblib'))
joblib.dump(hierarchy_map, os.path.join(ARTIFACTS_DIR, 'hierarchy_map.joblib'))
print("Mapy pomocnicze zostały stworzone.")

# --- Krok 3: Inżynieria Cech ---
print("\nRozpoczynam inżynierię cech...")

df_offers.dropna(subset=['locationPath'], inplace=True)
df_offers = df_offers[~df_offers['locationPath'].str.contains('[a-zA-Z]', na=False)].copy()

path_cols = ['woj_id', 'pow_id', 'gmi_id', 'city_id', 'district_id', 'subdistrict_id', 'street_id']
path_df = df_offers['locationPath'].str.split(',', expand=True, n=len(path_cols)-1)
path_df.columns = path_cols[:path_df.shape[1]]
path_df = path_df.reindex(columns=path_cols, fill_value='0')
path_df = path_df.apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)

df_offers['area'] = pd.to_numeric(df_offers['area'], errors='coerce')
df_offers['price'] = pd.to_numeric(df_offers['price'], errors='coerce')
df_full = pd.concat([df_offers.reset_index(drop=True), path_df.reset_index(drop=True)], axis=1)

valid_rows_conditions = (
    df_full['area'].notna() & df_full['price'].notna() & df_full['city_id'].notna() &
    (df_full['area'] > 0) & (df_full['price'] > 0) & (df_full['city_id'] != 0)
)
df_clean = df_full[valid_rows_conditions].copy()
df_clean['price_per_meter'] = df_clean['price'] / df_clean['area']
df_clean.reset_index(drop=True, inplace=True)
assert not df_clean.empty, "DataFrame jest pusty po filtrowaniu."

df_merged = pd.merge(df_clean, df_loc[['id', 'name']], left_on='district_id', right_on='id', how='left').rename(columns={'name': 'name_dist'})
df_merged = pd.merge(df_merged, df_loc[['id', 'name']], left_on='subdistrict_id', right_on='id', how='left').rename(columns={'name': 'name_subdist'})
df_merged['is_central_subdistrict'] = np.where(df_merged['name_dist'] == df_merged['name_subdist'], 1, 0)

final_cols = ['SaleId', 'title', 'description', 'area', 'price', 'price_per_meter', 'city_id', 'district_id', 'subdistrict_id', 'street_id', 'is_central_subdistrict']
df_processed = df_merged[final_cols].copy()
df_processed.rename(columns={'district_id': 'target_district_id', 'subdistrict_id': 'target_subdistrict_id', 'street_id': 'target_street_id'}, inplace=True)
df_processed['text_features'] = df_processed['title'].fillna('') + " " + df_processed['description'].fillna('')
df_processed.drop(columns=['title', 'description'], inplace=True)
df_processed.dropna(subset=['text_features'], inplace=True)
print(f"Zakończono inżynierię cech. Liczba wierszy w finalnym df_processed: {len(df_processed)}")

# --- ### NOWA KOMÓRKA DIAGNOSTYCZNA (WBUDOWANA) ### ---
print("\n--- DIAGNOSTYKA: Sprawdzanie liczby ofert dla kluczowych miast PRZED filtrowaniem ---")
try:
    warsaw_id = df_loc[df_loc['name'] == 'Warszawa'].iloc[0]['id']
    krakow_id = df_loc[df_loc['name'] == 'Kraków'].iloc[0]['id']
    wroclaw_id = df_loc[df_loc['name'] == 'Wrocław'].iloc[0]['id']
    
    print(f"ID Warszawy: {warsaw_id}")
    print(f"ID Krakowa: {krakow_id}")
    print(f"ID Wrocławia: {wroclaw_id}")
    
    counts_before_filter = df_processed['city_id'].value_counts()
    
    print("\nLiczba ofert w `df_processed` (po czyszczeniu, przed filtrem liczności):")
    print(f"  - Warszawa: {counts_before_filter.get(warsaw_id, 0)}")
    print(f"  - Kraków:   {counts_before_filter.get(krakow_id, 0)}")
    print(f"  - Wrocław:  {counts_before_filter.get(wroclaw_id, 0)}")
except (IndexError, KeyError) as e:
    print(f"Nie udało się znaleźć ID któregoś z miast do celów diagnostycznych. Błąd: {e}")
# --- KONIEC DIAGNOSTYKI ---


# --- Krok 4: Podział Danych i Transformacja ---
print("\nRozpoczynam podział danych i transformację...")

# ### POPRAWKA: Zmieniono > 1 na >= 1, aby nie usuwać miast z jedną ofertą.
city_counts = df_processed['city_id'].value_counts()
valid_cities = city_counts[city_counts > 1].index # Zostawiamy > 1 dla stratify
df_filtered = df_processed[df_processed['city_id'].isin(valid_cities)].copy()
print(f"Odrzucono {len(df_processed) - len(df_filtered)} wierszy z miast z 1 ofertą, aby umożliwić stratyfikację.")

assert not df_filtered.empty, "Po odfiltrowaniu miast z 1 ofertą ramka danych jest pusta."

target_columns = ['target_district_id', 'target_subdistrict_id', 'target_street_id']
X = df_filtered.drop(columns=target_columns)
y = df_filtered[target_columns]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=X['city_id'])
print(f"Podział danych: {len(X_train)} próbek treningowych, {len(X_val)} próbek walidacyjnych.")

numeric_features = ['area', 'price', 'price_per_meter', 'is_central_subdistrict']
imputer = SimpleImputer(strategy='median'); scaler = StandardScaler()
X_train_num = scaler.fit_transform(imputer.fit_transform(X_train[numeric_features]))
X_val_num = scaler.transform(imputer.transform(X_val[numeric_features]))
joblib.dump(imputer, os.path.join(ARTIFACTS_DIR, 'imputer.joblib'))
joblib.dump(scaler, os.path.join(ARTIFACTS_DIR, 'scaler.joblib'))

vectorizer = TfidfVectorizer(max_features=MAX_TEXT_FEATURES, ngram_range=(1, 2))
X_train_text = vectorizer.fit_transform(X_train['text_features'])
X_val_text = vectorizer.transform(X_val['text_features'])
joblib.dump(vectorizer, os.path.join(ARTIFACTS_DIR, 'vectorizer.joblib'))

X_train_city = X_train['city_id'].values
X_val_city = X_val['city_id'].values
X_val_sale_ids = X_val['SaleId'].values

np.save(os.path.join(PROCESSED_DATA_DIR, 'X_val_sale_ids.npy'), X_val_sale_ids)
y_train.to_pickle(os.path.join(PROCESSED_DATA_DIR, 'y_train.pkl'))
y_val.to_pickle(os.path.join(PROCESSED_DATA_DIR, 'y_val.pkl'))
scipy.sparse.save_npz(os.path.join(PROCESSED_DATA_DIR, 'X_train_text.npz'), X_train_text)
scipy.sparse.save_npz(os.path.join(PROCESSED_DATA_DIR, 'X_val_text.npz'), X_val_text)
np.save(os.path.join(PROCESSED_DATA_DIR, 'X_train_num.npy'), X_train_num)
np.save(os.path.join(PROCESSED_DATA_DIR, 'X_val_num.npy'), X_val_num)
np.save(os.path.join(PROCESSED_DATA_DIR, 'X_train_city.npy'), X_train_city)
np.save(os.path.join(PROCESSED_DATA_DIR, 'X_val_city.npy'), X_val_city)

print("Transformatory dopasowane i wszystkie dane zapisane.")
del df_loc, df_offers, df_processed, X, y, X_train, X_val, y_train, y_val, df_filtered, df_merged, df_clean, df_full
del X_train_text, X_val_text, X_train_num, X_val_num, X_train_city, X_val_city
gc.collect()

print("\n--- CAŁA CZĘŚĆ 1 ZAKOŃCZONA ---")

Katalogi 'artifacts_v15_final' i 'processed_data_v15_final' są gotowe.
Wczytywanie pliku: saleflats_2024_2025_v2.csv...
Pomyślnie wczytano 1467263 wierszy.
Mapy pomocnicze zostały stworzone.

Rozpoczynam inżynierię cech...
Zakończono inżynierię cech. Liczba wierszy w finalnym df_processed: 1250259

--- DIAGNOSTYKA: Sprawdzanie liczby ofert dla kluczowych miast PRZED filtrowaniem ---
ID Warszawy: 368
ID Krakowa: 337
ID Wrocławia: 366

Liczba ofert w `df_processed` (po czyszczeniu, przed filtrem liczności):
  - Warszawa: 151022
  - Kraków:   102757
  - Wrocław:  104017

Rozpoczynam podział danych i transformację...
Odrzucono 2864 wierszy z miast z 1 ofertą, aby umożliwić stratyfikację.
Podział danych: 997916 próbek treningowych, 249479 próbek walidacyjnych.


# Część 2: Budowa, Trening i Inferencia Modelu

In [1]:
import pandas as pd
import numpy as np
import joblib
import os
import scipy.sparse
import tensorflow as tf
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Dense, Dropout, Embedding, Flatten, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import matplotlib.pyplot as plt

### ZMIANA: Ujednolicenie nazw katalogów do 'v15_final' ###
ARTIFACTS_DIR = 'artifacts_v15_final'
PROCESSED_DATA_DIR = 'processed_data_v15_final'
RANDOM_STATE = 42

### ZMIANA: Ujednolicenie ścieżki i nazwy modelu ###
MODEL_PATH = os.path.join(ARTIFACTS_DIR, 'best_location_model_v15.keras')

In [2]:
# Ta komórka teraz poprawnie wczytuje artefakty z katalogów 'v15_final'
print("Wczytywanie artefaktów...")
hierarchy_map = joblib.load(os.path.join(ARTIFACTS_DIR, 'hierarchy_map.joblib'))
id_to_name = joblib.load(os.path.join(ARTIFACTS_DIR, 'id_to_name.joblib'))

print("Wczytywanie przetworzonych danych...")
X_train_text = scipy.sparse.load_npz(os.path.join(PROCESSED_DATA_DIR, 'X_train_text.npz'))
X_val_text = scipy.sparse.load_npz(os.path.join(PROCESSED_DATA_DIR, 'X_val_text.npz'))
X_train_num = np.load(os.path.join(PROCESSED_DATA_DIR, 'X_train_num.npy'))
X_val_num = np.load(os.path.join(PROCESSED_DATA_DIR, 'X_val_num.npy'))
X_train_city = np.load(os.path.join(PROCESSED_DATA_DIR, 'X_train_city.npy'))
X_val_city = np.load(os.path.join(PROCESSED_DATA_DIR, 'X_val_city.npy'))

# ### POPRAWKA ### - Dodano 'allow_pickle=True'
X_val_sale_ids = np.load(os.path.join(PROCESSED_DATA_DIR, 'X_val_sale_ids.npy'), allow_pickle=True)

y_train = pd.read_pickle(os.path.join(PROCESSED_DATA_DIR, 'y_train.pkl'))
y_val = pd.read_pickle(os.path.join(PROCESSED_DATA_DIR, 'y_val.pkl'))
print("Dane wczytane.")

Wczytywanie artefaktów...
Wczytywanie przetworzonych danych...
Dane wczytane.


In [3]:
def create_mapper(series_list):
    unique_ids = pd.unique(pd.concat(series_list))
    return {val: i for i, val in enumerate(unique_ids)}

def map_labels(series, mapper):
    return series.map(mapper).fillna(0).astype(int).values

district_map = create_mapper([y_train['target_district_id'], y_val['target_district_id']])
subdistrict_map = create_mapper([y_train['target_subdistrict_id'], y_val['target_subdistrict_id']])
street_map = create_mapper([y_train['target_street_id'], y_val['target_street_id']])
city_map = create_mapper([pd.Series(X_train_city), pd.Series(X_val_city)])

# Ta komórka teraz poprawnie zapisuje mapy do katalogu 'artifacts_v15_final'
joblib.dump(district_map, os.path.join(ARTIFACTS_DIR, 'district_map.joblib'))
joblib.dump(subdistrict_map, os.path.join(ARTIFACTS_DIR, 'subdistrict_map.joblib'))
joblib.dump(street_map, os.path.join(ARTIFACTS_DIR, 'street_map.joblib'))
joblib.dump(city_map, os.path.join(ARTIFACTS_DIR, 'city_map.joblib'))

y_district_train_mapped = map_labels(y_train['target_district_id'], district_map)
y_subdistrict_train_mapped = map_labels(y_train['target_subdistrict_id'], subdistrict_map)
y_street_train_mapped = map_labels(y_train['target_street_id'], street_map)
y_district_val_mapped = map_labels(y_val['target_district_id'], district_map)
y_subdistrict_val_mapped = map_labels(y_val['target_subdistrict_id'], subdistrict_map)
y_street_val_mapped = map_labels(y_val['target_street_id'], street_map)
X_train_city_mapped = map_labels(pd.Series(X_train_city), city_map)
X_val_city_mapped = map_labels(pd.Series(X_val_city), city_map)

MAX_TEXT_FEATURES = 20000
NUM_DISTRICTS, NUM_SUBDISTRICTS, NUM_STREETS, NUM_CITIES = len(district_map), len(subdistrict_map), len(street_map), len(city_map)
NUM_FEATURES = X_train_num.shape[1]

input_text = Input(shape=(MAX_TEXT_FEATURES,), name='text_input', sparse=True)
input_num = Input(shape=(NUM_FEATURES,), name='num_input')
input_city = Input(shape=(1,), name='city_input')

text_branch = Dense(128, activation='relu')(input_text); text_branch = Dropout(0.3)(text_branch)
num_branch = Dense(64, activation='relu')(input_num); num_branch = Dense(32, activation='relu')(num_branch)
city_branch = Embedding(input_dim=NUM_CITIES, output_dim=50, name='city_embedding')(input_city); city_branch = Flatten()(city_branch)
combined = Concatenate()([text_branch, num_branch, city_branch])
z = Dense(512, activation='relu')(combined); z = Dropout(0.5)(z)
z = Dense(256, activation='relu')(z); z = Dropout(0.5)(z)
z = Dense(128, activation='relu')(z); z = Dropout(0.5)(z)

output_district = Dense(NUM_DISTRICTS, activation='softmax', name='district_output')(z)
output_subdistrict = Dense(NUM_SUBDISTRICTS, activation='softmax', name='subdistrict_output')(z)
output_street = Dense(NUM_STREETS, activation='softmax', name='street_output')(z)

model = Model(inputs=[input_text, input_num, input_city], outputs=[output_district, output_subdistrict, output_street])
model.summary()

## Trening z Prostą Stratą i Umiarkowanymi Wagami

In [4]:
district_loss = 'sparse_categorical_crossentropy'
subdistrict_loss = 'sparse_categorical_crossentropy'
street_loss = 'sparse_categorical_crossentropy'

model.compile(
    optimizer='adam',
    loss={'district_output': district_loss, 'subdistrict_output': subdistrict_loss, 'street_output': street_loss},
    loss_weights={'district_output': 1.0, 'subdistrict_output': 1.2, 'street_output': 1.5},
    metrics={'district_output': 'accuracy', 'subdistrict_output': 'accuracy', 'street_output': 'accuracy'}
)

subdistrict_train_weights = np.where(y_train['target_subdistrict_id'].values > 0, 1.5, 1.0)
street_train_weights = np.where(y_train['target_street_id'].values > 0, 2.0, 1.0)
sample_weights_list = [np.ones(len(y_train)), subdistrict_train_weights, street_train_weights]

X_train_dict = {'text_input': X_train_text, 'num_input': X_train_num, 'city_input': X_train_city_mapped}
y_train_list = [y_district_train_mapped, y_subdistrict_train_mapped, y_street_train_mapped]
X_val_dict = {'text_input': X_val_text, 'num_input': X_val_num, 'city_input': X_val_city_mapped}
y_val_list = [y_district_val_mapped, y_subdistrict_val_mapped, y_street_val_mapped]

# Ta komórka teraz poprawnie zapisuje model do ścieżki MODEL_PATH, która wskazuje na 'artifacts_v15_final'
callbacks = [
    ModelCheckpoint(MODEL_PATH, monitor='val_loss', save_best_only=True, verbose=1),
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True, verbose=1)
]

history = model.fit(
    X_train_dict, y_train_list,
    sample_weight=sample_weights_list,
    validation_data=(X_val_dict, y_val_list),
    epochs=20, batch_size=128,
    callbacks=callbacks
)

Epoch 1/20
[1m7796/7797[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 67ms/step - district_output_accuracy: 0.7115 - district_output_loss: 0.9870 - loss: 17.1736 - street_output_accuracy: 0.6366 - street_output_loss: 7.4187 - subdistrict_output_accuracy: 0.5066 - subdistrict_output_loss: 4.2154
Epoch 1: val_loss improved from inf to 7.50017, saving model to artifacts_v15_final\best_location_model_v15.keras
[1m7797/7797[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m577s[0m 74ms/step - district_output_accuracy: 0.7115 - district_output_loss: 0.9870 - loss: 17.1731 - street_output_accuracy: 0.6366 - street_output_loss: 7.4186 - subdistrict_output_accuracy: 0.5066 - subdistrict_output_loss: 4.2152 - val_district_output_accuracy: 0.7880 - val_district_output_loss: 0.6109 - val_loss: 7.5002 - val_street_output_accuracy: 0.6442 - val_street_output_loss: 3.0437 - val_subdistrict_output_accuracy: 0.5575 - val_subdistrict_output_loss: 1.9361
Epoch 2/20
[1m7796/7797[0m [32m━━━━━━━━

## Predykcja Top-K (k=5) i Analiza Błędów

In [5]:
print(f"Wczytywanie najlepszego modelu z: {MODEL_PATH}")
best_model = tf.keras.models.load_model(MODEL_PATH)
print("Model wczytany pomyślnie.")

inv_district_map = {v: k for k, v in district_map.items()}
inv_subdistrict_map = {v: k for k, v in subdistrict_map.items()}
inv_street_map = {v: k for k, v in street_map.items()}

def find_best_consistent_path(top_k_districts, top_k_subdistricts, top_k_streets, hierarchy_map):
    for dist_id in top_k_districts:
        for sub_id in top_k_subdistricts:
            if sub_id != 0 and hierarchy_map.get(sub_id) != dist_id:
                continue
            for street_id in top_k_streets:
                if street_id != 0:
                    expected_parent = sub_id if sub_id != 0 else dist_id
                    if hierarchy_map.get(street_id) != expected_parent:
                        continue
                return dist_id, sub_id, street_id
    return top_k_districts[0], top_k_subdistricts[0], top_k_streets[0]

num_samples = 1000
k = 5
indices = np.random.choice(range(len(y_val)), num_samples, replace=False)
results_data = []

print(f"Przeprowadzanie predykcji Top-{k} z post-processingiem na {num_samples} próbkach...")
for i in indices:
    dense_vector_sample = X_val_text[i].toarray()
    current_sale_id = X_val_sale_ids[i]
    input_sample = {
        'text_input': dense_vector_sample,
        'num_input': np.expand_dims(X_val_num[i], axis=0),
        'city_input': np.expand_dims(map_labels(pd.Series(X_val_city[i]), city_map), axis=0)
    }
    
    preds = best_model.predict(input_sample, verbose=0)
    pred_district_probs, pred_subdistrict_probs, pred_street_probs = preds[0][0], preds[1][0], preds[2][0]

    top_k_district_ids = [inv_district_map.get(idx, 0) for idx in np.argsort(pred_district_probs)[::-1][:k]]
    top_k_subdistrict_ids = [inv_subdistrict_map.get(idx, 0) for idx in np.argsort(pred_subdistrict_probs)[::-1][:k]]
    top_k_street_ids = [inv_street_map.get(idx, 0) for idx in np.argsort(pred_street_probs)[::-1][:k]]

    final_district, final_subdistrict, final_street = find_best_consistent_path(
        top_k_district_ids, top_k_subdistrict_ids, top_k_street_ids, hierarchy_map
    )
    
    true_vals = y_val.iloc[i]
    results_data.append({
        'SaleId': current_sale_id,
        'City_ID': X_val_city[i],
        'True_District': true_vals['target_district_id'], 'Pred_District': final_district,
        'True_SubDistrict': true_vals['target_subdistrict_id'], 'Pred_SubDistrict': final_subdistrict,
        'True_Street': true_vals['target_street_id'], 'Pred_Street': final_street
    })

df_results = pd.DataFrame(results_data)

df_results['Correct_District'] = (df_results['True_District'] == df_results['Pred_District'])
df_results['Correct_SubDistrict'] = (df_results['True_SubDistrict'] == df_results['Pred_SubDistrict'])
df_results['Correct_Street'] = (df_results['True_Street'] == df_results['Pred_Street'])
df_results['Correct_Overall'] = df_results['Correct_District'] & df_results['Correct_SubDistrict'] & df_results['Correct_Street']

acc_district = df_results['Correct_District'].mean()
acc_subdistrict = df_results['Correct_SubDistrict'].mean()
acc_street = df_results['Correct_Street'].mean()
acc_overall = df_results['Correct_Overall'].mean()

print("\n--- Wyniki Ewaluacji (po Top-K i Post-processingu) ---")
print(f"Dokładność dla Dzielnic: {acc_district:.2%}")
print(f"Dokładność dla Pod-dzielnic: {acc_subdistrict:.2%}")
print(f"Dokładność dla Ulic: {acc_street:.2%}")
print(f"\nDokładność CAŁKOWITA (wszystkie poziomy poprawne): {acc_overall:.2%}")

Wczytywanie najlepszego modelu z: artifacts_v15_final\best_location_model_v15.keras
Model wczytany pomyślnie.
Przeprowadzanie predykcji Top-5 z post-processingiem na 1000 próbkach...

--- Wyniki Ewaluacji (po Top-K i Post-processingu) ---
Dokładność dla Dzielnic: 91.10%
Dokładność dla Pod-dzielnic: 60.20%
Dokładność dla Ulic: 64.10%

Dokładność CAŁKOWITA (wszystkie poziomy poprawne): 42.10%


In [6]:
for col in ['City_ID', 'True_District', 'Pred_District', 'True_SubDistrict', 'Pred_SubDistrict', 'True_Street', 'Pred_Street']:
    df_results[f'{col}_Name'] = df_results[col].apply(lambda x: id_to_name.get(x, 'Brak'))

df_errors = df_results[~df_results['Correct_Overall']].copy()
print("\n--- Analiza Błędów ---")

if not df_errors.empty:
    district_errors = df_errors[~df_errors['Correct_District']]
    if not district_errors.empty:
        print("\nTop 10 najczęstszych pomyłek na poziomie DZIELNICY:")
        display(district_errors.groupby(['True_District_Name', 'Pred_District_Name']).size().nlargest(10))
    else: print("\nBrak błędów na poziomie DZIELNICY!")
        
    subdistrict_errors = df_errors[df_errors['Correct_District'] & ~df_errors['Correct_SubDistrict']]
    if not subdistrict_errors.empty:
        print("\nTop 10 najczęstszych pomyłek na poziomie POD-DZIELNICY (przy poprawnej dzielnicy):")
        display(subdistrict_errors.groupby(['True_SubDistrict_Name', 'Pred_SubDistrict_Name']).size().nlargest(10))
    else: print("\nBrak błędów na poziomie POD-DZIELNICY przy poprawnych dzielnicach.")
else: print("\nModel nie popełnił żadnych błędów!")

def build_path(row, prefix):
    city = row['City_ID_Name'] if row['City_ID_Name'] != 'Brak' else '?'
    district = row[f'{prefix}_District_Name'] if row[f'{prefix}_District_Name'] != 'Brak' else '?'
    subdistrict = row[f'{prefix}_SubDistrict_Name'] if row[f'{prefix}_SubDistrict_Name'] != 'Brak' else '?'
    street = row[f'{prefix}_Street_Name'] if row[f'{prefix}_Street_Name'] != 'Brak' else '?'
    return f"{city} -> {district} -> {subdistrict} -> {street}"

df_results['True_Loc'] = df_results.apply(lambda row: build_path(row, 'True'), axis=1)
df_results['Predict_Loc'] = df_results.apply(lambda row: build_path(row, 'Pred'), axis=1)

print("\n--- Losowe wyniki predykcji ---")
df_display = df_results[['SaleId', 'True_Loc', 'Predict_Loc', 'Correct_Overall']].set_index('SaleId')
display(df_display.sample(min(20, len(df_display)), random_state=RANDOM_STATE))


--- Analiza Błędów ---

Top 10 najczęstszych pomyłek na poziomie DZIELNICY:


True_District_Name    Pred_District_Name 
Bielany               Praga-południe         8
Wrocław-stare miasto  Wrocław-śródmieście    5
Brak                  Kraków-podgórze        4
                      Kraków-krowodrza       3
Targówek              Białołęka              3
Wrocław-krzyki        Wrocław-fabryczna      3
Łódź-widzew           Łódź-górna             3
Brak                  Poznań-stare miasto    2
                      Praga-południe         2
                      Wrocław-krzyki         2
dtype: int64


Top 10 najczęstszych pomyłek na poziomie POD-DZIELNICY (przy poprawnej dzielnicy):


True_SubDistrict_Name  Pred_SubDistrict_Name
Śródmieście            Brak                     22
Centrum                Brak                      9
Sielec                 Brak                      5
Siedlce                Brak                      4
Chylonia               Brak                      3
Działki leśne          Brak                      3
Kapuściska             Brak                      3
Letnica                Brak                      3
Rubinkowo              Brak                      3
Stare miasto           Brak                      3
dtype: int64


--- Losowe wyniki predykcji ---


Unnamed: 0_level_0,True_Loc,Predict_Loc,Correct_Overall
SaleId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3027697,Legnica -> ? -> ? -> ?,Legnica -> ? -> ? -> ?,True
3398406,Krosno odrzańskie -> ? -> ? -> Mikołaja kopernika,Krosno odrzańskie -> ? -> ? -> ?,False
2265349,Warszawa -> Mokotów -> ? -> Balladyny,Warszawa -> Mokotów -> ? -> ?,False
3035282,Białystok -> ? -> ? -> ?,Białystok -> ? -> ? -> ?,True
3945153,Kraków -> Kraków-krowodrza -> Bronowice małe -...,Kraków -> Kraków-krowodrza -> Krowodrza -> ?,False
3773873,Piekary śląskie -> ? -> ? -> Papieża jana pawł...,Piekary śląskie -> ? -> ? -> ?,False
3998319,Kraków -> Kraków-śródmieście -> ? -> Będzińska,Kraków -> Kraków-śródmieście -> Stare miasto -> ?,False
4411716,Gdynia -> ? -> Wiczlino -> ?,Gdynia -> ? -> ? -> ?,False
643036,Warszawa -> Praga-południe -> Gocław -> ?,Warszawa -> Praga-południe -> Grochów -> ?,False
4238112,Warszawa -> Bielany -> Słodowiec -> Leopolda s...,Warszawa -> Praga-południe -> Grochów -> ?,False


In [15]:
# --- OSTATECZNA WERSJA SKRYPTU DO PREDYKCYJI (v12 - RADYKALNIE UPROSZCZONY, ZERO FILTROWANIA) ---
# Koniec z filtrowaniem. Każdy wiersz jest przetwarzany, bez względu na jego jakość.

import pandas as pd
import numpy as np
import os
import joblib
import gc
import tensorflow as tf
from tqdm.notebook import tqdm

# --- Krok 1: Wczytanie wszystkich potrzebnych artefaktów i modelu ---
print("Wczytywanie wytrenowanych artefaktów i modelu...")
ARTIFACTS_DIR = 'artifacts_v15_final'
MODEL_PATH = os.path.join(ARTIFACTS_DIR, 'best_location_model_v15.keras')

gc.collect()
model = tf.keras.models.load_model(MODEL_PATH)
print("Model wczytany pomyślnie.")

vectorizer = joblib.load(os.path.join(ARTIFACTS_DIR, 'vectorizer.joblib'))
scaler = joblib.load(os.path.join(ARTIFACTS_DIR, 'scaler.joblib'))
imputer = joblib.load(os.path.join(ARTIFACTS_DIR, 'imputer.joblib'))
hierarchy_map = joblib.load(os.path.join(ARTIFACTS_DIR, 'hierarchy_map.joblib'))
id_to_name = joblib.load(os.path.join(ARTIFACTS_DIR, 'id_to_name.joblib'))
city_map = joblib.load(os.path.join(ARTIFACTS_DIR, 'city_map.joblib'))
district_map = joblib.load(os.path.join(ARTIFACTS_DIR, 'district_map.joblib'))
subdistrict_map = joblib.load(os.path.join(ARTIFACTS_DIR, 'subdistrict_map.joblib'))
street_map = joblib.load(os.path.join(ARTIFACTS_DIR, 'street_map.joblib'))

inv_district_map = {v: k for k, v in district_map.items()}
inv_subdistrict_map = {v: k for k, v in subdistrict_map.items()}
inv_street_map = {v: k for k, v in street_map.items()}
print("Wszystkie artefakty wczytane.")

# --- Krok 2: Definicje funkcji pomocniczych (bez zmian) ---
def find_best_consistent_path(top_k_districts, top_k_subdistricts, top_k_streets, hierarchy_map):
    for dist_id in top_k_districts:
        for sub_id in top_k_subdistricts:
            if sub_id != 0 and hierarchy_map.get(sub_id) != dist_id: continue
            for street_id in top_k_streets:
                if street_id != 0:
                    expected_parent = sub_id if sub_id != 0 else dist_id
                    if hierarchy_map.get(street_id) != expected_parent: continue
                return dist_id, sub_id, street_id
    return top_k_districts[0], top_k_subdistricts[0], top_k_streets[0]

def build_predicted_path(row, id_to_name_map, hierarchy_map):
    if pd.isna(row.get('Pred_District_ID')):
        return "? -> ? -> ? -> ?"
    city_id = row['Pred_City_ID']
    district_id = row['Pred_District_ID']
    if city_id == 0 and district_id != 0:
        parent = hierarchy_map.get(district_id)
        while parent is not None and parent != 0:
            if parent in id_to_name_map:
                city_id = parent
                break
            parent = hierarchy_map.get(parent)
    city = id_to_name_map.get(city_id, '?')
    district = id_to_name_map.get(district_id, '?')
    subdistrict = id_to_name_map.get(row['Pred_SubDistrict_ID'], '?')
    street = id_to_name_map.get(row['Pred_Street_ID'], '?')
    return f"{city} -> {district} -> {subdistrict} -> {street}"

# --- Krok 3: Ustawienia i pętla predykcji ---
CHUNK_SIZE = 50000
OFFERS_PATH = 'saleflats_2024_2025_v2.csv'
OUTPUT_FILENAME = 'Location_Polska_v15_full_prediction.csv'
is_first_chunk = True

if os.path.exists(OUTPUT_FILENAME):
    os.remove(OUTPUT_FILENAME)
    print(f"Usunięto istniejący plik: {OUTPUT_FILENAME}")

print(f"\nRozpoczynam przetwarzanie pliku '{OFFERS_PATH}'...")
cols_map = {0: 'SaleId', 3: 'title', 4: 'description', 5: 'area', 6: 'price', 52: 'locationPath'}
chunk_iterator = pd.read_csv(
    OFFERS_PATH, header=None, sep=',', quotechar='"', dtype=str,
    on_bad_lines='warn', usecols=list(cols_map.keys()),
    chunksize=CHUNK_SIZE, low_memory=False
)

for chunk in tqdm(chunk_iterator, desc="Przetwarzanie chunków"):
    chunk.columns = list(cols_map.values())
    if chunk.empty: continue

    # --- A. PRZYGOTOWANIE DANYCH - BEZPIECZNE I BEZ FILTROWANIA ---

    # 1. Cechy tekstowe (zawsze bezpieczne)
    chunk['text_features'] = chunk['title'].fillna('') + " " + chunk['description'].fillna('')

    # 2. Cechy numeryczne (bezpieczna konwersja, błędy stają się NaN)
    chunk['area'] = pd.to_numeric(chunk['area'], errors='coerce')
    chunk['price'] = pd.to_numeric(chunk['price'], errors='coerce')
    
    # Bezpieczne obliczanie price_per_meter
    chunk['price_per_meter'] = np.nan
    valid_mask = (chunk['area'].notna()) & (chunk['area'] > 0)
    chunk.loc[valid_mask, 'price_per_meter'] = chunk.loc[valid_mask, 'price'] / chunk.loc[valid_mask, 'area']
    
    chunk['is_central_subdistrict'] = 0 # Bezpieczne założenie

    # 3. Cecha City ID (bezpieczne parsowanie)
    chunk['locationPath'].fillna('', inplace=True)
    path_df = chunk['locationPath'].str.split(',', expand=True, n=4)
    # .get(3, ...) jest bezpieczne, nie zawiedzie, gdy brakuje kolumny
    city_ids_raw = path_df.get(3, pd.Series(index=chunk.index)).fillna('0')
    city_ids = pd.to_numeric(city_ids_raw, errors='coerce').fillna(0).astype(int)

    # --- B. TRANSFORMACJA I PREDYKCJA ---
    # Imputer i Scaler zajmą się wszystkimi wartościami NaN w kolumnach numerycznych
    numeric_cols = ['area', 'price', 'price_per_meter', 'is_central_subdistrict']
    X_num_chunk = scaler.transform(imputer.transform(chunk[numeric_cols]))
    X_text_chunk = vectorizer.transform(chunk['text_features'])
    X_city_chunk = city_ids.map(city_map).fillna(0).astype(int).values

    predictions = model.predict([X_text_chunk, X_num_chunk, X_city_chunk], batch_size=4096, verbose=0)
    
    # --- C. POST-PROCESSING I ZAPIS ---
    results = []
    k=5
    for i in range(len(chunk)):
        pred_district_probs, pred_subdistrict_probs, pred_street_probs = predictions[0][i], predictions[1][i], predictions[2][i]
        top_k_district_ids = [inv_district_map.get(idx, 0) for idx in np.argsort(pred_district_probs)[::-1][:k]]
        top_k_subdistrict_ids = [inv_subdistrict_map.get(idx, 0) for idx in np.argsort(pred_subdistrict_probs)[::-1][:k]]
        top_k_street_ids = [inv_street_map.get(idx, 0) for idx in np.argsort(pred_street_probs)[::-1][:k]]
        final_district, final_subdistrict, final_street = find_best_consistent_path(top_k_district_ids, top_k_subdistrict_ids, top_k_street_ids, hierarchy_map)
        results.append({
            'SaleId': chunk.iloc[i]['SaleId'], 'Pred_City_ID': city_ids.iloc[i],
            'Pred_District_ID': final_district, 'Pred_SubDistrict_ID': final_subdistrict, 'Pred_Street_ID': final_street
        })
    df_predictions = pd.DataFrame(results)

    # Budujemy finalny output. Nie ma potrzeby merge'a, bo przetwarzaliśmy cały chunk.
    df_predictions['Predict_Loc'] = df_predictions.apply(lambda row: build_predicted_path(row, id_to_name, hierarchy_map), axis=1)
    
    final_df_to_save = df_predictions[['SaleId', 'Predict_Loc']]
    final_df_to_save.to_csv(OUTPUT_FILENAME, mode='a', header=is_first_chunk, index=False, encoding='utf-8-sig')
    is_first_chunk = False

    gc.collect()

print(f"\n✅ Zakończono! Finalny plik z predykcjami został zapisany jako: {OUTPUT_FILENAME}")
print("Oto 10 pierwszych wierszy z wynikami:")
display(pd.read_csv(OUTPUT_FILENAME).head(10))

Wczytywanie wytrenowanych artefaktów i modelu...
Model wczytany pomyślnie.
Wszystkie artefakty wczytane.
Usunięto istniejący plik: Location_Polska_v15_full_prediction.csv

Rozpoczynam przetwarzanie pliku 'saleflats_2024_2025_v2.csv'...


Przetwarzanie chunków: 0it [00:00, ?it/s]


✅ Zakończono! Finalny plik z predykcjami został zapisany jako: Location_Polska_v15_full_prediction.csv
Oto 10 pierwszych wierszy z wynikami:


Unnamed: 0,SaleId,Predict_Loc
0,88,Białystok -> ? -> ? -> ?
1,99,Białystok -> ? -> ? -> ?
2,115,Białystok -> ? -> ? -> ?
3,140,Białystok -> ? -> ? -> ?
4,145,Białystok -> ? -> ? -> ?
5,159,Białystok -> ? -> ? -> ?
6,165,Białystok -> ? -> ? -> ?
7,173,Białystok -> ? -> ? -> ?
8,179,? -> ? -> ? -> ?
9,189,Białystok -> ? -> ? -> ?
