In [1]:
# --- Podstawowe biblioteki ---
import pandas as pd
import numpy as np
from datetime import datetime
import os
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import mlflow

# --- Biblioteki do Machine Learningu i śledzenia eksperymentów ---
# POPRAWKA: Dodajemy 'pull' do listy importów
from pycaret.regression import setup, compare_models, tune_model, finalize_model, save_model, predict_model, pull



In [2]:
try:
    print("Wczytuję kompletny plik danych...")
    df_original = pd.read_csv('Data_state_LSTM_predicted_full.csv', sep=';', low_memory=False)
    print(f"Wczytano pomyślnie. Kształt danych: {df_original.shape}")
except Exception as e:
    print(f"BŁĄD: {e}")
    df_original = pd.DataFrame()

Wczytuję kompletny plik danych...
Wczytano pomyślnie. Kształt danych: (760765, 54)


In [3]:
if not df_original.empty:
    # 1. Inżynieria Cech (Inflacja)
    def adjust_price(row):
        price = pd.to_numeric(row['Price'], errors='coerce')
        if pd.isna(price): return np.nan
        date_str = row['NewestDate'] if 'NewestDate' in row and pd.notna(row['NewestDate']) else (row['DateAdded'] if 'DateAdded' in row and pd.notna(row['DateAdded']) else None)
        if date_str is None or date_str == 'NULL': return price
        try:
            offer_date = pd.to_datetime(date_str, errors='coerce')
            if pd.isna(offer_date): return price
            years_diff = (datetime.now() - offer_date).days / 365.25
            return round(price * (1.05**years_diff), 0) if years_diff > 0 else price
        except: return price
    df_original['AdjustedPrice'] = df_original.apply(adjust_price, axis=1)
    
    # 2. Konwersja typów
    numeric_cols = ['Area', 'NumberOfRooms', 'Floor', 'Floors']
    for col in numeric_cols:
        df_original[col] = pd.to_numeric(df_original[col], errors='coerce')
    
    # 3. Wybór danych do treningu
    data_for_training = df_original.dropna(subset=['AdjustedPrice', 'Area']).copy()
    print(f"Liczba wierszy do treningu (przed czyszczeniem): {len(data_for_training)}")

    # 4. Czyszczenie outlierów
    for col in ['AdjustedPrice', 'Area']:
        Q1 = data_for_training[col].quantile(0.25)
        Q3 = data_for_training[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound, upper_bound = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
        data_for_training = data_for_training[(data_for_training[col] >= lower_bound) & (data_for_training[col] <= upper_bound)]
    print(f"Liczba wierszy do treningu (po czyszczeniu): {len(data_for_training)}")
else:
    data_for_training = pd.DataFrame()

Liczba wierszy do treningu (przed czyszczeniem): 726257
Liczba wierszy do treningu (po czyszczeniu): 663111


In [4]:
if not df_original.empty:
    # Konwersja typów
    numeric_cols = ['Area', 'NumberOfRooms', 'Floor', 'Floors']
    for col in numeric_cols:
        df_original[col] = pd.to_numeric(df_original[col], errors='coerce')
    
    data_for_training = df_original[df_original['AdjustedPrice'].notna()].copy()
    print(f"Liczba wierszy do treningu (przed czyszczeniem): {len(data_for_training)}")

    # Czyszczenie
    Q1_price = data_for_training["AdjustedPrice"].quantile(0.25)
    Q3_price = data_for_training["AdjustedPrice"].quantile(0.75)
    IQR_price = Q3_price - Q1_price
    data_for_training = data_for_training[~((data_for_training["AdjustedPrice"] < (Q1_price - 1.5 * IQR_price)) | (data_for_training["AdjustedPrice"] > (Q3_price + 1.5 * IQR_price)))]
    
    data_for_training.dropna(subset=['Area'], inplace=True)
    Q1_area = data_for_training["Area"].quantile(0.25)
    Q3_area = data_for_training["Area"].quantile(0.75)
    IQR_area = Q3_area - Q1_area
    data_for_training = data_for_training[~((data_for_training["Area"] < (Q1_area - 1.5 * IQR_area)) | (data_for_training["Area"] > (Q3_area + 1.5 * IQR_area)))]
    
    print(f"Liczba wierszy do treningu (po czyszczeniu): {len(data_for_training)}")
else:
    data_for_training = pd.DataFrame()

Liczba wierszy do treningu (przed czyszczeniem): 728403
Liczba wierszy do treningu (po czyszczeniu): 663116


In [5]:
if not data_for_training.empty:
    print("--- Konfiguracja eksperymentu PyCaret ---")
    
    # Definicja list cech
    numeric_features = ['Area', 'NumberOfRooms', 'Floor', 'Floors']
    categorical_features = ['Predict_State', 'BuildingType', 'TypeOfMarket', 'Type', 'OfferFrom', 'OwnerType']
    date_features = ['BuiltYear']
    
    # --- POPRAWKA: Dodajemy wszystkie potencjalnie problematyczne kolumny do ignorowanych ---
    ignore_features = [
        'SaleId', 'OriginalId', 'PortalId', 'Title', 'Description', 'BuildingCondition', 'Price',
        'DateAddedToDatabase', # <--- KLUCZOWA POPRAWKA
        'DateAdded', 'NewestDate', 'DateLastModification', 'DateLastRaises',
        'OfferPrice', 'RealPriceAfterRenovation', 'OriginalPrice', 'PricePerSquareMeter',
        'AvailableFrom', 'Link', 'Phone', 'MainImage', 'OtherImages', 'NumberOfDuplicates',
        'NumberOfRaises', 'NumberOfModifications', 'IsDuplicatePriceLower', 'IsDuplicatePrivateOwner',
        'Score', 'ScorePrecision', 'CommunityScore', 'NumberOfCommunityComments', 'NumberOfCommunityOpinions',
        'Archive', 'Location', 'VoivodeshipNumber', 'CountyNumber', 'CommunityNumber', 'KindNumber',
        'RegionNumber', 'SubRegionNumber', 'StreetNumber', 'EncryptedId', 'PredictedRenovation',
        'LocationPath'
    ]

    # Konwersja daty 'BuiltYear' przed przekazaniem do setup
    if 'BuiltYear' in data_for_training.columns:
        median_year = pd.to_numeric(data_for_training['BuiltYear'], errors='coerce').median()
        data_for_training['BuiltYear'] = pd.to_numeric(data_for_training['BuiltYear'], errors='coerce').fillna(median_year)
        data_for_training.dropna(subset=['BuiltYear'], inplace=True)
        data_for_training['BuiltYear'] = data_for_training['BuiltYear'].astype(int).astype(str)
        data_for_training['BuiltYear'] = pd.to_datetime(data_for_training['BuiltYear'], format='%Y', errors='coerce')

    # Filtrowanie list, aby użyć tylko istniejących kolumn
    numeric_features_to_use = [c for c in numeric_features if c in data_for_training.columns]
    categorical_features_to_use = [c for c in categorical_features if c in data_for_training.columns]
    date_features_to_use = [c for c in date_features if c in data_for_training.columns]
    ignore_features_to_use = [c for c in ignore_features if c in data_for_training.columns]

    print(f"Cechy numeryczne: {numeric_features_to_use}")
    print(f"Cechy kategoryczne: {categorical_features_to_use}")
    print(f"Cechy daty: {date_features_to_use}")
    print(f"Ignorowane cechy: {ignore_features_to_use}")

    # Inicjalizacja środowiska PyCaret
    reg_exp = setup(
        data=data_for_training,
        target='AdjustedPrice',
        session_id=123,
        log_experiment=False, 
        numeric_features=numeric_features_to_use,
        categorical_features=categorical_features_to_use,
        date_features=date_features_to_use,
        ignore_features=ignore_features_to_use,
        normalize=True,
        normalize_method='zscore'
    )
else:
    print("BŁĄD: Zbiór 'data_for_training' jest pusty. Nie można uruchomić setup().")

--- Konfiguracja eksperymentu PyCaret ---
Cechy numeryczne: ['Area', 'NumberOfRooms', 'Floor', 'Floors']
Cechy kategoryczne: ['Predict_State', 'BuildingType', 'TypeOfMarket', 'Type', 'OfferFrom', 'OwnerType']
Cechy daty: ['BuiltYear']
Ignorowane cechy: ['SaleId', 'OriginalId', 'PortalId', 'Title', 'Description', 'BuildingCondition', 'Price', 'DateAddedToDatabase', 'DateAdded', 'NewestDate', 'DateLastModification', 'DateLastRaises', 'OfferPrice', 'RealPriceAfterRenovation', 'OriginalPrice', 'PricePerSquareMeter', 'AvailableFrom', 'Link', 'Phone', 'MainImage', 'OtherImages', 'NumberOfDuplicates', 'NumberOfRaises', 'NumberOfModifications', 'IsDuplicatePriceLower', 'IsDuplicatePrivateOwner', 'Score', 'ScorePrecision', 'CommunityScore', 'NumberOfCommunityComments', 'NumberOfCommunityOpinions', 'Archive', 'Location', 'VoivodeshipNumber', 'CountyNumber', 'CommunityNumber', 'KindNumber', 'RegionNumber', 'SubRegionNumber', 'StreetNumber', 'EncryptedId', 'PredictedRenovation', 'LocationPath']


Unnamed: 0,Description,Value
0,Session id,123
1,Target,AdjustedPrice
2,Target type,Regression
3,Original data shape,"(663116, 55)"
4,Transformed data shape,"(663116, 20)"
5,Transformed train set shape,"(464181, 20)"
6,Transformed test set shape,"(198935, 20)"
7,Ignore features,43
8,Numeric features,4
9,Date features,1


In [6]:
# --- Bezpieczne Porównanie Modeli ---

# Sprawdzamy, czy eksperyment PyCaret został poprawnie zainicjowany w poprzedniej komórce
if 'reg_exp' in locals() and reg_exp is not None:
    print("Rozpoczynam porównywanie wszystkich dostępnych modeli. To może potrwać...")
    
    # Uruchamiamy compare_models, aby przetestować całą bibliotekę
    best_models_list = compare_models(sort='R2', n_select=3)

    # Sprawdzamy, czy udało się znaleźć jakiekolwiek modele
    if best_models_list:
        # Do dalszej pracy wybieramy pierwszy, czyli najlepszy model
        best_model = best_models_list[0]

        print("\n--- Najlepsze znalezione modele ---")
        print(best_models_list)

        print("\n--- Wybrany najlepszy model do dalszej pracy ---")
        print(best_model)
    else:
        print("\nBŁĄD: compare_models nie zwróciło żadnych modeli. Sprawdź wyniki z setup().")
        # Ustawiamy best_model na None, aby uniknąć błędów w kolejnych komórkach
        best_model = None
else:
    print("BŁĄD KRYTYCZNY: Eksperyment PyCaret (setup) nie został uruchomiony.")
    print("Prawdopodobnie zbiór 'data_for_training' był pusty po czyszczeniu.")
    print("Sprawdź wyniki z komórki nr 4 (Przygotowanie Danych do Treningu).")
    best_model = None

Rozpoczynam porównywanie wszystkich dostępnych modeli. To może potrwać...


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,128278.404,31867570836.7484,178512.8302,0.5137,0.5023,39.0447,6.599
catboost,CatBoost Regressor,143323.4579,34870093983.7444,186733.6323,0.4678,0.5103,37.9251,6.172
xgboost,Extreme Gradient Boosting,143249.8266,34879573606.4,186759.2266,0.4677,0.5101,37.5188,0.848
lightgbm,Light Gradient Boosting Machine,146276.1713,36023385334.6778,189796.4028,0.4502,0.516,38.6735,0.834
dt,Decision Tree Regressor,151908.8309,51331665885.183,226561.7665,0.2166,0.6174,39.408,0.653
dummy,Dummy Regressor,205463.4422,65526467788.8,255980.0828,-0.0,0.6294,48.003,0.447



--- Najlepsze znalezione modele ---
[RandomForestRegressor(n_jobs=-1, random_state=123), <catboost.core.CatBoostRegressor object at 0x0000018CCF26EC50>, XGBRegressor(base_score=None, booster='gbtree', callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device='cpu', early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=None, grow_policy=None,
             importance_type=None, interaction_constraints=None,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=None,
             max_leaves=None, min_child_weight=None, missing=nan,
             monotone_constraints=None, multi_strategy=None, n_estimators=None,
             n_jobs=-1, num_parallel_tree=None, ...)]

--- Wybrany najlepszy model do dalszej pracy ---
RandomForestRegressor(n_jobs=-1, random_st

In [7]:
# --- KROK: TUNE_MODEL ---
# Teraz, gdy mamy sensowny model bazowy, możemy go dostroić.

if 'best_model' in locals() and best_model is not None:
    print(f"Rozpoczynam tuning najlepszego modelu: {type(best_model).__name__}")
    
    # PyCaret automatycznie przeszuka najlepsze hiperparametry.
    # n_iter=20 to 20 prób, dobry kompromis między czasem a jakością.
    tuned_best_model = tune_model(best_model, n_iter=20, optimize = 'R2')
    
    print("\n--- Wyniki po tuningu ---")
    # Porównanie wyników przed i po tuningu
    tuned_results = pull()
    display(tuned_results)
    
    # WAŻNE: Od teraz do finalizacji używamy modelu po tuningu
    best_model = tuned_best_model 
    
else:
    print("BŁĄD: Zmienna 'best_model' nie została znaleziona lub jest pusta.")

Rozpoczynam tuning najlepszego modelu: RandomForestRegressor


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,150289.385,37666519910.5089,194078.6436,0.4282,0.5303,47.7699
1,148873.2469,37155117779.1799,192756.6284,0.4272,0.5133,30.3144
2,149268.0704,37197414530.7416,192866.3126,0.4256,0.5323,66.2516
3,150270.3104,37968094745.009,194854.0345,0.4193,0.5312,47.0611
4,150723.0223,38135705213.346,195283.6532,0.4217,0.5297,30.1965
5,150010.064,37723365758.7276,194225.039,0.4265,0.5271,53.1688
6,150756.4342,38128478286.2302,195265.1487,0.4207,0.517,16.355
7,148738.0591,37040157447.6811,192458.1966,0.4359,0.5154,34.8114
8,149411.5787,37615154536.4945,193946.2671,0.4294,0.5199,40.5769
9,149219.0842,37328612849.2911,193206.1408,0.4278,0.518,27.5274


Fitting 10 folds for each of 20 candidates, totalling 200 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).

--- Wyniki po tuningu ---


Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,150289.385,37666520000.0,194078.6436,0.4282,0.5303,47.7699
1,148873.2469,37155120000.0,192756.6284,0.4272,0.5133,30.3144
2,149268.0704,37197410000.0,192866.3126,0.4256,0.5323,66.2516
3,150270.3104,37968090000.0,194854.0345,0.4193,0.5312,47.0611
4,150723.0223,38135710000.0,195283.6532,0.4217,0.5297,30.1965
5,150010.064,37723370000.0,194225.039,0.4265,0.5271,53.1688
6,150756.4342,38128480000.0,195265.1487,0.4207,0.517,16.355
7,148738.0591,37040160000.0,192458.1966,0.4359,0.5154,34.8114
8,149411.5787,37615150000.0,193946.2671,0.4294,0.5199,40.5769
9,149219.0842,37328610000.0,193206.1408,0.4278,0.518,27.5274


In [8]:
# --- Krok 1: Finalizacja najlepszego modelu ---
if 'best_model' in locals() and best_model is not None:
    print("Finalizowanie najlepszego modelu...")
    final_model = finalize_model(best_model)
    print("Sfinalizowany model:")
    print(final_model)

    # --- Krok 2: Zapis modelu na dysku ---
    model_filename = 'final_price_model.pkl'
    save_model(final_model, model_filename)
    print(f"\nModel został zapisany jako '{model_filename}.pkl'")

    # --- Krok 3: Predykcja na całym zbiorze i zapis wyników (Z POPRAWKĄ) ---
    print(f"\nWykonywanie predykcji na całym oryginalnym zbiorze danych (kształt: {df_original.shape})...")
    
    # --- KLUCZOWA POPRAWKA: PRZYGOTOWANIE DANYCH DO PREDYKCJI ---
    # Musimy wykonać te same transformacje typów na df_original, co na data_for_training
    data_for_prediction = df_original.copy()
    
    # Konwersja typów numerycznych
    numeric_features = ['Area', 'NumberOfRooms', 'Floor', 'Floors']
    for col in numeric_features:
        if col in data_for_prediction.columns:
            data_for_prediction[col] = pd.to_numeric(data_for_prediction[col], errors='coerce')
    
    # Konwersja daty 'BuiltYear'
    if 'BuiltYear' in data_for_prediction.columns:
        # Ta sama logika co w setup: wypełnij NaN medianą, konwertuj na datę
        median_year = pd.to_numeric(data_for_prediction['BuiltYear'], errors='coerce').median()
        data_for_prediction['BuiltYear'] = pd.to_numeric(data_for_prediction['BuiltYear'], errors='coerce').fillna(median_year)
        data_for_prediction.dropna(subset=['BuiltYear'], inplace=True)
        data_for_prediction['BuiltYear'] = data_for_prediction['BuiltYear'].astype(int).astype(str)
        data_for_prediction['BuiltYear'] = pd.to_datetime(data_for_prediction['BuiltYear'], format='%Y', errors='coerce')

    # Usuń kolumny docelowe przed predykcją
    data_for_prediction = data_for_prediction.drop(columns=['AdjustedPrice', 'Price'], errors='ignore')
    
    # Przekazujemy do predict_model przygotowane dane
    all_data_predictions = predict_model(final_model, data=data_for_prediction)
    print("Predykcja zakończona.")

    # Przygotowanie finalnego pliku do zapisu (łączymy wyniki z oryginalnymi danymi)
    # Używamy indeksu, aby mieć pewność, że wszystko się zgadza
    df_final_output = df_original.copy()
    df_final_output['PredictedPrice'] = all_data_predictions['prediction_label']

    if 'AdjustedPrice' in df_final_output.columns:
        price_index = df_final_output.columns.get_loc('AdjustedPrice')
        cols = list(df_final_output.columns)
        cols.insert(price_index + 1, cols.pop(cols.index('PredictedPrice')))
        df_final_output = df_final_output[cols]

    output_filename = 'sale_2024_0_predict.csv'
    df_final_output.to_csv(output_filename, index=False, sep=';')

    print(f"\nWyniki zostały zapisane do pliku: {output_filename}")
    print("\nPrzykładowe dane z finalnego pliku:")
    display(df_final_output[['SaleId', 'Price', 'AdjustedPrice', 'PredictedPrice']].head(10))

else:
    print("BŁĄD: Zmienna 'best_model' nie została znaleziona. Uruchom najpierw komórkę z 'compare_models'.")

Finalizowanie najlepszego modelu...
Sfinalizowany model:
Pipeline(memory=Memory(location=None),
         steps=[('date_feature_extractor',
                 TransformerWrapper(include=['BuiltYear'],
                                    transformer=ExtractDateTimeFeatures())),
                ('numerical_imputer',
                 TransformerWrapper(include=['Area', 'NumberOfRooms', 'Floor',
                                             'Floors'],
                                    transformer=SimpleImputer())),
                ('categorical_imputer',
                 TransformerWrapper(include=['Predict_State', 'BuildingTy...
                                    transformer=OneHotEncoder(cols=['Predict_State',
                                                                    'BuildingType'],
                                                              handle_missing='return_nan',
                                                              use_cat_names=True))),
                ('rest

Predykcja zakończona.

Wyniki zostały zapisane do pliku: sale_2024_0_predict.csv

Przykładowe dane z finalnego pliku:


Unnamed: 0,SaleId,Price,AdjustedPrice,PredictedPrice
0,88,766500.0,825596.0,845960.665167
1,99,540000.0,579384.0,825377.88
2,115,540000.0,578920.0,556801.196333
3,140,544000.0,566167.0,598301.1975
4,145,459000.0,490573.0,614920.443889
5,159,779000.0,833920.0,806558.288
6,165,359000.0,385544.0,420247.77
7,173,380000.0,406247.0,422378.927667
8,189,354000.0,378755.0,396921.045
9,208,820000.0,879453.0,811618.99
