# PROJEKT MODELU KLASYFIKACYJNEGO PRZEWIDYWANIA STANU MIESZKAŃ NA PODSTAWIE #

## Extra Trees Classifier ##

### kwiecień 2025

In [58]:
import pandas as pd
import os
from pycaret.classification import setup, create_model, tune_model, predict_model, evaluate_model, pull
from pycaret.classification import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import resample
from sklearn.metrics import classification_report, confusion_matrix, f1_score, recall_score
from imblearn.over_sampling import SMOTE
import pymysql
from sqlalchemy import create_engine
import numpy as np
from scipy.stats import skewnorm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
import sqlite3

# Blok z metodą OvR uruchamiać z tego miejsca

## Uwaga nie zapomnieć o imporcie z pierwszej linijki

In [27]:
df=pd.read_csv("sale_2024_0.csv")

In [28]:
df_2 = df[~((df['Link'].str.contains('otodom', case=False, na=False)) & 
                              (df['BuildingCondition'] == 'AFTER_RENOVATION'))]

In [29]:
Q1 = df_2["Price"].quantile(0.25)
Q3 = df_2["Price"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_prep_p = df_2[~((df_2["Price"] < lower_bound) | (df_2["Price"] > upper_bound))]

In [30]:
Q1 = df_prep_p["PricePerSquareMeter"].quantile(0.25)
Q3 = df_prep_p["PricePerSquareMeter"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_prep_a = df_prep_p[~((df_prep_p["PricePerSquareMeter"] < lower_bound) | (df_prep_p["PricePerSquareMeter"] > upper_bound))]

In [31]:
Q1 = df_prep_a["Area"].quantile(0.25)
Q3 = df_prep_a["Area"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_prepared2 = df_prep_a[~((df_prep_a["Area"] < lower_bound) | (df_prep_a["Area"] > upper_bound))]

In [32]:
df_prepared3=df_prepared2.dropna(subset=['BuildingCondition'])

In [33]:
df_prepared3.loc[df_prepared3['TypeOfMarket'] == 'pierwotny', 'BuildingCondition'] = 'DEVELOPER_STATE'

In [34]:
df_prepared3['BuiltYear'] = pd.to_datetime(df_prepared3['BuiltYear'], format='%Y', errors='coerce')

In [35]:
df_prepared3.loc[df_prepared3['BuiltYear'].dt.year >= 2025, 'BuildingCondition'] = 'DEVELOPER_STATE'

In [36]:
df_prepared3['BuiltYear'] = pd.to_datetime(df_prepared3['BuiltYear'], format='%Y', errors='coerce')

In [37]:
unique_ctype=df_prepared3['BuildingCondition'].unique()

unique_ctype

array(['DEVELOPER_STATE', 'AFTER_RENOVATION', 'GOOD', 'FOR_RENOVATION'],
      dtype=object)

In [38]:
df_prepared3["BuildingCondition"].value_counts(normalize=True)

BuildingCondition
DEVELOPER_STATE     0.606077
AFTER_RENOVATION    0.183006
FOR_RENOVATION      0.108469
GOOD                0.102449
Name: proportion, dtype: float64

In [39]:
# DODANIE ZBALANSOWANIA
# Sprawdź ile rekordów ma każda klasa
class_counts = df_prepared3['BuildingCondition'].value_counts()
min_count = class_counts.min()

# Tworzymy zbalansowane próbki dla każdej klasy
dfs = []
for condition in class_counts.index:
    df_condition = df_prepared3[df_prepared3['BuildingCondition'] == condition]
    df_condition_downsampled = resample(df_condition, 
                                        replace=False, 
                                        n_samples=min_count, 
                                        random_state=42)
    dfs.append(df_condition_downsampled)

# Łączymy wszystkie klasy w jeden zbalansowany DataFrame
df_balanced = pd.concat(dfs).reset_index(drop=True)

In [40]:
df_balanced["BuildingCondition"].value_counts(normalize=True)

BuildingCondition
DEVELOPER_STATE     0.25
AFTER_RENOVATION    0.25
FOR_RENOVATION      0.25
GOOD                0.25
Name: proportion, dtype: float64

In [41]:
df_balanced

Unnamed: 0,SaleId,OriginalId,PortalId,Title,Description,Area,Price,OfferPrice,RealPriceAfterRenovation,OriginalPrice,...,Archive,Location,VoivodeshipNumber,CountyNumber,CommunityNumber,KindNumber,RegionNumber,SubRegionNumber,StreetNumber,EncryptedId
0,4266367,,3,Nowoczesne mieszkania w sercu Łodzi Manufaktura,Kup mieszkanie bezpośrednio od Dewelopera! Pra...,52.00,459000.0,553361.0,575637.67,,...,,"Łódzkie, Łódź, Łódź-śródmieście, Śródmieście",10.0,61.0,5.0,9.0,958447.0,,,dKKXZ1uFQJTb8QJJOLnifO8e0oyGIwz8-GAUcMCH2_k=
1,4390394,,3,3-pokojowe mieszkanie 60m2 + loggia,3-pokojowe mieszkanie numer 8-A008 na 2. piętr...,60.29,,,,,...,,"Mazowieckie, Warszawa, Ursus, Gołąbki, Ul. Kaz...",14.0,65.0,12.0,8.0,918666.0,,32907.0,oydtk6bW4a5uEscrvIRGJ2DSFhxJitH2UJuyXKHVWIY=
2,4378127,,3,1 pokojowe (możliwość wydzielenia 2) |Czyżyny,Nowe mieszkanie z balkonem - Kraków Czyżyny Na...,32.62,515409.0,490700.0,707000.00,,...,,"Małopolskie, Kraków, Kraków-nowa Huta, Łęg, Ul...",12.0,61.0,3.0,9.0,950813.0,,60055.0,xGLchc43KA01zKcK3z0v4oaCdEfvBnZExzKfPS9GXW0=
3,4346014,,3,‼️Odbierz dziś duży Rabat -> 2 pok + Garaż‼️,"Nowe mieszkania, od dewelopera, bez podatku PC...",40.24,431156.0,428409.0,431459.00,428020.0,...,,"Pomorskie, Gdynia, Wiczlino",22.0,62.0,1.0,1.0,934429.0,,,5sT4Ar2swa4Sj0jt-GLT-zY_qkdVfWyqsbDtbsFdaZc=
4,4407014,,3,Mieszkanie Staroniwska Ogród,Zapraszamy do odkrycia naszej najnowszej inwes...,54.66,577300.0,624750.0,702500.00,,...,,"Podkarpackie, Rzeszów, Staroniwa",18.0,63.0,1.0,1.0,974245.0,,,0K6mgKFRS_FZAHJ2o8bTsjY_qkdVfWyqsbDtbsFdaZc=
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23207,4365731,,17,"Apartament, ul. Saperska","LEGNICA, ULICA SAPERSKA 1. 2 pokoje, balkon, p...",41.25,395000.0,,,,...,,"Dolnośląskie, Legnica, Tarninów, Ul. Saperska",2.0,62.0,1.0,1.0,1067598.0,,19509.0,30wxJBw-tetEqvu_DkzJj0m9CVG6yYs9U2KBME0Psic=
23208,4402007,,17,"Mieszkanie, ul. Modzelewskiego","Fenomenalnie wykończone, po remoncie mieszkani...",57.00,945000.0,829000.0,916000.00,,...,,"Mazowieckie, Warszawa, Mokotów, Wierzbno, Ul. ...",14.0,65.0,5.0,8.0,918577.0,,13192.0,adVPXWEdw4iBsck2kiqyTSxEZaoqZA0WSjJKH4A5guQ=
23209,4402899,,17,"Mieszkanie, ul. Sępa-Szarzyńskiego",Oferta na wyłączność! Nie znajdziecie jej pańs...,67.00,703500.0,849999.0,1114000.00,,...,,"Dolnośląskie, Wrocław, Wrocław-śródmieście, Pl...",2.0,64.0,6.0,9.0,986998.0,,19733.0,tAmtgaq_mFTFs4A7O3FG0JgtwTGeq42piasdfG4vjEE=
23210,4408927,,32,Sprzedam mieszkanie Jasło Jasło,Sprzedam mieszkanie do remontu w centrum Jasła...,49.00,250.0,265000.0,359966.67,,...,,"Podkarpackie, Jasielski, Jasło, Ul. Adama Mick...",18.0,5.0,1.0,1.0,953059.0,,12740.0,WTInZhDcqXMSLo2DspXxioaCdEfvBnZExzKfPS9GXW0=


In [42]:
polish_stopwords = [
    'i', 'oraz', 'a', 'ale', 'czy', 'więc', 'lecz', 'że', 'to', 'z', 'na', 'do', 'po', 'przez',
    'dla', 'bez', 'od', 'pod', 'nad', 'u', 'o', 'w', 'jak', 'tak', 'nie', 'jest', 'są', 'być',
    'był', 'była', 'było', 'byli', 'się', 'też', 'ten', 'ta', 'to', 'ci', 'co', 'który', 'która',
    'które', 'którzy', 'kto', 'kogo', 'czego', 'dlaczego', 'dlatego', 'tam', 'tu', 'tutaj',
    'teraz', 'już', 'jeszcze', 'bardzo', 'może', 'muszę', 'musisz', 'można', 'trzeba', 'będzie',
    'będą', 'by', 'aby', 'gdy', 'gdyby', 'mimo', 'choć', 'chociaż', 'nawet', 'ani', 'żeby', 'czyli'
]

In [65]:
print(preds.columns)

Index(['SaleId', 'OriginalId', 'PortalId', 'Title', 'Area', 'Price',
       'OfferPrice', 'RealPriceAfterRenovation', 'OriginalPrice',
       'PricePerSquareMeter',
       ...
       'tfidf_993', 'tfidf_994', 'tfidf_995', 'tfidf_996', 'tfidf_997',
       'tfidf_998', 'tfidf_999', 'target', 'prediction_label',
       'prediction_score'],
      dtype='object', length=9316)


In [66]:
# === OvR: One-vs-Rest podejście do klasyfikacji stanu mieszkania + zapis ===

# 1. Przygotowanie danych bez nulli
df_clean = df_balanced.copy()

# --- Dodanie cech: cena i lokalizacja ---

assert 'Price' in df_clean.columns, "Brakuje kolumny 'Price'"
assert 'Location' in df_clean.columns, "Brakuje kolumny 'Location'"
# Przekształcenie kolumny 'Location' na wektory (one-hot encoding)
df_location_dummies = pd.get_dummies(df_clean['Location'], prefix='loc', drop_first=True)
# Skalowanie ceny (opcjonalne, ale zalecane)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_clean['Price_scaled'] = scaler.fit_transform(df_clean[['Price']])
# Dołączenie nowych cech
df_clean = pd.concat([df_clean.reset_index(drop=True), df_location_dummies.reset_index(drop=True)], axis=1)
# Usunięcie oryginalnych kolumn tekstowych (opcjonalnie)
df_clean = df_clean[df_clean['BuildingCondition'].notna()].reset_index(drop=True)

# 2. Przekształcenie opisu na wektory TF-IDF
vectorizer = TfidfVectorizer(
    max_features=1000,
    min_df=2,
    stop_words=polish_stopwords
)
X_text = vectorizer.fit_transform(df_clean['Description'].fillna('')).toarray()
df_tfidf = pd.DataFrame(X_text, columns=[f'tfidf_{i}' for i in range(X_text.shape[1])])
df_clean = pd.concat([df_clean.reset_index(drop=True), df_tfidf], axis=1)
df_clean.drop(columns=['Description'], inplace=True)

# 3. Lista klas do przewidywania
unique_stany = df_clean['BuildingCondition'].unique()
models = {}
results_list = []

for stan in unique_stany:
    df_temp = df_clean.copy()
    df_temp['target'] = df_temp['BuildingCondition'].apply(lambda x: 1 if x == stan else 0)
    print(f"🧪 Stan: {stan} — Target count:\n{df_temp['target'].value_counts()}")

    clf_setup = setup(
        data=df_temp.drop(columns=['BuildingCondition']),
        target='target',
        session_id=1122,
       # text_features=['Description'], 
       # encoding_method='one_hot',  
       # max_encoding_ohe=10,
        html=False,
        fix_imbalance=True,
        fix_imbalance_method='SMOTE',  # lub ADASYN/RUS/BorderlineSMOTE
        verbose=True  # ważne, pokaże rozkład po SMOTE
    )
    model = create_model('lightgbm')
    tuned_model = tune_model(model, n_iter=10, optimize='F1', search_library='scikit-learn', search_algorithm='random')
    models[stan] = tuned_model

    print(f"\n📊 Ewaluacja modelu dla klasy: {stan}")
    model_results = pull()
    model_results['target_class'] = stan

    # ⬇️ NOWE: predykcja + metryki ręczne
    preds = predict_model(tuned_model)
    y_true = preds['target']
    y_pred = preds['prediction_label']

    cm = confusion_matrix(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, pos_label=1)
    recall = recall_score(y_true, y_pred, pos_label=1)

    print(f"Confusion Matrix:\n{cm}")
    print(f"F1 Score (class=1): {f1:.3f}")
    print(f"Recall (class=1): {recall:.3f}")
    print(classification_report(y_true, y_pred, digits=3))

    model_results['F1_score_class_1'] = f1
    model_results['Recall_class_1'] = recall
    results_list.append(model_results)

# ✳️ WYDRUKUJ metryki tego modelu
    print(f"\n📊 Ewaluacja modelu dla klasy: {stan}")
    print(pull())  # wyciąga ostatnie metryki z tune_model()
    evaluate_model(tuned_model)


# 4. Predykcja na danych z brakującym 'BuildingCondition'
df_nulls = df[df['BuildingCondition'].isna()].copy()
X_null_text = vectorizer.transform(df_nulls['Description'].fillna('')).toarray()
df_null_vec = pd.DataFrame(X_null_text, columns=[f'tfidf_{i}' for i in range(X_null_text.shape[1])])
df_nulls = pd.concat([df_nulls.reset_index(drop=True), df_null_vec], axis=1)
df_clean.drop(columns=['Description', 'Location', 'Price'], inplace=True)

# 5. Przewidzenie prawdopodobieństw i przypisanie klasy o najwyższej pewności
df_nulls["BuiltYear"] = pd.to_datetime(df_nulls["BuiltYear"], errors="coerce")
probabilities = {}

for stan, model in models.items():
        preds = predict_model(model, data=df_nulls, raw_score=True)
        probabilities[stan] = preds['Score_1'] 

# 6. Wybranie klasy z najwyższym Score
probs_df = pd.DataFrame(probabilities)
df_nulls['predicted_standard'] = probs_df.idxmax(axis=1)
df.loc[df_nulls.index, 'BuildingCondition'] = df_nulls['predicted_standard']

print("✅ Uzupełniono brakujące wartości w kolumnie 'BuildingCondition' na podstawie modeli OvR.")

# 7. Zapis do pliku CSV
df.to_csv("uzupelnione_mieszkania.csv", index=False)
print("💾 Zapisano do: uzupelnione_mieszkania.csv")

# 8. Zapis do bazy danych SQLite
conn = sqlite3.connect("uzupelnione_mieszkania.db")
df.to_sql("mieszkania", conn, if_exists="replace", index=False)
conn.close()
print("💾 Zapisano do: uzupelnione_mieszkania.db (tabela: mieszkania)")

🧪 Stan: DEVELOPER_STATE — Target count:
target
0    17409
1     5803
Name: count, dtype: int64
                    Description             Value
0                    Session id              1122
1                        Target            target
2                   Target type            Binary
3           Original data shape     (23212, 9314)
4        Transformed data shape     (31336, 9312)
5   Transformed train set shape     (24372, 9312)
6    Transformed test set shape      (6964, 9312)
7              Numeric features              1032
8                 Date features                 1
9          Categorical features                17
10     Rows with missing values            100.0%
11                   Preprocess              True
12              Imputation type            simple
13           Numeric imputation              mean
14       Categorical imputation              mode
15     Maximum one-hot encoding                25
16              Encoding method              None
17   

                                                                                                                       

      Accuracy     AUC  Recall  Prec.   F1  Kappa  MCC
Fold                                                  
0       0.7495  0.5735     0.0    0.0  0.0    0.0  0.0
1       0.7495  0.5472     0.0    0.0  0.0    0.0  0.0
2       0.7502  0.5947     0.0    0.0  0.0    0.0  0.0
3       0.7502  0.5907     0.0    0.0  0.0    0.0  0.0
4       0.0000  0.0000     0.0    0.0  0.0    0.0  0.0
5       0.7502  0.5941     0.0    0.0  0.0    0.0  0.0
6       0.7502  0.6369     0.0    0.0  0.0    0.0  0.0
7       0.7502  0.6230     0.0    0.0  0.0    0.0  0.0
8       0.7500  0.6165     0.0    0.0  0.0    0.0  0.0
9       0.7500  0.5626     0.0    0.0  0.0    0.0  0.0
Mean    0.6750  0.5339     0.0    0.0  0.0    0.0  0.0
Std     0.2250  0.1799     0.0    0.0  0.0    0.0  0.0


Processing:  14%|██████████▎                                                             | 1/7 [00:10<01:03, 10.64s/it]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


                                                                                                                       

Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
      Accuracy     AUC  Recall  Prec.   F1  Kappa  MCC
Fold                                                  
0       0.7495  0.9683     0.0    0.0  0.0    0.0  0.0
1       0.7495  0.6703     0.0    0.0  0.0    0.0  0.0
2       0.7502  0.5253     0.0    0.0  0.0    0.0  0.0
3       0.7502  0.5172     0.0    0.0  0.0    0.0  0.0
4       0.0000  0.0000     0.0    0.0  0.0    0.0  0.0
5       0.7502  0.5024     0.0    0.0  0.0    0.0  0.0
6       0.7502  0.6108     0.0    0.0  0.0    0.0  0.0
7       0.7502  0.5160     0.0    0.0  0.0    0.0  0.0
8       0.7500  0.9726     0.0    0.0  0.0    0.0  0.0
9       0.7500  0.7631     0.0    0.0  0.0    0.0  0.0
Mean    0.6750  0.6046     0.0    0.0  0.0    0.0  0.0
Std     0.2250  0.2636     0.0    0.0  0.0    0.0  0.0

📊 Ewaluacja modelu dla klasy: DEVELOPER_STATE
                             Model

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

🧪 Stan: AFTER_RENOVATION — Target count:
target
0    17409
1     5803
Name: count, dtype: int64
                    Description             Value
0                    Session id              1122
1                        Target            target
2                   Target type            Binary
3           Original data shape     (23212, 9314)
4        Transformed data shape     (31336, 9312)
5   Transformed train set shape     (24372, 9312)
6    Transformed test set shape      (6964, 9312)
7              Numeric features              1032
8                 Date features                 1
9          Categorical features                17
10     Rows with missing values            100.0%
11                   Preprocess              True
12              Imputation type            simple
13           Numeric imputation              mean
14       Categorical imputation              mode
15     Maximum one-hot encoding                25
16              Encoding method              None
17  

                                                                                                                       

      Accuracy     AUC  Recall  Prec.   F1  Kappa  MCC
Fold                                                  
0       0.7495  0.5311     0.0    0.0  0.0    0.0  0.0
1       0.7495  0.5175     0.0    0.0  0.0    0.0  0.0
2       0.7502  0.5134     0.0    0.0  0.0    0.0  0.0
3       0.7502  0.5344     0.0    0.0  0.0    0.0  0.0
4       0.7502  0.5078     0.0    0.0  0.0    0.0  0.0
5       0.7502  0.5064     0.0    0.0  0.0    0.0  0.0
6       0.7502  0.5422     0.0    0.0  0.0    0.0  0.0
7       0.7502  0.5062     0.0    0.0  0.0    0.0  0.0
8       0.7500  0.5045     0.0    0.0  0.0    0.0  0.0
9       0.7500  0.5018     0.0    0.0  0.0    0.0  0.0
Mean    0.7500  0.5165     0.0    0.0  0.0    0.0  0.0
Std     0.0002  0.0136     0.0    0.0  0.0    0.0  0.0


Processing:  14%|██████████▎                                                             | 1/7 [00:10<01:01, 10.25s/it]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


                                                                                                                       

      Accuracy     AUC  Recall  Prec.      F1   Kappa     MCC
Fold                                                         
0       0.7495  0.6990  0.0000    0.0  0.0000  0.0000  0.0000
1       0.7495  0.5162  0.0000    0.0  0.0000  0.0000  0.0000
2       0.7508  0.7973  0.0025    1.0  0.0049  0.0037  0.0430
3       0.7502  0.6759  0.0000    0.0  0.0000  0.0000  0.0000
4       0.7502  0.5168  0.0000    0.0  0.0000  0.0000  0.0000
5       0.7508  0.8018  0.0025    1.0  0.0049  0.0037  0.0430
6       0.7502  0.5115  0.0000    0.0  0.0000  0.0000  0.0000
7       0.7502  0.5034  0.0000    0.0  0.0000  0.0000  0.0000
8       0.7500  0.5343  0.0000    0.0  0.0000  0.0000  0.0000
9       0.7500  0.5449  0.0000    0.0  0.0000  0.0000  0.0000
Mean    0.7501  0.6101  0.0005    0.2  0.0010  0.0007  0.0086
Std     0.0004  0.1152  0.0010    0.4  0.0020  0.0015  0.0172

📊 Ewaluacja modelu dla klasy: AFTER_RENOVATION
                             Model  Accuracy     AUC  Recall  Prec.   F1  \
0  Light

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

🧪 Stan: FOR_RENOVATION — Target count:
target
0    17409
1     5803
Name: count, dtype: int64
                    Description             Value
0                    Session id              1122
1                        Target            target
2                   Target type            Binary
3           Original data shape     (23212, 9314)
4        Transformed data shape     (31336, 9312)
5   Transformed train set shape     (24372, 9312)
6    Transformed test set shape      (6964, 9312)
7              Numeric features              1032
8                 Date features                 1
9          Categorical features                17
10     Rows with missing values            100.0%
11                   Preprocess              True
12              Imputation type            simple
13           Numeric imputation              mean
14       Categorical imputation              mode
15     Maximum one-hot encoding                25
16              Encoding method              None
17    

                                                                                                                       

      Accuracy     AUC  Recall  Prec.   F1  Kappa  MCC
Fold                                                  
0       0.7495  0.6149     0.0    0.0  0.0    0.0  0.0
1       0.7495  0.5859     0.0    0.0  0.0    0.0  0.0
2       0.7502  0.5109     0.0    0.0  0.0    0.0  0.0
3       0.7502  0.5328     0.0    0.0  0.0    0.0  0.0
4       0.7502  0.5815     0.0    0.0  0.0    0.0  0.0
5       0.7502  0.5256     0.0    0.0  0.0    0.0  0.0
6       0.7502  0.5270     0.0    0.0  0.0    0.0  0.0
7       0.7502  0.5670     0.0    0.0  0.0    0.0  0.0
8       0.7500  0.5782     0.0    0.0  0.0    0.0  0.0
9       0.7500  0.6517     0.0    0.0  0.0    0.0  0.0
Mean    0.7500  0.5676     0.0    0.0  0.0    0.0  0.0
Std     0.0002  0.0422     0.0    0.0  0.0    0.0  0.0


Processing:  14%|██████████▎                                                             | 1/7 [00:10<01:01, 10.18s/it]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


                                                                                                                       

Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
      Accuracy     AUC  Recall  Prec.   F1  Kappa  MCC
Fold                                                  
0       0.7495  0.5384     0.0    0.0  0.0    0.0  0.0
1       0.7495  0.5640     0.0    0.0  0.0    0.0  0.0
2       0.7502  0.5447     0.0    0.0  0.0    0.0  0.0
3       0.7502  0.7741     0.0    0.0  0.0    0.0  0.0
4       0.7502  0.5003     0.0    0.0  0.0    0.0  0.0
5       0.7502  0.5247     0.0    0.0  0.0    0.0  0.0
6       0.7502  0.5173     0.0    0.0  0.0    0.0  0.0
7       0.7502  0.4958     0.0    0.0  0.0    0.0  0.0
8       0.7500  0.5029     0.0    0.0  0.0    0.0  0.0
9       0.7500  0.5288     0.0    0.0  0.0    0.0  0.0
Mean    0.7500  0.5491     0.0    0.0  0.0    0.0  0.0
Std     0.0002  0.0777     0.0    0.0  0.0    0.0  0.0

📊 Ewaluacja modelu dla klasy: FOR_RENOVATION
                             Model 

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

🧪 Stan: GOOD — Target count:
target
0    17409
1     5803
Name: count, dtype: int64
                    Description             Value
0                    Session id              1122
1                        Target            target
2                   Target type            Binary
3           Original data shape     (23212, 9314)
4        Transformed data shape     (31336, 9312)
5   Transformed train set shape     (24372, 9312)
6    Transformed test set shape      (6964, 9312)
7              Numeric features              1032
8                 Date features                 1
9          Categorical features                17
10     Rows with missing values            100.0%
11                   Preprocess              True
12              Imputation type            simple
13           Numeric imputation              mean
14       Categorical imputation              mode
15     Maximum one-hot encoding                25
16              Encoding method              None
17              

                                                                                                                       

      Accuracy     AUC  Recall  Prec.   F1  Kappa  MCC
Fold                                                  
0       0.7495  0.5645     0.0    0.0  0.0    0.0  0.0
1       0.7495  0.5345     0.0    0.0  0.0    0.0  0.0
2       0.7502  0.5186     0.0    0.0  0.0    0.0  0.0
3       0.7502  0.5193     0.0    0.0  0.0    0.0  0.0
4       0.7502  0.5774     0.0    0.0  0.0    0.0  0.0
5       0.7502  0.5313     0.0    0.0  0.0    0.0  0.0
6       0.7502  0.5440     0.0    0.0  0.0    0.0  0.0
7       0.7502  0.5008     0.0    0.0  0.0    0.0  0.0
8       0.7500  0.5423     0.0    0.0  0.0    0.0  0.0
9       0.7500  0.5590     0.0    0.0  0.0    0.0  0.0
Mean    0.7500  0.5392     0.0    0.0  0.0    0.0  0.0
Std     0.0002  0.0221     0.0    0.0  0.0    0.0  0.0


Processing:  14%|██████████▎                                                             | 1/7 [00:11<01:08, 11.47s/it]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


                                                                                                                       

Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
      Accuracy     AUC  Recall  Prec.   F1  Kappa  MCC
Fold                                                  
0       0.7495  0.5127     0.0    0.0  0.0    0.0  0.0
1       0.7495  0.5085     0.0    0.0  0.0    0.0  0.0
2       0.7502  0.6199     0.0    0.0  0.0    0.0  0.0
3       0.7502  0.5094     0.0    0.0  0.0    0.0  0.0
4       0.7502  0.5097     0.0    0.0  0.0    0.0  0.0
5       0.7502  0.6254     0.0    0.0  0.0    0.0  0.0
6       0.7502  0.5149     0.0    0.0  0.0    0.0  0.0
7       0.7502  0.5068     0.0    0.0  0.0    0.0  0.0
8       0.7500  0.5037     0.0    0.0  0.0    0.0  0.0
9       0.7500  0.6032     0.0    0.0  0.0    0.0  0.0
Mean    0.7500  0.5414     0.0    0.0  0.0    0.0  0.0
Std     0.0002  0.0493     0.0    0.0  0.0    0.0  0.0

📊 Ewaluacja modelu dla klasy: GOOD
                             Model  Accuracy 

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

KeyError: "['Description'] not found in axis"

In [None]:
results_df = pd.concat(results_list)
results_df.to_csv("ovr_results.csv", index=False)
print("💾 Zapisano metryki OvR do pliku: ovr_results.csv")

In [None]:
metryki=pd.read_csv("ovr_results.csv")

In [None]:
metryki

In [None]:
df2=df.sample(10)
df2.to_csv("uzup_mieszkania_sample.csv", index=False)

In [None]:
df["BuildingCondition"].value_counts(normalize=True)

In [None]:
# Fatalny balans, mimo iż baza wstępna została zbalansowana, tutaj wyniki są trudne do zaakceptowania
# wartoć stanu Developer_State to aż 97% wszystkich predykcji. Aż trudno w to uwierzyć. Do sprawdzenia szczegółowego.