# PROJEKT MODELU KLASYFIKACYJNEGO PRZEWIDYWANIA STANU MIESZKAŃ NA PODSTAWIE #

## Extra Trees Classifier ##

### kwiecień 2025

In [1]:
import pandas as pd
import os
from pycaret.classification import setup, create_model, tune_model, predict_model, evaluate_model, pull
from pycaret.classification import *
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import resample
from sklearn.metrics import classification_report, confusion_matrix, f1_score, recall_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import pymysql
from sqlalchemy import create_engine
import numpy as np
from scipy.stats import skewnorm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
import sqlite3

# Blok z metodą OvR uruchamiać z tego miejsca

## Uwaga nie zapomnieć o imporcie z pierwszej linijki

In [3]:
df=pd.read_csv("sale_2024_14.csv")

In [4]:
df_2 = df[~((df['Link'].str.contains('otodom', case=False, na=False)) & 
                              (df['BuildingCondition'] == 'AFTER_RENOVATION'))]

In [5]:
Q1 = df_2["Price"].quantile(0.25)
Q3 = df_2["Price"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_prep_p = df_2[~((df_2["Price"] < lower_bound) | (df_2["Price"] > upper_bound))]

In [6]:
Q1 = df_prep_p["PricePerSquareMeter"].quantile(0.25)
Q3 = df_prep_p["PricePerSquareMeter"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_prep_a = df_prep_p[~((df_prep_p["PricePerSquareMeter"] < lower_bound) | (df_prep_p["PricePerSquareMeter"] > upper_bound))]

In [7]:
Q1 = df_prep_a["Area"].quantile(0.25)
Q3 = df_prep_a["Area"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_prepared2 = df_prep_a[~((df_prep_a["Area"] < lower_bound) | (df_prep_a["Area"] > upper_bound))]

In [8]:
df_prepared3=df_prepared2.dropna(subset=['BuildingCondition'])

In [9]:
df_prepared3.loc[df_prepared3['TypeOfMarket'] == 'pierwotny', 'BuildingCondition'] = 'DEVELOPER_STATE'

In [10]:
df_prepared3['BuiltYear'] = pd.to_datetime(df_prepared3['BuiltYear'], format='%Y', errors='coerce')

In [11]:
df_prepared3.loc[df_prepared3['BuiltYear'].dt.year >= 2025, 'BuildingCondition'] = 'DEVELOPER_STATE'

In [12]:
df_prepared3['BuiltYear'] = pd.to_datetime(df_prepared3['BuiltYear'], format='%Y', errors='coerce')

In [13]:
unique_ctype=df_prepared3['BuildingCondition'].unique()

unique_ctype

array(['DEVELOPER_STATE', 'AFTER_RENOVATION', 'GOOD', 'FOR_RENOVATION'],
      dtype=object)

In [14]:
df_prepared3["BuildingCondition"].value_counts(normalize=True)

BuildingCondition
DEVELOPER_STATE     0.558076
AFTER_RENOVATION    0.209840
FOR_RENOVATION      0.129301
GOOD                0.102783
Name: proportion, dtype: float64

In [15]:
# DODANIE ZBALANSOWANIA
# Sprawdź ile rekordów ma każda klasa
class_counts = df_prepared3['BuildingCondition'].value_counts()
min_count = class_counts.min()

# Tworzymy zbalansowane próbki dla każdej klasy
dfs = []
for condition in class_counts.index:
    df_condition = df_prepared3[df_prepared3['BuildingCondition'] == condition]
    df_condition_downsampled = resample(df_condition, 
                                        replace=False, 
                                        n_samples=min_count, 
                                        random_state=42)
    dfs.append(df_condition_downsampled)

# Łączymy wszystkie klasy w jeden zbalansowany DataFrame
df_balanced = pd.concat(dfs).reset_index(drop=True)

In [16]:
df_balanced["BuildingCondition"].value_counts(normalize=True)

BuildingCondition
DEVELOPER_STATE     0.25
AFTER_RENOVATION    0.25
FOR_RENOVATION      0.25
GOOD                0.25
Name: proportion, dtype: float64

In [17]:
df_balanced

Unnamed: 0,SaleId,OriginalId,PortalId,Title,Description,Area,Price,OfferPrice,RealPriceAfterRenovation,OriginalPrice,...,Archive,Location,VoivodeshipNumber,CountyNumber,CommunityNumber,KindNumber,RegionNumber,SubRegionNumber,StreetNumber,EncryptedId
0,4274896,,3,Na sprzedaż mieszkanie 3 pokojowe,Na sprzedaż mieszkanie 3 pokojowe o powierzchn...,92.00,1122000.0,,,,...,,"Mazowieckie, Pruszkowski, Piastów",14,21,1.0,1.0,921496.0,,,40_16Iiq8kxkkzEq1TPgzno5Kb4ifgjNFRUWvwErVL0=
1,4374481,,10,Mieszkanie na sprzedaż - 39 m² - 2 pokoje Wars...,NOVA RADIOWA to elegancki i nowoczesny komplek...,39.76,,,,,...,,"Mazowieckie, Warszawa, Bemowo, Ul. Radiowa",14,65,2.0,8.0,988780.0,,18295.0,MAGyXtqn_Dv0mfMJoSrr3222_PRktl6F6g00N3Ph_e0=
2,4268951,,10,Nowa inwestycja deweloperska od 6900zł/m2,"Z przyjemnością informujemy, że właśnie ruszył...",52.04,359076.0,462000.0,554012.33,,...,,"Mazowieckie, Radom, Ul. Warsztatowa",14,63,1.0,1.0,972750.0,,23695.0,ycDhm8yLLseFlvz6yitY1NPMqpHF6X1wDc4eEYzDo2s=
3,4372186,,3,"Bez prowizji, Okazja Targówek Przy metrze 2-pok","Bez prowizji, bez podatku PCC 2% ATRAKCYJNA CE...",37.88,559000.0,597280.0,740000.00,,...,,"Mazowieckie, Warszawa, Targówek, Bródno",14,65,11.0,8.0,1065020.0,,,vP5sTp7fUzF3jaWcdgjNa7C9zgjsgeFCftN87d07Wfg=
4,4239305,,3,2 pokoje. Oddanie już w tym roku,Nowa inwestycja na Wawrze - atrakcyjne ceny i ...,49.22,703846.0,695000.0,1046326.67,,...,,"Mazowieckie, Warszawa, Wawer, Zerzeń",14,65,14.0,8.0,919281.0,,,ELeEj3EwnQQRYr9TG8awNhXVZdKE7iJxxFPu0Bq0ikc=
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3747,4259767,,18,Mieszkanie na Ursynowie do własnej aranżacji,NA SPRZEDAŻ! Mieszkanie jednopokojowe z możliw...,41.34,660000.0,860000.0,915266.67,,...,,"Mazowieckie, Warszawa, Ursynów, Natolin, Ul. B...",14,65,13.0,8.0,918442.0,,980.0,WsCU_CEzsMFt83dzwl50Cu8e0oyGIwz8-GAUcMCH2_k=
3748,4291171,,17,"Mieszkanie, ul. Skarbka z Gór",Zapraszamy na prezentację mieszkania dwupokojo...,57.10,600000.0,640000.0,725000.00,,...,,"Mazowieckie, Warszawa, Białołęka, Grodzisk, Ul...",14,65,3.0,8.0,919447.0,,28178.0,tcA4oDRPl-7BLXnrqHO97HAQGFSZzeA3-gwVhOeGLN0=
3749,4401058,,17,"Mieszkanie, ul. Gwiaździsta",2-pokojowe mieszkanie zlokalizowane na Żolibor...,45.80,830000.0,750000.0,1000000.00,,...,,"Mazowieckie, Warszawa, Żoliborz, Ul. Gwiaździsta",14,65,19.0,8.0,920048.0,,6422.0,tMQEEFXPR5xqy0a9DvnaSyFhR9uKFSRu990qiyDp3Xk=
3750,4325307,,4,2-pokojowe mieszkanie na sprzedaż,Mieszkanie na Starym Mieście - Idealne na inwe...,32.00,187000.0,,,,...,,"Mazowieckie, Płock, Stare Miasto, Ul. Ostatnia",14,62,1.0,1.0,1067684.0,,15360.0,-tdZznEgemH0fneW1Eyi1SxEZaoqZA0WSjJKH4A5guQ=


In [18]:
polish_stopwords = [
    'i', 'oraz', 'a', 'ale', 'czy', 'więc', 'lecz', 'że', 'to', 'z', 'na', 'do', 'po', 'przez',
    'dla', 'bez', 'od', 'pod', 'nad', 'u', 'o', 'w', 'jak', 'tak', 'nie', 'jest', 'są', 'być',
    'był', 'była', 'było', 'byli', 'się', 'też', 'ten', 'ta', 'to', 'ci', 'co', 'który', 'która',
    'które', 'którzy', 'kto', 'kogo', 'czego', 'dlaczego', 'dlatego', 'tam', 'tu', 'tutaj',
    'teraz', 'już', 'jeszcze', 'bardzo', 'może', 'muszę', 'musisz', 'można', 'trzeba', 'będzie',
    'będą', 'by', 'aby', 'gdy', 'gdyby', 'mimo', 'choć', 'chociaż', 'nawet', 'ani', 'żeby', 'czyli'
]

In [19]:
# === OvR: One-vs-Rest podejście do klasyfikacji stanu mieszkania + zapis ===

# 1. Przygotowanie danych bez nulli
df_clean = df_balanced.copy()
os.environ["PYCARET_N_JOBS"] = "1"

# --- Dodanie cech: cena i lokalizacja ---

assert 'Price' in df_clean.columns, "Brakuje kolumny 'Price'"
assert 'Location' in df_clean.columns, "Brakuje kolumny 'Location'"
# Przekształcenie kolumny 'Location' na wektory (one-hot encoding)
df_location_dummies = pd.get_dummies(df_clean['Location'], prefix='loc', drop_first=True)
# Skalowanie ceny (opcjonalne, ale zalecane)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
df_clean['Price_scaled'] = scaler.fit_transform(df_clean[['Price']])
# Dołączenie nowych cech
df_clean = pd.concat([df_clean.reset_index(drop=True), df_location_dummies.reset_index(drop=True)], axis=1)
# Usunięcie oryginalnych kolumn tekstowych (opcjonalnie)
df_clean = df_clean[df_clean['BuildingCondition'].notna()].reset_index(drop=True)

# 2. Przekształcenie opisu na wektory TF-IDF
vectorizer = TfidfVectorizer(
    max_features=1000,
    min_df=2,
    stop_words=polish_stopwords
)
X_text = vectorizer.fit_transform(df_clean['Description'].fillna('')).toarray()
df_tfidf = pd.DataFrame(X_text, columns=[f'tfidf_{i}' for i in range(X_text.shape[1])])
df_clean = pd.concat([df_clean.reset_index(drop=True), df_tfidf], axis=1)
df_clean.drop(columns=['Description'], inplace=True)

# 3. Lista klas do przewidywania

unique_stany = df_clean['BuildingCondition'].unique()
models = {}
results_list = []

for stan in unique_stany:
    print(f"\n🧪 Stan: {stan}")

    # 1. Przygotowanie targetu OvR
    df_temp = df_clean.copy()
    df_temp['target'] = df_temp['BuildingCondition'].apply(lambda x: 1 if x == stan else 0)
    
    # 2. Podział danych na trening/test
    X = df_temp.drop(columns=['target', 'BuildingCondition'])
    y = df_temp['target']
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=1122
    )

    # 3. Setup PyCaret z training setu
    clf_setup = setup(
        data=pd.concat([X_train, y_train], axis=1),
        target='target',
        session_id=1122,
        fix_imbalance=True,
        fix_imbalance_method='SMOTE',
        html=False,
        verbose=True
    )

    # 4. Trenowanie i strojenie
    model = create_model('lightgbm')
    tuned_model = tune_model(model, n_iter=5, optimize='F1')

    models[stan] = tuned_model

    # 5. Predykcja na testowym zbiorze
    X_test_copy = X_test.copy()
    X_test_copy['target'] = y_test  # tylko po to, by zachować zgodność
    preds = predict_model(tuned_model, data=X_test_copy)

    y_true = preds['target']
    y_pred = preds['prediction_label']

    # 6. Metryki
    f1 = f1_score(y_true, y_pred, pos_label=1)
    recall = recall_score(y_true, y_pred, pos_label=1)
    cm = confusion_matrix(y_true, y_pred)

    print(f"\n📊 Wyniki dla klasy: {stan}")
    print(f"F1 Score: {f1:.3f}")
    print(f"Recall: {recall:.3f}")
    print(f"Confusion Matrix:\n{cm}")
    print(classification_report(y_true, y_pred, digits=3))

    # 7. Zapis metryk
    model_results = pull()
    model_results['target_class'] = stan
    model_results['F1_score_class_1'] = f1
    model_results['Recall_class_1'] = recall
    results_list.append(model_results)

# 4. Predykcja na danych z brakującym 'BuildingCondition'
df_nulls = df[df['BuildingCondition'].isna()].copy()
X_null_text = vectorizer.transform(df_nulls['Description'].fillna('')).toarray()
df_null_vec = pd.DataFrame(X_null_text, columns=[f'tfidf_{i}' for i in range(X_null_text.shape[1])])
df_nulls = pd.concat([df_nulls.reset_index(drop=True), df_null_vec], axis=1)
df_clean.drop(columns=['Description', 'Location', 'Price'], inplace=True)

# 5. Przewidzenie prawdopodobieństw i przypisanie klasy o najwyższej pewności
df_nulls["BuiltYear"] = pd.to_datetime(df_nulls["BuiltYear"], errors="coerce")
probabilities = {}

for stan, model in models.items():
        preds = predict_model(model, data=df_nulls, raw_score=True)
        probabilities[stan] = preds['Score_1'] 

# 6. Wybranie klasy z najwyższym Score
probs_df = pd.DataFrame(probabilities)
df_nulls['predicted_standard'] = probs_df.idxmax(axis=1)
df.loc[df_nulls.index, 'BuildingCondition'] = df_nulls['predicted_standard']

print("✅ Uzupełniono brakujące wartości w kolumnie 'BuildingCondition' na podstawie modeli OvR.")

# 7. Zapis do pliku CSV
df.to_csv("uzupelnione_mieszkania.csv", index=False)
print("💾 Zapisano do: uzupelnione_mieszkania.csv")

# 8. Zapis do bazy danych SQLite
conn = sqlite3.connect("uzupelnione_mieszkania.db")
df.to_sql("mieszkania", conn, if_exists="replace", index=False)
conn.close()
print("💾 Zapisano do: uzupelnione_mieszkania.db (tabela: mieszkania)")


🧪 Stan: DEVELOPER_STATE
                    Description             Value
0                    Session id              1122
1                        Target            target
2                   Target type            Binary
3           Original data shape      (3001, 2319)
4        Transformed data shape      (4051, 2334)
5   Transformed train set shape      (3150, 2334)
6    Transformed test set shape       (901, 2334)
7              Numeric features              1032
8                 Date features                 1
9          Categorical features                17
10     Rows with missing values            100.0%
11                   Preprocess              True
12              Imputation type            simple
13           Numeric imputation              mean
14       Categorical imputation              mode
15     Maximum one-hot encoding                25
16              Encoding method              None
17                Fix imbalance              True
18         Fix imbalance 

                                                                                                                       

      Accuracy     AUC  Recall   Prec.      F1   Kappa     MCC
Fold                                                          
0       0.7524  0.4758  0.0000  0.0000  0.0000  0.0000  0.0000
1       0.7524  0.4887  0.0000  0.0000  0.0000  0.0000  0.0000
2       0.7524  0.5190  0.0000  0.0000  0.0000  0.0000  0.0000
3       0.7524  0.4710  0.0000  0.0000  0.0000  0.0000  0.0000
4       0.7524  0.5450  0.0000  0.0000  0.0000  0.0000  0.0000
5       0.7476  0.6162  0.0000  0.0000  0.0000  0.0000  0.0000
6       0.7476  0.5235  0.0000  0.0000  0.0000  0.0000  0.0000
7       0.7476  0.5963  0.0000  0.0000  0.0000  0.0000  0.0000
8       0.7524  0.4833  0.0377  0.6667  0.0714  0.0456  0.1148
9       0.7476  0.5992  0.0000  0.0000  0.0000  0.0000  0.0000
Mean    0.7505  0.5318  0.0038  0.0667  0.0071  0.0046  0.0115
Std     0.0023  0.0522  0.0113  0.2000  0.0214  0.0137  0.0344


Processing:  14%|██████████▎                                                             | 1/7 [00:00<00:03,  1.57it/s]

Fitting 10 folds for each of 5 candidates, totalling 50 fits


                                                                                                                       

      Accuracy     AUC  Recall   Prec.      F1   Kappa     MCC
Fold                                                          
0       0.7524  0.4935  0.0000  0.0000  0.0000  0.0000  0.0000
1       0.7619  0.9663  0.0385  1.0000  0.0741  0.0568  0.1709
2       0.7524  0.8805  0.0000  0.0000  0.0000  0.0000  0.0000
3       0.7524  0.4999  0.0000  0.0000  0.0000  0.0000  0.0000
4       0.7524  0.5000  0.0000  0.0000  0.0000  0.0000  0.0000
5       0.7476  0.5187  0.0000  0.0000  0.0000  0.0000  0.0000
6       0.7476  0.5223  0.0000  0.0000  0.0000  0.0000  0.0000
7       0.7476  0.5156  0.0000  0.0000  0.0000  0.0000  0.0000
8       0.7524  0.5158  0.0377  0.6667  0.0714  0.0456  0.1148
9       0.7476  0.9623  0.0000  0.0000  0.0000  0.0000  0.0000
Mean    0.7514  0.6375  0.0076  0.1667  0.0146  0.0102  0.0286
Std     0.0042  0.1970  0.0152  0.3416  0.0291  0.0206  0.0585
                             Model  Accuracy     AUC  Recall  Prec.   F1  \
0  Light Gradient Boosting Machine    0.74

                                                                                                                       

      Accuracy     AUC  Recall   Prec.      F1   Kappa     MCC
Fold                                                          
0       0.7524  0.5113  0.0000  0.0000  0.0000  0.0000  0.0000
1       0.7524  0.4742  0.0000  0.0000  0.0000  0.0000  0.0000
2       0.7476  0.4763  0.0000  0.0000  0.0000 -0.0094 -0.0397
3       0.0000  0.0000  0.0000  0.0000  0.0000  0.0000  0.0000
4       0.7524  0.5329  0.0000  0.0000  0.0000  0.0000  0.0000
5       0.7476  0.5333  0.0000  0.0000  0.0000  0.0000  0.0000
6       0.7476  0.5314  0.0000  0.0000  0.0000  0.0000  0.0000
7       0.7524  0.5300  0.0377  0.6667  0.0714  0.0456  0.1148
8       0.7476  0.3882  0.0000  0.0000  0.0000  0.0000  0.0000
9       0.7476  0.5695  0.0000  0.0000  0.0000  0.0000  0.0000
Mean    0.6748  0.4547  0.0038  0.0667  0.0071  0.0036  0.0075
Std     0.2249  0.1588  0.0113  0.2000  0.0214  0.0143  0.0377


Processing:  14%|██████████▎                                                             | 1/7 [00:00<00:03,  1.62it/s]

Fitting 10 folds for each of 5 candidates, totalling 50 fits


                                                                                                                       

Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
      Accuracy     AUC  Recall  Prec.      F1   Kappa     MCC
Fold                                                         
0       0.7524  0.5225  0.0000    0.0  0.0000  0.0000  0.0000
1       0.7524  0.5161  0.0000    0.0  0.0000  0.0000  0.0000
2       0.7524  0.5319  0.0000    0.0  0.0000  0.0000  0.0000
3       0.0000  0.0000  0.0000    0.0  0.0000  0.0000  0.0000
4       0.7524  0.8162  0.0000    0.0  0.0000  0.0000  0.0000
5       0.7476  0.5093  0.0000    0.0  0.0000  0.0000  0.0000
6       0.7476  0.5127  0.0000    0.0  0.0000  0.0000  0.0000
7       0.7524  0.5284  0.0189    1.0  0.0370  0.0280  0.1191
8       0.7476  0.6655  0.0000    0.0  0.0000  0.0000  0.0000
9       0.7476  0.5156  0.0000    0.0  0.0000  0.0000  0.0000
Mean    0.6752  0.5118  0.0019    0.1  0.0037  0.0028  0.0119
Std     0.2251  0.1947  0.0057    0.3  0.0111

                                                                                                                       

      Accuracy     AUC  Recall  Prec.      F1   Kappa     MCC
Fold                                                         
0       0.7524  0.5161  0.0000    0.0  0.0000  0.0000  0.0000
1       0.7524  0.5296  0.0000    0.0  0.0000  0.0000  0.0000
2       0.7524  0.4617  0.0000    0.0  0.0000  0.0000  0.0000
3       0.7524  0.5176  0.0000    0.0  0.0000  0.0000  0.0000
4       0.7524  0.4942  0.0000    0.0  0.0000  0.0000  0.0000
5       0.7571  0.4933  0.0377    1.0  0.0727  0.0554  0.1688
6       0.7476  0.4850  0.0000    0.0  0.0000  0.0000  0.0000
7       0.7476  0.5298  0.0000    0.0  0.0000  0.0000  0.0000
8       0.7381  0.4539  0.0000    0.0  0.0000 -0.0187 -0.0570
9       0.7476  0.5844  0.0000    0.0  0.0000  0.0000  0.0000
Mean    0.7500  0.5066  0.0038    0.1  0.0073  0.0037  0.0112
Std     0.0049  0.0359  0.0113    0.3  0.0218  0.0181  0.0552


Processing:  14%|██████████▎                                                             | 1/7 [00:00<00:02,  2.13it/s]

Fitting 10 folds for each of 5 candidates, totalling 50 fits


                                                                                                                       

Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
      Accuracy     AUC  Recall  Prec.      F1   Kappa     MCC
Fold                                                         
0       0.7524  0.5096  0.0000    0.0  0.0000  0.0000  0.0000
1       0.7524  0.5000  0.0000    0.0  0.0000  0.0000  0.0000
2       0.7524  0.4489  0.0000    0.0  0.0000  0.0000  0.0000
3       0.7524  0.5096  0.0000    0.0  0.0000  0.0000  0.0000
4       0.7524  0.5000  0.0000    0.0  0.0000  0.0000  0.0000
5       0.7571  0.5189  0.0377    1.0  0.0727  0.0554  0.1688
6       0.7476  0.4968  0.0000    0.0  0.0000  0.0000  0.0000
7       0.7476  0.5000  0.0000    0.0  0.0000  0.0000  0.0000
8       0.7429  0.4936  0.0000    0.0  0.0000 -0.0094 -0.0402
9       0.7476  0.5000  0.0000    0.0  0.0000  0.0000  0.0000
Mean    0.7505  0.4977  0.0038    0.1  0.0073  0.0046  0.0129
Std     0.0038  0.0177  0.0113    0.3  0.0218

                                                                                                                       

      Accuracy     AUC  Recall  Prec.   F1   Kappa     MCC
Fold                                                      
0       0.7524  0.4673     0.0    0.0  0.0  0.0000  0.0000
1       0.7524  0.3625     0.0    0.0  0.0  0.0000  0.0000
2       0.7524  0.4781     0.0    0.0  0.0  0.0000  0.0000
3       0.7524  0.4690     0.0    0.0  0.0  0.0000  0.0000
4       0.7524  0.5234     0.0    0.0  0.0  0.0000  0.0000
5       0.7476  0.4866     0.0    0.0  0.0  0.0000  0.0000
6       0.7476  0.4609     0.0    0.0  0.0  0.0000  0.0000
7       0.7476  0.4587     0.0    0.0  0.0  0.0000  0.0000
8       0.7476  0.4896     0.0    0.0  0.0  0.0000  0.0000
9       0.7429  0.4532     0.0    0.0  0.0 -0.0094 -0.0402
Mean    0.7495  0.4649     0.0    0.0  0.0 -0.0009 -0.0040
Std     0.0032  0.0392     0.0    0.0  0.0  0.0028  0.0121


Processing:  14%|██████████▎                                                             | 1/7 [00:00<00:03,  1.64it/s]

Fitting 10 folds for each of 5 candidates, totalling 50 fits


                                                                                                                       

      Accuracy     AUC  Recall  Prec.      F1   Kappa     MCC
Fold                                                         
0       0.7524  0.5000  0.0000   0.00  0.0000  0.0000  0.0000
1       0.7524  0.4968  0.0000   0.00  0.0000  0.0000  0.0000
2       0.7524  0.5000  0.0000   0.00  0.0000  0.0000  0.0000
3       0.7524  0.5065  0.0000   0.00  0.0000  0.0000  0.0000
4       0.7524  0.5000  0.0000   0.00  0.0000  0.0000  0.0000
5       0.7476  0.5000  0.0000   0.00  0.0000  0.0000  0.0000
6       0.7476  0.5000  0.0000   0.00  0.0000  0.0000  0.0000
7       0.7429  0.4968  0.0000   0.00  0.0000 -0.0094 -0.0402
8       0.7476  0.5000  0.0000   0.00  0.0000  0.0000  0.0000
9       0.7476  0.5062  0.0189   0.50  0.0364  0.0183  0.0559
Mean    0.7495  0.5006  0.0019   0.05  0.0036  0.0009  0.0016
Std     0.0032  0.0031  0.0057   0.15  0.0109  0.0065  0.0217
                             Model  Accuracy     AUC  Recall  Prec.   F1  \
0  Light Gradient Boosting Machine    0.7483  0.4973    

KeyError: "['Description'] not found in axis"

In [None]:
results_df = pd.concat(results_list)
results_df.to_csv("ovr_results.csv", index=False)
print("💾 Zapisano metryki OvR do pliku: ovr_results.csv")

In [None]:
metryki=pd.read_csv("ovr_results.csv")

In [None]:
metryki

In [None]:
# Zebranie wyników z listy do DataFrame
results_df = pd.concat(results_list, ignore_index=True)

# Zachowujemy tylko wybrane kolumny, które nas interesują
summary_df = results_df[[
    'target_class', 
    'F1_score_class_1', 
    'Recall_class_1',
    'Accuracy', 
    'AUC', 
    'MCC'
]].sort_values(by='F1_score_class_1', ascending=False)

# Wyświetlenie wyników
print("\n📋 Podsumowanie metryk OvR dla każdej klasy BuildingCondition:")
display(summary_df)

In [None]:
df2=df.sample(10)
df2.to_csv("uzup_mieszkania_sample.csv", index=False)

In [None]:
df["BuildingCondition"].value_counts(normalize=True)

In [None]:
# Fatalny balans, mimo iż baza wstępna została zbalansowana, tutaj wyniki są trudne do zaakceptowania
# wartoć stanu Developer_State to aż 97% wszystkich predykcji. Aż trudno w to uwierzyć. Do sprawdzenia szczegółowego.