# PROJEKT MODELU KLASYFIKACYJNEGO PRZEWIDYWANIA STANU MIESZKAŃ NA PODSTAWIE #

## Extra Trees Classifier ##

### kwiecień 2025

In [18]:
import pandas as pd
from pycaret.classification import setup, create_model, tune_model, predict_model, evaluate_model, pull
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.utils import resample
import pymysql
from sqlalchemy import create_engine
import numpy as np
from scipy.stats import skewnorm
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# tą komórkę uruchom jeżeli bazę danych bierzesz np. z DBeaver, oracle developer itp.

#username = 'root'
#password = '1234'
#host = '127.0.0.1'
#port = 3306  
#database = 'projekt1'
#engine = create_engine(f'mysql+pymysql://{username}:{password}@{host}:{port}/{database}')

#df = pd.read_sql("SELECT * FROM saleflats", con=engine)

#engine.dispose()

In [None]:
# tę komórkę uruchom jeśli czerpiesz dane z pliku .csv
df = pd.read_csv('sale_2024_0.csv', sep=',')

In [None]:
df

In [None]:
# pobranie próbki 10 losowych wierszy
df.sample(10)

In [None]:
print(df.columns)

In [None]:
df['BuiltYear'] = pd.to_datetime(df['BuiltYear'], format='%Y', errors='coerce')

In [None]:
df_a = df.dropna(subset=['Description'])

In [None]:
df_a.isnull().sum()

In [None]:
df_b = df_a.dropna(subset=['Location'])

In [None]:
df_b.isnull().sum()

In [None]:
df_c = df_b.dropna(subset=['BuildingCondition'])

In [None]:
df_c.isnull().sum()

In [None]:
df_c

In [None]:
df_c['Description'] = df_c['Description'].str.slice(0, 3000)

In [None]:
# Przygotuj wektorizer z limitem
vectorizer = CountVectorizer(max_features=500)  # np. 500 najczęstszych słów
X_bow = vectorizer.fit_transform(df_c["Description"])

# Konwertuj do DataFrame
df_bow = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())

# Dołącz do oryginalnych danych (z wyjątkiem kolumny Description)
df_c = df_c.drop(columns=['Description']).reset_index(drop=True)
df_bow = df_bow.reset_index(drop=True)
df_prepared = pd.concat([df_c, df_bow], axis=1)

In [None]:
exp = setup(
    data=df_prepared,
    target='BuildingCondition',
    session_id=1123,
    categorical_features=['BuildingType', 'Location'],
    keep_features=[ 
            'Description',
            'SaleId',
            'Score',
            'Type',
            'PricePerSquareMeter',
            'Price', 
            'Area', 
            'TypeOfMarket'
            ],
#    text_features=['Title'],
    date_features=['BuiltYear'],
    ordinal_features={
        'BuildingType': ['Pozostałe', 'Blok', 'Apartametowiec', 'Kamienica']
    },
    verbose=False
)
exp.dataset_transformed.sample(10)

In [None]:
df_prepared.info

In [None]:
df_prepared[df_prepared.duplicated()]

In [None]:
df_prepared.nunique()

In [None]:
df_prepared.isnull()

In [None]:
df_prepared.isnull().sum()

In [None]:
Q1 = df_prepared["Price"].quantile(0.25)
Q3 = df_prepared["Price"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_prep_p = df_prepared[~((df_prepared["Price"] < lower_bound) | (df_prepared["Price"] > upper_bound))]

In [None]:
Q1 = df_prep_p["PricePerSquareMeter"].quantile(0.25)
Q3 = df_prep_p["PricePerSquareMeter"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_prep_a = df_prep_p[~((df_prep_p["PricePerSquareMeter"] < lower_bound) | (df_prep_p["PricePerSquareMeter"] > upper_bound))]

In [None]:
Q1 = df_prep_a["Area"].quantile(0.25)
Q3 = df_prep_a["Area"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_prepared2 = df_prep_a[~((df_prep_a["Area"] < lower_bound) | (df_prep_a["Area"] > upper_bound))]

In [None]:
df_prepared2["BuildingCondition"].value_counts(normalize=True)

In [None]:
df_prepared2.nunique()

In [None]:
unique_btype=df_prepared2['BuildingType'].unique()

unique_btype

In [None]:
print(df_prepared2['BuildingCondition'].nunique())

In [None]:
df_prepared2.isnull().sum()

In [None]:
df_prepared3=df_prepared2.dropna(subset=['BuildingCondition'])

In [None]:
df_prepared3.loc[df_prepared3['TypeOfMarket'] == 'pierwotny', 'BuildingCondition'] = 'DEVELOPER_STATE'

In [None]:
df_prepared3['BuiltYear'] = pd.to_datetime(df_prepared3['BuiltYear'], format='%Y', errors='coerce')

In [None]:
df_prepared3.loc[df_prepared3['BuiltYear'].dt.year >= 2025, 'BuildingCondition'] = 'DEVELOPER_STATE'

In [None]:
df_prepared3 = df_prepared3[~((df_prepared3['Link'].str.contains('otodom', case=False, na=False)) & 
                              (df_prepared3['BuildingCondition'] == 'AFTER_RENOVATION'))]

In [None]:
unique_ctype=df_prepared3['BuildingCondition'].unique()

unique_ctype

In [None]:
df_prepared3["BuildingCondition"].value_counts(normalize=True)

In [None]:
# DODANIE ZBALANSOWANIA
# Sprawdź ile rekordów ma każda klasa
class_counts = df_prepared3['BuildingCondition'].value_counts()
min_count = class_counts.min()

# Tworzymy zbalansowane próbki dla każdej klasy
dfs = []
for condition in class_counts.index:
    df_condition = df_prepared3[df_prepared3['BuildingCondition'] == condition]
    df_condition_downsampled = resample(df_condition, 
                                        replace=False, 
                                        n_samples=min_count, 
                                        random_state=42)
    dfs.append(df_condition_downsampled)

# Łączymy wszystkie klasy w jeden zbalansowany DataFrame
df_balanced = pd.concat(dfs).reset_index(drop=True)

In [None]:
df_balanced["BuildingCondition"].value_counts(normalize=True)

In [None]:
df_balanced

In [None]:
unbalanced_exp = setup(
    data=df_balanced,
    target='BuildingCondition',
    session_id=1123,
    keep_features=[ 
            'Description',
            'SaleId',
             'Score',
             'Type',
             'PricePerSquareMeter',
             'Price', 
             'Area', 
             'TypeOfMarket',
             'Title'
            ],   
    categorical_features=['BuildingType', 'Location'],
    date_features=['BuiltYear'],
    ordinal_features={
#      'BuildingType': ['Pozostałe', 'Blok', 'Apartametowiec', 'Kamienica']
        'BuildingCondition': ['FOR_RENOVATION', 'GOOD', 'AFTER_RENOVATION', 'DEVELOPER_STATE']
    },
    verbose=False,

    ignore_features=[
        'RealPriceAfterRenovation',
#        'Title',
#        'Area', 
#        'Price', 
#        'PricePerSquareMeter',
#        'Type',
#        'Score',
#        'TypeOfMarket',
        'OfferPrice',
        'OriginalPrice',
        'OriginalId',
        'PortalId',
        'OfferFrom',
        'OwnerType',
        'DateAddedToDatabase',
        'DateAdded',
        'DateLastModification',
        'DateLastRaises',
        'NewestDate',
        'AvailableFrom',
        'Link',
        'Phone',
        'MainImage',
        'OtherImages',
        'NumberOfDuplicates',
        'NumberOfRaises',
        'NumberOfModifications',
        'IsDuplicatePriceLower',
        'IsDuplicatePrivateOwner',
        'ScorePrecision',
        'CommunityScore',
        'NumberOfCommunityComments',
        'NumberOfCommunityOpinions',
        'Archive',
        'VoivodeshipNumber',
        'CountyNumber',
        'CommunityNumber',
        'RegionNumber',
        'KindNumber',
        'SubRegionNumber',
        'StreetNumber',
        'EncryptedId',
        ],
         
       
)
best_unbalanced_model = unbalanced_exp.compare_models()
unbalanced_metrics_df = pull()

In [None]:
unbalanced_exp.plot_model(best_unbalanced_model, plot='error')

In [None]:
unbalanced_exp.plot_model(best_unbalanced_model, plot='feature')

In [None]:
tuned_model = unbalanced_exp.tune_model(best_unbalanced_model, n_iter=3, optimize='F1')

In [None]:
best_model = unbalanced_exp.compare_models([best_unbalanced_model, tuned_model])

In [None]:
unbalanced_exp.predict_model(best_model)

In [None]:
unbalanced_exp.plot_model(best_model, plot='error')

In [None]:
unbalanced_exp.plot_model(best_model, plot='feature')

In [None]:
best_tuned_model = unbalanced_exp.tune_model(
    best_model,
    optimize="F1",
    choose_better=True,
    fold=5,
)

In [None]:
unbalanced_exp.predict_model(best_tuned_model).head(10)

In [None]:
unbalanced_exp.plot_model(best_tuned_model, plot='error')

In [None]:
unbalanced_exp.plot_model(best_tuned_model, plot='feature')

In [None]:
best_final_model = unbalanced_exp.finalize_model(best_tuned_model)
best_final_model

In [None]:
unbalanced_exp.save_model(best_final_model, "0_best_buildingCond_model", verbose=False);

In [None]:
unbalanced_exp.predict_model(best_final_model).head(10)

In [None]:
predictions = unbalanced_exp.predict_model(best_final_model)

In [None]:
plot_model(best_final_model, plot="auc")

In [None]:
plot_model(best_final_model, plot="confusion_matrix")
# ile wartości z oryginalnego zbioru danych (True Class) algorytm zmienił na nową wartość po analizie(Predicted Class)
# np. 7 wartości oryginalnych o ocenie 1 zostało zmienionych na 0

## Confusion Matrix (Macierz pomyłek)

Confusion Matrix to tabela, która przedstawia liczbę poprawnych i niepoprawnych klasyfikacji dla każdej klasy. Ułatwia analizę, gdzie model popełnia błędy.

|               | Predicted Positive | Predicted Negative |
|---------------|--------------------|--------------------|
| Actual Positive | True Positive (TP)  | False Negative (FN) |
| Actual Negative | False Positive (FP) | True Negative (TN)  |
 
Analiza macierzy pomyłek pozwala zrozumieć, które klasy są mylone przez model oraz jaki jest balans między różnymi rodzajami błędów.

<span style="color:red">Idealny model miałby same wartości na przekątnej (TP i TN) oraz zera poza nią.</span>

In [None]:
plt.figure(figsize=(12, 6))
plt.barh(unbalanced_metrics_df['Model'], unbalanced_metrics_df['F1'], color='skyblue')
plt.xlabel('F1')
plt.title('Porównanie dokładności modeli wg. metryki F1')
plt.gca().invert_yaxis()  # najlepszy model na górze
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

In [None]:
best_final_model.get_params()

In [None]:
predicted_column = predictions['prediction_label']
print(predicted_column)

In [None]:
df_last=predictions[['prediction_label','BuildingCondition', ]]

In [None]:
#df_last = df_last.reset_index()  
#print(df_last.columns) 

In [None]:
#df_last.rename(columns={'index': 'SaleId'}, inplace=True)

In [None]:
df_last.to_csv('0_sale_2024_stan.csv')

In [None]:
from pycaret.classification import load_model, predict_model

In [None]:
best_final_model = load_model("0_best_buildingCond_model")

In [None]:
df_last

In [None]:
# 1. Wczytaj dane
df_raw = pd.read_csv("sale_2024_0.csv")

# 2. Zrób tymczasową kopię do predykcji — z placeholderami
df_predict = df_raw.copy()
df_predict["BuildingCondition"] = df_predict["BuildingCondition"].fillna(" ")

In [None]:
df_predict['BuiltYear'] = pd.to_datetime(df_predict['BuiltYear'], format='%Y', errors='coerce')

In [None]:
df_predict[['Title', 'Description']] = df_predict[['Title', 'Description']].fillna('')

In [None]:
# 3. Setup z text_features (ważne!)
#exp = setup(
#    data=df_predict,
#    target='BuildingCondition',
#    session_id=1123,
#    categorical_features=['BuildingType', 'Location'],
#    text_features=['Title', 'Description'],
#    text_features_method='bow',
#    keep_features=['SaleId'],
#    date_features=['BuiltYear'],
#    ordinal_features={
#        'BuildingType': ['Pozostałe', 'Blok', 'Apartametowiec', 'Kamienica']
#    },
#    verbose=False
#)

In [None]:
# 1. Wyciągnij kolumny, które model zna
expected_cols = best_final_model.feature_names_in_

#  Napraw BuiltYear
df_raw['BuiltYear'] = pd.to_datetime(df_raw['BuiltYear'], errors='coerce')

# 2. Funkcja dopasowująca df_raw
def align_dataframe(df_raw, expected_cols):
    # Usuń 'BuildingCondition' z listy expected_cols
    expected_cols = [col for col in expected_cols if col != 'BuildingCondition']

    # Dodaj brakujące kolumny
    for col in expected_cols:
        if col not in df_raw.columns:
            df_raw[col] = 0  # albo NaN

    # Usuń BuildingCondition jeśli jest
    if 'BuildingCondition' in df_raw.columns:
        df_raw = df_raw.drop(columns=['BuildingCondition'])
    
    # Ustaw kolejność kolumn
    df_aligned = df_raw[expected_cols]

    return df_aligned

# 3. Użycie
df_aligned = align_dataframe(df_raw, expected_cols)

# 4. Predykcja
predictions = predict_model(best_final_model, data=df_aligned)

In [None]:
result = df_aligned.copy()
result['Prediction_State'] = predictions['prediction_label']

# 5. Wyświetl
print(result)

In [None]:
# 1. Skopiuj oryginalne dane
final_result = df_raw.copy()

# 2. Usuń starą kolumnę BuildingCondition
if 'BuildingCondition' in final_result.columns:
    final_result = final_result.drop(columns=['BuildingCondition'])

# 3. Dodaj nową kolumnę z predykcją
final_result['Predicted_State'] = predictions['prediction_label']

# 4. Wyświetl efekt
final_result

In [None]:
# 1. Wczytaj dane z pliku
df_raw = pd.read_csv("sale_2024_0.csv")

# 2. Dodaj kolumnę z predykcją do oryginalnego df_raw
df_raw['Predicted_State'] = predictions['prediction_label']

# 3. Wyświetl efekt
df_raw

In [None]:
df_raw = df_raw[~((df_raw['Link'].str.contains('otodom', case=False, na=False)) & 
                              (df_raw['BuildingCondition'] == 'AFTER_RENOVATION'))]

In [None]:
df_raw["Predicted_State"].value_counts(normalize=True)

In [None]:
df_raw

In [None]:
df_raw.to_csv('0_sale_2024_stan_final.csv')

In [None]:
df1=pd.read_csv("0_sale_2024_stan_final.csv")

In [None]:
df2=df1.sample(10)

In [None]:
df2

In [None]:
df2.to_csv('01_sample10.csv')

# Blok z metodą OvR uruchamiać z tego miejsca

## Uwaga nie zapomnieć o imporcie z pierwszej linijki

In [2]:
df=pd.read_csv("sale_2024_0.csv")

In [3]:
df_2 = df[~((df['Link'].str.contains('otodom', case=False, na=False)) & 
                              (df['BuildingCondition'] == 'AFTER_RENOVATION'))]

In [4]:
Q1 = df_2["Price"].quantile(0.25)
Q3 = df_2["Price"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_prep_p = df_2[~((df_2["Price"] < lower_bound) | (df_2["Price"] > upper_bound))]

In [5]:
Q1 = df_prep_p["PricePerSquareMeter"].quantile(0.25)
Q3 = df_prep_p["PricePerSquareMeter"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_prep_a = df_prep_p[~((df_prep_p["PricePerSquareMeter"] < lower_bound) | (df_prep_p["PricePerSquareMeter"] > upper_bound))]

In [6]:
Q1 = df_prep_a["Area"].quantile(0.25)
Q3 = df_prep_a["Area"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_prepared2 = df_prep_a[~((df_prep_a["Area"] < lower_bound) | (df_prep_a["Area"] > upper_bound))]

In [7]:
df_prepared3=df_prepared2.dropna(subset=['BuildingCondition'])

In [8]:
df_prepared3.loc[df_prepared3['TypeOfMarket'] == 'pierwotny', 'BuildingCondition'] = 'DEVELOPER_STATE'

In [9]:
df_prepared3['BuiltYear'] = pd.to_datetime(df_prepared3['BuiltYear'], format='%Y', errors='coerce')

In [10]:
df_prepared3.loc[df_prepared3['BuiltYear'].dt.year >= 2025, 'BuildingCondition'] = 'DEVELOPER_STATE'

In [11]:
df_prepared3['BuiltYear'] = pd.to_datetime(df_prepared3['BuiltYear'], format='%Y', errors='coerce')

In [12]:
unique_ctype=df_prepared3['BuildingCondition'].unique()

unique_ctype

array(['DEVELOPER_STATE', 'AFTER_RENOVATION', 'GOOD', 'FOR_RENOVATION'],
      dtype=object)

In [13]:
df_prepared3["BuildingCondition"].value_counts(normalize=True)

BuildingCondition
DEVELOPER_STATE     0.606077
AFTER_RENOVATION    0.183006
FOR_RENOVATION      0.108469
GOOD                0.102449
Name: proportion, dtype: float64

In [14]:
# DODANIE ZBALANSOWANIA
# Sprawdź ile rekordów ma każda klasa
class_counts = df_prepared3['BuildingCondition'].value_counts()
min_count = class_counts.min()

# Tworzymy zbalansowane próbki dla każdej klasy
dfs = []
for condition in class_counts.index:
    df_condition = df_prepared3[df_prepared3['BuildingCondition'] == condition]
    df_condition_downsampled = resample(df_condition, 
                                        replace=False, 
                                        n_samples=min_count, 
                                        random_state=42)
    dfs.append(df_condition_downsampled)

# Łączymy wszystkie klasy w jeden zbalansowany DataFrame
df_balanced = pd.concat(dfs).reset_index(drop=True)

In [15]:
df_balanced["BuildingCondition"].value_counts(normalize=True)

BuildingCondition
DEVELOPER_STATE     0.25
AFTER_RENOVATION    0.25
FOR_RENOVATION      0.25
GOOD                0.25
Name: proportion, dtype: float64

In [16]:
df_balanced

Unnamed: 0,SaleId,OriginalId,PortalId,Title,Description,Area,Price,OfferPrice,RealPriceAfterRenovation,OriginalPrice,...,Archive,Location,VoivodeshipNumber,CountyNumber,CommunityNumber,KindNumber,RegionNumber,SubRegionNumber,StreetNumber,EncryptedId
0,4266367,,3,Nowoczesne mieszkania w sercu Łodzi Manufaktura,Kup mieszkanie bezpośrednio od Dewelopera! Pra...,52.00,459000.0,553361.0,575637.67,,...,,"Łódzkie, Łódź, Łódź-śródmieście, Śródmieście",10.0,61.0,5.0,9.0,958447.0,,,dKKXZ1uFQJTb8QJJOLnifO8e0oyGIwz8-GAUcMCH2_k=
1,4390394,,3,3-pokojowe mieszkanie 60m2 + loggia,3-pokojowe mieszkanie numer 8-A008 na 2. piętr...,60.29,,,,,...,,"Mazowieckie, Warszawa, Ursus, Gołąbki, Ul. Kaz...",14.0,65.0,12.0,8.0,918666.0,,32907.0,oydtk6bW4a5uEscrvIRGJ2DSFhxJitH2UJuyXKHVWIY=
2,4378127,,3,1 pokojowe (możliwość wydzielenia 2) |Czyżyny,Nowe mieszkanie z balkonem - Kraków Czyżyny Na...,32.62,515409.0,490700.0,707000.00,,...,,"Małopolskie, Kraków, Kraków-nowa Huta, Łęg, Ul...",12.0,61.0,3.0,9.0,950813.0,,60055.0,xGLchc43KA01zKcK3z0v4oaCdEfvBnZExzKfPS9GXW0=
3,4346014,,3,‼️Odbierz dziś duży Rabat -> 2 pok + Garaż‼️,"Nowe mieszkania, od dewelopera, bez podatku PC...",40.24,431156.0,428409.0,431459.00,428020.0,...,,"Pomorskie, Gdynia, Wiczlino",22.0,62.0,1.0,1.0,934429.0,,,5sT4Ar2swa4Sj0jt-GLT-zY_qkdVfWyqsbDtbsFdaZc=
4,4407014,,3,Mieszkanie Staroniwska Ogród,Zapraszamy do odkrycia naszej najnowszej inwes...,54.66,577300.0,624750.0,702500.00,,...,,"Podkarpackie, Rzeszów, Staroniwa",18.0,63.0,1.0,1.0,974245.0,,,0K6mgKFRS_FZAHJ2o8bTsjY_qkdVfWyqsbDtbsFdaZc=
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23207,4365731,,17,"Apartament, ul. Saperska","LEGNICA, ULICA SAPERSKA 1. 2 pokoje, balkon, p...",41.25,395000.0,,,,...,,"Dolnośląskie, Legnica, Tarninów, Ul. Saperska",2.0,62.0,1.0,1.0,1067598.0,,19509.0,30wxJBw-tetEqvu_DkzJj0m9CVG6yYs9U2KBME0Psic=
23208,4402007,,17,"Mieszkanie, ul. Modzelewskiego","Fenomenalnie wykończone, po remoncie mieszkani...",57.00,945000.0,829000.0,916000.00,,...,,"Mazowieckie, Warszawa, Mokotów, Wierzbno, Ul. ...",14.0,65.0,5.0,8.0,918577.0,,13192.0,adVPXWEdw4iBsck2kiqyTSxEZaoqZA0WSjJKH4A5guQ=
23209,4402899,,17,"Mieszkanie, ul. Sępa-Szarzyńskiego",Oferta na wyłączność! Nie znajdziecie jej pańs...,67.00,703500.0,849999.0,1114000.00,,...,,"Dolnośląskie, Wrocław, Wrocław-śródmieście, Pl...",2.0,64.0,6.0,9.0,986998.0,,19733.0,tAmtgaq_mFTFs4A7O3FG0JgtwTGeq42piasdfG4vjEE=
23210,4408927,,32,Sprzedam mieszkanie Jasło Jasło,Sprzedam mieszkanie do remontu w centrum Jasła...,49.00,250.0,265000.0,359966.67,,...,,"Podkarpackie, Jasielski, Jasło, Ul. Adama Mick...",18.0,5.0,1.0,1.0,953059.0,,12740.0,WTInZhDcqXMSLo2DspXxioaCdEfvBnZExzKfPS9GXW0=


In [39]:
polish_stopwords = [
    'i', 'oraz', 'a', 'ale', 'czy', 'więc', 'lecz', 'że', 'to', 'z', 'na', 'do', 'po', 'przez',
    'dla', 'bez', 'od', 'pod', 'nad', 'u', 'o', 'w', 'jak', 'tak', 'nie', 'jest', 'są', 'być',
    'był', 'była', 'było', 'byli', 'się', 'też', 'ten', 'ta', 'to', 'ci', 'co', 'który', 'która',
    'które', 'którzy', 'kto', 'kogo', 'czego', 'dlaczego', 'dlatego', 'tam', 'tu', 'tutaj',
    'teraz', 'już', 'jeszcze', 'bardzo', 'może', 'muszę', 'musisz', 'można', 'trzeba', 'będzie',
    'będą', 'by', 'aby', 'gdy', 'gdyby', 'mimo', 'choć', 'chociaż', 'nawet', 'ani', 'żeby', 'czyli'
]

In [40]:
# === OvR: One-vs-Rest podejście do klasyfikacji stanu mieszkania + zapis ===

from pycaret.classification import setup, create_model, tune_model, predict_model
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import sqlite3

# 1. Przygotowanie danych bez nulli
df_clean = df_balanced.copy()
df_clean = df_clean[df_clean['BuildingCondition'].notna()].reset_index(drop=True)

# 2. Przekształcenie opisu na wektory TF-IDF
vectorizer = TfidfVectorizer(
    max_features=3000,
    min_df=2,
    stop_words=polish_stopwords
)
X_text = vectorizer.fit_transform(df_clean['Description'].fillna('')).toarray()
df_tfidf = pd.DataFrame(X_text, columns=[f'tfidf_{i}' for i in range(X_text.shape[1])])
df_clean = pd.concat([df_clean.reset_index(drop=True), df_tfidf], axis=1)
df_clean.drop(columns=['Description'], inplace=True)

# 3. Lista klas do przewidywania
unique_stany = df_clean['BuildingCondition'].unique()
models = {}
results_list = []

for stan in unique_stany:
    df_temp = df_clean.copy()
    df_temp['target'] = df_temp['BuildingCondition'].apply(lambda x: 1 if x == stan else 0)
    print(f"🧪 Stan: {stan} — Target count:\n{df_temp['target'].value_counts()}")
    print(df_temp.filter(like='tfidf_').describe())
    
    clf_setup = setup(
        data=df_temp.drop(columns=['BuildingCondition']),
        target='target',
        session_id=123,
        verbose=False,
        html=False,
        fix_imbalance=True,
        fix_imbalance_method='smote'  # lub 'random'
    )
    model = create_model('lightgbm')
    tuned_model = tune_model(model)
    models[stan] = tuned_model

# ✳️ WYDRUKUJ metryki tego modelu
    print(f"\n📊 Ewaluacja modelu dla klasy: {stan}")
    print(pull())  # wyciąga ostatnie metryki z tune_model()
    evaluate_model(tuned_model)

# 📝 ZAPISZ metryki do listy
    model_results = pull()
    model_results['target_class'] = stan
    results_list.append(model_results)

# 4. Predykcja na danych z brakującym 'BuildingCondition'
df_nulls = df[df['BuildingCondition'].isna()].copy()
X_null_text = vectorizer.transform(df_nulls['Description'].fillna('')).toarray()
df_null_vec = pd.DataFrame(X_null_text, columns=[f'tfidf_{i}' for i in range(X_null_text.shape[1])])
df_nulls = pd.concat([df_nulls.reset_index(drop=True), df_null_vec], axis=1)
df_nulls.drop(columns=['Description'], inplace=True)

# 5. Przewidzenie prawdopodobieństw i przypisanie klasy o najwyższej pewności
df_nulls["BuiltYear"] = pd.to_datetime(df_nulls["BuiltYear"], errors="coerce")
probabilities = {}

for stan, model in models.items():
    preds = predict_model(model, data=df_nulls, raw_score=False)
    probabilities[stan] = preds['Score']

# 6. Wybranie klasy z najwyższym Score
probs_df = pd.DataFrame(probabilities)
df_nulls['predicted_standard'] = probs_df.idxmax(axis=1)
df.loc[df_nulls.index, 'BuildingCondition'] = df_nulls['predicted_standard']

print("✅ Uzupełniono brakujące wartości w kolumnie 'BuildingCondition' na podstawie modeli OvR.")

# 7. Zapis do pliku CSV
df.to_csv("uzupelnione_mieszkania.csv", index=False)
print("💾 Zapisano do: uzupelnione_mieszkania.csv")

# 8. Zapis do bazy danych SQLite
conn = sqlite3.connect("uzupelnione_mieszkania.db")
df.to_sql("mieszkania", conn, if_exists="replace", index=False)
conn.close()
print("💾 Zapisano do: uzupelnione_mieszkania.db (tabela: mieszkania)")

🧪 Stan: DEVELOPER_STATE — Target count:
target
0    17409
1     5803
Name: count, dtype: int64



                                                                                                                       

            tfidf_0       tfidf_1       tfidf_2       tfidf_3       tfidf_4  \
count  23212.000000  23212.000000  23212.000000  23212.000000  23212.000000   
mean       0.011340      0.015096      0.003257      0.002044      0.001859   
std        0.040914      0.031982      0.024132      0.014771      0.014461   
min        0.000000      0.000000      0.000000      0.000000      0.000000   
25%        0.000000      0.000000      0.000000      0.000000      0.000000   
50%        0.000000      0.000000      0.000000      0.000000      0.000000   
75%        0.000000      0.000000      0.000000      0.000000      0.000000   
max        0.567200      0.401486      0.654289      0.376434      0.561324   

            tfidf_5       tfidf_6       tfidf_7       tfidf_8       tfidf_9  \
count  23212.000000  23212.000000  23212.000000  23212.000000  23212.000000   
mean       0.002279      0.002761      0.001671      0.001371      0.001397   
std        0.014818      0.016426      0.013991    

                                                                                                                       

      Accuracy     AUC  Recall  Prec.   F1  Kappa  MCC
Fold                                                  
0       0.7502  0.5616     0.0    0.0  0.0    0.0  0.0
1       0.7502  0.6244     0.0    0.0  0.0    0.0  0.0
2       0.7502  0.5149     0.0    0.0  0.0    0.0  0.0
3       0.7502  0.5784     0.0    0.0  0.0    0.0  0.0
4       0.7502  0.5663     0.0    0.0  0.0    0.0  0.0
5       0.7502  0.4886     0.0    0.0  0.0    0.0  0.0
6       0.7495  0.5665     0.0    0.0  0.0    0.0  0.0
7       0.7495  0.7776     0.0    0.0  0.0    0.0  0.0
8       0.7500  0.5239     0.0    0.0  0.0    0.0  0.0
9       0.7500  0.5613     0.0    0.0  0.0    0.0  0.0
Mean    0.7500  0.5763     0.0    0.0  0.0    0.0  0.0
Std     0.0002  0.0760     0.0    0.0  0.0    0.0  0.0


Processing:  14%|██████████▎                                                             | 1/7 [00:06<00:38,  6.49s/it]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


                                                                                                                       

Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
      Accuracy     AUC  Recall  Prec.   F1  Kappa  MCC
Fold                                                  
0       0.7502  0.5066     0.0    0.0  0.0    0.0  0.0
1       0.7502  0.5099     0.0    0.0  0.0    0.0  0.0
2       0.7502  0.5099     0.0    0.0  0.0    0.0  0.0
3       0.7502  0.5119     0.0    0.0  0.0    0.0  0.0
4       0.7502  0.5004     0.0    0.0  0.0    0.0  0.0
5       0.7502  0.5041     0.0    0.0  0.0    0.0  0.0
6       0.7495  0.5012     0.0    0.0  0.0    0.0  0.0
7       0.7495  0.5098     0.0    0.0  0.0    0.0  0.0
8       0.7500  0.5033     0.0    0.0  0.0    0.0  0.0
9       0.7500  0.5058     0.0    0.0  0.0    0.0  0.0
Mean    0.7500  0.5063     0.0    0.0  0.0    0.0  0.0
Std     0.0002  0.0038     0.0    0.0  0.0    0.0  0.0

📊 Ewaluacja modelu dla klasy: DEVELOPER_STATE
      Accuracy     AUC  Recall  Pr

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

🧪 Stan: AFTER_RENOVATION — Target count:
target
0    17409
1     5803
Name: count, dtype: int64
            tfidf_0       tfidf_1       tfidf_2       tfidf_3       tfidf_4  \
count  23212.000000  23212.000000  23212.000000  23212.000000  23212.000000   
mean       0.011340      0.015096      0.003257      0.002044      0.001859   
std        0.040914      0.031982      0.024132      0.014771      0.014461   
min        0.000000      0.000000      0.000000      0.000000      0.000000   
25%        0.000000      0.000000      0.000000      0.000000      0.000000   
50%        0.000000      0.000000      0.000000      0.000000      0.000000   
75%        0.000000      0.000000      0.000000      0.000000      0.000000   
max        0.567200      0.401486      0.654289      0.376434      0.561324   

            tfidf_5       tfidf_6       tfidf_7       tfidf_8       tfidf_9  \
count  23212.000000  23212.000000  23212.000000  23212.000000  23212.000000   
mean       0.002279      0.002761 

                                                                                                                       

      Accuracy     AUC  Recall  Prec.   F1  Kappa  MCC
Fold                                                  
0       0.7502  0.4847     0.0    0.0  0.0    0.0  0.0
1       0.7502  0.4962     0.0    0.0  0.0    0.0  0.0
2       0.7502  0.5347     0.0    0.0  0.0    0.0  0.0
3       0.7502  0.5338     0.0    0.0  0.0    0.0  0.0
4       0.7502  0.5182     0.0    0.0  0.0    0.0  0.0
5       0.7502  0.5105     0.0    0.0  0.0    0.0  0.0
6       0.7495  0.5143     0.0    0.0  0.0    0.0  0.0
7       0.7495  0.5070     0.0    0.0  0.0    0.0  0.0
8       0.7500  0.4984     0.0    0.0  0.0    0.0  0.0
9       0.7500  0.5523     0.0    0.0  0.0    0.0  0.0
Mean    0.7500  0.5150     0.0    0.0  0.0    0.0  0.0
Std     0.0002  0.0194     0.0    0.0  0.0    0.0  0.0


Processing:  14%|██████████▎                                                             | 1/7 [00:07<00:43,  7.30s/it]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


                                                                                                                       

Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
      Accuracy     AUC  Recall  Prec.   F1  Kappa  MCC
Fold                                                  
0       0.7502  0.5016     0.0    0.0  0.0    0.0  0.0
1       0.7502  0.5094     0.0    0.0  0.0    0.0  0.0
2       0.7502  0.5004     0.0    0.0  0.0    0.0  0.0
3       0.7502  0.5000     0.0    0.0  0.0    0.0  0.0
4       0.7502  0.5000     0.0    0.0  0.0    0.0  0.0
5       0.7502  0.5000     0.0    0.0  0.0    0.0  0.0
6       0.7495  0.5000     0.0    0.0  0.0    0.0  0.0
7       0.7495  0.5000     0.0    0.0  0.0    0.0  0.0
8       0.7500  0.5000     0.0    0.0  0.0    0.0  0.0
9       0.7500  0.5033     0.0    0.0  0.0    0.0  0.0
Mean    0.7500  0.5015     0.0    0.0  0.0    0.0  0.0
Std     0.0002  0.0028     0.0    0.0  0.0    0.0  0.0

📊 Ewaluacja modelu dla klasy: AFTER_RENOVATION
      Accuracy     AUC  Recall  P

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

🧪 Stan: FOR_RENOVATION — Target count:
target
0    17409
1     5803
Name: count, dtype: int64
            tfidf_0       tfidf_1       tfidf_2       tfidf_3       tfidf_4  \
count  23212.000000  23212.000000  23212.000000  23212.000000  23212.000000   
mean       0.011340      0.015096      0.003257      0.002044      0.001859   
std        0.040914      0.031982      0.024132      0.014771      0.014461   
min        0.000000      0.000000      0.000000      0.000000      0.000000   
25%        0.000000      0.000000      0.000000      0.000000      0.000000   
50%        0.000000      0.000000      0.000000      0.000000      0.000000   
75%        0.000000      0.000000      0.000000      0.000000      0.000000   
max        0.567200      0.401486      0.654289      0.376434      0.561324   

            tfidf_5       tfidf_6       tfidf_7       tfidf_8       tfidf_9  \
count  23212.000000  23212.000000  23212.000000  23212.000000  23212.000000   
mean       0.002279      0.002761   

                                                                                                                       

      Accuracy     AUC  Recall  Prec.   F1  Kappa  MCC
Fold                                                  
0       0.7502  0.5930     0.0    0.0  0.0    0.0  0.0
1       0.7502  0.5664     0.0    0.0  0.0    0.0  0.0
2       0.7502  0.5084     0.0    0.0  0.0    0.0  0.0
3       0.7502  0.5210     0.0    0.0  0.0    0.0  0.0
4       0.7502  0.5178     0.0    0.0  0.0    0.0  0.0
5       0.7502  0.5616     0.0    0.0  0.0    0.0  0.0
6       0.7495  0.6010     0.0    0.0  0.0    0.0  0.0
7       0.7495  0.5377     0.0    0.0  0.0    0.0  0.0
8       0.7500  0.5635     0.0    0.0  0.0    0.0  0.0
9       0.7500  0.6120     0.0    0.0  0.0    0.0  0.0
Mean    0.7500  0.5582     0.0    0.0  0.0    0.0  0.0
Std     0.0002  0.0346     0.0    0.0  0.0    0.0  0.0


Processing:  14%|██████████▎                                                             | 1/7 [00:08<00:51,  8.66s/it]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


                                                                                                                       

Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
      Accuracy     AUC  Recall  Prec.   F1  Kappa  MCC
Fold                                                  
0       0.7502  0.5053     0.0    0.0  0.0    0.0  0.0
1       0.7502  0.5000     0.0    0.0  0.0    0.0  0.0
2       0.7502  0.5033     0.0    0.0  0.0    0.0  0.0
3       0.7502  0.4988     0.0    0.0  0.0    0.0  0.0
4       0.7502  0.5033     0.0    0.0  0.0    0.0  0.0
5       0.7502  0.5004     0.0    0.0  0.0    0.0  0.0
6       0.7495  0.5000     0.0    0.0  0.0    0.0  0.0
7       0.7495  0.5000     0.0    0.0  0.0    0.0  0.0
8       0.7500  0.5016     0.0    0.0  0.0    0.0  0.0
9       0.7500  0.5000     0.0    0.0  0.0    0.0  0.0
Mean    0.7500  0.5013     0.0    0.0  0.0    0.0  0.0
Std     0.0002  0.0020     0.0    0.0  0.0    0.0  0.0

📊 Ewaluacja modelu dla klasy: FOR_RENOVATION
      Accuracy     AUC  Recall  Pre

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

🧪 Stan: GOOD — Target count:
target
0    17409
1     5803
Name: count, dtype: int64
            tfidf_0       tfidf_1       tfidf_2       tfidf_3       tfidf_4  \
count  23212.000000  23212.000000  23212.000000  23212.000000  23212.000000   
mean       0.011340      0.015096      0.003257      0.002044      0.001859   
std        0.040914      0.031982      0.024132      0.014771      0.014461   
min        0.000000      0.000000      0.000000      0.000000      0.000000   
25%        0.000000      0.000000      0.000000      0.000000      0.000000   
50%        0.000000      0.000000      0.000000      0.000000      0.000000   
75%        0.000000      0.000000      0.000000      0.000000      0.000000   
max        0.567200      0.401486      0.654289      0.376434      0.561324   

            tfidf_5       tfidf_6       tfidf_7       tfidf_8       tfidf_9  \
count  23212.000000  23212.000000  23212.000000  23212.000000  23212.000000   
mean       0.002279      0.002761      0.00167

                                                                                                                       

      Accuracy     AUC  Recall  Prec.   F1  Kappa  MCC
Fold                                                  
0       0.7502  0.5254     0.0    0.0  0.0    0.0  0.0
1       0.7502  0.5456     0.0    0.0  0.0    0.0  0.0
2       0.7502  0.5265     0.0    0.0  0.0    0.0  0.0
3       0.7502  0.5362     0.0    0.0  0.0    0.0  0.0
4       0.7502  0.5410     0.0    0.0  0.0    0.0  0.0
5       0.7502  0.5271     0.0    0.0  0.0    0.0  0.0
6       0.7495  0.6071     0.0    0.0  0.0    0.0  0.0
7       0.7495  0.5665     0.0    0.0  0.0    0.0  0.0
8       0.7500  0.5398     0.0    0.0  0.0    0.0  0.0
9       0.7500  0.5536     0.0    0.0  0.0    0.0  0.0
Mean    0.7500  0.5469     0.0    0.0  0.0    0.0  0.0
Std     0.0002  0.0235     0.0    0.0  0.0    0.0  0.0


Processing:  14%|██████████▎                                                             | 1/7 [00:08<00:52,  8.75s/it]

Fitting 10 folds for each of 10 candidates, totalling 100 fits


                                                                                                                       

Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).
      Accuracy     AUC  Recall  Prec.   F1  Kappa  MCC
Fold                                                  
0       0.7502  0.5000     0.0    0.0  0.0    0.0  0.0
1       0.7502  0.5074     0.0    0.0  0.0    0.0  0.0
2       0.7502  0.5000     0.0    0.0  0.0    0.0  0.0
3       0.7502  0.5037     0.0    0.0  0.0    0.0  0.0
4       0.7502  0.5185     0.0    0.0  0.0    0.0  0.0
5       0.7502  0.4996     0.0    0.0  0.0    0.0  0.0
6       0.7495  0.5000     0.0    0.0  0.0    0.0  0.0
7       0.7495  0.5000     0.0    0.0  0.0    0.0  0.0
8       0.7500  0.5127     0.0    0.0  0.0    0.0  0.0
9       0.7500  0.5000     0.0    0.0  0.0    0.0  0.0
Mean    0.7500  0.5042     0.0    0.0  0.0    0.0  0.0
Std     0.0002  0.0063     0.0    0.0  0.0    0.0  0.0

📊 Ewaluacja modelu dla klasy: GOOD
      Accuracy     AUC  Recall  Prec.   F1  K

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

✅ Uzupełniono brakujące wartości w kolumnie 'BuildingCondition' na podstawie modeli OvR.
💾 Zapisano do: uzupelnione_mieszkania.csv
💾 Zapisano do: uzupelnione_mieszkania.db (tabela: mieszkania)


In [20]:
results_df = pd.concat(results_list)
results_df.to_csv("ovr_results.csv", index=False)
print("💾 Zapisano metryki OvR do pliku: ovr_results.csv")

💾 Zapisano metryki OvR do pliku: ovr_results.csv


In [21]:
metryki=pd.read_csv("ovr_results.csv")

In [22]:
metryki

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,target_class
0,0.7502,0.5,0.0,0.0,0.0,0.0,0.0,DEVELOPER_STATE
1,0.7502,0.5,0.0,0.0,0.0,0.0,0.0,DEVELOPER_STATE
2,0.7502,0.5,0.0,0.0,0.0,0.0,0.0,DEVELOPER_STATE
3,0.7502,0.5,0.0,0.0,0.0,0.0,0.0,DEVELOPER_STATE
4,0.7502,0.5,0.0,0.0,0.0,0.0,0.0,DEVELOPER_STATE
5,0.7502,0.5,0.0,0.0,0.0,0.0,0.0,DEVELOPER_STATE
6,0.7495,0.5,0.0,0.0,0.0,0.0,0.0,DEVELOPER_STATE
7,0.7495,0.5,0.0,0.0,0.0,0.0,0.0,DEVELOPER_STATE
8,0.75,0.5,0.0,0.0,0.0,0.0,0.0,DEVELOPER_STATE
9,0.75,0.5,0.0,0.0,0.0,0.0,0.0,DEVELOPER_STATE


In [23]:
df2=df.sample(10)
df2.to_csv("uzup_mieszkania_sample.csv", index=False)

In [24]:
df["BuildingCondition"].value_counts(normalize=True)

BuildingCondition
DEVELOPER_STATE     0.970245
AFTER_RENOVATION    0.020038
GOOD                0.004998
FOR_RENOVATION      0.004719
Name: proportion, dtype: float64

In [None]:
# Fatalny balans, mimo iż baza wstępna została zbalansowana, tutaj wyniki są trudne do zaakceptowania
# wartoć stanu Developer_State to aż 97% wszystkich predykcji. Aż trudno w to uwierzyć. Do sprawdzenia szczegółowego.