# PROJEKT MODELU KLASYFIKACYJNEGO PRZEWIDYWANIA STANU MIESZKAŃ NA PODSTAWIE #

## Extra Trees Classifier ##

### kwiecień 2025

In [None]:
import pandas as pd
from pycaret.classification import setup, pull, compare_models, plot_model
from sklearn.feature_extraction.text import CountVectorizer
import pymysql
from sqlalchemy import create_engine
import numpy as np
from scipy.stats import skewnorm
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# tą komórkę uruchom jeżeli bazę danych bierzesz np. z DBeaver, oracle developer itp.

#username = 'root'
#password = '1234'
#host = '127.0.0.1'
#port = 3306  
#database = 'projekt1'
#engine = create_engine(f'mysql+pymysql://{username}:{password}@{host}:{port}/{database}')

#df = pd.read_sql("SELECT * FROM saleflats", con=engine)

#engine.dispose()

In [None]:
# tę komórkę uruchom jeśli czerpiesz dane z pliku .csv
df = pd.read_csv('sale_2024_0.csv', sep=',')

In [None]:
df

In [None]:
# pobranie próbki 10 losowych wierszy
df.sample(10)

In [None]:
print(df.columns)

In [None]:
df['BuiltYear'] = pd.to_datetime(df['BuiltYear'], format='%Y', errors='coerce')

In [None]:
df_a = df.dropna(subset=['Description'])

In [None]:
df_a.isnull().sum()

In [None]:
df_b = df_a.dropna(subset=['Location'])

In [None]:
df_b.isnull().sum()

In [None]:
df_c = df_b.dropna(subset=['BuildingCondition'])

In [None]:
df_c.isnull().sum()

In [None]:
df_c

In [None]:
df_c['Description'] = df_c['Description'].str.slice(0, 300)

In [None]:
# Przygotuj wektorizer z limitem
vectorizer = CountVectorizer(max_features=500)  # np. 500 najczęstszych słów
X_bow = vectorizer.fit_transform(df_c["Description"])

# Konwertuj do DataFrame
df_bow = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())

# Dołącz do oryginalnych danych (z wyjątkiem kolumny Description)
df_c = df_c.drop(columns=['Description']).reset_index(drop=True)
df_bow = df_bow.reset_index(drop=True)
df_prepared = pd.concat([df_c, df_bow], axis=1)

In [None]:
exp = setup(
    data=df_prepared,
    target='BuildingCondition',
    session_id=1123,
    categorical_features=['BuildingType', 'Location'],
    keep_features=[            
             'Description', 
             'SaleId'],
    date_features=['BuiltYear'],
    ordinal_features={
        'BuildingType': ['Pozostałe', 'Blok', 'Apartametowiec', 'Kamienica']
    },
    verbose=False
)
exp.dataset_transformed.sample(10)

In [None]:
df_prepared.info

In [None]:
df_prepared[df_prepared.duplicated()]

In [None]:
df_prepared.nunique()

In [None]:
df_prepared.isnull()

In [None]:
df_prepared.isnull().sum()

In [None]:
Q1 = df_prepared["Price"].quantile(0.25)
Q3 = df_prepared["Price"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_prep_p = df_prepared[~((df_prepared["Price"] < lower_bound) | (df_prepared["Price"] > upper_bound))]

In [None]:
Q1 = df_prep_p["PricePerSquareMeter"].quantile(0.25)
Q3 = df_prep_p["PricePerSquareMeter"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_prep_a = df_prep_p[~((df_prep_p["PricePerSquareMeter"] < lower_bound) | (df_prep_p["PricePerSquareMeter"] > upper_bound))]

In [None]:
Q1 = df_prep_a["Area"].quantile(0.25)
Q3 = df_prep_a["Area"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_prepared2 = df_prep_a[~((df_prep_a["Area"] < lower_bound) | (df_prep_a["Area"] > upper_bound))]

In [None]:
df_prepared2["BuildingCondition"].value_counts(normalize=True)

In [None]:
df_prepared2.nunique()

In [None]:
unique_btype=df_prepared2['BuildingType'].unique()

unique_btype

In [None]:
print(df_prepared2['BuildingCondition'].nunique())

In [None]:
unique_ctype=df_prepared2['BuildingCondition'].unique()

unique_ctype

In [None]:
df_prepared2.isnull().sum()

In [None]:
df_prepared3=df_prepared2.dropna(subset=['BuildingCondition'])

In [None]:
unbalanced_exp = setup(
    data=df_prepared3,
    target='BuildingCondition',
    session_id=1123,
    keep_features=[
            'Description', 
             'SaleId'],
    categorical_features=['BuildingType', 'Location'],
    date_features=['BuiltYear'],
    ordinal_features={
        'BuildingType': ['Pozostałe', 'Blok', 'Apartametowiec', 'Kamienica']
    },
    verbose=False,

    ignore_features=[
        'RealPriceAfterRenovation',
        'OfferPrice',
        'OriginalPrice',
        'PricePerSquareMeter',
        'OriginalId',
        'PortalId',
        'Title',
        'Type',
        'OfferFrom',
        'TypeOfMarket',
        'OwnerType',
        'DateAddedToDatabase',
        'DateAdded',
        'DateLastModification',
        'DateLastRaises',
        'NewestDate',
        'AvailableFrom',
        'Link',
        'Phone',
        'MainImage',
        'OtherImages',
        'NumberOfDuplicates',
        'NumberOfRaises',
        'NumberOfModifications',
        'IsDuplicatePriceLower',
        'IsDuplicatePrivateOwner',
        'Score',
        'ScorePrecision',
        'CommunityScore',
        'NumberOfCommunityComments',
        'NumberOfCommunityOpinions',
        'Archive',
        'VoivodeshipNumber',
        'CountyNumber',
        'CommunityNumber',
        'RegionNumber',
        'KindNumber',
        'SubRegionNumber',
        'StreetNumber',
        'EncryptedId',
        ],
         
       
)
best_unbalanced_model = unbalanced_exp.compare_models()
unbalanced_metrics_df = pull()

In [None]:
unbalanced_exp.plot_model(best_unbalanced_model, plot='error')

In [None]:
unbalanced_exp.plot_model(best_unbalanced_model, plot='feature')

In [None]:
tuned_model = unbalanced_exp.tune_model(best_unbalanced_model, n_iter=3, optimize='F1')

In [None]:
best_model = unbalanced_exp.compare_models([best_unbalanced_model, tuned_model])

In [None]:
unbalanced_exp.predict_model(best_model)

In [None]:
unbalanced_exp.plot_model(best_model, plot='error')

In [None]:
unbalanced_exp.plot_model(best_model, plot='feature')

In [None]:
best_tuned_model = unbalanced_exp.tune_model(
    best_model,
    optimize="F1",
    choose_better=True,
    fold=5,
)

In [None]:
unbalanced_exp.predict_model(best_tuned_model).head(10)

In [None]:
unbalanced_exp.plot_model(best_tuned_model, plot='error')

In [None]:
unbalanced_exp.plot_model(best_tuned_model, plot='feature')

In [None]:
best_final_model = unbalanced_exp.finalize_model(best_tuned_model)
best_final_model

In [None]:
unbalanced_exp.save_model(best_final_model, "0_best_buildingCond_model", verbose=False);

In [None]:
unbalanced_exp.predict_model(best_final_model).head(10)

In [None]:
predictions = unbalanced_exp.predict_model(best_final_model)

In [None]:
plot_model(best_final_model, plot="auc")

In [None]:
plot_model(best_final_model, plot="confusion_matrix")
# ile wartości z oryginalnego zbioru danych (True Class) algorytm zmienił na nową wartość po analizie(Predicted Class)
# np. 7 wartości oryginalnych o ocenie 1 zostało zmienionych na 0

## Confusion Matrix (Macierz pomyłek)

Confusion Matrix to tabela, która przedstawia liczbę poprawnych i niepoprawnych klasyfikacji dla każdej klasy. Ułatwia analizę, gdzie model popełnia błędy.

|               | Predicted Positive | Predicted Negative |
|---------------|--------------------|--------------------|
| Actual Positive | True Positive (TP)  | False Negative (FN) |
| Actual Negative | False Positive (FP) | True Negative (TN)  |
 
Analiza macierzy pomyłek pozwala zrozumieć, które klasy są mylone przez model oraz jaki jest balans między różnymi rodzajami błędów.

<span style="color:red">Idealny model miałby same wartości na przekątnej (TP i TN) oraz zera poza nią.</span>

In [None]:
plt.figure(figsize=(12, 6))
plt.barh(unbalanced_metrics_df['Model'], unbalanced_metrics_df['F1'], color='skyblue')
plt.xlabel('F1')
plt.title('Porównanie dokładności modeli wg. metryki F1')
plt.gca().invert_yaxis()  # najlepszy model na górze
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

In [None]:
best_final_model.get_params()

In [None]:
predicted_column = predictions['prediction_label']
print(predicted_column)

In [None]:
df_last=predictions[['prediction_label','BuildingCondition', ]]

In [None]:
df_last = df_last.reset_index()  
print(df_last.columns) 

In [None]:
df_last.rename(columns={'index': 'SaleId'}, inplace=True)

In [None]:
df_last.to_csv('sale_2024_stan.csv')

In [None]:
print(df_last.columns)

In [None]:
print(df_last.head())

In [None]:
#df_last = df_last.reset_index()  
#print(df_last.columns)  

In [None]:
#df_last.rename(columns={'index': 'SaleId'}, inplace=True)

In [None]:
saleflats_df = pd.read_csv('sale_2024_0.csv')

In [None]:
new_state_df = pd.read_csv('sale_2024_stan.csv')

In [None]:
print("saleflats_df.columns:", saleflats_df.columns.tolist())

In [None]:
print("new_state_df.columns:", new_state_df.columns.tolist())

In [None]:
new_state_df2 = new_state_df.rename(columns={'prediction_label ': 'NewState'})

In [None]:
print("new_state_df.columns:", new_state_df.columns.tolist())

In [None]:
print(new_state_df2.columns)

In [None]:
print(saleflats_df.columns)

In [None]:
merged_df = pd.merge(
    saleflats_df,
    new_state_df2[['SaleId', 'prediction_label']],
    left_on='SaleId',  # Kolumna w saleflats_df
    right_on='SaleId',  # Kolumna w new_state_df2
    how='left'
)

In [None]:
merged_df.head(10).all

In [None]:
merged_df.head(10).all

In [None]:
prediction_df = df_prepared3.copy()

In [None]:
from pycaret.classification import predict_model

In [None]:
prediction_df_clean = prediction_df.drop(columns=['BuildingCondition'], errors='ignore')
predictions = predict_model(best_final_model, data=prediction_df_clean)

In [None]:
predictions['PredictedState'] = prediction_df['BuildingCondition']

In [None]:
merged_df['PredictedState'] = predictions['prediction_label']

In [None]:
# Przenieś kolumnę 'BuildingCondition' za 'NewState'
cols = list(merged_df.columns)
new_state_index = cols.index('BuildingCondition')
# Usuń z listy kolumn
cols.remove('prediction_label')
# Dodaj w odpowiednie miejsce
cols.insert(new_state_index + 1, 'prediction_label')
# Przekształć DataFrame
merged_df = merged_df[cols]

In [None]:
merged_df.head(10)

In [None]:
merged_df

In [None]:
merged_df.to_csv('0_new_state_full.csv')

In [None]:
exp = setup(
    data=df_prepared2,
    target='BuildingCondition',
    session_id=1123,
    categorical_features=['BuildingType', 'Location'],
    keep_features=[            
             'Description', 
             'SaleId'],
    date_features=['BuiltYear'],
    ordinal_features={
        'BuildingType': ['Pozostałe', 'Blok', 'Apartametowiec', 'Kamienica']
    },
    verbose=False
)
exp.dataset_transformed.sample(10)

In [None]:
prediction_df = df_prepared2.copy()

In [None]:
prediction_df_clean = prediction_df.drop(columns=['BuildingCondition'], errors='ignore')
predictions = predict_model(best_final_model, data=prediction_df_clean)

In [None]:
predictions