# PROJEKT MODELU REGRESYJNEGO PRZEWIDYWANIA CEN MIESZKAŃ NA PODSTAWIE #

# ALGORYTMU LGBM #

  ## KWIECIEŃ 2025 ##

In [None]:
import pandas as pd
import mlflow
from pycaret.datasets import get_data
from pycaret.regression import setup, pull, compare_models, plot_model, load_model
import pymysql
from sqlalchemy import create_engine
import numpy as np
from scipy.stats import skewnorm
import matplotlib.pyplot as plt
import seaborn as sns
from joblib import parallel_backend

In [None]:
# tą komórkę uruchom jeżeli bazie bierzesz np. z DBeaver
# username = 'root'
# password = '1234'
# host = '127.0.0.1'
# port = 3306  
# database = 'projekt1'
# engine = create_engine(f'mysql+pymysql://{username}:{password}@{host}:{port}/{database}')

# df = pd.read_sql("SELECT * FROM saleflats", con=engine)

# engine.dispose()

In [None]:
MLFLOW_EXPERIMENT_NAME = 'Investoro_Ceny'
MLFLOW_TAGS = {'data': 'Investoro_ceny', 'library': 'pycaret'}

mlflow.set_tracking_uri("http://localhost:5000")

In [None]:
# tę komórkę uruchom jeśli czerpiesz dane z pliku .csv
df = pd.read_csv('sale_2024_0.csv', sep=',')

In [None]:
df

In [None]:
df.head(10)

In [None]:
df.sample(10)

In [None]:
df.info

In [None]:
df[df.duplicated()]

In [None]:
df.nunique()

In [None]:
correlation_matrix = df[['Area', 'Price', 'BuiltYear', 'Floor', 'Floors', 'CommunityScore', 'CountyNumber', 'CommunityNumber',
                               'RegionNumber','KindNumber']].corr()

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Price')

In [None]:
df.isnull()

In [None]:
df.isnull().sum()

In [None]:
df_beznull_price = df.dropna(subset=['Area'])

In [None]:
df_beznull_price.isnull().sum()

In [None]:
# df_beznull_price2 = df_beznull_price.sort_values('Area').interpolate()

In [None]:
df_beznull_price.head(20)

In [None]:
Q1 = df_beznull_price["Price"].quantile(0.25)
Q3 = df_beznull_price["Price"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_price_p = df_beznull_price[~((df_beznull_price["Price"] < lower_bound) | (df_beznull_price["Price"] > upper_bound))]

In [None]:
Q1 = df_price_p["PricePerSquareMeter"].quantile(0.25)
Q3 = df_price_p["PricePerSquareMeter"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_price_a = df_price_p[~((df_price_p["PricePerSquareMeter"] < lower_bound) | (df_price_p["PricePerSquareMeter"] > upper_bound))]


In [None]:
Q1 = df_price_a["Area"].quantile(0.25)
Q3 = df_price_a["Area"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

df_price1 = df_price_a[~((df_price_a["Area"] < lower_bound) | (df_price_a["Area"] > upper_bound))]

In [None]:
df_price1['BuiltYear'] = pd.to_datetime(df_price1['BuiltYear'], format='%Y', errors='coerce')

In [None]:
df_price1.head(20)

In [None]:
df_price1.describe().T

In [None]:
df_price1["Price"].value_counts(normalize=True)

In [None]:
#df_price2 = df_price1.sort_values('Price').interpolate()

In [None]:
df_price1.nunique()

In [None]:
df_price1.head(10)

In [None]:
unique_btype=df_price1['BuildingType'].unique()

unique_btype

In [None]:
print(df_price1['Price'].nunique())

In [None]:
print(df_price1['Price'].value_counts())

In [None]:
df_price1.isnull().sum()

In [None]:
df_price2 = df_price1.dropna(subset=['Price'])

In [None]:
df_price2.isnull().sum()

# Sprawdzenie braków - procentowo.

In [None]:
df_price2.isna().sum() / len(df) * 100

In [None]:
# parametr thresh pozwala określić minimalną liczbę nie-NaN wartości, które muszą być w wierszu/kolumnie
display(df_price2.head())
df_price2.dropna(thresh=(0.4*len(df_price2)),axis=1).head()

In [None]:
df_price2.isnull().sum()

In [None]:
df_price2.plot.scatter(x='Area', y='Price');

In [None]:
df_price2.plot.scatter(x='BuiltYear', y='Price');

In [None]:
correlation_matrix = df_price2[['Area', 'Price', 'BuiltYear', 'Floor', 'Floors', 'CommunityScore', 'CountyNumber', 'CommunityNumber',
                               'RegionNumber','KindNumber']].corr()

plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Price')

In [None]:
df_price2 = df_price2.dropna(subset=['Location'])

In [None]:
sale_ids = df_price2['SaleId'].reset_index(drop=True)

In [None]:
train_df = df_price2.sample(frac=0.9, random_state=42)
holdout_df = df_price2.drop(train_df.index)

In [None]:
df_price2['VoivodeshipNumber'] = df_price2['VoivodeshipNumber'].astype(str)
df_price2['CountyNumber'] = df_price2['CountyNumber'].astype(str)
df_price2['CommunityNumber'] = df_price2['CommunityNumber'].astype(str)
df_price2['KindNumber'] = df_price2['KindNumber'].astype(str)
df_price2['RegionNumber'] = df_price2['RegionNumber'].astype(str)
df_price2['BuiltYear'] = pd.to_datetime(df_price2['BuiltYear'], format='%Y', errors='coerce')
if 'StreetNumber' in df_price2.columns:
    df_price2['StreetNumber'] = df_price2['StreetNumber'].astype(str)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer # Zaimportuj TfidfVectorizer
from pycaret.regression import setup, compare_models, pull

# --- RĘCZNE PRZETWARZANIE KOLUMNY 'Location' ---
if 'Location' in df_price2.columns:
    print("Przetwarzanie ręczne kolumny 'Location'...")
    
    # 1. Inicjalizacja wektoryzatora
    location_vectorizer = TfidfVectorizer(
        max_features=1000,  # Ustaw pożądaną liczbę cech (np. 1000)
        stop_words=None,    # Rozważ dodanie polskich stop words, np. ['i', 'oraz', 'ul']
        ngram_range=(1, 1), # Możesz eksperymentować, np. (1,2)
        min_df=5,           # Opcjonalnie: ignoruj terminy rzadsze niż 5 wystąpień
        max_df=0.95         # Opcjonalnie: ignoruj terminy częstsze niż 95% dokumentów
    )


In [None]:
    # 2. Dopasuj i przekształć kolumnę 'Location'
    #    Upewnij się, że 'Location' nie ma NaN lub obsłuż je (np. .fillna(''))
    #    .astype(str) jest dodane dla pewności, jeśli są tam inne typy niż string
    df_price2['Location_Clean'] = df_price2['Location'].fillna('').astype(str)
    location_tfidf_features = location_vectorizer.fit_transform(df_price2['Location_Clean'])

In [None]:
    # 3. Konwertuj wynik do DataFrame
    try:
        # Preferowana metoda, jeśli wersja scikit-learn jest wystarczająco nowa
        feature_names = location_vectorizer.get_feature_names_out()
    except AttributeError:
        # Starsza metoda
        feature_names = location_vectorizer.get_feature_names()
        
    location_tfidf_df = pd.DataFrame(
        location_tfidf_features.toarray(), # Konwertuj na gęstą macierz
        columns=['loc_tfidf_' + name for name in feature_names],
        index=df_price2.index # Zachowaj oryginalny indeks
    )
    print(f"Utworzono {location_tfidf_df.shape[1]} cech TF-IDF z 'Location'.")

In [None]:
    # 4. Połącz nowe cechy z oryginalnym DataFrame
    df_price2_processed = pd.concat([df_price2.drop(columns=['Location_Clean']), location_tfidf_df], axis=1)

    # 5. Usuń oryginalną kolumnę 'Location' z listy ignorowanych, jeśli tam była,
    #    lub dodaj ją, aby PyCaret jej nie szukał.
    #    Najlepiej po prostu usunąć ją z DataFrame, którego użyjesz.
    if 'Location' in df_price2_processed.columns:
        df_price2_processed = df_price2_processed.drop(columns=['Location'])
    
    # Dodaj 'Location' do listy ignorowanych, jeśli jeszcze tam nie jest,
    # bo przetworzyliśmy ją ręcznie i nie chcemy, by PyCaret próbował ją przetwarzać ponownie.
    if 'Location' not in ignore_features_list:
        ignore_features_list.append('Location')
    else:    
        print("Kolumna 'Location' nie znaleziona. Pomijanie ręcznego przetwarzania.")
        df_price2_processed = df_price2.copy()

In [None]:
categorical_features_initial = [
    'BuildingType', 'BuildingCondition', 'TypeOfMarket', 'OwnerType', 'Type', 'OfferFrom',
    'VoivodeshipNumber', 'CountyNumber', 'CommunityNumber', 'KindNumber',
    'RegionNumber'
    
]
numeric_features_initial = [
    'Area', 'NumberOfRooms', 'Floor', 'Floors', 'CommunityScore'
]
date_features_initial = ['BuiltYear']


# Filtruj cechy, które faktycznie istnieją w df_price3
categorical_features_to_use = [col for col in categorical_features_initial if col in df_price2.columns]
numeric_features_to_use = [col for col in numeric_features_initial if col in df_price2.columns]
date_features_to_use = [col for col in date_features_initial if col in df_price2.columns]

# Definicja cech ignorowanych
ignore_features_list = [
    'SaleId', 'OriginalId', 'PortalId', 'Title', 'Description',
    'OfferPrice', 'RealPriceAfterRenovation', 'OriginalPrice',
    'PricePerSquareMeter', 'DateAddedToDatabase', 'DateAdded',
    'DateLastModification', 'DateLastRaises', 'NewestDate',
    'AvailableFrom', 'Link', 'Phone', 'MainImage', 'OtherImages',
    'NumberOfDuplicates', 'NumberOfRaises', 'NumberOfModifications',
    'IsDuplicatePriceLower', 'IsDuplicatePrivateOwner', 'Score', 'ScorePrecision',
    'NumberOfCommunityComments', 'NumberOfCommunityOpinions', 'Archive',
    'SubRegionNumber', 'EncryptedId', "Location",
    'StreetNumber' # Ignorujemy, bo może mieć zbyt wiele unikalnych wartości
]
# Filtruj ignorowane cechy, aby upewnić się, że są w DataFrame
ignore_features_list = [col for col in ignore_features_list if col in df_price2.columns]

transformed_exp = setup(
    data=df_price2,
    target='Price',
    verbose=True, # Ustaw na True dla debugowania setup
    session_id=123,
    log_experiment=True,
    experiment_name=MLFLOW_EXPERIMENT_NAME, 
    log_data=True, 
    log_plots=True, 
    categorical_features=categorical_features_to_use,
    numeric_features=numeric_features_to_use,
    date_features=date_features_to_use,
    text_features=[], 
#    text_processor=custom_text_vectorizer,
#    text_features_method=custom_text_vectorizer,
    ignore_features=ignore_features_list,
    ordinal_features={'BuildingType': ['Pozostałe', 'Blok', 'Apartamentowiec', 'Kamienica'],
                      'BuildingCondition': ['For_Renovation', 'Good', 'After_Renovation', 'Developer_State']},
    # Dla starszych wersji PyCaret, obsługa wysokiej kardynalności jest często domyślna    # lub można spróbować:
    # high_cardinality_method = 'frequency' # jeśli 'Location' ma dużo kategorii
    # lub zostawić domyślne zachowanie PyCaret dla cech kategorycznych
)
best_transformed_model = transformed_exp.compare_models()
transformed_metrics_df = pull()

In [None]:
transformed_exp.X_train_transformed.info()

In [None]:
transformed_exp.X_train_transformed.head()

In [None]:
from pycaret.regression import get_config
import matplotlib.pyplot as plt
import seaborn as sns

# Wyciągnij dane oryginalne i przetransformowane
df_raw = df_price2.copy()
df_transformed = get_config("X_train").copy()
df_transformed["Price"] = get_config("y_train")

# Rysowanie wykresów
fig, axes = plt.subplots(1, 2, figsize=(12, 6))

sns.histplot(df_raw["Price"], ax=axes[0])
axes[0].set_title("Raw Data")

sns.histplot(df_transformed["Price"], ax=axes[1])
axes[1].set_title("Transformed Data")

plt.tight_layout()
plt.show()

In [None]:
transformed_exp.dataset.head()

In [None]:
transformed_exp.dataset_transformed.head()

In [None]:
df_transformed.plot.scatter(x='Area', y='Price');

In [None]:
df_transformed.plot.scatter(x='BuiltYear', y='Price');

In [None]:
plt.figure(figsize=(12, 6))
plt.barh(transformed_metrics_df['Model'], transformed_metrics_df['R2'], color='skyblue')
plt.xlabel('R2')
plt.title('Porównanie dokładności modeli')
plt.gca().invert_yaxis()  # najlepszy model na górze
plt.grid(True, linestyle='--', alpha=0.5)
plt.tight_layout()
plt.show()

In [None]:
transformed_exp.plot_model(best_transformed_model, plot='error')

In [None]:
transformed_exp.plot_model(best_transformed_model, plot='feature')

In [None]:
tuned_model = transformed_exp.tune_model(best_transformed_model, n_iter=3, optimize='R2',choose_better=True)

In [None]:
#RF nie posiada parametru tuned_model - pominąć w przypadku rf
#tuned_best_models = [best_transformed_model.tune_model(m, optimize='R2') for m in tuned_model]

In [None]:
# pominąć dla rf
#best_model = transformed_exp.compare_models([tuned_best_models, tuned_model],choose_better=True)

In [None]:
#best_model = transformed_exp.compare_models(tuned_model, sort='R2')

In [None]:
transformed_exp.predict_model(tuned_model)

In [None]:
transformed_exp.plot_model(tuned_model, plot='error')

In [None]:
transformed_exp.plot_model(tuned_model, plot='feature')

In [None]:
transformed_exp.predict_model(tuned_model).head(10)

In [None]:
best_final_model = transformed_exp.finalize_model(tuned_model, experiment_custom_tags={"step": "final"})
best_final_model

In [None]:
transformed_exp.save_model(best_final_model, '0_full-basic-model')

In [None]:
transformed_exp.predict_model(best_final_model, data=holdout_df)
predict_holdout_df = transformed_exp.pull()

In [None]:
mlflow_exp = mlflow.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)
runs_df = mlflow.search_runs([mlflow_exp.experiment_id], order_by=["start_time DESC"])
run_id = runs_df['run_id'].values[0]
metrics = predict_holdout_df.drop(columns=["Model"]).to_dict(orient='records')[0]

with mlflow.start_run(run_id=run_id, nested=True):
    for key, value in metrics.items():
        mlflow.log_metric(key, value)

In [None]:
#predictions = transformed_exp.predict_model(best_final_model)
holdout_predictions = transformed_exp.predict_model(best_final_model) # Zapisz wynik predykcji
display(holdout_predictions.head()) # Wyświetl kilka pierwszych predykcji

In [None]:
#predicted_column = predict_holdout_df['prediction_label']
#print(predicted_column)
holdout_metrics_df = transformed_exp.pull() # Teraz pobierz metryki z tej predykcji
display(holdout_metrics_df)
predicted_column = holdout_predictions['prediction_label'] # Jeśli chcesz samą kolumnę predykcji
print(predicted_column)

In [None]:
print(holdout_predictions[['prediction_label']].applymap('{:,.0f}'.format))

In [None]:
df_last=holdout_predictions[['prediction_label','Price', ]].applymap('{:,.0f}'.format)

In [None]:
df_last.to_csv('0_new_prices.csv')

In [None]:
print(df_last.columns)

In [None]:
print(df_last.head())

In [None]:
df_last = df_last.reset_index()  
print(df_last.columns)  

In [None]:
df_last.rename(columns={'index': 'SaleId'}, inplace=True)

In [None]:
saleflats_df = pd.read_csv('sale_2024_0.csv')

In [None]:
new_prices_df = pd.read_csv('0_new_prices.csv')

In [None]:
print("saleflats_df.columns:", saleflats_df.columns.tolist())

In [None]:
print("new_prices_df.columns:", new_prices_df.columns.tolist())

In [None]:
new_prices_df = new_prices_df.rename(columns={'Unnamed: 0': 'SaleId', 'Price': 'NewPrice'})

In [None]:
print("new_prices_df.columns:", new_prices_df.columns.tolist())

In [None]:
print(new_prices_df.columns)

In [None]:
print(saleflats_df.columns)

In [None]:
merged_df = pd.merge(
    saleflats_df,
    new_prices_df[['SaleId', 'NewPrice']],
    left_on='SaleId',  # Kolumna w saleflats_df
    right_on='SaleId',  # Kolumna w new_prices_df
    how='left'
)

In [None]:
merged_df

In [None]:
# Krok 2: usuwanie dodatkowej kolumny klucza (opcjonalnie, bo mamy już 'SaleId')
#merged_df.drop(columns=['SaleID'], inplace=True)

# Krok 3: przestawienie kolumny 'NewPrice' obok 'Price'
cols = merged_df.columns.tolist()

# znajdź indeks kolumny 'Price'
price_index = cols.index('Price')

# usuń NewPrice z listy i dodaj ją zaraz po Price
cols.remove('NewPrice')
cols.insert(price_index + 1, 'NewPrice')

# ustaw nową kolejność kolumn
merged_df = merged_df[cols]

In [None]:
merged_df

In [None]:
merged_df[merged_df.duplicated()]

In [None]:
prediction_df = merged_df.copy()

In [None]:
from pycaret.regression import predict_model

In [None]:
prediction_df['BuiltYear'] = pd.to_datetime(prediction_df['BuiltYear'], format='%Y', errors='coerce')

In [None]:
prediction_df_clean = prediction_df.drop(columns=['Price'], errors='ignore')
predictions = predict_model(best_final_model, data=prediction_df_clean)

In [None]:
predictions['RealPrice'] = prediction_df['Price']

In [None]:
merged_df['PredictedPrice'] = predictions['prediction_label']

In [None]:
predictions=predictions[['prediction_label',]].applymap('{:,.0f}'.format)

In [None]:
# Przenieś kolumnę 'PredictedPrice' za 'NewPrice'
cols = list(merged_df.columns)
new_price_index = cols.index('NewPrice')
# Usuń z listy kolumn
cols.remove('PredictedPrice')
# Dodaj w odpowiednie miejsce
cols.insert(new_price_index + 1, 'PredictedPrice')
# Przekształć DataFrame
merged_df = merged_df[cols]

In [None]:
merged_df.to_csv('0_new_prices_full.csv')

In [None]:
merged_df = pd.read_csv('0_new_prices_full.csv')

In [None]:
merged_df

In [None]:
merged_df2=merged_df[['PredictedPrice',]].applymap('{:,.0f}'.format)

In [None]:
merged_df2

In [None]:
merged_df['PredictedPrice'] = merged_df2['PredictedPrice']

In [None]:
merged_df.drop(columns=['NewPrice'], inplace=True)

In [None]:
merged_df

In [None]:
merged_df.to_csv('full_uzup_mieszkania_ceny.csv')

In [None]:
merged_df.head(20)
