In [69]:
import optuna
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from sklearn.preprocessing import LabelEncoder
import os, warnings
warnings.simplefilter("ignore") # Change the filter in this process
os.environ["PYTHONWARNINGS"] = "ignore"

In [70]:
path = "/home/pydev/Music/work_files/latest_broko_code"
df = pd.read_csv(path  + "/Dataset/ML_CLEAN_DATA__Bdv2.4_RES.csv")
df.info(verbose=True,show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181512 entries, 0 to 181511
Data columns (total 25 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   ML_Number       181490 non-null  object 
 1   Area            181490 non-null  object 
 2   Municipality    178872 non-null  object 
 3   Community       178872 non-null  object 
 4   Postal_Code     181359 non-null  object 
 5   Postal_Short    181359 non-null  object 
 6   Style           181490 non-null  object 
 7   Type            181490 non-null  object 
 8   Cluster         181490 non-null  object 
 9   List_Price      181485 non-null  float64
 10  Cluster_Price   181490 non-null  float64
 11  Taxes           181488 non-null  float64
 12  Cluster_Tax     181490 non-null  float64
 13  Bedrooms        181490 non-null  float64
 14  Washrooms       181490 non-null  float64
 15  Basement1       181490 non-null  object 
 16  Days_On_Market  181490 non-null  float64
 17  Exterior1 

In [71]:
# drop_cols = ["ML_Number","Postal_Code","Sold_Price","Month_Year","HPI_for_Month"]
drop_cols = ["ML_Number","Postal_Code","Sold_Price","Month_Year","HPI_for_Month","Community","Municipality","Area","Cluster","Cluster_Price","Cluster_Tax","List_Price","Days_On_Market"]
df = df.drop(drop_cols, axis=1)
df.isnull().sum()

Postal_Short        153
Style                22
Type                 22
Taxes                24
Bedrooms             22
Washrooms            22
Basement1            22
Exterior1          1042
Garage_Type       27649
lat                  37
lng                  37
HPI_Sold_Price       22
dtype: int64

In [72]:
data = df.dropna(axis = 0, how ='any') 

In [73]:
q1 = data['HPI_Sold_Price'].quantile(0.25)
q3 = data['HPI_Sold_Price'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
lower_bound_outliers = data[data['HPI_Sold_Price'] < lower_bound] 
upper_bound_outliers = data[data['HPI_Sold_Price'] > upper_bound]
# lower_bound_outliers.to_csv(path + '/Dataset/lower_bound_outliers.csv')
# upper_bound_outliers.to_csv(path +'/Dataset/upper_bound_outliers.csv')    
dataset = data[(data['HPI_Sold_Price'] >= lower_bound) & (data['HPI_Sold_Price'] <= upper_bound)]

In [74]:
dataset.isnull().sum()

Postal_Short      0
Style             0
Type              0
Taxes             0
Bedrooms          0
Washrooms         0
Basement1         0
Exterior1         0
Garage_Type       0
lat               0
lng               0
HPI_Sold_Price    0
dtype: int64

In [75]:
dataset.info(verbose=True,show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 144696 entries, 1 to 181488
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Postal_Short    144696 non-null  object 
 1   Style           144696 non-null  object 
 2   Type            144696 non-null  object 
 3   Taxes           144696 non-null  float64
 4   Bedrooms        144696 non-null  float64
 5   Washrooms       144696 non-null  float64
 6   Basement1       144696 non-null  object 
 7   Exterior1       144696 non-null  object 
 8   Garage_Type     144696 non-null  object 
 9   lat             144696 non-null  float64
 10  lng             144696 non-null  float64
 11  HPI_Sold_Price  144696 non-null  float64
dtypes: float64(6), object(6)
memory usage: 14.4+ MB


In [76]:
X = dataset.drop('HPI_Sold_Price',axis=1)
y = dataset['HPI_Sold_Price']

In [77]:
# categorical features
categorical_features = [column for column, dtype in X.dtypes.items() if dtype==object]


In [78]:
# Define the XGBoost model
def objective(trial):
    params = {
        # "max_depth": trial.suggest_int("max_depth", 5, 30),
        "learning_rate": trial.suggest_float("learning_rate", 0.001, 0.5), # 0.01,0.5
        "n_estimators": trial.suggest_int("n_estimators", 5, 50),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 1),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 1),
        "booster":'gblinear'
        
    }

    model = XGBRegressor(**params)

    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Encode categorical features before training
    label_encoders = {}
    for feature in categorical_features:
        le = LabelEncoder()
        X_train[feature] = le.fit_transform(X_train[feature])
        X_valid[feature] = le.fit_transform(X_valid[feature])
        label_encoders[feature] = le


    model.fit(
        X_train,y_train,
        eval_set=[(X_valid,y_valid)],
        early_stopping_rounds=10,
        verbose=False
    )

    y_pred = model.predict(X_valid)

    return MAPE(y_valid, y_pred)


In [79]:
# Create an Optuna study
study = optuna.create_study(direction="minimize")

# Optimize the objective function
study.optimize(objective, n_trials=1000,show_progress_bar=True)

# Get the best parameters and model type
best_params = study.best_params

print("Best hyperparameters: ", best_params)

[I 2023-09-22 17:23:50,681] A new study created in memory with name: no-name-2fe5c3e9-76e4-4a24-a5a9-bf3a0e232d73


  0%|          | 0/1000 [00:00<?, ?it/s]

[I 2023-09-22 17:23:51,452] Trial 0 finished with value: 0.308618119317928 and parameters: {'learning_rate': 0.09591425751338917, 'n_estimators': 34, 'reg_lambda': 0.043037684319725544, 'reg_alpha': 0.7219904801548741}. Best is trial 0 with value: 0.308618119317928.
[I 2023-09-22 17:23:52,192] Trial 1 finished with value: 0.3077892715782986 and parameters: {'learning_rate': 0.09284824824989954, 'n_estimators': 43, 'reg_lambda': 0.288450608684787, 'reg_alpha': 0.36539451736602047}. Best is trial 1 with value: 0.3077892715782986.
[I 2023-09-22 17:23:52,733] Trial 2 finished with value: 0.31289339528779475 and parameters: {'learning_rate': 0.20092746838299527, 'n_estimators': 13, 'reg_lambda': 0.6088156249742509, 'reg_alpha': 0.15549601152167403}. Best is trial 1 with value: 0.3077892715782986.
[I 2023-09-22 17:23:53,140] Trial 3 finished with value: 0.313308478741639 and parameters: {'learning_rate': 0.14470704296721365, 'n_estimators': 23, 'reg_lambda': 0.47690943647871353, 'reg_alpha':

In [None]:
best_df = pd.DataFrame([best_params])
best_df.to_csv(path + "/Dataset/xgb_param_interpret.csv",index=False)