In [39]:
import optuna
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from catboost import CatBoostRegressor, Pool
from lightgbm import LGBMRegressor, Dataset
from sklearn.metrics import mean_absolute_percentage_error as MAPE
from sklearn.preprocessing import LabelEncoder

In [48]:
path = "/Users/thrilok/Desktop/mantra_collab_job/work_files/latest_broko_code"
df = pd.read_csv(path  + "/Dataset/ML_CLEAN_DATA__Bdv2.3_RES.csv")
df.info(verbose=True,show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180356 entries, 0 to 180355
Data columns (total 22 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   ML_Number       180356 non-null  object 
 1   Postal_Code     180332 non-null  object 
 2   Postal_Short    180332 non-null  object 
 3   Style           180356 non-null  object 
 4   Type            180356 non-null  object 
 5   Cluster         180356 non-null  object 
 6   List_Price      180356 non-null  float64
 7   Cluster_Price   180356 non-null  float64
 8   Taxes           180356 non-null  float64
 9   Cluster_Tax     180356 non-null  float64
 10  Bedrooms        180356 non-null  int64  
 11  Washrooms       180356 non-null  int64  
 12  Basement1       180356 non-null  object 
 13  Days_On_Market  180356 non-null  int64  
 14  Exterior1       180356 non-null  object 
 15  Garage_Type     152749 non-null  object 
 16  lat             180341 non-null  float64
 17  lng       

In [49]:
drop_cols = ["ML_Number","Postal_Code","Sold_Price","Month_Year","HPI_for_Month"]
data = df.drop(drop_cols, axis=1)
data.isnull().sum()

Postal_Short         24
Style                 0
Type                  0
Cluster               0
List_Price            0
Cluster_Price         0
Taxes                 0
Cluster_Tax           0
Bedrooms              0
Washrooms             0
Basement1             0
Days_On_Market        0
Exterior1             0
Garage_Type       27607
lat                  15
lng                  15
HPI_Sold_Price        0
dtype: int64

In [50]:
data = data.dropna(axis = 0, how ='any') 

In [51]:
data.isnull().sum()

Postal_Short      0
Style             0
Type              0
Cluster           0
List_Price        0
Cluster_Price     0
Taxes             0
Cluster_Tax       0
Bedrooms          0
Washrooms         0
Basement1         0
Days_On_Market    0
Exterior1         0
Garage_Type       0
lat               0
lng               0
HPI_Sold_Price    0
dtype: int64

In [52]:
q1 = data['HPI_Sold_Price'].quantile(0.25)
q3 = data['HPI_Sold_Price'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
lower_bound_outliers = data[data['HPI_Sold_Price'] < lower_bound] 
upper_bound_outliers = data[data['HPI_Sold_Price'] > upper_bound]
# lower_bound_outliers.to_csv(path + '/Dataset/lower_bound_outliers.csv')
# upper_bound_outliers.to_csv(path +'/Dataset/upper_bound_outliers.csv')    
dataset = data[(data['HPI_Sold_Price'] >= lower_bound) & (data['HPI_Sold_Price'] <= upper_bound)]

In [53]:
dataset.info(verbose=True,show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 144691 entries, 0 to 180354
Data columns (total 17 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Postal_Short    144691 non-null  object 
 1   Style           144691 non-null  object 
 2   Type            144691 non-null  object 
 3   Cluster         144691 non-null  object 
 4   List_Price      144691 non-null  float64
 5   Cluster_Price   144691 non-null  float64
 6   Taxes           144691 non-null  float64
 7   Cluster_Tax     144691 non-null  float64
 8   Bedrooms        144691 non-null  int64  
 9   Washrooms       144691 non-null  int64  
 10  Basement1       144691 non-null  object 
 11  Days_On_Market  144691 non-null  int64  
 12  Exterior1       144691 non-null  object 
 13  Garage_Type     144691 non-null  object 
 14  lat             144691 non-null  float64
 15  lng             144691 non-null  float64
 16  HPI_Sold_Price  144691 non-null  float64
dtypes: float64(7), 

In [54]:
X = dataset.drop('HPI_Sold_Price',axis=1)
y = dataset['HPI_Sold_Price']

In [55]:
# categorical features
categorical_features = [column for column, dtype in X.dtypes.items() if dtype==object]


In [56]:
# label_encoders = {}
# for feature in categorical_features:
#     le = LabelEncoder()
#     X_train[feature] = le.fit_transform(X_train[feature])
#     X_valid[feature] = le.fit_transform(X_valid[feature])
#     label_encoders[feature] = le

In [57]:
# X_valid

In [58]:
# Define the XGBoost model
def xgb_model(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 5, 30),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5),
        "n_estimators": trial.suggest_int("n_estimators", 5, 50),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 1),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 1),
    }

    model = XGBRegressor(**params)

    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Encode categorical features before training
    label_encoders = {}
    for feature in categorical_features:
        le = LabelEncoder()
        X_train[feature] = le.fit_transform(X_train[feature])
        X_valid[feature] = le.fit_transform(X_valid[feature])
        label_encoders[feature] = le

    # Create DMatrix with categorical feature support
    # train_dmatrix = DMatrix(
    #     X_train,y_train
    # )
    # valid_dmatrix = DMatrix(
    #     X_valid,y_valid
    # )

    model.fit(
        X_train,y_train,
        eval_set=[(X_valid,y_valid)],
        early_stopping_rounds=10,
        verbose=False
    )

    y_pred = model.predict(X_valid)

    return MAPE(y_valid, y_pred)


In [59]:
# Define the CatBoost model

def catboost_model(trial):
    params = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 5, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1, 10),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "max_ctr_complexity": trial.suggest_int("max_ctr_complexity", 2, 8),
    }

    model = CatBoostRegressor(**params)
    
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    cat_train_pool = Pool(X_train, y_train, cat_features=categorical_features)
    cat_val_pool = Pool(X_valid, y_valid, cat_features=categorical_features)

    model.fit(cat_train_pool, eval_set=cat_val_pool, early_stopping_rounds=10,verbose=False)
    
    y_pred = model.predict(cat_val_pool)
    
    return MAPE(y_valid, y_pred)

In [60]:
# Define the LightGBM model
def lightgbm_model(trial):
    params = {
        "max_depth": trial.suggest_int("max_depth", 5, 30),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5),
        "n_estimators": trial.suggest_int("n_estimators", 5, 50),
        "reg_lambda": trial.suggest_float("reg_lambda", 0, 1),
        "reg_alpha": trial.suggest_float("reg_alpha", 0, 1),
    }

    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Encode categorical features before training
    label_encoders = {}
    for feature in categorical_features:
        le = LabelEncoder()
        X_train[feature] = le.fit_transform(X_train[feature])
        X_valid[feature] = le.fit_transform(X_valid[feature])
        label_encoders[feature] = le

    train_data = Dataset(X_train, label=y_train)
    valid_data = Dataset(X_valid, label=y_valid)

    model = LGBMRegressor(**params)

    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        categorical_feature=categorical_features,
        # early_stopping=10,  # Early stopping rounds
        verbose=False) # Set verbose to False to suppress LightGBM's messages
    

    y_pred = model.predict(X_valid)

    return MAPE(y_valid, y_pred)


In [61]:
# Define the Optuna objective function
def objective(trial):
    model_type = trial.suggest_categorical("model_type", ["xgboost", "catboost", "lightgbm"])
    
    if model_type == "xgboost":
        return xgb_model(trial)
    elif model_type == "catboost":
        return catboost_model(trial)
    else:
        return lightgbm_model(trial)

In [62]:
# Create an Optuna study
study = optuna.create_study(direction="minimize")

# Optimize the objective function
study.optimize(objective, n_trials=100,show_progress_bar=True)

# Get the best parameters and model type
best_params = study.best_params
best_model_type = best_params["model_type"]

# Print the best model and its parameters
print("Best model type: ", best_model_type)
print("Best hyperparameters: ", best_params)

[I 2023-09-17 13:49:11,297] A new study created in memory with name: no-name-b55083c5-1147-4831-bdf7-f031ddd1704f


  0%|          | 0/100 [00:00<?, ?it/s]

0:	learn: 370031.8425194	test: 371483.0080823	best: 371483.0080823 (0)	total: 125ms	remaining: 2m 5s
1:	learn: 342493.5453598	test: 343783.3821063	best: 343783.3821063 (1)	total: 214ms	remaining: 1m 46s
2:	learn: 317918.7839970	test: 319053.1192865	best: 319053.1192865 (2)	total: 313ms	remaining: 1m 44s
3:	learn: 295149.4297372	test: 296104.8087057	best: 296104.8087057 (3)	total: 417ms	remaining: 1m 43s
4:	learn: 274554.1677913	test: 275355.9462550	best: 275355.9462550 (4)	total: 514ms	remaining: 1m 42s
5:	learn: 256114.4582958	test: 256801.5598369	best: 256801.5598369 (5)	total: 618ms	remaining: 1m 42s
6:	learn: 239536.5762671	test: 240065.8657451	best: 240065.8657451 (6)	total: 737ms	remaining: 1m 44s
7:	learn: 224297.4763501	test: 224770.5497165	best: 224770.5497165 (7)	total: 875ms	remaining: 1m 48s
8:	learn: 211116.8513346	test: 211526.4695313	best: 211526.4695313 (8)	total: 987ms	remaining: 1m 48s
9:	learn: 199127.6489524	test: 199480.7961579	best: 199480.7961579 (9)	total: 1.09s



[I 2023-09-17 13:52:07,030] Trial 2 finished with value: 0.12160455290174635 and parameters: {'model_type': 'xgboost', 'max_depth': 14, 'learning_rate': 0.2747699146551956, 'n_estimators': 25, 'reg_lambda': 0.07387256326209402, 'reg_alpha': 0.22710851690927591}. Best is trial 0 with value: 0.11598398869135129.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3466
[LightGBM] [Info] Number of data points in the train set: 115752, number of used features: 16
[LightGBM] [Info] Start training from score 1115491.120153
[I 2023-09-17 13:52:07,368] Trial 3 finished with value: 0.1302850307525785 and parameters: {'model_type': 'lightgbm', 'max_depth': 8, 'learning_rate': 0.35199130748289487, 'n_estimators': 15, 'reg_lambda': 0.8049033568761237, 'reg_alpha': 0.6851141726623607}. Best is trial 0 with value: 0.11598398869135129.
You can set `force_row_wise=true` to remove the overhead.
And if memo



[I 2023-09-17 13:53:13,191] Trial 7 finished with value: 0.1319568947723148 and parameters: {'model_type': 'xgboost', 'max_depth': 29, 'learning_rate': 0.07297670921650905, 'n_estimators': 39, 'reg_lambda': 0.3250725442382292, 'reg_alpha': 0.3332127659121489}. Best is trial 0 with value: 0.11598398869135129.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3466
[LightGBM] [Info] Number of data points in the train set: 115752, number of used features: 16
[LightGBM] [Info] Start training from score 1115491.120153
[I 2023-09-17 13:53:13,768] Trial 8 finished with value: 0.1284445446511557 and parameters: {'model_type': 'lightgbm', 'max_depth': 13, 'learning_rate': 0.14101814592900633, 'n_estimators': 43, 'reg_lambda': 0.006878890074684452, 'reg_alpha': 0.5430024983348984}. Best is trial 0 with value: 0.11598398869135129.




[I 2023-09-17 13:53:17,837] Trial 9 finished with value: 0.16736349065980802 and parameters: {'model_type': 'xgboost', 'max_depth': 27, 'learning_rate': 0.07298448466458546, 'n_estimators': 29, 'reg_lambda': 0.6575664960589684, 'reg_alpha': 0.7629519101708525}. Best is trial 0 with value: 0.11598398869135129.
0:	learn: 396590.3118015	test: 398265.8704560	best: 398265.8704560 (0)	total: 57.5ms	remaining: 57.5s
1:	learn: 392833.7981543	test: 394483.1621041	best: 394483.1621041 (1)	total: 132ms	remaining: 1m 5s
2:	learn: 389117.0863545	test: 390735.5912287	best: 390735.5912287 (2)	total: 208ms	remaining: 1m 9s
3:	learn: 385469.1321284	test: 387063.9296367	best: 387063.9296367 (3)	total: 300ms	remaining: 1m 14s
4:	learn: 381859.3014601	test: 383429.7449617	best: 383429.7449617 (4)	total: 367ms	remaining: 1m 13s
5:	learn: 378296.3153218	test: 379845.4178724	best: 379845.4178724 (5)	total: 424ms	remaining: 1m 10s
6:	learn: 374760.5257231	test: 376290.1675117	best: 376290.1675117 (6)	total: 4



[I 2023-09-17 14:04:11,482] Trial 19 finished with value: 0.11937258905288858 and parameters: {'model_type': 'xgboost', 'max_depth': 21, 'learning_rate': 0.1504101505059211, 'n_estimators': 48, 'reg_lambda': 0.49387147180563007, 'reg_alpha': 0.9916244948060724}. Best is trial 15 with value: 0.1124560039137774.
0:	learn: 396182.6427131	test: 397854.6934728	best: 397854.6934728 (0)	total: 48.6ms	remaining: 48.6s
1:	learn: 392055.6134627	test: 393702.7188190	best: 393702.7188190 (1)	total: 99.5ms	remaining: 49.7s
2:	learn: 388008.8885907	test: 389621.0395439	best: 389621.0395439 (2)	total: 168ms	remaining: 56s
3:	learn: 384041.0525486	test: 385617.9500011	best: 385617.9500011 (3)	total: 236ms	remaining: 58.9s
4:	learn: 380043.6261209	test: 381595.0038621	best: 381595.0038621 (4)	total: 291ms	remaining: 57.9s
5:	learn: 376060.3106001	test: 377588.0160454	best: 377588.0160454 (5)	total: 351ms	remaining: 58.1s
6:	learn: 372253.1359801	test: 373759.5563403	best: 373759.5563403 (6)	total: 426m



[I 2023-09-17 14:19:05,979] Trial 29 finished with value: 0.5975299153080258 and parameters: {'model_type': 'xgboost', 'max_depth': 18, 'learning_rate': 0.10342734095932044, 'n_estimators': 5, 'reg_lambda': 0.2547814989177608, 'reg_alpha': 0.9699198543750549}. Best is trial 22 with value: 0.11232450033384944.
0:	learn: 387503.1363852	test: 389096.1567359	best: 389096.1567359 (0)	total: 81.8ms	remaining: 1m 21s
1:	learn: 375297.8325500	test: 376769.6920481	best: 376769.6920481 (1)	total: 135ms	remaining: 1m 7s
2:	learn: 363507.1778401	test: 364900.6299928	best: 364900.6299928 (2)	total: 190ms	remaining: 1m 3s
3:	learn: 352331.5456322	test: 353657.7772990	best: 353657.7772990 (3)	total: 251ms	remaining: 1m 2s
4:	learn: 341636.2980911	test: 342899.1194482	best: 342899.1194482 (4)	total: 334ms	remaining: 1m 6s
5:	learn: 331200.2476892	test: 332396.5861844	best: 332396.5861844 (5)	total: 403ms	remaining: 1m 6s
6:	learn: 321114.3393066	test: 322256.3805232	best: 322256.3805232 (6)	total: 468



[I 2023-09-17 14:28:27,557] Trial 38 finished with value: 0.36779956869917907 and parameters: {'model_type': 'xgboost', 'max_depth': 11, 'learning_rate': 0.06568175457605502, 'n_estimators': 16, 'reg_lambda': 0.2184696118720777, 'reg_alpha': 0.4471157306521211}. Best is trial 22 with value: 0.11232450033384944.
0:	learn: 395165.7884595	test: 396827.2658318	best: 396827.2658318 (0)	total: 161ms	remaining: 2m 40s
1:	learn: 390209.5007937	test: 391833.3956462	best: 391833.3956462 (1)	total: 252ms	remaining: 2m 5s
2:	learn: 385143.8632876	test: 386740.6531649	best: 386740.6531649 (2)	total: 446ms	remaining: 2m 28s
3:	learn: 380181.6128855	test: 381742.5260049	best: 381742.5260049 (3)	total: 616ms	remaining: 2m 33s
4:	learn: 375361.0176525	test: 376887.0719492	best: 376887.0719492 (4)	total: 825ms	remaining: 2m 44s
5:	learn: 370616.9032646	test: 372109.2147283	best: 372109.2147283 (5)	total: 1s	remaining: 2m 46s
6:	learn: 365965.6695089	test: 367424.2328170	best: 367424.2328170 (6)	total: 1



[I 2023-09-17 14:39:08,219] Trial 46 finished with value: 0.1604715035176012 and parameters: {'model_type': 'xgboost', 'max_depth': 22, 'learning_rate': 0.06465568565261066, 'n_estimators': 34, 'reg_lambda': 0.39466066565954305, 'reg_alpha': 0.7742105136553351}. Best is trial 22 with value: 0.11232450033384944.
0:	learn: 393853.4385161	test: 395505.9355787	best: 395505.9355787 (0)	total: 186ms	remaining: 3m 5s
1:	learn: 387558.3829887	test: 389163.2504200	best: 389163.2504200 (1)	total: 375ms	remaining: 3m 7s
2:	learn: 381248.3293920	test: 382811.3398361	best: 382811.3398361 (2)	total: 546ms	remaining: 3m 1s
3:	learn: 375136.8049890	test: 376656.1654904	best: 376656.1654904 (3)	total: 717ms	remaining: 2m 58s
4:	learn: 369067.4456015	test: 370539.2198373	best: 370539.2198373 (4)	total: 880ms	remaining: 2m 55s
5:	learn: 363262.8693397	test: 364694.6191992	best: 364694.6191992 (5)	total: 1.07s	remaining: 2m 57s
6:	learn: 357490.2181321	test: 358886.9170260	best: 358886.9170260 (6)	total: 



[I 2023-09-17 14:57:40,931] Trial 58 finished with value: 0.3381525534513876 and parameters: {'model_type': 'xgboost', 'max_depth': 16, 'learning_rate': 0.05772271232311721, 'n_estimators': 20, 'reg_lambda': 0.3470841375939225, 'reg_alpha': 0.4206217683084356}. Best is trial 22 with value: 0.11232450033384944.
0:	learn: 390682.7417499	test: 392308.5091929	best: 392308.5091929 (0)	total: 62ms	remaining: 1m 1s
1:	learn: 381476.5981302	test: 383023.7889095	best: 383023.7889095 (1)	total: 119ms	remaining: 59.1s
2:	learn: 372286.7050347	test: 373788.8633565	best: 373788.8633565 (2)	total: 172ms	remaining: 57.2s
3:	learn: 363553.2391930	test: 365004.5125791	best: 365004.5125791 (3)	total: 227ms	remaining: 56.5s
4:	learn: 354975.3187163	test: 356350.7566680	best: 356350.7566680 (4)	total: 288ms	remaining: 57.4s
5:	learn: 346675.4435661	test: 347973.9808561	best: 347973.9808561 (5)	total: 349ms	remaining: 57.9s
6:	learn: 338653.5836233	test: 339918.1806906	best: 339918.1806906 (6)	total: 432ms



[I 2023-09-17 15:29:19,165] Trial 78 finished with value: 0.5884497301429206 and parameters: {'model_type': 'xgboost', 'max_depth': 25, 'learning_rate': 0.05480659889051294, 'n_estimators': 10, 'reg_lambda': 0.548678572049489, 'reg_alpha': 0.39045847804153927}. Best is trial 61 with value: 0.11209017081680005.
0:	learn: 391639.1068754	test: 393272.9532184	best: 393272.9532184 (0)	total: 56.6ms	remaining: 56.6s
1:	learn: 383316.3013552	test: 384879.2735383	best: 384879.2735383 (1)	total: 111ms	remaining: 55.2s
2:	learn: 374970.5423570	test: 376487.4075040	best: 376487.4075040 (2)	total: 166ms	remaining: 55.1s
3:	learn: 367026.7360373	test: 368497.2024629	best: 368497.2024629 (3)	total: 226ms	remaining: 56.3s
4:	learn: 359203.7540251	test: 360605.1818633	best: 360605.1818633 (4)	total: 288ms	remaining: 57.3s
5:	learn: 351607.2348580	test: 352938.0827425	best: 352938.0827425 (5)	total: 351ms	remaining: 58.1s
6:	learn: 344331.0431194	test: 345629.5103037	best: 345629.5103037 (6)	total: 432



[I 2023-09-17 15:45:47,050] Trial 89 finished with value: 0.2184116573335355 and parameters: {'model_type': 'xgboost', 'max_depth': 14, 'learning_rate': 0.04977407287673974, 'n_estimators': 34, 'reg_lambda': 0.42945975103802747, 'reg_alpha': 0.17170667194816647}. Best is trial 61 with value: 0.11209017081680005.
0:	learn: 388836.8768434	test: 390438.5224353	best: 390438.5224353 (0)	total: 46.1ms	remaining: 46.1s
1:	learn: 377869.2400569	test: 379409.2209484	best: 379409.2209484 (1)	total: 87ms	remaining: 43.4s
2:	learn: 367177.6425730	test: 368637.5303391	best: 368637.5303391 (2)	total: 144ms	remaining: 47.9s
3:	learn: 356850.7992636	test: 358224.1815073	best: 358224.1815073 (3)	total: 198ms	remaining: 49.3s
4:	learn: 346928.7301305	test: 348222.2760534	best: 348222.2760534 (4)	total: 253ms	remaining: 50.3s
5:	learn: 337321.9970252	test: 338577.2820708	best: 338577.2820708 (5)	total: 308ms	remaining: 51.1s
6:	learn: 328224.2813477	test: 329430.4924697	best: 329430.4924697 (6)	total: 36

In [63]:
best_df = pd.DataFrame([best_params])
best_df.to_csv(path + "/Dataset/best.csv",index=False)