In [36]:
from sklearn.model_selection import train_test_split
import pandas as pd
from catboost import CatBoostRegressor,Pool
from catboost.utils import eval_metric
import optuna
from optuna.samplers import TPESampler
from hyperopt import tpe, hp, fmin

import warnings
warnings.filterwarnings('ignore')

https://towardsdatascience.com/state-of-the-art-machine-learning-hyperparameter-optimization-with-optuna-a315d8564de1

https://towardsdatascience.com/automate-hyperparameter-tuning-with-hyperopts-for-multiple-models-22b499298a8a

https://stephenallwright.com/rmse-vs-mape/


In [37]:
path = "/home/pydev/Music/work_files/latest_broko_code"
df = pd.read_csv(path  + "/Dataset/ML_CLEAN_DATA__Bdv2.2_RES.csv")
df.info(verbose=True,show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 177234 entries, 0 to 177233
Data columns (total 22 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   ML_Number       177234 non-null  object 
 1   Postal_Code     177104 non-null  object 
 2   Postal_Short    177104 non-null  object 
 3   Style           177234 non-null  object 
 4   Type            177234 non-null  object 
 5   Cluster         177234 non-null  object 
 6   List_Price      177228 non-null  float64
 7   Cluster_Price   177234 non-null  float64
 8   Taxes           177213 non-null  float64
 9   Cluster_Tax     177234 non-null  float64
 10  Bedrooms        177234 non-null  int64  
 11  Washrooms       177234 non-null  int64  
 12  Basement1       177234 non-null  object 
 13  Days_On_Market  177234 non-null  int64  
 14  Exterior1       176279 non-null  object 
 15  Garage_Type     149607 non-null  object 
 16  lat             177220 non-null  float64
 17  lng       

In [38]:
# drop_cols = ['ML_Number','Sold_Date',"Address","Area","Postal_Code","Water_Included","Air_Conditioning","Exterior1","Rooms","Tax_Year","Water_Included","property_type","Sold_Price","Month Year","HPI for Month" ]
drop_cols = ["ML_Number","Postal_Code","Sold_Price","Month_Year","HPI_for_Month"]
df = df.drop(drop_cols, axis=1)
df.isnull().sum()

Postal_Short        130
Style                 0
Type                  0
Cluster               0
List_Price            6
Cluster_Price         0
Taxes                21
Cluster_Tax           0
Bedrooms              0
Washrooms             0
Basement1             0
Days_On_Market        0
Exterior1           955
Garage_Type       27627
lat                  14
lng                  14
HPI_Sold_Price        0
dtype: int64

In [39]:
data = df.dropna(axis = 0, how ='any') 

In [40]:
q1 = data['HPI_Sold_Price'].quantile(0.25)
q3 = data['HPI_Sold_Price'].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - 1.5 * iqr
upper_bound = q3 + 1.5 * iqr
lower_bound_outliers = data[data['HPI_Sold_Price'] < lower_bound] 
upper_bound_outliers = data[data['HPI_Sold_Price'] > upper_bound]
# lower_bound_outliers.to_csv(path + '/Dataset/lower_bound_outliers.csv')
# upper_bound_outliers.to_csv(path +'/Dataset/upper_bound_outliers.csv')    
dataset = data[(data['HPI_Sold_Price'] >= lower_bound) & (data['HPI_Sold_Price'] <= upper_bound)]

In [41]:
dataset.isnull().sum()

Postal_Short      0
Style             0
Type              0
Cluster           0
List_Price        0
Cluster_Price     0
Taxes             0
Cluster_Tax       0
Bedrooms          0
Washrooms         0
Basement1         0
Days_On_Market    0
Exterior1         0
Garage_Type       0
lat               0
lng               0
HPI_Sold_Price    0
dtype: int64

In [42]:
dataset.info(verbose=True,show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 140676 entries, 1 to 177232
Data columns (total 17 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Postal_Short    140676 non-null  object 
 1   Style           140676 non-null  object 
 2   Type            140676 non-null  object 
 3   Cluster         140676 non-null  object 
 4   List_Price      140676 non-null  float64
 5   Cluster_Price   140676 non-null  float64
 6   Taxes           140676 non-null  float64
 7   Cluster_Tax     140676 non-null  float64
 8   Bedrooms        140676 non-null  int64  
 9   Washrooms       140676 non-null  int64  
 10  Basement1       140676 non-null  object 
 11  Days_On_Market  140676 non-null  int64  
 12  Exterior1       140676 non-null  object 
 13  Garage_Type     140676 non-null  object 
 14  lat             140676 non-null  float64
 15  lng             140676 non-null  float64
 16  HPI_Sold_Price  140676 non-null  float64
dtypes: float64(7), 

In [43]:
X = dataset.drop('HPI_Sold_Price',axis=1)
y = dataset['HPI_Sold_Price']

In [44]:
# numeric features

numeric_features = [column for column, dtype in X.dtypes.items() if dtype == float or dtype == int]

# categorical features
categorical_features = [column for column, dtype in X.dtypes.items() if dtype==object]

# for c in categorical_features:
#     X[c].fillna('nan', inplace=True)
#     X[c].fillna('nan', inplace=True)


X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train,y_train, test_size=0.2, random_state=42)



# numeric_train_pool = Pool(X_train[numeric_features], y_train)
# numeric_val_pool = Pool(X_valid[numeric_features], y_valid)
# numeric_test_pool = Pool(X_test[numeric_features], y_test)


cat_train_pool = Pool(X_train, y_train, cat_features=categorical_features)
cat_val_pool = Pool(X_valid, y_valid, cat_features=categorical_features)
cat_test_pool = Pool(X_test, y_test, cat_features=categorical_features)

In [45]:
def calc_test_quality(train_pool=cat_train_pool, val_pool=cat_val_pool, test_pool=cat_test_pool, **kwargs):
    model = CatBoostRegressor(**kwargs, random_seed=42)
    model.fit(train_pool, verbose=0, eval_set=val_pool)
    y_pred = model.predict(test_pool)
    return eval_metric(test_pool.get_label(), y_pred, 'MAPE')

In [46]:
# def hyperopt_objective(params):
#     print(params)
#     model = catboost.CatBoostRegressor(**params, random_seed=42)
#     model.fit(numeric_train_pool, verbose=0, eval_set=numeric_val_pool)
#     y_pred = model.predict(numeric_val_pool)
#     return -eval_metric(numeric_val_pool.get_label(), y_pred, 'MAPE')[0]

# space = {
#     'learning_rate': hp.uniform('learning_rate', 0.01, 0.1),
#     'depth': hp.randint('depth', 3, 10),
#     'l2_leaf_reg': hp.uniform('l2_leaf_reg', 1, 10),
#     'boosting_type': hp.choice('boosting_type', ['Ordered', 'Plain']),
# }

# best = fmin(hyperopt_objective,
#     space=space,
#     algo=tpe.suggest,
#     max_evals=20,
#     rstate=np.random.default_rng(seed=123))


In [47]:
# best_params = best.copy()
# best_params['boosting_type'] = 'Plain' if best['boosting_type'] == 1 else 'Ordered'
# calc_test_quality(**best_params), best_params

In [48]:
# calc_test_quality(train_pool=cat_train_pool,val_pool=cat_val_pool,test_pool=cat_test_pool)

Let's apply bayesian optimization approaches to a dataset with categorical features and numerical features

In [49]:

# def objective(trial):
#     params = {
#         'learning_rate': trial.suggest_float('learning_rate', 0.000001, 0.01),
#         'depth': trial.suggest_int('depth', 1, 10),
#         'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 14),
#         'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
#         'max_ctr_complexity': trial.suggest_int('max_ctr_complexity', 0, 14)
#     }

#     model = CatBoostRegressor(**params, random_seed=42, n_estimators=1000)
#     model.fit(cat_train_pool, verbose=0, eval_set=cat_val_pool,early_stopping_rounds=10)
#     y_pred = model.predict(cat_val_pool)
#     return eval_metric(cat_val_pool.get_label(), y_pred, 'MAPE')

# sampler = TPESampler(seed=123)
# study = optuna.create_study(direction='minimize', sampler=sampler)
# study.optimize(objective, n_trials=10,show_progress_bar = True)



In [62]:
def objective(trial):
    params = {
        "iterations": 1000,
        "learning_rate": trial.suggest_float("learning_rate", 1e-2, 0.1, log=True),
        "depth": trial.suggest_int("depth", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.05, 1.0),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.05, 1.0),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 14),
        'boosting_type': trial.suggest_categorical('boosting_type', ['Ordered', 'Plain']),
        'max_ctr_complexity': trial.suggest_int('max_ctr_complexity', 0, 14)
    }

    model = CatBoostRegressor(**params, random_seed=42,silent=True)
    model.fit(cat_train_pool, verbose=0, eval_set=cat_val_pool)
    y_pred = model.predict(cat_val_pool)
    return eval_metric(cat_val_pool.get_label(), y_pred, 'MAPE')

sampler = TPESampler(seed=123)
study = optuna.create_study(direction='minimize', sampler=sampler)
study.optimize(objective, n_trials=200,show_progress_bar = True)

[I 2023-09-11 13:07:06,306] A new study created in memory with name: no-name-083ca339-39b1-44b9-98d8-e5f48a8f6814


  0%|          | 0/200 [00:00<?, ?it/s]

[I 2023-09-11 13:07:21,970] Trial 0 finished with value: 47.16771105516188 and parameters: {'learning_rate': 0.049712909978071915, 'depth': 3, 'subsample': 0.26550888088599295, 'colsample_bylevel': 0.5737490306287467, 'min_data_in_leaf': 72, 'l2_leaf_reg': 6.5003839816179925, 'boosting_type': 'Ordered', 'max_ctr_complexity': 7}. Best is trial 0 with value: 47.16771105516188.
[I 2023-09-11 13:07:43,752] Trial 1 finished with value: 46.53083799106431 and parameters: {'learning_rate': 0.024667067270243252, 'depth': 4, 'subsample': 0.7425972220148396, 'colsample_bylevel': 0.46664363244564316, 'min_data_in_leaf': 6, 'l2_leaf_reg': 6.1745753192956085, 'boosting_type': 'Ordered', 'max_ctr_complexity': 2}. Best is trial 1 with value: 46.53083799106431.
[I 2023-09-11 13:08:35,441] Trial 2 finished with value: 46.93025731462092 and parameters: {'learning_rate': 0.03400567296340426, 'depth': 6, 'subsample': 0.652680910623755, 'colsample_bylevel': 0.8569602043739001, 'min_data_in_leaf': 73, 'l2_le

In [63]:
calc_test_quality(train_pool=cat_train_pool,
                  val_pool=cat_val_pool,
                  test_pool=cat_test_pool,
                  **study.best_params), study.best_params

([0.15302356206690831],
 {'learning_rate': 0.010891390017107478,
  'depth': 1,
  'subsample': 0.2995218749648715,
  'colsample_bylevel': 0.0675127793103688,
  'min_data_in_leaf': 73,
  'l2_leaf_reg': 7.573543939167165,
  'boosting_type': 'Ordered',
  'max_ctr_complexity': 1})

In [64]:
best_params_optuna = best_df = pd.DataFrame([study.best_params])
best_df.to_csv(path + "/Dataset/best_params_optuna.csv",index=False)

In [53]:
# def hyperopt_objective(params):
#     print(params)
#     model = CatBoostRegressor(**params, random_seed=42,n_estimators=1000)
#     model.fit(cat_train_pool, verbose=0, eval_set=cat_val_pool,early_stopping_rounds=10)
#     y_pred = model.predict(cat_val_pool)
#     return eval_metric(cat_val_pool.get_label(), y_pred, 'MAPE')[0]

# space = {
#     'learning_rate': hp.uniform('learning_rate', 0.01, 0.1),
#     'depth': hp.randint('depth',14),
#     'l2_leaf_reg': hp.uniform('l2_leaf_reg', 1, 14),
#     'boosting_type': hp.choice('boosting_type', ['Ordered', 'Plain']),
#     'max_ctr_complexity': hp.randint('max_ctr_complexity',14),
#     "subsample": hp.uniform("subsample", 0.05, 1.0),
#     "colsample_bylevel": hp.uniform("colsample_bylevel", 0.05, 1.0),
#     "min_data_in_leaf": hp.uniform("min_data_in_leaf", 1, 100),
# }

# best = fmin(hyperopt_objective,
#     space=space,
#     algo=tpe.suggest,
#     max_evals=100,
#     rstate= np.random.seed(123))

In [54]:
# best_params = best.copy()
# best_params['boosting_type'] = 'Plain' if best['boosting_type'] == 1 else 'Ordered'
# calc_test_quality(train_pool=cat_train_pool,
#                   val_pool=cat_val_pool,
#                   test_pool=cat_test_pool,
#                   **best_params), best_params


In [55]:
# best_params

In [56]:
# best_df = pd.DataFrame([best_params])
# best_df.to_csv(path + "/Dataset/best_params_hyperopt.csv",index=False)