### Classification algorithm optimization, Randomized Search

In [52]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import xgboost as xgb
import catboost as cb
import lightgbm as lgb
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import seaborn as sns
from imblearn.under_sampling import TomekLinks
from sklearn.ensemble import IsolationForest
from scipy import stats
from sklearn.preprocessing import label_binarize

In [53]:
# This is the dataset we created by optimization process and now here we are going to use it.
df = pd.read_csv("housing_classification_processed.csv")
df

Unnamed: 0,price,bedrooms,bathrooms,floors,view,sqft_basement,grade_category
0,231300.0,2,1.00,1.0,0,0,2
1,180000.0,2,1.00,1.0,0,0,1
2,604000.0,4,3.00,1.0,0,910,2
3,510000.0,3,2.00,1.0,0,0,3
4,257500.0,3,2.25,2.0,0,0,2
...,...,...,...,...,...,...,...
16851,360000.0,3,2.50,3.0,0,0,3
16852,400000.0,4,2.50,2.0,0,0,3
16853,402101.0,2,0.75,2.0,0,0,2
16854,400000.0,3,2.50,2.0,0,0,3


In [54]:
df.columns

Index(['price', 'bedrooms', 'bathrooms', 'floors', 'view', 'sqft_basement',
       'grade_category'],
      dtype='object')

In [55]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16856 entries, 0 to 16855
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   price           16856 non-null  float64
 1   bedrooms        16856 non-null  int64  
 2   bathrooms       16856 non-null  float64
 3   floors          16856 non-null  float64
 4   view            16856 non-null  int64  
 5   sqft_basement   16856 non-null  int64  
 6   grade_category  16856 non-null  int64  
dtypes: float64(3), int64(4)
memory usage: 921.9 KB


In [None]:
# X/y split etc.
target = "grade_category"

# catboost expects data either in int or str, we convert it to integer
df['floors'] = df['floors'].astype('int64')

# we loose some decimal bathrooms but it should not be a problem because of Integer or whole values.
df['bathrooms'] = df['bathrooms'].astype('int64')

# categorical features need to be separated for certain algorithms, like CatBoost
categorical_features = ['bedrooms', 'floors', 'view']

X = df.drop(columns=[target])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# SCALING => some of the algorithms require this
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Use Randomized Search to find best parameters for CatBoost this time!

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Define parameter grid for RandomizedSearchCV
# use numpy etc. to define various options for the randomize search to try on
param_dist = {
    'iterations': np.linspace(300, 750, 25), 
    'learning_rate': np.linspace(0.03, 0.25, 15),  
    'depth': [4, 5, 6, 7], 
}

cat_model = cb.CatBoostClassifier(verbose=0)

# Set up RandomizedSearchCV for iterations
random_search = RandomizedSearchCV(
    estimator=cat_model,
    param_distributions=param_dist,
    n_iter=100, 
    scoring='accuracy',
    cv=3,
    n_jobs=-1
)

# Fit the random search
random_search.fit(X_train, y_train, cat_features=categorical_features)

# Print the best parameters
print("Best Parameters:", random_search.best_params_)

219 fits failed out of a total of 300.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
3 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\aozaydin\Documents\project_marjetas\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\aozaydin\Documents\project_marjetas\.venv\Lib\site-packages\catboost\core.py", line 5245, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, graph, sample_weight, None, None, None, None, baseline, use_best_model,
  File "c:\Users\aozaydin\Documents\project_marjetas\.venv\Lib\site-packages\catboost\core.py", line 2395, in _fit
    train_par

Best Parameters: {'learning_rate': 0.06142857142857143, 'iterations': 600.0, 'depth': 5}


In [None]:
# I tried with different parameters along with different learning rate, depth and along with different iterations.
# with different search where I found best parameters are as follows.
# Best Parameters: {'learning_rate': 0.06142857142857143, 'iterations': 600.0, 'depth': 5}
