In [None]:
!pip install category_encoders

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline
from hyperopt import fmin, tpe, hp, Trials
import category_encoders as ce
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

In [4]:
from sklearn.model_selection import cross_val_score

In [5]:
df = pd.read_csv("flat_houses_v6.csv")

In [6]:
df.drop(columns = ["PROP_ID", "LATITUDE", "LONGITUDE"], inplace = True)

In [7]:
X = df.drop(columns = ["PRICE_CR"])
y = df["PRICE_CR"]

In [8]:
y_transformed = np.log1p(y)

In [9]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['BEDROOM_NUM', 'BATHROOM_NUM', 'CALCULATED_AREA_SQFT']),
        ('cat', OrdinalEncoder(), ["PROPERTY_TYPE","BALCONY_NUM","AMENITIES_CLUSTER", "FLOOR_CATEGORY", 'FURNISH']),
        ('cat1', OneHotEncoder(drop = 'first', sparse_output = False), ['CITY_NAME','OWNTYPE','AGE_POSSESSION']),
        ('target_enc', ce.TargetEncoder(), ['LOCALITY_NAME'])
    ],
    remainder = 'passthrough'
)

In [None]:
!pip install scikit-optimize

In [11]:
from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection
from sklearn import decomposition
from sklearn import preprocessing
from sklearn import pipeline

from functools import partial
from skopt import space
from skopt import gp_minimize

In [12]:
def objective(params, param_names, x, y):

  params = dict(zip(param_names, params))

  preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['BEDROOM_NUM', 'BATHROOM_NUM', 'CALCULATED_AREA_SQFT']),
        ('cat', OrdinalEncoder(), ["PROPERTY_TYPE","BALCONY_NUM","AMENITIES_CLUSTER", "FLOOR_CATEGORY", 'FURNISH']),
        ('cat1', OneHotEncoder(drop = 'first', sparse_output = False), ['CITY_NAME','OWNTYPE','AGE_POSSESSION']),
        ('target_enc', ce.TargetEncoder(), ['LOCALITY_NAME'])
    ],
    remainder = 'passthrough'
   )

  model = ensemble.RandomForestRegressor(**params)
  pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
  ])
    # Define cross-validation strategy
  kfold = KFold(n_splits=10, shuffle=True, random_state=42)
  scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2', n_jobs=-1)
  mean_r2 = np.mean(scores)
  return -mean_r2


In [15]:
param_space = [
    space.Integer(3, 35, name = "max_depth"),
    space.Integer(100, 500, name = "n_estimators"),
    space.Real(0.1, 1, prior = "uniform", name = "max_features"),
    space.Real(0.1, 1, prior = "uniform", name = "max_samples")
]

param_names = [
    "n_estimators",
    "max_features",
    "max_depth",
    "min_samples_split",
    "min_samples_leaf",
    "bootstrap"
]

In [None]:
optimization_function = partial(objective, param_names = param_names, x = X, y = y_transformed)
result = gp_minimize(
    optimization_function,
    dimensions = param_space,
    n_calls = 15,
    n_random_starts = 10,
    verbose = 10
)

In [17]:
print(dict(zip(param_names, result.x)))

{'n_estimators': 100, 'max_features': 0.8578669148983669, 'max_depth': 35, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': False}




---



---



In [None]:
# Random Forest
# For 100 iteration
# {'max_depth': 26, 'n_estimators': 500, 'max_features': 0.40755472499128953, 'max_samples': 1.0}
# For 15 iteration max_depth= 24, n_estimators=500, max_features=0.46421041809399965, max_samples=1.0

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['BEDROOM_NUM', 'BATHROOM_NUM', 'CALCULATED_AREA_SQFT']),
        ('cat', OrdinalEncoder(), ["PROPERTY_TYPE","BALCONY_NUM","AMENITIES_CLUSTER", "FLOOR_CATEGORY", 'FURNISH']),
        ('cat1', OneHotEncoder(drop = 'first', sparse_output = False), ['CITY_NAME','OWNTYPE','AGE_POSSESSION']),
        ('target_enc', ce.TargetEncoder(), ['LOCALITY_NAME'])
    ],
    remainder = 'passthrough'
)

model = ensemble.ExtraTreesRegressor(max_samples = None,max_depth= 16, n_estimators=391, max_features= 0.6523343501341525)
pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)])

In [None]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [None]:
scores.mean(), scores.std()

(0.8140602030798181, 0.015307185595288098)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
pipeline.fit(X_train,y_train)

In [None]:
y_pred = pipeline.predict(X_test)
y_pred = np.expm1(y_pred)
metrics.mean_absolute_error(np.expm1(y_test),y_pred)

0.3411988607282923