In [2]:
pip install optuna

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.4.2 (from optuna)
  Downloading sqlalchemy-2.0.41-cp312-cp312-win_amd64.whl.metadata (9.8 kB)
Collecting tqdm (from optuna)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting PyYAML (from optuna)
  Downloading PyYAML-6.0.2-cp312-cp312-win_amd64.whl.metadata (2.1 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading mako-1.3.10-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4.12 (from alembic>=1.5.0->optuna)
  Downloading typing_extensions-4.14.0-py3-none-any.whl.metadata (3.0 kB)
Collecting greenlet>=1 (from sqlalchemy>=1.4.2->optuna)
  Downloading greenlet-3.2.3-cp312-cp312-win_amd64.whl.metadata (4.2 kB)
Downloading optuna-4.4.0-py3-none


[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd
import numpy as np
import optuna
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold,cross_val_score,train_test_split
from sklearn.preprocessing import StandardScaler,OrdinalEncoder,OneHotEncoder,TargetEncoder
from sklearn.decomposition import PCA
import category_encoders as ce


from xgboost import XGBRegressor

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
df = pd.read_csv("../datasets/AHM_FS.csv")

df.shape

(10742, 12)

In [7]:
X = df.drop(columns=['price'])
y = df['price']

In [8]:
y_transformed = np.log1p(y)

In [9]:
numerical_columns = X.select_dtypes(include=['int','float']).columns
categorical_columns = X.select_dtypes(include=['object']).columns

In [11]:
preprocessor_te1 = ColumnTransformer([
    ('num',StandardScaler(),numerical_columns),
    ('ohe',OneHotEncoder(sparse_output=False,drop='first'), ['transaction', 'status', 'furnishing','floor_category', 'luxury_category']),
    ('target_enc',ce.TargetEncoder(),['location','facing'])
],remainder='passthrough')

pipeline = Pipeline([
    ('preprocessing',preprocessor_te1),
    ('regressor',XGBRegressor())
])

In [12]:
class Scorer:
    def __init__(self,model_name,preprocessor,model):
        self.model_name = model_name
        self.preprocessor = preprocessor
        self.model = model

    def get_score(self,X,y):
        output = []

        output.append(self.model_name)

        pipeline = Pipeline([
            ('preprocessor',self.preprocessor),
            ('regressor',self.model)
        ])

        # kfold cross-validation
        kfold = KFold(n_splits=10,shuffle=True, random_state=42)
        scores = cross_val_score(
            pipeline,
            X,
            y_transformed,
            cv=kfold,
            scoring='r2'
        )

        output.append(scores.mean())

        X_train,X_test,y_train,y_test = train_test_split(X,y_transformed,test_size=0.2, random_state=42)
        pipeline.fit(X_train,y_train)

        y_pred = pipeline.predict(X_test)

        y_pred = np.expm1(y_pred)

        output.append(mean_absolute_error(np.expm1(y_test),y_pred))

        print(f"{self.model_name} is completed!!\n")
        return output

In [13]:
def objective_xgboost(trial):

    # Suggesting hyperparameters for XGBoost Regressor
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 3, 16)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 10)
    gamma = trial.suggest_float('gamma', 0, 10)
    reg_lambda = trial.suggest_float('lambda', 0, 10)  # L2 regularization
    reg_alpha = trial.suggest_float('alpha', 0, 10)    # L1 regularization
    learning_rate = trial.suggest_float('eta', 0.01, 0.3)  # Learning rate
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    colsample_bylevel = trial.suggest_float('colsample_bylevel', 0.5, 1.0)
    colsample_bynode = trial.suggest_float('colsample_bynode', 0.5, 1.0)
    tree_method = trial.suggest_categorical('tree_method', ['auto', 'approx', 'hist'])
    booster = trial.suggest_categorical('booster', ['gbtree', 'gblinear'])

    model = XGBRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_child_weight=min_child_weight,
        gamma=gamma,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        colsample_bylevel=colsample_bylevel,
        colsample_bynode=colsample_bynode,
        tree_method=tree_method,
        booster=booster,
        random_state=42
    )

    preprocessor_te1 = ColumnTransformer([
        ('num', StandardScaler(), numerical_columns),
        ('ohe', OneHotEncoder(sparse_output=False, drop='first'), ['transaction', 'status', 'furnishing', 'floor_category', 'luxury_category']),
        ('target_enc', ce.TargetEncoder(), ['location', 'facing'])
    ], remainder='passthrough')

    pipeline_xgb = Pipeline([
        ('preprocessing', preprocessor_te1),
        ('regressor', model)
    ])

    score = cross_val_score(pipeline_xgb, X, y_transformed,
                            cv=KFold(n_splits=10, shuffle=True, random_state=42),
                            scoring='r2', n_jobs=-1).mean()
    
    return score

In [14]:
study = optuna.create_study(direction='maximize',
                            study_name="xgboost_ahm_data", 
                            storage="sqlite:///xgboost_ahm1.db",
                            pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=30, interval_steps=10))
study.optimize(objective_xgboost, n_trials=200)

[I 2025-07-03 15:08:38,978] A new study created in RDB with name: xgboost_ahm_data
[I 2025-07-03 15:08:48,068] Trial 0 finished with value: 0.865030921725265 and parameters: {'n_estimators': 731, 'max_depth': 9, 'min_child_weight': 4, 'gamma': 2.080077643766071, 'lambda': 2.6161742417055622, 'alpha': 6.893145951567099, 'eta': 0.21326116372362985, 'subsample': 0.6908667248670131, 'colsample_bytree': 0.9955094016956225, 'colsample_bylevel': 0.7972422172252609, 'colsample_bynode': 0.5779060269881109, 'tree_method': 'auto', 'booster': 'gbtree'}. Best is trial 0 with value: 0.865030921725265.
[I 2025-07-03 15:08:50,032] Trial 1 finished with value: 0.8601803241424493 and parameters: {'n_estimators': 580, 'max_depth': 12, 'min_child_weight': 8, 'gamma': 3.341219347506117, 'lambda': 8.751322328907932, 'alpha': 7.041763654832631, 'eta': 0.17619014956923107, 'subsample': 0.7914518311930276, 'colsample_bytree': 0.6003701356825228, 'colsample_bylevel': 0.7947997307684073, 'colsample_bynode': 0.76

In [15]:
best_trial = study.best_trial
print(f"Best Trial Parameters : {best_trial.params}")
print(f"Best Trial R2 score : {best_trial.value}")

Best Trial Parameters : {'n_estimators': 562, 'max_depth': 16, 'min_child_weight': 1, 'gamma': 0.0013265670995929553, 'lambda': 0.4223455749917649, 'alpha': 0.6857352596443773, 'eta': 0.2830332899687122, 'subsample': 0.6991381325846714, 'colsample_bytree': 0.6908536288011848, 'colsample_bylevel': 0.8429273850926464, 'colsample_bynode': 0.9447414747136604, 'tree_method': 'auto', 'booster': 'gbtree'}
Best Trial R2 score : 0.9233467789779883


In [16]:
best_trial.params

{'n_estimators': 562,
 'max_depth': 16,
 'min_child_weight': 1,
 'gamma': 0.0013265670995929553,
 'lambda': 0.4223455749917649,
 'alpha': 0.6857352596443773,
 'eta': 0.2830332899687122,
 'subsample': 0.6991381325846714,
 'colsample_bytree': 0.6908536288011848,
 'colsample_bylevel': 0.8429273850926464,
 'colsample_bynode': 0.9447414747136604,
 'tree_method': 'auto',
 'booster': 'gbtree'}

In [19]:
best_xgb_params = best_trial.params
preprocessor_te1 = ColumnTransformer([
        ('num',StandardScaler(),numerical_columns),
        ('ohe',OneHotEncoder(sparse_output=False,drop='first'), ['transaction', 'status', 'furnishing','floor_category', 'luxury_category']),
        ('target_enc',ce.TargetEncoder(),['location','facing'])
        ],remainder='passthrough')
    

best_rf_model = XGBRegressor(**best_xgb_params, random_state=42)

pipeline_xgb = Pipeline([
    ('preprocessing',preprocessor_te1),
    ('regressor',best_rf_model)
            ])

score = cross_val_score(pipeline_xgb, X, y_transformed, 
                        cv=KFold(n_splits=20, shuffle=True, random_state=42),
                        scoring='r2',n_jobs=-1).mean()

In [20]:
score*100

92.43808595064544

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)
pipeline_xgb.fit(X_train, y_train)

In [32]:
pipeline_xgb.score(X_test, y_test)

0.9166936686558209

In [33]:
# code for test the pipeline
r2 = pipeline_xgb.score(X_test, y_test)
print(f"R2 score on test set: {r2}")

R2 score on test set: 0.9166936686558209


In [34]:
import joblib

# Save the pipeline model using joblib
joblib.dump(pipeline_xgb, '../Model/pipeline_xgb.joblib')

# Save the dataset
joblib.dump(df, 'data.joblib')


['data.joblib']

In [30]:
from sklearn.utils.validation import check_is_fitted
check_is_fitted(pipeline_xgb.named_steps)  # assuming 'prep' is your ColumnTransformer

TypeError: {'preprocessing': ColumnTransformer(remainder='passthrough',
                  transformers=[('num', StandardScaler(),
                                 Index(['bhk', 'built_up_area', 'totalfloor', 'bathroom'], dtype='object')),
                                ('ohe',
                                 OneHotEncoder(drop='first',
                                               sparse_output=False),
                                 ['transaction', 'status', 'furnishing',
                                  'floor_category', 'luxury_category']),
                                ('target_enc', TargetEncoder(),
                                 ['location', 'facing'])]), 'regressor': XGBRegressor(alpha=0.6857352596443773, base_score=None, booster='gbtree',
             callbacks=None, colsample_bylevel=0.8429273850926464,
             colsample_bynode=0.9447414747136604,
             colsample_bytree=0.6908536288011848, device=None,
             early_stopping_rounds=None, enable_categorical=False,
             eta=0.2830332899687122, eval_metric=None, feature_types=None,
             feature_weights=None, gamma=0.0013265670995929553,
             grow_policy=None, importance_type=None,
             interaction_constraints=None, lambda=0.4223455749917649,
             learning_rate=None, max_bin=None, max_cat_threshold=None,
             max_cat_to_onehot=None, max_delta_step=None, max_depth=16,
             max_leaves=None, min_child_weight=1, missing=nan,
             monotone_constraints=None, multi_strategy=None, ...)} is not an estimator instance.