## Model Selection

In [8]:
# !pip install optuna

In [9]:
# !pip install category_encoders

In [14]:
!pip install xgboost

Collecting xgboost
  Using cached xgboost-2.1.2-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.2-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.1/124.9 MB 469.7 kB/s eta 0:04:26
   ---------------------------------------- 0.1/124.9 MB 469.7 kB/s eta 0:04:26
   ---------------------------------------- 0.1/124.9 MB 469.7 kB/s eta 0:04:26
   ---------------------------------------- 0.1/124.9 MB 469.7 kB/s eta 0:04:26
   ---------------------------------------- 0.1/124.9 MB 262.6 kB/s eta 0:07:56
   ---------------------------------------- 0.1/124.9 MB 262.6 kB/s eta 0:07:56
   ---------------------------------------- 0.1/124.9 MB 355.0 kB/s eta 0:05:52
   ----

In [11]:
# !pip install lightgbm

Collecting lightgbm
  Downloading lightgbm-4.5.0-py3-none-win_amd64.whl.metadata (17 kB)
Downloading lightgbm-4.5.0-py3-none-win_amd64.whl (1.4 MB)
   ---------------------------------------- 0.0/1.4 MB ? eta -:--:--
   - -------------------------------------- 0.0/1.4 MB 960.0 kB/s eta 0:00:02
   --- ------------------------------------ 0.1/1.4 MB 1.2 MB/s eta 0:00:02
   ----- ---------------------------------- 0.2/1.4 MB 1.3 MB/s eta 0:00:01
   ------- -------------------------------- 0.3/1.4 MB 1.5 MB/s eta 0:00:01
   ----------- ---------------------------- 0.4/1.4 MB 1.9 MB/s eta 0:00:01
   --------------- ------------------------ 0.6/1.4 MB 1.9 MB/s eta 0:00:01
   ---------------- ----------------------- 0.6/1.4 MB 2.0 MB/s eta 0:00:01
   ----------------- ---------------------- 0.6/1.4 MB 1.6 MB/s eta 0:00:01
   ----------------- ---------------------- 0.6/1.4 MB 1.6 MB/s eta 0:00:01
   ------------------ --------------------- 0.7/1.4 MB 1.4 MB/s eta 0:00:01
   ------------------



In [None]:
import pandas as pd
import numpy as np
import optuna
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold,cross_val_score,train_test_split
from sklearn.preprocessing import StandardScaler,OrdinalEncoder,OneHotEncoder
from sklearn.decomposition import PCA
import category_encoders as ce

from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [None]:
df = pd.read_csv("surat_mvi.csv")

df.shape

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10742 entries, 0 to 10741
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   location         10742 non-null  object 
 1   bhk              10742 non-null  int64  
 2   built_up_area    10742 non-null  int64  
 3   transaction      10742 non-null  object 
 4   status           10742 non-null  object 
 5   totalfloor       10742 non-null  int64  
 6   furnishing       10742 non-null  object 
 7   facing           10742 non-null  object 
 8   bathroom         10742 non-null  int64  
 9   floor_category   10742 non-null  object 
 10  luxury_category  10742 non-null  object 
 11  price            10742 non-null  float64
dtypes: float64(1), int64(4), object(7)
memory usage: 1007.2+ KB


In [None]:
df.duplicated().sum()

0

In [None]:
df['furnishing'].value_counts()

furnishing
Unfurnished       8162
Furnished         1417
Semi-Furnished    1163
Name: count, dtype: int64

In [None]:
X = df.drop(columns=['price'])
y = df['price']

In [None]:
y_transformed = np.log1p(y)

In [None]:
numerical_columns = X.select_dtypes(include=['int','float']).columns
categorical_columns = X.select_dtypes(include=['object']).columns

## Ordinal Encoding
- Best for Tree Based Model

In [None]:
preprocessor_oe = ColumnTransformer(
    transformers=[
        ('num_scale',StandardScaler(),numerical_columns),
        ('cat_encoding',OrdinalEncoder(),categorical_columns)
    ],
    remainder='passthrough'
)

In [None]:
pipeline = Pipeline([
    ('preprocessor_oe',preprocessor_oe),
    ('regressor',LinearRegression())
])

In [None]:
# K-Fold cross-validation
kfold = KFold(n_splits=10,shuffle=True, random_state=42)
scores = cross_val_score(
    pipeline,
    X,
    y_transformed,
    cv=kfold,
    scoring='r2'
)

In [None]:
round(scores.mean()*100),round(scores.std(),5)

(81, 0.01227)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y_transformed,test_size=0.2, random_state=42)

In [None]:
pipeline.fit(X_train,y_train)

In [None]:
y_pred = pipeline.predict(X_test)

y_pred = np.expm1(y_pred)

In [None]:
mean_absolute_error(y_test,y_pred)

0.6855523880086613

## Making General Class For Different Models

In [None]:
class Scorer:
    def __init__(self,model_name,preprocessor,model):
        self.model_name = model_name
        self.preprocessor = preprocessor
        self.model = model

    def get_score(self,X,y):
        output = []

        output.append(self.model_name)

        pipeline = Pipeline([
            ('preprocessor',self.preprocessor),
            ('regressor',self.model)
        ])

        # kfold cross-validation
        kfold = KFold(n_splits=10,shuffle=True, random_state=42)
        scores = cross_val_score(
            pipeline,
            X,
            y_transformed,
            cv=kfold,
            scoring='r2'
        )

        output.append(scores.mean())

        X_train,X_test,y_train,y_test = train_test_split(X,y_transformed,test_size=0.2, random_state=42)
        pipeline.fit(X_train,y_train)

        y_pred = pipeline.predict(X_test)

        y_pred = np.expm1(y_pred)

        output.append(mean_absolute_error(np.expm1(y_test),y_pred))

        print(f"{self.model_name} is completed!!\n")
        return output

In [None]:
model_dict = {
    "Linear_Regression" : LinearRegression(),
    "Ridge" : Ridge(),
    "Lasso" : Lasso(),
    "SVR" : SVR(),
    "Decision_Tree" : DecisionTreeRegressor(),
    "RandomForest" : RandomForestRegressor(),
    "ExtraTree" : ExtraTreesRegressor(),
    "Adaboost" : AdaBoostRegressor(),
    "GradientBoosting" : GradientBoostingRegressor(),
    "MLPRegressor" : MLPRegressor(),
    "xgboost" : XGBRegressor(),
    "LightGBM" : LGBMRegressor()
}

In [None]:
predictions = []
for model_name,model in model_dict.items():
    predictions.append(
        Scorer(preprocessor=preprocessor_oe,
            model_name=model_name,
               model=model).get_score(X,y_transformed))

Linear_Regression is completed!!

Ridge is completed!!

Lasso is completed!!

SVR is completed!!

Decision_Tree is completed!!

RandomForest is completed!!

ExtraTree is completed!!

Adaboost is completed!!

GradientBoosting is completed!!

MLPRegressor is completed!!

xgboost is completed!!

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000818 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 340
[LightGBM] [Info] Number of data points in the train set: 9667, number of used features: 11
[LightGBM] [Info] Start training from score 0.864633
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000293 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 341
[LightGBM] [Info] Number of data points in 

### Ordinal Encoding DataFrame of Prediction

In [None]:
model_df = pd.DataFrame(predictions,columns=['Model_name','cv_scores','mae'])

In [None]:
model_df.sort_values(by=['cv_scores','mae'],ascending=False)

Unnamed: 0,Model_name,cv_scores,mae
5,RandomForest,0.923413,0.229332
6,ExtraTree,0.921128,0.231477
10,xgboost,0.92017,0.242021
11,LightGBM,0.914809,0.261329
8,GradientBoosting,0.890375,0.299005
4,Decision_Tree,0.867859,0.295057
9,MLPRegressor,0.853307,0.35072
3,SVR,0.848382,0.372005
7,Adaboost,0.828301,0.390336
1,Ridge,0.811657,0.416303


## One Hot Encoding

In [None]:
preprocessor_ohe = ColumnTransformer(
    transformers=[
        ('num_scale',StandardScaler(),numerical_columns),
        ('cat_encoding',OneHotEncoder(drop='first',sparse_output=False),categorical_columns)
    ],
    remainder='passthrough'
)

In [None]:
preprocessor_ohe.fit_transform(X).shape

(10742, 47)

In [None]:
# also we have to apply PCA

In [None]:
class Scorer:
    def __init__(self,model_name,preprocessor,model):
        self.model_name = model_name
        self.preprocessor = preprocessor
        self.model = model

    def get_score(self,X,y):
        output = []

        output.append(self.model_name)

        pipeline = Pipeline([
            ('preprocessor',self.preprocessor),
            ('pca',PCA(n_components=0.95)),
            ('regressor',self.model)
        ])

        # kfold cross-validation
        kfold = KFold(n_splits=10,shuffle=True, random_state=42)
        scores = cross_val_score(
            pipeline,
            X,
            y_transformed,
            cv=kfold,
            scoring='r2'
        )

        output.append(scores.mean())

        X_train,X_test,y_train,y_test = train_test_split(X,y_transformed,test_size=0.2, random_state=42)
        pipeline.fit(X_train,y_train)

        y_pred = pipeline.predict(X_test)

        y_pred = np.expm1(y_pred)

        output.append(mean_absolute_error(np.expm1(y_test),y_pred))

        print(f"{self.model_name} is completed!!\n")
        return output

In [None]:
model_dict = {
    "Linear_Regression" : LinearRegression(),
    "Ridge" : Ridge(),
    "Lasso" : Lasso(),
    "SVR" : SVR(),
    "Decision_Tree" : DecisionTreeRegressor(),
    "RandomForest" : RandomForestRegressor(),
    "ExtraTree" : ExtraTreesRegressor(),
    "Adaboost" : AdaBoostRegressor(),
    "GradientBoosting" : GradientBoostingRegressor(),
    "MLPRegressor" : MLPRegressor(),
    "xgboost" : XGBRegressor(),
    "LightGBM" : LGBMRegressor()
}

In [None]:
predictions = []
for model_name,model in model_dict.items():
    predictions.append(
        Scorer(preprocessor=preprocessor_ohe,
            model_name=model_name,
               model=model).get_score(X,y_transformed))

Linear_Regression is completed!!

Ridge is completed!!

Lasso is completed!!

SVR is completed!!

Decision_Tree is completed!!

RandomForest is completed!!

ExtraTree is completed!!

Adaboost is completed!!

GradientBoosting is completed!!

MLPRegressor is completed!!

xgboost is completed!!

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000629 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4590
[LightGBM] [Info] Number of data points in the train set: 9667, number of used features: 18
[LightGBM] [Info] Start training from score 0.864633
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000368 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4590
[LightGBM] [Info] Number of data points in the train set: 9667, number of used features: 18
[LightGBM] [Info] Start training from score 0.866514
[LightGBM] [Info] Auto-c

### One Hot Encoding DataFrame of Prediction

In [None]:
model_df = pd.DataFrame(predictions,columns=['Model_name','cv_scores','mae'])

In [None]:
model_df.sort_values(by=['cv_scores','mae'],ascending=False)

Unnamed: 0,Model_name,cv_scores,mae
6,ExtraTree,0.905684,0.25892
5,RandomForest,0.902984,0.270478
10,xgboost,0.897571,0.275552
11,LightGBM,0.897566,0.28927
3,SVR,0.881175,0.318371
9,MLPRegressor,0.873696,0.331765
8,GradientBoosting,0.872595,0.339132
1,Ridge,0.825456,0.394827
0,Linear_Regression,0.825455,0.394816
4,Decision_Tree,0.821292,0.3398


## Target Encoder

In [None]:
preprocessor_te = ColumnTransformer([
    ('num',StandardScaler(),numerical_columns),
    ('ohe',OneHotEncoder(sparse_output=False,drop='first'), ['transaction', 'status', 'furnishing', 'facing','floor_category', 'luxury_category']),
    ('target_enc',ce.TargetEncoder(),['location'])
],remainder='passthrough')

In [None]:
preprocessor_te1 = ColumnTransformer([
    ('num',StandardScaler(),numerical_columns),
    ('ohe',OneHotEncoder(sparse_output=False,drop='first'), ['transaction', 'status', 'furnishing','floor_category', 'luxury_category']),
    ('target_enc',ce.TargetEncoder(),['location','facing'])
],remainder='passthrough')

In [None]:
pipeline = Pipeline([
    ('preprocessing',preprocessor_te),
    ('regressor',LinearRegression())
])

In [None]:
# K-Fold cross-validation
kfold = KFold(n_splits=10,shuffle=True, random_state=42)
scores = cross_val_score(
    pipeline,
    X,
    y_transformed,
    cv=kfold,
    scoring='r2'
)

In [None]:
round(scores.mean()*100),round(scores.std(),5)

(84, np.float64(0.01409))

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y_transformed,test_size=0.2, random_state=42)

In [None]:
pipeline.fit(X_train,y_train)

In [None]:
y_pred = pipeline.predict(X_test)

y_pred = np.expm1(y_pred)

In [None]:
mean_absolute_error(y_test,y_pred)

np.float64(0.6974665765723467)

In [None]:
class Scorer:
    def __init__(self,model_name,preprocessor,model):
        self.model_name = model_name
        self.preprocessor = preprocessor
        self.model = model

    def get_score(self,X,y):
        output = []

        output.append(self.model_name)

        pipeline = Pipeline([
            ('preprocessor',self.preprocessor),
            ('regressor',self.model)
        ])

        # kfold cross-validation
        kfold = KFold(n_splits=10,shuffle=True, random_state=42)
        scores = cross_val_score(
            pipeline,
            X,
            y_transformed,
            cv=kfold,
            scoring='r2'
        )

        output.append(scores.mean())

        X_train,X_test,y_train,y_test = train_test_split(X,y_transformed,test_size=0.2, random_state=42)
        pipeline.fit(X_train,y_train)

        y_pred = pipeline.predict(X_test)

        y_pred = np.expm1(y_pred)

        output.append(mean_absolute_error(np.expm1(y_test),y_pred))

        print(f"{self.model_name} is completed!!\n")
        return output

In [None]:
model_dict = {
    "Linear_Regression" : LinearRegression(),
    "Ridge" : Ridge(),
    "Lasso" : Lasso(),
    "SVR" : SVR(),
    "Decision_Tree" : DecisionTreeRegressor(),
    "RandomForest" : RandomForestRegressor(),
    "ExtraTree" : ExtraTreesRegressor(),
    "Adaboost" : AdaBoostRegressor(),
    "GradientBoosting" : GradientBoostingRegressor(),
    "MLPRegressor" : MLPRegressor(),
    "xgboost" : XGBRegressor(),
    "LightGBM" : LGBMRegressor()
}

In [None]:
predictions = []
for model_name,model in model_dict.items():
    predictions.append(
        Scorer(preprocessor=preprocessor_te,
            model_name=model_name,
               model=model).get_score(X,y_transformed))

Linear_Regression is completed!!

Ridge is completed!!

Lasso is completed!!

SVR is completed!!

Decision_Tree is completed!!

RandomForest is completed!!

ExtraTree is completed!!

Adaboost is completed!!

GradientBoosting is completed!!

MLPRegressor is completed!!

xgboost is completed!!

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000363 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 351
[LightGBM] [Info] Number of data points in the train set: 9667, number of used features: 21
[LightGBM] [Info] Start training from score 0.864633
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000496 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 352
[LightGBM] [Info] Number of data points in 

### Target Encoder Dataframe

In [None]:
model_df = pd.DataFrame(predictions,columns=['Model_name','cv_scores','mae'])

In [None]:
model_df.sort_values(by=['cv_scores','mae'],ascending=False)

Unnamed: 0,Model_name,cv_scores,mae
5,RandomForest,0.924405,0.224718
10,xgboost,0.921144,0.24068
6,ExtraTree,0.92023,0.232153
11,LightGBM,0.916466,0.255216
8,GradientBoosting,0.892507,0.294958
3,SVR,0.873327,0.326771
4,Decision_Tree,0.872126,0.289288
9,MLPRegressor,0.868766,0.332459
1,Ridge,0.838833,0.378784
0,Linear_Regression,0.838822,0.378915


In [None]:
predictions = []
for model_name,model in model_dict.items():
    predictions.append(
        Scorer(preprocessor=preprocessor_te1,
            model_name=model_name,
               model=model).get_score(X,y_transformed))

Linear_Regression is completed!!

Ridge is completed!!

Lasso is completed!!

SVR is completed!!

Decision_Tree is completed!!

RandomForest is completed!!

ExtraTree is completed!!

Adaboost is completed!!

GradientBoosting is completed!!

MLPRegressor is completed!!

xgboost is completed!!

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000314 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 346
[LightGBM] [Info] Number of data points in the train set: 9667, number of used features: 15
[LightGBM] [Info] Start training from score 0.864633
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000441 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 347
[LightGBM] [Info] Number of data points in 

### Target Encoder DataFrame of Prediction

In [None]:
model_df = pd.DataFrame(predictions,columns=['Model_name','cv_scores','mae'])

In [None]:
model_df.sort_values(by=['cv_scores','mae'],ascending=False)

Unnamed: 0,Model_name,cv_scores,mae
5,RandomForest,0.924552,0.223523
10,xgboost,0.921917,0.240593
6,ExtraTree,0.920616,0.227544
11,LightGBM,0.916685,0.254408
8,GradientBoosting,0.892543,0.295885
3,SVR,0.873858,0.329232
4,Decision_Tree,0.872306,0.29155
9,MLPRegressor,0.87094,0.332831
1,Ridge,0.838985,0.378681
0,Linear_Regression,0.838977,0.378805


In [None]:
preprocessor_te1.get_feature_names_out()

array(['num__bhk', 'num__built_up_area', 'num__totalfloor',
       'num__bathroom', 'ohe__transaction_Resale',
       'ohe__status_Ready to Move', 'ohe__status_Under Construction',
       'ohe__furnishing_Semi-Furnished', 'ohe__furnishing_Unfurnished',
       'ohe__floor_category_Low Floor', 'ohe__floor_category_Mid Floor',
       'ohe__luxury_category_Low', 'ohe__luxury_category_Medium',
       'target_enc__location', 'target_enc__facing'], dtype=object)

# **Hyperparameter Tuning**

## Use Algorithms based on analysis :
- RandomForest Regressor
- xgboost
- ExtraTree
- LightGBMRegressor

### Apply Preprocessing on cols :
- we apply target encoder on -> location, facing
- one hot encoding on -> transaction, status, furnishing, luxury_category, floor_category

In [None]:
# we apply target encoder on -> location, facing
# one hot encoding on -> transaction, status, furnishing, luxury_category, floor_category

In [None]:
# Importing necessary libraries
import optuna
from optuna.visualization import plot_contour,plot_slice,plot_optimization_history,plot_timeline

In [None]:
preprocessor_te1 = ColumnTransformer([
    ('num',StandardScaler(),numerical_columns),
    ('ohe',OneHotEncoder(sparse_output=False,drop='first'), ['transaction', 'status', 'furnishing','floor_category', 'luxury_category']),
    ('target_enc',ce.TargetEncoder(),['location','facing'])
],remainder='passthrough')

pipeline = Pipeline([
    ('preprocessing',preprocessor_te1),
    ('regressor',LinearRegression())
])

In [None]:
class Scorer:
    def __init__(self,model_name,preprocessor,model):
        self.model_name = model_name
        self.preprocessor = preprocessor
        self.model = model

    def get_score(self,X,y):
        output = []

        output.append(self.model_name)

        pipeline = Pipeline([
            ('preprocessor',self.preprocessor),
            ('regressor',self.model)
        ])

        # kfold cross-validation
        kfold = KFold(n_splits=10,shuffle=True, random_state=42)
        scores = cross_val_score(
            pipeline,
            X,
            y_transformed,
            cv=kfold,
            scoring='r2'
        )

        output.append(scores.mean())

        X_train,X_test,y_train,y_test = train_test_split(X,y_transformed,test_size=0.2, random_state=42)
        pipeline.fit(X_train,y_train)

        y_pred = pipeline.predict(X_test)

        y_pred = np.expm1(y_pred)

        output.append(mean_absolute_error(np.expm1(y_test),y_pred))

        print(f"{self.model_name} is completed!!\n")
        return output

### Random Forest
- best R2 score :- 0.9256
- Mean Absolute Error :- 0.076128

In [None]:
def objective_randomforest(trial):

    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    max_depth = trial.suggest_int('max_depth', 3, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
    bootstrap = trial.suggest_categorical('bootstrap',[True,False])
    # oob_score = trial.suggest_categorical('oob_score',[True,False])
    # max_features = trial.suggest_float('max_features', 0.1, 1.0)
    # ccp_alpha = trial.suggest_float('ccp_alpha', 0.0, 0.15)

    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        bootstrap=bootstrap,
        # oob_score=oob_score,
        # max_features=max_features,
        # ccp_alpha=ccp_alpha,
        random_state=42
    )

    preprocessor_te1 = ColumnTransformer([
        ('num',StandardScaler(),numerical_columns),
        ('ohe',OneHotEncoder(sparse_output=False,drop='first'), ['transaction', 'status', 'furnishing','floor_category', 'luxury_category']),
        ('target_enc',ce.TargetEncoder(),['location','facing'])
        ],remainder='passthrough')


    pipeline_rfr = Pipeline([
        ('preprocessing',preprocessor_te1),
        ('regressor',model)
                ])

    score = cross_val_score(pipeline_rfr, X, y_transformed,
                            cv=KFold(n_splits=10, shuffle=True, random_state=42),
                            scoring='r2',n_jobs=-1).mean()
    return score

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective_randomforest, n_trials=200)

[I 2024-10-20 17:01:24,510] A new study created in memory with name: no-name-59f6e06c-c566-4264-ae12-311c8edae39d
[I 2024-10-20 17:01:43,487] Trial 0 finished with value: 0.9211445451568665 and parameters: {'n_estimators': 364, 'max_depth': 16, 'min_samples_split': 14, 'min_samples_leaf': 2, 'bootstrap': True}. Best is trial 0 with value: 0.9211445451568665.
[I 2024-10-20 17:01:47,390] Trial 1 finished with value: 0.8325218540106271 and parameters: {'n_estimators': 301, 'max_depth': 3, 'min_samples_split': 2, 'min_samples_leaf': 9, 'bootstrap': True}. Best is trial 0 with value: 0.9211445451568665.
[I 2024-10-20 17:01:51,793] Trial 2 finished with value: 0.8650637521169117 and parameters: {'n_estimators': 186, 'max_depth': 5, 'min_samples_split': 12, 'min_samples_leaf': 6, 'bootstrap': False}. Best is trial 0 with value: 0.9211445451568665.
[I 2024-10-20 17:02:17,434] Trial 3 finished with value: 0.8958651489907139 and parameters: {'n_estimators': 430, 'max_depth': 19, 'min_samples_spl

In [None]:
best_trial = study.best_trial
print(f"Best Trial Parameters : {best_trial.params}")
print(f"Best Trial R2 score : {best_trial.value}")

Best Trial Parameters : {'n_estimators': 456, 'max_depth': 19, 'min_samples_split': 3, 'min_samples_leaf': 1, 'bootstrap': True}
Best Trial R2 score : 0.9256130028216354


In [None]:
best_trial.params

{'n_estimators': 456,
 'max_depth': 19,
 'min_samples_split': 3,
 'min_samples_leaf': 1,
 'bootstrap': True}

In [None]:
# {'n_estimators': 456,
#  'max_depth': 19,
#  'min_samples_split': 3,
#  'min_samples_leaf': 1,
#  'bootstrap': True}

best_rf_params = best_trial.params
preprocessor_te1 = ColumnTransformer([
        ('num',StandardScaler(),numerical_columns),
        ('ohe',OneHotEncoder(sparse_output=False,drop='first'), ['transaction', 'status', 'furnishing','floor_category', 'luxury_category']),
        ('target_enc',ce.TargetEncoder(),['location','facing'])
        ],remainder='passthrough')


best_rf_model = RandomForestRegressor(**best_rf_params, random_state=42)

pipeline_rfr = Pipeline([
    ('preprocessing',preprocessor_te1),
    ('regressor',best_rf_model)
            ])

score = cross_val_score(pipeline_rfr, X, y_transformed,
                        cv=KFold(n_splits=10, shuffle=True, random_state=42),
                        scoring='r2',n_jobs=-1).mean()

In [None]:
score

np.float64(0.9256130028216354)

In [None]:
best_rf_params = {'n_estimators': 456,
                'max_depth': 19,
                'min_samples_split': 3,
                'min_samples_leaf': 1,
                'bootstrap': True}
preprocessor_te1 = ColumnTransformer([
        ('num',StandardScaler(),numerical_columns),
        ('ohe',OneHotEncoder(sparse_output=False,drop='first'), ['transaction', 'status', 'furnishing','floor_category', 'luxury_category']),
        ('target_enc',ce.TargetEncoder(),['location','facing'])
        ],remainder='passthrough')


best_rf_model = RandomForestRegressor(**best_rf_params, random_state=42)

pipeline_rfr = Pipeline([
    ('preprocessing',preprocessor_te1),
    ('regressor',best_rf_model)
            ])

mae_score = cross_val_score(pipeline_rfr, X, y_transformed,
                        cv=KFold(n_splits=20, shuffle=True, random_state=42),
                        scoring='neg_mean_absolute_error',n_jobs=-1).mean()

In [None]:
negative_mae = np.float64(mae_score)
mae = -negative_mae  # Convert to positive

print(f"Mean Absolute Error: {mae}")

Mean Absolute Error: 0.07612809792010336


### XGBoost
- best R2 Score : 92.5796
- Mean Absolute Error: 0.07959
- **Latest**
    - best mae = 0.078
    - best r2 = 92.8

In [None]:
def objective_xgboost(trial):

    # Suggesting hyperparameters for XGBoost Regressor
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 3, 16)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 10)
    gamma = trial.suggest_float('gamma', 0, 10)
    reg_lambda = trial.suggest_float('lambda', 0, 10)  # L2 regularization
    reg_alpha = trial.suggest_float('alpha', 0, 10)    # L1 regularization
    learning_rate = trial.suggest_float('eta', 0.01, 0.3)  # Learning rate
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    colsample_bylevel = trial.suggest_float('colsample_bylevel', 0.5, 1.0)
    colsample_bynode = trial.suggest_float('colsample_bynode', 0.5, 1.0)
    tree_method = trial.suggest_categorical('tree_method', ['auto', 'approx', 'hist'])
    booster = trial.suggest_categorical('booster', ['gbtree', 'gblinear'])

    model = XGBRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_child_weight=min_child_weight,
        gamma=gamma,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        colsample_bylevel=colsample_bylevel,
        colsample_bynode=colsample_bynode,
        tree_method=tree_method,
        booster=booster,
        random_state=42
    )

    preprocessor_te1 = ColumnTransformer([
        ('num', StandardScaler(), numerical_columns),
        ('ohe', OneHotEncoder(sparse_output=False, drop='first'), ['transaction', 'status', 'furnishing', 'floor_category', 'luxury_category']),
        ('target_enc', ce.TargetEncoder(), ['location', 'facing'])
    ], remainder='passthrough')

    pipeline_xgb = Pipeline([
        ('preprocessing', preprocessor_te1),
        ('regressor', model)
    ])

    score = cross_val_score(pipeline_xgb, X, y_transformed,
                            cv=KFold(n_splits=10, shuffle=True, random_state=42),
                            scoring='r2', n_jobs=-1).mean()

    return score

In [None]:
study = optuna.create_study(direction='maximize',
                            study_name="xgboost_ahm_data",
                            storage="sqlite:///xgboost_ahm1.db",
                            pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=30, interval_steps=10))
study.optimize(objective_xgboost, n_trials=200)

[I 2024-10-25 10:14:18,704] A new study created in RDB with name: xgboost_ahm_data
[I 2024-10-25 10:14:27,590] Trial 0 finished with value: -0.0014266266123092787 and parameters: {'n_estimators': 384, 'max_depth': 7, 'min_child_weight': 7, 'gamma': 8.19137252721567, 'lambda': 8.810123068305009, 'alpha': 9.47126043355415, 'eta': 0.1453329338986372, 'subsample': 0.7048398730198089, 'colsample_bytree': 0.6922559527071708, 'colsample_bylevel': 0.6492176506364449, 'colsample_bynode': 0.7815482990237124, 'tree_method': 'approx', 'booster': 'gblinear'}. Best is trial 0 with value: -0.0014266266123092787.
[I 2024-10-25 10:14:30,363] Trial 1 finished with value: 0.8455228185673043 and parameters: {'n_estimators': 507, 'max_depth': 14, 'min_child_weight': 8, 'gamma': 7.844029855079596, 'lambda': 6.517339728693704, 'alpha': 1.9141803280608183, 'eta': 0.25018138204670537, 'subsample': 0.759091193288022, 'colsample_bytree': 0.5556041301347907, 'colsample_bylevel': 0.6272514565411167, 'colsample_byn

In [None]:
best_trial = study.best_trial
print(f"Best Trial Parameters : {best_trial.params}")
print(f"Best Trial R2 score : {best_trial.value}")

Best Trial Parameters : {'n_estimators': 785, 'max_depth': 11, 'min_child_weight': 2, 'gamma': 0.0046586042055373885, 'lambda': 6.485762246450696, 'alpha': 0.5855286128937831, 'eta': 0.2857757568193114, 'subsample': 0.6345158078695642, 'colsample_bytree': 0.9807676183680145, 'colsample_bylevel': 0.9789475309795306, 'colsample_bynode': 0.9397540930871581, 'tree_method': 'approx', 'booster': 'gbtree'}
Best Trial R2 score : 0.9233115760219368


In [None]:
best_trial.params

{'n_estimators': 785,
 'max_depth': 11,
 'min_child_weight': 2,
 'gamma': 0.0046586042055373885,
 'lambda': 6.485762246450696,
 'alpha': 0.5855286128937831,
 'eta': 0.2857757568193114,
 'subsample': 0.6345158078695642,
 'colsample_bytree': 0.9807676183680145,
 'colsample_bylevel': 0.9789475309795306,
 'colsample_bynode': 0.9397540930871581,
 'tree_method': 'approx',
 'booster': 'gbtree'}

In [None]:
# {'n_estimators': 785,
#  'max_depth': 11,
#  'min_child_weight': 2,
#  'gamma': 0.0046586042055373885,
#  'lambda': 6.485762246450696,
#  'alpha': 0.5855286128937831,
#  'eta': 0.2857757568193114,
#  'subsample': 0.6345158078695642,
#  'colsample_bytree': 0.9807676183680145,
#  'colsample_bylevel': 0.9789475309795306,
#  'colsample_bynode': 0.9397540930871581,
#  'tree_method': 'approx',
#  'booster': 'gbtree'}

#### Test the best model

In [None]:
best_xgb_params = best_trial.params
preprocessor_te1 = ColumnTransformer([
        ('num',StandardScaler(),numerical_columns),
        ('ohe',OneHotEncoder(sparse_output=False,drop='first'), ['transaction', 'status', 'furnishing','floor_category', 'luxury_category']),
        ('target_enc',ce.TargetEncoder(),['location','facing'])
        ],remainder='passthrough')


best_rf_model = XGBRegressor(**best_xgb_params, random_state=42)

pipeline_rfr = Pipeline([
    ('preprocessing',preprocessor_te1),
    ('regressor',best_rf_model)
            ])

score = cross_val_score(pipeline_rfr, X, y_transformed,
                        cv=KFold(n_splits=20, shuffle=True, random_state=42),
                        scoring='r2',n_jobs=-1).mean()

In [None]:
score*100

92.40176553704339

#### Calculate MAE

In [None]:
best_xgb_params = {'n_estimators': 785,
 'max_depth': 11,
 'min_child_weight': 2,
 'gamma': 0.0046586042055373885,
 'lambda': 6.485762246450696,
 'alpha': 0.5855286128937831,
 'eta': 0.2857757568193114,
 'subsample': 0.6345158078695642,
 'colsample_bytree': 0.9807676183680145,
 'colsample_bylevel': 0.9789475309795306,
 'colsample_bynode': 0.9397540930871581,
 'tree_method': 'approx',
 'booster': 'gbtree'}

In [None]:
preprocessor_te1 = ColumnTransformer([
        ('num',StandardScaler(),numerical_columns),
        ('ohe',OneHotEncoder(sparse_output=False,drop='first'), ['transaction', 'status', 'furnishing','floor_category', 'luxury_category']),
        ('target_enc',ce.TargetEncoder(),['location','facing'])
        ],remainder='passthrough')


best_xgb_model = XGBRegressor(**best_xgb_params, random_state=42)

pipeline_xgb = Pipeline([
    ('preprocessing',preprocessor_te1),
    ('regressor',best_xgb_model)
            ])

negative_mae = cross_val_score(pipeline_xgb, X, y_transformed,
                        cv=KFold(n_splits=20, shuffle=True, random_state=42),
                        scoring='neg_mean_absolute_error',n_jobs=-1).mean()

In [None]:
mae = -negative_mae  # Convert to positive

print(f"Mean Absolute Error: {mae}")

Mean Absolute Error: 0.08127451044314177


In [None]:
preprocessor_te1 = ColumnTransformer([
        ('num',StandardScaler(),numerical_columns),
        ('ohe',OneHotEncoder(sparse_output=False,drop='first'), ['transaction', 'status', 'furnishing','floor_category', 'luxury_category']),
        ('target_enc',ce.TargetEncoder(),['location','facing'])
        ],remainder='passthrough')


best_xgb_model = XGBRegressor(**best_xgb_params, random_state=42)

pipeline_xgb = Pipeline([
    ('preprocessing',preprocessor_te1),
    ('regressor',best_xgb_model)
            ])

negative_mae = cross_val_score(pipeline_xgb, X, y_transformed,
                        cv=KFold(n_splits=20, shuffle=True, random_state=42),
                        scoring='r2',n_jobs=-1).mean()

In [None]:
mae = negative_mae  # Convert to positive

print(f"Mean Absolute Error: {mae}")

Mean Absolute Error: 0.9240176553704339


### Fitting using train_test_split

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X,y_transformed, test_size=0.3, random_state=42)

In [None]:
best_xgb_params

{'n_estimators': 785,
 'max_depth': 11,
 'min_child_weight': 2,
 'gamma': 0.0046586042055373885,
 'lambda': 6.485762246450696,
 'alpha': 0.5855286128937831,
 'eta': 0.2857757568193114,
 'subsample': 0.6345158078695642,
 'colsample_bytree': 0.9807676183680145,
 'colsample_bylevel': 0.9789475309795306,
 'colsample_bynode': 0.9397540930871581,
 'tree_method': 'approx',
 'booster': 'gbtree'}

In [None]:
preprocessor_te1 = ColumnTransformer([
        ('num',StandardScaler(),numerical_columns),
        ('ohe',OneHotEncoder(sparse_output=False,drop='first'), ['transaction', 'status', 'furnishing','floor_category', 'luxury_category']),
        ('target_enc',ce.TargetEncoder(),['location','facing'])
        ],remainder='passthrough')


best_xgb_model = XGBRegressor(**best_xgb_params, random_state=42)

pipeline_xgb = Pipeline([
    ('preprocessing',preprocessor_te1),
    ('regressor',best_xgb_model)
            ])

In [None]:
pipeline_xgb.fit(X_train,y_train)

In [None]:
y_pred_xgb = pipeline_xgb.predict(X_test)

In [None]:
# y_pred_xgb = np.expm1(y_pred_xgb)

In [None]:
r2_score(y_test,y_pred_xgb)

0.9189079781387007

In [None]:
mean_absolute_error(y_test,y_pred_xgb)

0.08143662791489567

In [None]:
from sklearn.metrics import r2_score

# Assuming y_test and y_pred_xgb are already defined
r2 = r2_score(y_test, y_pred_xgb)
n = len(y_test)   # Number of observations
p = X_test.shape[1]  # Number of predictors (features)

# Calculate adjusted R-squared
adjusted_r2 = 1 - ((1 - r2) * (n - 1)) / (n - p - 1)
print("R^2 Score:", r2)
print("Adjusted R^2 Score:", adjusted_r2)


R^2 Score: 0.9189079781387007
Adjusted R^2 Score: 0.9186301792472419


#### Model using mae

In [None]:
import optuna

In [None]:
def objective_xgboost(trial):

    # Suggesting hyperparameters for XGBoost Regressor
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', 3, 16)
    min_child_weight = trial.suggest_int('min_child_weight', 1, 10)
    gamma = trial.suggest_float('gamma', 0, 10)
    reg_lambda = trial.suggest_float('lambda', 0, 10)  # L2 regularization
    reg_alpha = trial.suggest_float('alpha', 0, 10)    # L1 regularization
    learning_rate = trial.suggest_float('eta', 0.01, 0.3)  # Learning rate
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    colsample_bylevel = trial.suggest_float('colsample_bylevel', 0.5, 1.0)
    colsample_bynode = trial.suggest_float('colsample_bynode', 0.5, 1.0)
    tree_method = trial.suggest_categorical('tree_method', ['auto', 'approx', 'hist'])
    booster = trial.suggest_categorical('booster', ['gbtree', 'gblinear'])

    model = XGBRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_child_weight=min_child_weight,
        gamma=gamma,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        learning_rate=learning_rate,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        colsample_bylevel=colsample_bylevel,
        colsample_bynode=colsample_bynode,
        tree_method=tree_method,
        booster=booster,
        random_state=42
    )

    preprocessor_te1 = ColumnTransformer([
        ('num', StandardScaler(), numerical_columns),
        ('ohe', OneHotEncoder(sparse_output=False, drop='first'), ['transaction', 'status', 'furnishing', 'floor_category', 'luxury_category']),
        ('target_enc', ce.TargetEncoder(), ['location', 'facing'])
    ], remainder='passthrough')

    pipeline_xgb = Pipeline([
        ('preprocessing', preprocessor_te1),
        ('regressor', model)
    ])

    score = cross_val_score(pipeline_xgb, X, y_transformed,
                            cv=KFold(n_splits=10, shuffle=True, random_state=42),
                            scoring='neg_mean_absolute_error', n_jobs=-1).mean()

    return -score

In [None]:
study = optuna.create_study(direction='minimize',
                            study_name="xgboost_ahm1",
                            storage="sqlite:///xgboost_ahm_mae.db",
                            pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=30, interval_steps=10))
study.optimize(objective_xgboost, n_trials=200)

[I 2024-10-21 09:26:09,636] A new study created in RDB with name: xgboost_ahm1
[I 2024-10-21 09:26:18,159] Trial 0 finished with value: 0.12053555766012265 and parameters: {'n_estimators': 266, 'max_depth': 15, 'min_child_weight': 9, 'gamma': 2.6782756335475932, 'lambda': 7.287798021972769, 'alpha': 7.257319238262777, 'eta': 0.11283855820085952, 'subsample': 0.8092030685473937, 'colsample_bytree': 0.6593604629304753, 'colsample_bylevel': 0.6524289472127753, 'colsample_bynode': 0.7655343655237274, 'tree_method': 'approx', 'booster': 'gbtree'}. Best is trial 0 with value: 0.12053555766012265.
[I 2024-10-21 09:26:21,071] Trial 1 finished with value: 0.12923420926039436 and parameters: {'n_estimators': 705, 'max_depth': 7, 'min_child_weight': 1, 'gamma': 5.327525192005895, 'lambda': 0.9773185982756039, 'alpha': 0.6765067992401508, 'eta': 0.2484270623592961, 'subsample': 0.5317633875455285, 'colsample_bytree': 0.7325173615119511, 'colsample_bylevel': 0.6696584407078027, 'colsample_bynode': 

In [None]:
best_trial = study.best_trial
print(f"Best Trial Parameters : {best_trial.params}")
print(f"Best Trial MAE score : {best_trial.value}")

Best Trial Parameters : {'n_estimators': 575, 'max_depth': 15, 'min_child_weight': 7, 'gamma': 0.0020556862993042183, 'lambda': 6.440232878451855, 'alpha': 0.02954612701677145, 'eta': 0.06709360947516087, 'subsample': 0.7998254835934505, 'colsample_bytree': 0.9155578919010959, 'colsample_bylevel': 0.9218105551128289, 'colsample_bynode': 0.6944244865247082, 'tree_method': 'hist', 'booster': 'gbtree'}
Best Trial MAE score : 0.07856465894143608


In [None]:
best_trial.params

{'n_estimators': 575,
 'max_depth': 15,
 'min_child_weight': 7,
 'gamma': 0.0020556862993042183,
 'lambda': 6.440232878451855,
 'alpha': 0.02954612701677145,
 'eta': 0.06709360947516087,
 'subsample': 0.7998254835934505,
 'colsample_bytree': 0.9155578919010959,
 'colsample_bylevel': 0.9218105551128289,
 'colsample_bynode': 0.6944244865247082,
 'tree_method': 'hist',
 'booster': 'gbtree'}

In [None]:
best_mae_xgb_params = {'n_estimators': 575,
                'max_depth': 15,
                'min_child_weight': 7,
                'gamma': 0.0020556862993042183,
                'lambda': 6.440232878451855,
                'alpha': 0.02954612701677145,
                'eta': 0.06709360947516087,
                'subsample': 0.7998254835934505,
                'colsample_bytree': 0.9155578919010959,
                'colsample_bylevel': 0.9218105551128289,
                'colsample_bynode': 0.6944244865247082,
                'tree_method': 'hist',
                'booster': 'gbtree'}

In [None]:
preprocessor_te1 = ColumnTransformer([
        ('num',StandardScaler(),numerical_columns),
        ('ohe',OneHotEncoder(sparse_output=False,drop='first'), ['transaction', 'status', 'furnishing','floor_category', 'luxury_category']),
        ('target_enc',ce.TargetEncoder(),['location','facing'])
        ],remainder='passthrough')


best_xgb_model = XGBRegressor(**best_mae_xgb_params,random_state=42)

pipeline_xgb = Pipeline([
    ('preprocessing',preprocessor_te1),
    ('regressor',best_xgb_model)
            ])

score = cross_val_score(pipeline_xgb, X, y_transformed,
                        cv=KFold(n_splits=20, shuffle=True, random_state=42),
                        scoring='r2',n_jobs=-1, verbose=2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 out of  20 | elapsed:   16.3s remaining:    4.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   17.5s finished


In [None]:
print(f"r2 score: {score}")

r2 score: 0.9281770833539035


## Exporting XGB Model

In [None]:
# import pickle
# # Save the pipeline model
# with open('pipeline_xgb.pkl', 'wb') as model_file:
#     pickle.dump(pipeline_xgb, model_file)

# # # Save the dataset
# # with open('data.pkl', 'wb') as data_file:
# #     pickle.dump(df, data_file)


In [None]:
import joblib

# Save the pipeline model using joblib
joblib.dump(pipeline_xgb, 'pipeline_xgb.joblib')

# Save the dataset
joblib.dump(df, 'data.joblib')


['data.joblib']

## LightGBM
- best R2 Score : 91.99 ~ 92
- Mean Absolute Error: 0.079599

In [None]:
import lightgbm as lgb
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, KFold
import category_encoders as ce  # Ensure you have this installed
import optuna

def objective_lightgbm(trial):

    # Suggesting hyperparameters for LightGBM Regressor
    n_estimators = trial.suggest_int('n_estimators', 100, 1000)
    max_depth = trial.suggest_int('max_depth', -1, 16)  # Use -1 for no limit
    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3)
    num_leaves = trial.suggest_int('num_leaves', 20, 150)
    min_child_samples = trial.suggest_int('min_child_samples', 1, 100)
    lambda_l1 = trial.suggest_float('lambda_l1', 0, 10)  # L1 regularization
    lambda_l2 = trial.suggest_float('lambda_l2', 0, 10)  # L2 regularization
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    colsample_bytree = trial.suggest_float('colsample_bytree', 0.5, 1.0)
    boosting_type = trial.suggest_categorical('boosting_type', ['gbdt', 'dart', 'goss'])

    model = lgb.LGBMRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        num_leaves=num_leaves,
        min_child_samples=min_child_samples,
        lambda_l1=lambda_l1,
        lambda_l2=lambda_l2,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        boosting_type=boosting_type,
        n_jobs=-1,
        random_state=42
    )

    preprocessor_te1 = ColumnTransformer([
        ('num', StandardScaler(), numerical_columns),
        ('ohe', OneHotEncoder(sparse_output=False, drop='first'), ['transaction', 'status', 'furnishing', 'floor_category', 'luxury_category']),
        ('target_enc', ce.TargetEncoder(), ['location', 'facing'])
    ], remainder='passthrough')

    pipeline_lgb = Pipeline([
        ('preprocessing', preprocessor_te1),
        ('regressor', model)
    ])

    score = cross_val_score(pipeline_lgb, X, y_transformed,
                             cv=KFold(n_splits=10, shuffle=True, random_state=42),
                             scoring='r2', n_jobs=-1).mean()

    return score


In [None]:
study = optuna.create_study(direction='maximize',
                            study_name="lightgbm_ahm_data",
                            storage="sqlite:///xgboost_ahm1.db",
                            pruner=optuna.pruners.MedianPruner(n_startup_trials=5, n_warmup_steps=30, interval_steps=10))
study.optimize(objective_lightgbm, n_trials=200)

[I 2024-10-20 19:16:46,399] A new study created in RDB with name: lightgbm_ahm_data
[I 2024-10-20 19:16:53,369] Trial 0 finished with value: 0.9047477960732593 and parameters: {'n_estimators': 853, 'max_depth': -1, 'learning_rate': 0.27375430831357084, 'num_leaves': 35, 'min_child_samples': 77, 'lambda_l1': 4.326765485538834, 'lambda_l2': 4.2747979170992965, 'subsample': 0.822987669626181, 'colsample_bytree': 0.8957686741234059, 'boosting_type': 'gbdt'}. Best is trial 0 with value: 0.9047477960732593.
[I 2024-10-20 19:16:57,910] Trial 1 finished with value: 0.907383315923223 and parameters: {'n_estimators': 896, 'max_depth': -1, 'learning_rate': 0.24293742480682706, 'num_leaves': 104, 'min_child_samples': 66, 'lambda_l1': 8.526843523658966, 'lambda_l2': 8.10771658473438, 'subsample': 0.9604613270765708, 'colsample_bytree': 0.8946917525853519, 'boosting_type': 'goss'}. Best is trial 1 with value: 0.907383315923223.
[I 2024-10-20 19:16:58,626] Trial 2 finished with value: 0.8347432000290

In [None]:
best_trial = study.best_trial
print(f"Best Trial Parameters : {best_trial.params}")
print(f"Best Trial R2 score : {best_trial.value}")

Best Trial Parameters : {'n_estimators': 944, 'max_depth': 16, 'learning_rate': 0.15969675160521957, 'num_leaves': 146, 'min_child_samples': 3, 'lambda_l1': 0.14068614220929881, 'lambda_l2': 1.9506001541091766, 'subsample': 0.9095898400124978, 'colsample_bytree': 0.874476763926304, 'boosting_type': 'dart'}
Best Trial R2 score : 0.9279166859011673


In [None]:
best_trial.params

{'n_estimators': 944,
 'max_depth': 16,
 'learning_rate': 0.15969675160521957,
 'num_leaves': 146,
 'min_child_samples': 3,
 'lambda_l1': 0.14068614220929881,
 'lambda_l2': 1.9506001541091766,
 'subsample': 0.9095898400124978,
 'colsample_bytree': 0.874476763926304,
 'boosting_type': 'dart'}

In [None]:
# {'n_estimators': 634,
#  'max_depth': 12,
#  'min_child_weight': 8,
#  'gamma': 0.0002861183040955631,
#  'lambda': 0.9428763463676353,
#  'alpha': 0.5593671105711346,
#  'eta': 0.1865835656991152,
#  'subsample': 0.986071911153697,
#  'colsample_bytree': 0.630717240360652,
#  'colsample_bylevel': 0.9648043451900892,
#  'colsample_bynode': 0.7530870818841173,
#  'tree_method': 'approx',
#  'booster': 'gbtree'}

In [None]:
best_lgb_params = best_trial.params
preprocessor_te1 = ColumnTransformer([
        ('num',StandardScaler(),numerical_columns),
        ('ohe',OneHotEncoder(sparse_output=False,drop='first'), ['transaction', 'status', 'furnishing','floor_category', 'luxury_category']),
        ('target_enc',ce.TargetEncoder(),['location','facing'])
        ],remainder='passthrough')


best_lgb_model = XGBRegressor(**best_lgb_params, random_state=42)

pipeline_lgb = Pipeline([
    ('preprocessing',preprocessor_te1),
    ('regressor',best_lgb_model)
            ])

score = cross_val_score(pipeline_lgb, X, y_transformed,
                        cv=KFold(n_splits=20, shuffle=True, random_state=42),
                        scoring='r2',n_jobs=-1).mean()

In [None]:
score*100

np.float64(91.99098056273692)

#### Calculate MAE

In [None]:
best_lgb_params = {'n_estimators': 634,
 'max_depth': 12,
 'min_child_weight': 8,
 'gamma': 0.0002861183040955631,
 'lambda': 0.9428763463676353,
 'alpha': 0.5593671105711346,
 'eta': 0.1865835656991152,
 'subsample': 0.986071911153697,
 'colsample_bytree': 0.630717240360652,
 'colsample_bylevel': 0.9648043451900892,
 'colsample_bynode': 0.7530870818841173,
 'tree_method': 'approx',
 'booster': 'gbtree'}

In [None]:
preprocessor_te1 = ColumnTransformer([
        ('num',StandardScaler(),numerical_columns),
        ('ohe',OneHotEncoder(sparse_output=False,drop='first'), ['transaction', 'status', 'furnishing','floor_category', 'luxury_category']),
        ('target_enc',ce.TargetEncoder(),['location','facing'])
        ],remainder='passthrough')


best_lgb_model = XGBRegressor(**best_lgb_params, random_state=42)

pipeline_lgb = Pipeline([
    ('preprocessing',preprocessor_te1),
    ('regressor',best_lgb_model)
            ])

score = cross_val_score(pipeline_lgb, X, y_transformed,
                        cv=KFold(n_splits=20, shuffle=True, random_state=42),
                        scoring='neg_mean_absolute_error',n_jobs=-1).mean()

In [None]:
mae = -negative_mae  # Convert to positive

print(f"Mean Absolute Error: {mae}")

Mean Absolute Error: 0.07959940695493628


## Model Exporting

#### Based on Analysis see that RandomForest and XGBoost gives best r2 score with low Mean Absolute Error

#### Model : XGBOOST
- we use the model XGBOOST

In [None]:
import pickle

# pipeline_rfr

In [None]:
# Save the model (pipeline with preprocessing and regressor)
with open('pipeline_xgb.pkl', 'wb') as model_file:
    pickle.dump(pipeline_xgb, model_file)

print("Model/Pipeline saved successfully!")

Model/Pipeline saved successfully!


#### Exporting Dataset

In [None]:
# Save the dataset
with open('data.pkl', 'wb') as dataset_file:
    pickle.dump(df, dataset_file)

print("Dataset saved successfully!")

Dataset saved successfully!


In [None]:
df.shape

(10742, 12)

In [None]:
df.head()

Unnamed: 0,location,bhk,built_up_area,transaction,status,totalfloor,furnishing,facing,bathroom,floor_category,luxury_category,price
0,gota/jagatpur/newranip/sola,1,407,New Property,Ready to Move,14,Unfurnished,North - East,1,High Floor,Low,0.29
1,shantipura circle/sanathal,1,717,New Property,Under Construction,14,Unfurnished,East,1,Mid Floor,Low,0.28
2,gota/jagatpur/newranip/sola,1,502,New Property,Under Construction,10,Unfurnished,East,1,Mid Floor,Low,0.29
3,shantipura circle/sanathal,1,441,New Property,Under Construction,14,Unfurnished,East,1,Mid Floor,Low,0.3
4,vastral/sardar patel ring rd/odhav,1,715,Resale,Ready to Move,5,Unfurnished,North - East,1,Low Floor,Low,0.24
