In [57]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,r2_score
from sklearn.decomposition import PCA

In [58]:
from sklearn.linear_model import Ridge,Lasso

In [59]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor, AdaBoostRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.neural_network import MLPRegressor

In [60]:
#!pip install xgboost later i will do it 

In [61]:
#from xgboost import XGBRegressor

In [62]:
df = pd.read_csv('gurgaon_properties_post_feature_selection_v2.csv')

In [63]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,sector 28,4.5,4,4,0,Old Property,3240.0,0,0,unfurnished,low,mid floor
1,flat,sector 66,2.8,3,3,2,New Property,1950.0,1,0,furnished,medium,high floor
2,flat,sector 104,1.9,3,3,3,Under Construction,1365.0,0,0,semifurnished,medium,high floor
3,house,sector 50,10.56,5,5,3,Moderately Old,3240.0,1,0,unfurnished,low,mid floor
4,house,sector 11,2.2,4,3,2,Old Property,1800.0,0,0,unfurnished,low,low floor


In [64]:
# df.furnishing_type.value_counts()

In [65]:
# # 0 -> unfurnished
# # 1 -> semifurnished
# # 2 -> furnished
# df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

In [66]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,sector 28,4.5,4,4,0,Old Property,3240.0,0,0,unfurnished,low,mid floor
1,flat,sector 66,2.8,3,3,2,New Property,1950.0,1,0,furnished,medium,high floor
2,flat,sector 104,1.9,3,3,3,Under Construction,1365.0,0,0,semifurnished,medium,high floor
3,house,sector 50,10.56,5,5,3,Moderately Old,3240.0,1,0,unfurnished,low,mid floor
4,house,sector 11,2.2,4,3,2,Old Property,1800.0,0,0,unfurnished,low,low floor


In [67]:
X = df.drop(columns=['price'])
y = df['price']

In [68]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

In [69]:
cat_cols=['property_type','sector','balcony','agePossession','furnishing_type','luxury_category','floor_category']
num_cols = ['bedRoom','bathroom','built_up_area','servant room','store room']

ord_cat_cols=['property_type' ,'balcony','luxury_category','floor_category']
ohe_cat_cols=['sector','agePossession','furnishing_type']

tar_cat_cols=['property_type' ,'balcony','luxury_category','floor_category']
tar_ohe_cols= ['agePossession','furnishing_type']
tar_ce_cols=['sector']

# Ordinal Encoding

In [70]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [71]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols ),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), cat_cols )
    ], 
    remainder='passthrough'
)

In [72]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [73]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [74]:
scores.mean(),scores.std()

(0.7369801973943306, 0.021918264322027578)

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [76]:
pipeline.fit(X_train,y_train)

In [77]:
y_pred = pipeline.predict(X_test)

In [78]:
mean_absolute_error(np.expm1(y_test),y_pred)

1.4312626444073986

In [79]:
def scorer(model_name,model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # kfold cross validation 
    kfold =  KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)
    
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append( mean_absolute_error( np.expm1(y_test), y_pred ) )
    
    return output

In [80]:
model_dict = {
    'linear_reg': LinearRegression(),
    'svr': SVR(),
    'ridge': Ridge(),
    'LASSO': Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest': RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    #'xgboost': XGBRegressor()
}

In [54]:
model_output=[]
for model_name, model in model_dict.items():
    model_output.append( scorer(model_name,model) )

In [55]:
model_df = pd.DataFrame(model_output, columns = ['name','r2','mae'])

In [56]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.881169,0.524882
6,extra trees,0.869451,0.541809
7,gradient boosting,0.872552,0.576705
9,mlp,0.801176,0.722241
4,decision tree,0.776538,0.726677
1,svr,0.76412,0.847358
8,adaboost,0.754885,0.862643
2,ridge,0.736313,0.946339
0,linear_reg,0.73631,0.946382
3,LASSO,0.059434,1.528906


# OneHotEncoding

In [27]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    
    transformers=[
      ( 'num', StandardScaler(), num_cols ),
      ( 'cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ord_cat_cols ),
      ( 'cat1', OneHotEncoder(drop='first',handle_unknown='ignore'), ohe_cat_cols ) 
    ],
    
    remainder='passthrough'
)

In [28]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [29]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [30]:
scores.mean()

0.8559452894636912

In [31]:
scores.std()

0.02117803909630371

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [33]:
pipeline.fit(X_train,y_train)

In [34]:
y_pred = pipeline.predict(X_test)



In [35]:
y_pred = np.expm1(y_pred)

In [36]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.6440248224606867

In [37]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [38]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    #'xgboost':XGBRegressor()
}

In [39]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [40]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [41]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
6,extra trees,0.89638,0.461217
5,random forest,0.891971,0.507194
7,gradient boosting,0.878873,0.546247
9,mlp,0.870177,0.592656
0,linear_reg,0.855945,0.644025
2,ridge,0.856025,0.644987
4,decision tree,0.812161,0.665824
8,adaboost,0.763601,0.847664
1,svr,0.765691,0.860331
3,LASSO,0.05541,1.531937


# OneHotEncoding With PCA

In [42]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'),['sector','agePossession','furnishing_type'])
    ], 
    remainder='passthrough'
)

In [43]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [44]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [45]:
scores.mean()

0.05821394586994194

In [46]:
scores.std()

0.01929366256897615

In [47]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output

In [48]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreesRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    #'xgboost':XGBRegressor()
}

In [49]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [50]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [51]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.762003,0.742494
6,extra trees,0.733268,0.762082
4,decision tree,0.691967,0.834192
7,gradient boosting,0.613076,0.986561
1,svr,0.227493,1.369005
9,mlp,0.213742,1.429993
8,adaboost,0.294787,1.470167
3,LASSO,0.055609,1.531845
2,ridge,0.058214,1.534456
0,linear_reg,0.058214,1.534456


# Target Encoder

In [52]:
#!pip install category_encoders

In [53]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# # Creating a column transformer for preprocessing
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
#         ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
#         ('cat1',OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'),['agePossession','furnishing_type']),
#         ('target_enc', ce.TargetEncoder(), ['sector'])
#     ], 
#     remainder='passthrough'
# )

In [54]:
# Creating a pipeline
# pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('regressor', LinearRegression())
# ])

In [55]:
# # K-fold cross-validation
# kfold = KFold(n_splits=10, shuffle=True, random_state=42)
# scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
# Warning: No categorical columns found. Calling 'transform' will only return input data.
# Warning: No categorical columns found. Calling 'transform' will only return input data.
# Warning: No categorical columns found. Calling 'transform' will only return input data.
# Warning: No categorical columns found. Calling 'transform' will only return input data.
# Warning: No categorical columns found. Calling 'transform' will only return input data.
# Warning: No categorical columns found. Calling 'transform' will only return input data.
# Warning: No categorical columns found. Calling 'transform' will only return input data.
# Warning: No categorical columns found. Calling 'transform' will only return input data.
# Warning: No categorical columns found. Calling 'transform' will only return input data.
# Warning: No categorical columns found. Calling 'transform' will only return input data.

In [56]:
# scores.mean(),scores.std() # TARGET encoder failed because X_train donot have y_train col so that we can convert sector to 
# # the mean of groupby of each sector we need to manual run the cross validation so that we can explicitly encode sector

# RETAKE FOR TARGET ENCODER - manually ,need to convert sector in target encoder sector 

In [57]:
# Apply the log1p transformation to the target variable
y_transformed = np.log1p(y)

# Define the Preprocessing Column Transformer
# We define the preprocessor for all columns EXCEPT the one we will target-encode ('sector')
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
#         ('cat_ohe', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), columns_to_encode),
#         ('cat1',OneHotEncoder(drop='first',sparse_output=False,handle_unknown='ignore'),['agePossession','furnishing_type'])
#     ],
#     remainder='passthrough'
# )
# List of columns for Ordinal Encoding (excluding those for One-Hot)
ordinal_cols = ['property_type', 'balcony', 'luxury_category', 'floor_category']

# Corrected preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat_ohe', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False), 
         ['agePossession', 'furnishing_type']),
        ('cat_ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), ordinal_cols)
    ],
    remainder='passthrough'
)

In [58]:
# 3. Manual Cross-Validation Loop
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
r2_scores = []
mae_scores = []

# Instantiate the model you want to test
model = LinearRegression()

for train_index, test_index in kfold.split(X):
    X_train, X_test = X.iloc[train_index] , X.iloc[test_index]
    y_train, y_test = y_transformed.iloc[train_index] , y_transformed.iloc[test_index]
    
    # --- Start of Encoding for the Fold ---
    
    # a. Target Encoding for 'sector'
    # This encoder is fitted ONLY on the training data for this specific fold
    target_encoder = ce.TargetEncoder(cols='sector')
    target_encoder.fit(X_train, y_train)
    
    X_train_encoded = target_encoder.transform(X_train)
    X_test_encoded = target_encoder.transform(X_test)
    
    # b. Apply the main preprocessor to the rest of the columns
    # We fit the preprocessor ONLY on the training data for this fold
    preprocessor.fit(X_train)
    
    X_train_transformed = preprocessor.transform(X_train_encoded)
    X_test_transformed = preprocessor.transform(X_test_encoded)
    
    # --- End of Encoding for the Fold ---
    
    # Train the model on the fully transformed training data
    model.fit(X_train_transformed, y_train)
    
    # Make predictions on the transformed test data
    y_pred = model.predict(X_test_transformed)
    
    # Calculate scores and append to lists
    r2_scores.append(r2_score(y_test, y_pred))
    
    # For MAE, we use the inverse-transformed predictions
    mae_scores.append(mean_absolute_error(np.expm1(y_test), np.expm1(y_pred)))

# 4. Final Results
print(f"Model: LinearRegressor with Target Encoding")
print("-" * 50)
print(f"Average R2 Score: {np.mean(r2_scores):.4f} (± {np.std(r2_scores):.4f})")
print(f"Average MAE: {np.mean(mae_scores):.4f} (± {np.std(mae_scores):.4f})")
print()

Model: LinearRegressor with Target Encoding
--------------------------------------------------
Average R2 Score: 0.8279 (± 0.0201)
Average MAE: 0.7170 (± 0.0658)



In [59]:
output_model=[]
for model_name , model in model_dict.items():
    
    # 3. Manual Cross-Validation Loop
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    r2_scores = []
    mae_scores = []

    # Instantiate the model you want to test

    for train_index, test_index in kfold.split(X):
        X_train, X_test = X.iloc[train_index] , X.iloc[test_index]
        y_train, y_test = y_transformed.iloc[train_index] , y_transformed.iloc[test_index]

        # --- Start of Encoding for the Fold ---

        # a. Target Encoding for 'sector'
        # This encoder is fitted ONLY on the training data for this specific fold
        target_encoder = ce.TargetEncoder(cols='sector')
        target_encoder.fit(X_train, y_train)

        X_train_encoded = target_encoder.transform(X_train)
        X_test_encoded = target_encoder.transform(X_test)

        # b. Apply the main preprocessor to the rest of the columns
        # We fit the preprocessor ONLY on the training data for this fold
        preprocessor.fit(X_train)

        X_train_transformed = preprocessor.transform(X_train_encoded)
        X_test_transformed = preprocessor.transform(X_test_encoded)

        # --- End of Encoding for the Fold ---

        # Train the model on the fully transformed training data
        model.fit(X_train_transformed, y_train)

        # Make predictions on the transformed test data
        y_pred = model.predict(X_test_transformed)

        # Calculate scores and append to lists
        r2_scores.append(r2_score(y_test, y_pred))

        # For MAE, we use the inverse-transformed predictions
        mae_scores.append(mean_absolute_error(np.expm1(y_test), np.expm1(y_pred)))

    # 4. Final Results
    
#     print(f"Model: {model_name} with Target Encoding")
#     print("-" * 50)
#     print(f"Average R2 Score: {np.mean(r2_scores):.4f} (± {np.std(r2_scores):.4f})")
#     print(f"Average MAE: {np.mean(mae_scores):.4f} (± {np.std(mae_scores):.4f})")
#     print()
    output_model.append( [model_name,np.mean(r2_scores), np.mean(mae_scores)] )
model_df = pd.DataFrame(output_model,columns=['name','r2_score','MAE'])
model_df.sort_values(by=['MAE'],ascending=False)

Unnamed: 0,name,r2_score,MAE
3,LASSO,-0.003917,1.548376
8,adaboost,0.821311,0.722886
2,ridge,0.827889,0.717094
0,linear_reg,0.82786,0.716951
4,decision tree,0.80045,0.65841
9,mlp,0.848018,0.633282
1,svr,0.862365,0.596216
7,gradient boosting,0.885361,0.542623
6,extra trees,0.892187,0.493852
5,random forest,0.894256,0.490121


In [60]:
model_df.sort_values(by=['MAE'])

Unnamed: 0,name,r2_score,MAE
5,random forest,0.894256,0.490121
6,extra trees,0.892187,0.493852
7,gradient boosting,0.885361,0.542623
1,svr,0.862365,0.596216
9,mlp,0.848018,0.633282
4,decision tree,0.80045,0.65841
0,linear_reg,0.82786,0.716951
2,ridge,0.827889,0.717094
8,adaboost,0.821311,0.722886
3,LASSO,-0.003917,1.548376


In [61]:
# target encoder is our best setup for model 

## conclusion - model - RandomForestRegressor is best suitable for current setup observation and findings
## with r2 score = 0.89 , MAE= 0.48 crore or 48 lakhs

# Hyperparameter Tuning

In [62]:
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline

In [63]:
# !pip install --upgrade scikit-learn
# !pip install --upgrade imbalanced-learn
from sklearn.model_selection import RandomizedSearchCV

In [64]:
X = df.drop(columns=['price'])
y = df['price']

# Apply the log1p transformation to the target variable
y_transformed = np.log1p(y)

# 2. Define the Preprocessing Column Transformer
# This defines how each type of column will be transformed
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1),['property_type', 'balcony', 'luxury_category', 'floor_category']),
        ('cat_ohe', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False), 
         [ 'agePossession', 'furnishing_type']),
        # TargetEncoder is used for the high-cardinality 'sector' column
#         ('cat_te', ce.TargetEncoder(), ['sector'])
        ('cat_te', ce.TargetEncoder(cols=['sector']), ['sector'])

    ],
    remainder='passthrough',
    verbose_feature_names_out=False
)

# 3. Create the Full Pipeline
# This pipeline chains the preprocessor and the regressor model together.
# Using Pipeline from imblearn ensures the TargetEncoder gets the 'y' variable during fitting.
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_jobs=-1, random_state=42))
])

# 4. Define the Hyperparameter Search Space
# We define a range of values for RandomizedSearchCV to try for our RandomForestRegressor.
# The keys are prefixed with 'regressor__' to specify they belong to that step in the pipeline.
param_dist = {
    'regressor__n_estimators': [100, 200, 300, 400, 500],
    'regressor__max_depth': [10, 20, 30, 40, 50, None],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__max_features': ['sqrt', 'log2']
}

# 5. Set Up and Run Randomized Search
# n_iter controls how many different parameter combinations are tried.
# cv is the number of cross-validation folds.
# scoring='neg_mean_absolute_error' is used because the goal is to minimize MAE.
search = RandomizedSearchCV(
    pipeline, 
    param_distributions=param_dist, 
    n_iter=25, 
    cv=5, 
    scoring='neg_mean_absolute_error',
    n_jobs=-1, 
    random_state=42,
    verbose=2
)

print("Starting hyperparameter tuning...")
search.fit(X, y_transformed)
print("Tuning finished.")

# 6. Display the Results
print("\n--- Best Hyperparameters Found ---")
print(search.best_params_)

# The search was scored on negative MAE, so we convert it back to a positive MAE score
best_mae = -search.best_score_
print(f"\nBest Cross-Validated MAE: {best_mae:.4f}")


Starting hyperparameter tuning...
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Tuning finished.

--- Best Hyperparameters Found ---
{'regressor__n_estimators': 500, 'regressor__min_samples_split': 2, 'regressor__min_samples_leaf': 1, 'regressor__max_features': 'log2', 'regressor__max_depth': 30}

Best Cross-Validated MAE: 0.1238


In [65]:
# Get the best estimator from the randomized search
best_model = search.best_estimator_

# Make predictions on the entire dataset using the best model
# Note: For a more rigorous evaluation, you would typically evaluate on a separate test set
# that was not used during the cross-validation or hyperparameter tuning.
y_pred_transformed = best_model.predict(X)

# Calculate the R2 score
r2 = r2_score(y_transformed, y_pred_transformed)

print(f"R2 Score of the best model: {r2:.4f}")

R2 Score of the best model: 0.9862


In [66]:
# After your RandomizedSearchCV training is done:

import joblib

# Get your best trained pipeline
best_pipeline = search.best_estimator_

# Save the complete pipeline (preprocessing + model)
joblib.dump(best_pipeline, 'my_model.joblib')
print("✅ Model saved!")

# Later, to load and use:
loaded_model = joblib.load('my_model.joblib')

# Make predictions on new data
# new_predictions = loaded_model.predict(new_data)
# actual_price = np.exp(new_predictions)  # Convert back from log

print("✅ Model loaded and ready to use!")

✅ Model saved!
✅ Model loaded and ready to use!


In [105]:
# Load your saved model
model = joblib.load('my_model.joblib')
print("✅ Model loaded successfully!")

# Create test data (one sample)
data = [[1.0,33.0,4,4,0.0,2.0,3240.0,0,0,'unfurnished',1.0,2.0]]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
test_data = pd.DataFrame(data, columns=columns)

test_data
print("Test data:")
print(test_data)

# Make prediction
log_prediction = model.predict(test_data)
print(f"\nLog prediction: {log_prediction[0]}")

# Convert back to actual price
actual_price = np.exp(log_prediction[0])
print(f"Predicted price: ₹{actual_price:,.2f}")

print("\n✅ Model is working perfectly!")

✅ Model loaded successfully!
Test data:
   property_type  sector  bedRoom  bathroom  balcony  agePossession  \
0            1.0    33.0        4         4      0.0            2.0   

   built_up_area  servant room  store room furnishing_type  luxury_category  \
0         3240.0             0           0     unfurnished              1.0   

   floor_category  
0             2.0  

Log prediction: 1.7940683227995686
Predicted price: ₹6.01

✅ Model is working perfectly!


In [76]:
# checking my model on training dataset

In [77]:
predictions = model.predict(X)

# Calculate all metrics
mae = mean_absolute_error(y_transformed, predictions)
r2 = r2_score(y_transformed, predictions)
errors = y_transformed - predictions
std_dev = np.std(errors)

print(f"MAE: {mae:.4f}")
print(f"R2 Score: {r2:.4f}")
print(f"Standard Deviation: {std_dev:.4f}")

# Your cross-validation MAE (already calculated)
print(f"CV MAE: {-search.best_score_:.4f}")

MAE: 0.0439
R2 Score: 0.9862
Standard Deviation: 0.0653
CV MAE: 0.1238


In [78]:
df.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category,price
0,1.0,33.0,4,4,0.0,2.0,3240.0,0,0,unfurnished,1.0,2.0,4.5
1,0.0,74.0,3,3,2.0,1.0,1950.0,1,0,furnished,2.0,0.0,2.8
2,0.0,6.0,3,3,3.0,4.0,1365.0,0,0,semifurnished,2.0,0.0,1.9
3,1.0,57.0,5,5,3.0,0.0,3240.0,1,0,unfurnished,1.0,2.0,10.56
4,1.0,13.0,4,3,2.0,2.0,1800.0,0,0,unfurnished,1.0,1.0,2.2


In [91]:
# Export (dump) the DataFrame to a file
joblib.dump(df, 'df.joblib')

['df.joblib']

# currently dumped model is very overfitted as r2 = 98 which is very high so i used another optimization code xgboost + bayesian optimization 

# NOTE - below code ran on google colab notebook if possible i may have uploaded it also on github please refer that for proof 

# random forest fine tuned model file size = 150 + MB
# xgboost tuned model file size = 3 MB
## so i deleted random forest fine tuned model and only using xgboost tuned model file 

In [81]:
# from skopt import gp_minimize
# from skopt.space import Real, Integer, Categorical
# from skopt.utils import use_named_args

In [82]:
# # Define the preprocessing steps for different column types
# # This ColumnTransformer will apply different transformations to different columns
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
#         ('cat_ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1),
#          ['property_type', 'balcony', 'luxury_category', 'floor_category']),
#         ('cat_ohe', OneHotEncoder(handle_unknown='ignore', drop='first', sparse_output=False),
#          ['agePossession', 'furnishing_type']),
#         # TargetEncoder is used for the high-cardinality 'sector' column
#         ('cat_te', ce.TargetEncoder(cols=['sector']), ['sector'])
#     ],
#     remainder='passthrough',
#     verbose_feature_names_out=False
# )

# # Define the search space for XGBoost hyperparameters
# # These are the ranges of values that Bayesian Optimization will explore
# search_space = [
#     Integer(100, 1000, name='n_estimators'),
#     Integer(3, 10, name='max_depth'),
#     Real(0.01, 1.0, 'uniform', name='learning_rate'),
#     Real(0.5, 1.0, 'uniform', name='subsample'),
#     Real(0.5, 1.0, 'uniform', name='colsample_bytree'),
#     Real(0, 10, 'uniform', name='gamma'),
#     Integer(1, 10, name='min_child_weight'),
#     Real(0, 10, 'uniform', name='reg_alpha'),
#     Real(0, 10, 'uniform', name='reg_lambda')
# ]

# # Define the objective function for Bayesian Optimization
# # This function will be minimized by the optimizer
# @use_named_args(search_space)
# def objective(**params):
#     """
#     Objective function for Bayesian Optimization.
#     Takes hyperparameters as input, trains an XGBoost model,
#     and returns the negative mean absolute error.
#     """
#     # Create the XGBoost model with the given hyperparameters
#     model = XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1, **params)

#     # Create the full pipeline
#     pipeline = Pipeline([
#         ('preprocessor', preprocessor),
#         ('regressor', model)
#     ])

#     # Perform cross-validation
#     # We use 'neg_mean_absolute_error' because the optimizer minimizes the function,
#     # and we want to maximize the MAE (by minimizing its negative).
#     score = -np.mean(cross_val_score(
#         pipeline,
#         X,
#         y_transformed,
#         cv=5,
#         scoring='neg_mean_absolute_error',
#         n_jobs=-1
#     ))

#     return score

# # Run Bayesian Optimization
# print("Starting Bayesian Optimization for XGBoost...")
# # n_calls is the number of iterations
# result = gp_minimize(objective, search_space, n_calls=50, random_state=42, n_jobs=-1)
# print("Optimization finished.")

# # Display the results
# print("\n--- Best Hyperparameters Found ---")
# best_params = {dim.name: val for dim, val in zip(search_space, result.x)}
# print(best_params)

# best_mae = result.fun
# print(f"\nBest Cross-Validated MAE: {best_mae:.4f}")


# # --- Final Model Training and Evaluation ---

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y_transformed, test_size=0.2, random_state=42)

# # Create the final XGBoost model with the best hyperparameters
# final_model = XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1, **best_params)

# # Create the final pipeline
# final_pipeline = Pipeline([
#     ('preprocessor', preprocessor),
#     ('regressor', final_model)
# ])

# # Train the final model on the entire training set
# print("\nTraining the final model with the best hyperparameters...")
# final_pipeline.fit(X_train, y_train)
# print("Final model training finished.")

# # Make predictions on the unseen test data
# y_pred = final_pipeline.predict(X_test)

# # Calculate R2 and MAE on the test data
# r2_final = r2_score(y_test, y_pred)
# mae_final = mean_absolute_error(np.expm1(y_test), np.expm1(y_pred))

# print(f"\n--- Final Model Performance on Unseen Test Data ---")
# print(f"R2 Score: {r2_final:.4f}")
# print(f"MAE (in lakhs): {mae_final:.4f}")

In [83]:
# # Define the filename for the exported model
# model_filename = 'xgboost_pipeline_model.joblib'

# # Export the pipeline using joblib
# joblib.dump(final_pipeline, model_filename)

# print(f"Model and pipeline exported successfully to '{model_filename}'")

# my fine tuning was very overfitting in nature so i dropped fine tuning it and focused on , feature engineering and took help from ai to help me build more better features for the prediction 

In [None]:
# # Define the filename for the exported X DataFrame
# X_filename = 'X_dataframe.joblib'

# # Export the X DataFrame using joblib
# joblib.dump(X, X_filename)

# print(f"X DataFrame exported successfully to '{X_filename}'")

In [None]:
import joblib

# --- Main Function to Run the Training Process ---
def train_model():
    """
    This function loads the data, engineers new features, defines a robust preprocessing pipeline,
    trains an XGBoost model, saves the final pipeline, and generates a performance visualization.
    """
    # 1. Load the dataset
    print("Loading dataset...")
    df = pd.read_csv('gurgaon_properties_post_feature_selection_v2.csv')

    # --- 2. Advanced Feature Engineering ---
    print("Engineering new features...")

    sector_price_map = df.groupby('sector')['price'].median() / df.groupby('sector')['built_up_area'].median()
    df['sector_score'] = df['sector'].map(sector_price_map)

    df['area_x_sector_score'] = df['built_up_area'] * df['sector_score']
    df['area_x_room'] = df['built_up_area'] / (df['bedRoom'] + 1)
    df['bed_bath_ratio'] = df['bedRoom'] / (df['bathroom'] + 1)

    # --- 3. Data Preparation ---
    print("Preparing data for training...")
    
    X = df.drop(columns=['price', 'sector'])
    y = df['price']

    y_log_transformed = np.log1p(y)

    # --- 4. Define the Preprocessing Pipeline ---
    balcony_order = ['0', '1', '2', '3', '3+']
    luxury_order = ['Low', 'Medium', 'High']
    floor_order = ['Low Floor', 'Mid Floor', 'High Floor']
    age_possession_order = ['Under Construction', 'Relatively New', 'New Property', 'Moderately Old', 'Old Property']
    furnishing_order = ['Unfurnished', 'Semi-Furnished', 'Furnished']

    numerical_features = ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room', 'sector_score', 'area_x_sector_score', 'area_x_room', 'bed_bath_ratio']
    ordinal_features = ['balcony', 'luxury_category', 'floor_category', 'agePossession', 'furnishing_type']
    nominal_features = ['property_type']

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('ord', OrdinalEncoder(categories=[balcony_order, luxury_order, floor_order, age_possession_order, furnishing_order]), ordinal_features),
            ('nom', OneHotEncoder(handle_unknown='ignore', drop='first'), nominal_features)
        ],
        remainder='passthrough'
    )

    # --- 5. Create the Final Model Pipeline ---
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', XGBRegressor(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=8,
            subsample=0.9,
            colsample_bytree=0.9,
            random_state=42,
            n_jobs=-1
        ))
    ])

    # --- 6. Train on a Split Dataset for Honest Evaluation ---
    # We split the data to train the model and then test it on data it has never seen.
    print("Splitting data for final evaluation...")
    X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log_transformed, test_size=0.2, random_state=42)

    print("Training the XGBoost model on the training set...")
    pipeline.fit(X_train, y_train_log)
    
    # --- 7. Performance Visualization ---
    print("Generating performance visualization...")
    y_pred_log = pipeline.predict(X_test)

    # Inverse transform both predicted and actual values to get the real prices
    y_pred_actual = np.expm1(y_pred_log)
    y_test_actual = np.expm1(y_test_log)

    # Create the scatter plot
    plt.figure(figsize=(10, 8))
    sns.scatterplot(x=y_test_actual, y=y_pred_actual, alpha=0.6)
    
    # Add a line for perfect predictions (y=x)
    max_price = max(y_test_actual.max(), y_pred_actual.max())
    plt.plot([0, max_price], [0, max_price], color='red', linestyle='--', lw=2, label='Perfect Prediction')
    
    plt.xlabel("Actual Price (in Crores)")
    plt.ylabel("Predicted Price (in Crores)")
    plt.title("Actual vs. Predicted Property Prices")
    plt.legend()
    plt.grid(True)
    
    # Save the plot to a file
    plt.savefig('prediction_vs_actual.png')
    print("Plot saved as 'prediction_vs_actual.png'")
    # To display the plot if running locally
    # plt.show()

    # --- 8. Save the Final Pipeline (Trained on the Full Dataset) ---
    print("\nRetraining model on the full dataset for deployment...")
    pipeline.fit(X, y_log_transformed) # Retrain on all data
    
    print("Saving the final model pipeline and sector map...")
    joblib.dump(pipeline, 'gurgaon_property_prediction_pipeline.joblib')
    joblib.dump(sector_price_map, 'sector_map.joblib')

    print("\n--- Process Complete ---")
    print("A new model, sector map, and performance plot have been saved.")

# --- Run the main function ---
if __name__ == "__main__":
    train_model()
'''