In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df=pd.read_csv('/content/Gurgaon_properties_post_feature_selection_V3.csv')

In [None]:
df.drop(index=[128,290,374],inplace=True)

In [None]:
X = df.drop(columns=['price'])
y = df['price']

In [None]:
y_transformed = np.log1p(y)

In [None]:
! pip install category_encoders


Collecting category_encoders
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.1-py3-none-any.whl (85 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/85.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.1


In [None]:
import category_encoders as ce
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline

In [None]:
columns_to_encode = ['property_type', 'servant room', 'facing','store room', 'balcony']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'Built_up_area']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',handle_unknown='ignore'),['agePossession','furnishing_type', 'facilities', 'floor_category']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ],
    remainder='passthrough'
)

In [None]:
from xgboost import XGBRegressor

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(
    subsample=1.0,
    reg_lambda=1.0,
    n_estimators=800,
    max_depth=6,
    learning_rate=0.1,
    gamma=0,
    colsample_bytree=0.8,
    random_state=42
))
])

In [None]:
from sklearn.model_selection import KFold, cross_val_score

In [None]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2',error_score='raise')

In [None]:
scores.mean()

np.float64(0.9005631169469007)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

pipeline.fit(X_train,y_train)

y_pred = pipeline.predict(X_test)

y_pred = np.expm1(y_pred)

(mean_absolute_error(np.expm1(y_test),y_pred))

0.5094651484028976

Transformed target regerssor

In [None]:
from sklearn.preprocessing import FunctionTransformer
log_transformer = FunctionTransformer(np.log1p, inverse_func=np.expm1, validate=True)


In [None]:
from sklearn.compose import TransformedTargetRegressor
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', TransformedTargetRegressor(
        regressor=XGBRegressor(
            subsample=0.8,
            reg_lambda=1.0,
            n_estimators=700,
            max_depth=5,
            learning_rate=0.1,
            gamma=0,
            colsample_bytree=0.8,
            random_state=42  # Ensuring reproducibility
        ),
        transformer=log_transformer  # Apply log transformation to y
    ))
])

In [None]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2',error_score='raise')

In [None]:
scores.mean()

np.float64(0.9008409019054264)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

pipeline.fit(X_train,y_train)

y_pred = pipeline.predict(X_test)

y_pred = np.expm1(y_pred)

(mean_absolute_error(np.expm1(y_test),y_pred))

0.549559252902503

In [None]:
from sklearn.metrics import make_scorer, mean_absolute_error
from sklearn.model_selection import cross_val_score, KFold
mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y, cv=kfold, scoring=mae_scorer,error_score='raise')


In [None]:
print(f"Cross-validated MAE: {-np.mean(scores)}")

Cross-validated MAE: 0.4961674528622807


# Exporting model

In [None]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,facing,agePossession,Built_up_area,servant room,store room,furnishing_type,facilities,floor_category
0,house,sector 70,5.25,2.0,4,4,North-East,New Property,2609.0,Yes,No,unfurnished,standard,Mid Floor
1,house,sector 43,2.1,2.0,10,4,West,New Property,549.0,Yes,No,unfurnished,basic,Mid Floor
2,house,sector 54,5.0,4.0,21,4,North,Relatively New,1161.0,Yes,No,semifurnished,standard,Mid Floor
3,house,sector 43,3.0,2.0,12,4,West,New Property,558.0,No,No,semifurnished,basic,Mid Floor
4,house,sector 43,4.5,5.0,20,4,East,Relatively New,1152.0,Yes,No,semifurnished,basic,Mid Floor


In [None]:
columns_to_encode = ['property_type', 'servant room', 'facing','store room', 'balcony']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'Built_up_area']),
        ('cat', OrdinalEncoder(), ['property_type', 'servant room', 'facing','store room', 'balcony']),
        ('cat1',OneHotEncoder(drop='first',handle_unknown='ignore'),['agePossession','furnishing_type', 'facilities', 'floor_category']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ],
    remainder='passthrough',
    force_int_remainder_cols =False
)

In [None]:
final_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(
    subsample=1.0,
    reg_lambda=1.0,
    n_estimators=800,
    max_depth=6,
    learning_rate=0.1,
    gamma=0,
    colsample_bytree=0.8,
    random_state=42
))
])

In [None]:
final_pipeline.fit(X,y_transformed)

In [None]:
import pickle
with open('pipeline.pkl','wb') as file:
  pickle.dump(final_pipeline,file)

In [None]:
with open('df.pkl','wb') as file:
  pickle.dump(X,file)

In [None]:
X.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony', 'facing',
       'agePossession', 'Built_up_area', 'servant room', 'store room',
       'furnishing_type', 'facilities', 'floor_category'],
      dtype='object')

In [None]:
X.iloc[0].values

array(['house', 'sector 70', np.float64(2.0), np.int64(4), np.int64(4),
       'North-East', 'New Property', np.float64(2609.0), 'Yes', 'No',
       'unfurnished', 'standard', 'Mid Floor'], dtype=object)

In [None]:
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony', 'facing',
       'agePossession', 'Built_up_area', 'servant room', 'store room',
       'furnishing_type', 'facilities', 'floor_category']

In [None]:
data = ['house', 'sector 81', 3, 4, 4,
       'North-East', 'New Property', 1700, 'No', 'No',
       'unfurnished', 'basic', 'Low Floor']

In [None]:
one_df = pd.DataFrame([data],columns=X.columns);

In [None]:
one_df

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,facing,agePossession,Built_up_area,servant room,store room,furnishing_type,facilities,floor_category
0,house,sector 81,3,4,4,North-East,New Property,1700,No,No,unfurnished,basic,Low Floor


In [None]:
np.expm1(pipeline.predict(one_df))

array([2.1323948], dtype=float32)

In [None]:
df.sector.value_counts().index

Index(['sohna road', 'sector 37', 'sector 85', 'sector 102', 'sector 70',
       'sector 92', 'sector 69', 'sector 90', 'sector 81', 'sector 109',
       'sector 65', 'sector 79', 'sector 83', 'sector 104', 'sector 67',
       'sector 33', 'sector 43', 'sector 86', 'sector 50', 'sector 107',
       'sector 2', 'sector 82', 'sector 108', 'sector 89', 'sector 56',
       'sector 95', 'sector 48', 'sector 84', 'sector 26', 'sector 49',
       'sector 103', 'sector 113', 'sector 99', 'sector 66', 'sector 28',
       'sector 61', 'sector 106', 'sector 3', 'sector 25', 'sector 7',
       'manesar', 'sector 68', 'laxma', 'sector 72', 'sector 12', 'sector 54',
       'sector 71', 'sector 77', 'sector 63', 'sector 88', 'sector 57',
       'sector 112', 'sector 111', 'sector 110', 'sector 36', 'sector 9',
       'sector 11', 'sector 105', 'sector 74', 'gwal pahari', 'sector 22',
       'sector 24', 'sector 47', 'sector 78', 'sector 62', 'sector 91',
       'sector 60', 'sector 76', 'sector 14', 

In [None]:
df.facing.value_counts()

Unnamed: 0_level_0,count
facing,Unnamed: 1_level_1
East,1121
North-East,890
North,468
West,257
South,242
North-West,192
South-East,168
South-West,154
