In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

In [2]:
# import warnings
# warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('gurgaon_properties_post_feature_selection.csv')

In [3]:
df.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category,price
0,0.0,18.0,3,5,4.0,3.0,3405.0,1,0,0,1.0,2.0,4.75
1,0.0,45.0,3,3,3.0,4.0,1500.0,0,0,1,2.0,1.0,0.9
2,0.0,94.0,2,2,2.0,3.0,664.0,0,1,0,1.0,2.0,0.45
3,1.0,52.0,5,5,2.0,2.0,3323.0,1,0,1,1.0,2.0,9.85
4,0.0,107.0,4,4,3.0,0.0,1893.0,0,0,1,2.0,1.0,0.8


##### one hot encode -> sector, balcony, agePossession, furnishing type, luxury category, floor category

In [4]:
df.furnishing_type.value_counts()

furnishing_type
1    2392
0     995
2     184
Name: count, dtype: int64

In [5]:
X = df.drop(columns=['price'])
y = df['price']

In [6]:
columns_to_encode = ['sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [7]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

In [8]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['property_type', 'bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OneHotEncoder(drop='first',handle_unknown='ignore'), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [9]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel='rbf'))
])

In [17]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [18]:
scores

array([0.89021493, 0.86246743, 0.88362295, 0.88870944, 0.89707905,
       0.89620919, 0.86255798, 0.88182142, 0.89216443, 0.86348181])

In [29]:
scores.mean()

np.float64(0.8818328623866429)

In [30]:
scores.std()

np.float64(0.013228452357722712)

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [32]:
pipeline.fit(X_train,y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'passthrough'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,C,1.0
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [33]:
y_pred = pipeline.predict(X_test)

In [34]:
y_pred = np.expm1(y_pred)

In [35]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(np.expm1(y_test),y_pred)

0.5141584915433994

In [36]:
### Mean absolute error around 51 lacs and r2 score is 88