In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [11]:
df = pd.read_csv('gurgaon_properties_post_feature_selection.csv')

In [12]:
df.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category,price
0,0,36,3.0,2.0,2,1,850.0,0.0,0.0,0.0,1,1,0.82
1,0,95,2.0,2.0,2,1,1226.0,1.0,0.0,0.0,1,2,0.95
2,0,103,2.0,2.0,1,1,1000.0,0.0,0.0,0.0,1,0,0.32
3,0,99,3.0,4.0,4,3,1615.0,1.0,0.0,1.0,0,2,1.6
4,0,5,2.0,2.0,1,3,582.0,0.0,1.0,0.0,0,2,0.48


In [13]:
X= df.drop(columns = ['price'])
y = df['price']

In [14]:
X

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,0,36,3.0,2.0,2,1,850.0,0.0,0.0,0.0,1,1
1,0,95,2.0,2.0,2,1,1226.0,1.0,0.0,0.0,1,2
2,0,103,2.0,2.0,1,1,1000.0,0.0,0.0,0.0,1,0
3,0,99,3.0,4.0,4,3,1615.0,1.0,0.0,1.0,0,2
4,0,5,2.0,2.0,1,3,582.0,0.0,1.0,0.0,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...
3549,0,90,2.0,2.0,1,3,532.0,0.0,0.0,0.0,2,2
3550,1,12,5.0,5.0,4,3,6228.0,1.0,1.0,0.0,0,1
3551,0,23,1.0,1.0,1,0,665.0,0.0,0.0,1.0,2,2
3552,1,44,5.0,6.0,3,0,5490.0,1.0,1.0,0.0,2,2


In [15]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

In [16]:
columns_to_encode = ['sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [17]:
#Applying log1P transformer to the targert variable

In [18]:
y_transformed = np.log1p(y)

In [19]:
#Creating a column tarnsformer for the Preprocessing 

In [20]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['property_type', 'bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OneHotEncoder(drop='first'), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [21]:
#Creating a pipeline

In [22]:
pipeline = Pipeline([
    ('preprocessor',preprocessor),
    ('regressor' ,SVR(kernel='rbf'))
]
)

In [23]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [25]:
scores.mean()

0.8845360715052786

In [26]:
scores.std()

0.014784881452420021

In [27]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [28]:
pipeline.fit(X_train,y_train)

In [29]:
y_pred = pipeline.predict(X_test)

In [30]:
y_pred

array([0.96550142, 0.51892652, 0.746627  , 1.78136517, 1.25257061,
       1.67103543, 2.29262185, 0.93482073, 0.9834673 , 0.96570763,
       1.39728099, 0.70176536, 1.33526263, 0.92702179, 2.5155566 ,
       0.57912663, 0.9424148 , 0.23484156, 0.71444301, 1.31228313,
       0.89328132, 0.86278227, 0.54646773, 0.6698488 , 1.02538718,
       1.40528508, 0.59396203, 0.62780382, 0.76958236, 2.51442755,
       0.31505753, 0.46460434, 1.72821751, 0.32137611, 0.75898816,
       1.71163322, 0.95015092, 0.46059255, 0.47767047, 0.31971881,
       0.55438075, 1.00911204, 0.39815885, 1.08188122, 0.85781433,
       0.3872911 , 2.1707235 , 1.05887541, 1.24141743, 0.9364242 ,
       0.91459166, 0.5772059 , 1.91292048, 1.18156013, 0.69773358,
       0.56598882, 2.17027198, 1.63128363, 1.82869103, 0.94604816,
       1.05129194, 1.39283978, 0.50134573, 1.14508688, 0.26023677,
       0.93979281, 1.02899746, 1.19584233, 0.4836444 , 0.77029644,
       1.25072737, 0.70125382, 1.09724813, 1.19501517, 1.17311

In [31]:
y_pred = np.expm1(y_pred)

In [32]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(np.expm1(y_test),y_pred)

0.5324591082613233