In [1]:
import numpy as np
import pandas as pd

In [14]:
df = pd.read_csv('properties_post_feature_selection.csv')

In [15]:
df.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category,price
0,0.0,92.0,3,4,4.0,3.0,1900.0,1,0,2,1.0,0.0,2.2
1,0.0,107.0,4,3,3.0,3.0,2824.0,1,0,0,1.0,2.0,1.1
2,0.0,11.0,4,4,2.0,3.0,2380.0,1,0,0,1.0,0.0,1.97
3,0.0,29.0,3,4,4.0,2.0,3700.0,1,0,2,2.0,2.0,5.8
4,0.0,18.0,2,2,1.0,0.0,737.0,0,0,0,1.0,1.0,0.35


In [34]:
df['property_type'] = df['property_type'].astype('int32')
df['sector'] = df['sector'].astype('int32')
df['bedRoom'] = df['bedRoom'].astype('float64')
df['bathroom'] = df['bathroom'].astype('float64')
df['balcony'] = df['balcony'].astype('int32')
df['agePossession'] = df['agePossession'].astype('int32')
df['servant room'] = df['servant room'].astype('float64')
df['store room'] = df['store room'].astype('float64')
df['furnishing_type'] = df['furnishing_type'].astype('float64')
df['luxury_category'] = df['luxury_category'].astype('int32')
df['floor_category'] = df['floor_category'].astype('int32')

In [35]:
df.head(3)

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category,price
0,0,92,3.0,4.0,4,3,1900.0,1.0,0.0,2.0,1,0,2.2
1,0,107,4.0,3.0,3,3,2824.0,1.0,0.0,0.0,1,2,1.1
2,0,11,4.0,4.0,2,3,2380.0,1.0,0.0,0.0,1,0,1.97


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3554 entries, 0 to 3553
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   property_type    3554 non-null   int32  
 1   sector           3554 non-null   int32  
 2   bedRoom          3554 non-null   float64
 3   bathroom         3554 non-null   float64
 4   balcony          3554 non-null   int32  
 5   agePossession    3554 non-null   int32  
 6   built_up_area    3554 non-null   float64
 7   servant room     3554 non-null   float64
 8   store room       3554 non-null   float64
 9   furnishing_type  3554 non-null   float64
 10  luxury_category  3554 non-null   int32  
 11  floor_category   3554 non-null   int32  
 12  price            3554 non-null   float64
dtypes: float64(7), int32(6)
memory usage: 277.8 KB


In [38]:
X = df.drop(columns=['price'])
y = df['price']

In [39]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

In [53]:
columns_to_encode = ['sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [54]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

In [55]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['property_type', 'bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OneHotEncoder(drop='first',handle_unknown='ignore'), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [56]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel='rbf'))
])

In [57]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [58]:
scores.mean()

0.886314354384294

In [59]:
scores.std()

0.013803043225815364

In [65]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [66]:
pipeline.fit(X_train,y_train)

In [67]:
y_pred = pipeline.predict(X_test)



In [68]:
y_pred = np.expm1(y_pred)

In [69]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(np.expm1(y_test),y_pred)

0.5532780370459063