In [10]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [35]:
df = pd.read_csv('cleaned_data_v8.csv')

In [36]:
df.head()

Unnamed: 0,PROPERTY_TYPE,CITY,BEDROOM_NUM,AREA,BALCONY_NUM,Location,luxury_category,floor_category,age_category,price
0,2.0,3.0,2.0,570.5,1.0,15.0,0.0,0.0,0.0,0.191
1,2.0,3.0,3.0,1115.5,1.0,127.0,0.0,0.0,0.0,1.175
2,2.0,3.0,3.0,1446.0,1.0,140.0,0.0,0.0,0.0,1.285
3,2.0,3.0,3.0,1295.0,1.0,182.0,0.0,0.0,0.0,0.675
4,2.0,3.0,2.0,920.0,1.0,182.0,0.0,0.0,0.0,0.47


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5670 entries, 0 to 5669
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PROPERTY_TYPE    5670 non-null   float64
 1   CITY             5670 non-null   float64
 2   BEDROOM_NUM      5670 non-null   float64
 3   AREA             5670 non-null   float64
 4   BALCONY_NUM      5670 non-null   float64
 5   Location         5670 non-null   float64
 6   luxury_category  5670 non-null   float64
 7   floor_category   5670 non-null   float64
 8   age_category     5670 non-null   float64
 9   price            5670 non-null   float64
dtypes: float64(10)
memory usage: 443.1 KB


In [38]:
X = df.drop(columns=['price'])
y = df['price']

In [39]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

In [55]:
columns_to_encode = ['CITY', 'BALCONY_NUM', 'age_category', 'luxury_category', 'floor_category']

In [56]:
y_transformed = np.log1p(y)

In [71]:
X['AREA'] = np.log1p(X['AREA'])

In [72]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['PROPERTY_TYPE', 'BEDROOM_NUM', 'AREA','Location']),
        ('cat', OneHotEncoder(drop='first'), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [73]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel='rbf'))
])

In [74]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [75]:
scores.mean()

0.8386275598041376

In [76]:
scores.std()

0.021663597781019138

In [77]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [78]:
pipeline.fit(X_train,y_train)

In [79]:
y_pred = pipeline.predict(X_test)

In [80]:
y_pred = np.expm1(y_pred)

In [81]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(np.expm1(y_test),y_pred)

0.23822613100854598

---------------------------------

In [82]:
df = pd.read_csv('cleaned_data_v8_1.csv')

In [83]:
df.head()

Unnamed: 0,PROPERTY_TYPE,CITY,BEDROOM_NUM,AREA,BALCONY_NUM,Location,price,luxury_category,floor_category,age_category,Landmark_Category
0,Residential Apartment,Kolkata South,2.0,570.5,1.0,Amtala,0.191,Low,Low Floor,Old Property,Few Landmarks
1,Residential Apartment,Kolkata South,3.0,1115.5,1.0,EM Bypass,1.175,Low,Low Floor,Old Property,Several Landmarks
2,Residential Apartment,Kolkata South,3.0,1446.0,1.0,Garia,1.285,Low,Low Floor,Old Property,Several Landmarks
3,Residential Apartment,Kolkata South,3.0,1295.0,1.0,Joka,0.675,Low,Low Floor,Old Property,Many Landmarks
4,Residential Apartment,Kolkata South,2.0,920.0,1.0,Joka,0.47,Low,Low Floor,Old Property,Many Landmarks


In [84]:
X = df.drop(columns=['price'])
y = df['price']

In [85]:
y_transformed = np.log1p(y)

In [86]:
columns_to_encode = ['CITY', 'age_category', 'luxury_category', 'floor_category','PROPERTY_TYPE','Location','Landmark_Category']

In [87]:
X['AREA'] = np.log1p(X['AREA'])

In [91]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['BEDROOM_NUM', 'AREA','BALCONY_NUM']),
        ('cat', OneHotEncoder(drop='first',handle_unknown='ignore'), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [102]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel='rbf'))
])

In [103]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')



In [104]:
scores.mean()

0.885041619915515

In [105]:
scores.std()

0.012723648837630122

In [106]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.23822613100854598