In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('gurgoan_properties_post_feature_selection.csv')

In [3]:
df.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category,price
0,0.0,36.0,3.0,2.0,2.0,1.0,850.0,0.0,0.0,0.0,1.0,1.0,0.82
1,0.0,95.0,2.0,2.0,2.0,1.0,1226.0,1.0,0.0,0.0,1.0,2.0,0.95
2,0.0,92.0,2.0,2.0,1.0,4.0,681.4,0.0,0.0,0.0,1.0,1.0,0.46
3,0.0,103.0,2.0,2.0,1.0,1.0,1000.0,0.0,0.0,0.0,1.0,0.0,0.32
4,0.0,99.0,3.0,4.0,4.0,3.0,1615.0,1.0,0.0,1.0,0.0,2.0,1.6


In [4]:
#One head encode--> sector, balcony, agePossession, furnishing_type, luxury_category, floor category

In [5]:
X = df.drop(columns='price')
y =df['price']

In [6]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

In [7]:
columns_to_encode = ['sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [8]:
#Applying the Log1p transformation to the target variable

y_transformed = np.log1p(y)

In [9]:
#creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers = [
        ('num', StandardScaler(), ['property_type', 'bedRoom','bathroom','built_up_area', 'servant room', 'store room']),
        ('cat', OneHotEncoder(drop='first'), columns_to_encode)
    ],
    remainder= 'passthrough'
)

In [10]:
#Creating a Pipe Line
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
    
])


In [11]:
#K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores= cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring= 'r2')


In [12]:
scores.mean()

0.8585977346273893

In [13]:
scores.std()

0.016842732638532615

In [14]:
# Mean absolute error(to see how much error the model is making)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed, test_size= 0.2, random_state= 42)

In [15]:
pipeline.fit(X_train, y_train)


In [16]:
y_pred = pipeline.predict(X_test)

In [17]:
y_pred = np.expm1(y_pred)

In [18]:
y_pred

array([ 2.90954021,  7.58793289,  1.03302035,  1.24977873,  3.75547161,
        2.37585324,  0.96446102,  0.27061821,  1.25665839,  2.0759656 ,
        1.12449916,  3.52620535,  1.1164123 ,  0.6868566 , 13.35051375,
        1.64614599,  3.77806307,  1.89918064,  1.5039134 ,  0.9466332 ,
        2.25367536,  0.95905055,  0.60933531,  0.51624874,  0.47328221,
        1.52454919,  2.18338218,  2.61412026,  1.17720056,  1.12300352,
        1.19583088,  0.66981073,  0.32267063,  3.69263509,  4.40559336,
        1.27558818,  0.48977076,  1.59151871,  8.21163754,  1.86344612,
        1.53969058,  5.39211419,  2.20782498,  7.05912192,  0.62065641,
        0.65197338,  1.45890341,  2.17059761,  1.40684383,  0.17779363,
        6.69123228,  2.1808157 ,  0.45687724,  2.93775428,  7.78738261,
        1.80951713,  1.04572935,  2.3113352 ,  3.29188392,  1.64688312,
        2.17848233,  1.02831209,  2.9700539 ,  1.73431399,  1.52417936,
        0.66123444,  4.84552774,  2.44362926,  8.20438849,  1.69

In [20]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(np.expm1(y_test), y_pred)

0.6783522256391938

In [21]:
# Checking with Support vector Machine

#Creating a Pipe Line
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR(kernel= 'rbf'))
    
])


In [22]:
#K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores= cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring= 'r2')


In [23]:
scores.mean()

0.8889065853056115

In [24]:
scores.std()

0.019596109327419134

In [25]:
# Mean absolute error(to see how much error the model is making)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed, test_size= 0.2, random_state= 42)

In [26]:
pipeline.fit(X_train, y_train)

In [27]:
y_pred = pipeline.predict(X_test)

In [28]:
y_pred = np.expm1(y_pred)

In [29]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(np.expm1(y_test), y_pred)

0.5322688526359813