In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [4]:
df =pd.read_csv('gurgoan_properties_post_feature_selection V2.csv').drop(columns=['store room', 'floor_category','balcony'])

In [5]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_category
0,flat,sector 36,0.82,3.0,2.0,New Property,850.0,0.0,0.0,low
1,flat,sector 89,0.95,2.0,2.0,New Property,1226.0,1.0,0.0,low
2,flat,sector 86,0.46,2.0,2.0,Under Construction,681.41,0.0,0.0,low
3,flat,sohna road,0.32,2.0,2.0,New Property,1000.0,0.0,0.0,low
4,flat,sector 92,1.6,3.0,4.0,Relatively New,1615.0,1.0,1.0,high


In [6]:
# 0--> unfurnished
# 1--> Semifurnushed
# 2 --> furnished

In [7]:
#Numerical = bedRoom, bathroom, built_up_area, servant room
#ordinal - property_type, furnishing_type, luxury category
# OHE - sector, agePossesssion.

In [8]:
df['agePossession'].value_counts()

Relatively New        1725
Moderately Old         629
New Property           589
Old Property           327
Under Construction     296
Name: agePossession, dtype: int64

In [10]:
df['agePossession'] = df['agePossession'].replace({
    'Relatively New': 'new',
    'Moderately Old': 'old',
    'New Property': 'new',
    'Old Property': 'old',
    'Under Construction':'under construction'
})

In [11]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_category
0,flat,sector 36,0.82,3.0,2.0,new,850.0,0.0,0.0,low
1,flat,sector 89,0.95,2.0,2.0,new,1226.0,1.0,0.0,low
2,flat,sector 86,0.46,2.0,2.0,under construction,681.41,0.0,0.0,low
3,flat,sohna road,0.32,2.0,2.0,new,1000.0,0.0,0.0,low
4,flat,sector 92,1.6,3.0,4.0,new,1615.0,1.0,1.0,high


In [12]:
df['property_type'] = df['property_type'].replace({'flat': 0, 'house': 1})

In [13]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_category
0,0,sector 36,0.82,3.0,2.0,new,850.0,0.0,0.0,low
1,0,sector 89,0.95,2.0,2.0,new,1226.0,1.0,0.0,low
2,0,sector 86,0.46,2.0,2.0,under construction,681.41,0.0,0.0,low
3,0,sohna road,0.32,2.0,2.0,new,1000.0,0.0,0.0,low
4,0,sector 92,1.6,3.0,4.0,new,1615.0,1.0,1.0,high


In [16]:
df['luxury_category'] =  df['luxury_category'].replace({'low': 0, 'medium': 1, 'high': 2})

In [17]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_category
0,0,sector 36,0.82,3.0,2.0,new,850.0,0.0,0.0,0
1,0,sector 89,0.95,2.0,2.0,new,1226.0,1.0,0.0,0
2,0,sector 86,0.46,2.0,2.0,under construction,681.41,0.0,0.0,0
3,0,sohna road,0.32,2.0,2.0,new,1000.0,0.0,0.0,0
4,0,sector 92,1.6,3.0,4.0,new,1615.0,1.0,1.0,2


In [19]:
new_df = pd.get_dummies(df, columns=['sector', 'agePossession'], drop_first= True)

In [20]:
X = new_df.drop(columns=['price'])
y = new_df['price']

In [22]:
X.head()

Unnamed: 0,property_type,bedRoom,bathroom,built_up_area,servant room,furnishing_type,luxury_category,sector_gwal pahari,sector_manesar,sector_sector 1,...,sector_sector 9,sector_sector 90,sector_sector 91,sector_sector 92,sector_sector 93,sector_sector 95,sector_sector 99,sector_sohna road,agePossession_old,agePossession_under construction
0,0,3.0,2.0,850.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,2.0,2.0,1226.0,1.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,2.0,2.0,681.41,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,2.0,2.0,1000.0,0.0,0.0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,3.0,4.0,1615.0,1.0,1.0,2,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [23]:
y_log = np.log1p(y)

In [24]:
y_log

0       0.598837
1       0.667829
2       0.378436
3       0.277632
4       0.955511
          ...   
3561    0.314811
3562    1.945910
3563    0.470004
3564    2.803360
3565    1.022451
Name: price, Length: 3566, dtype: float64

In [25]:
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

In [27]:
X_scaled

array([[-0.51651242, -0.072     , -0.87140804, ..., -0.21956078,
        -0.60521362, -0.30086522],
       [-0.51651242, -0.87434997, -0.87140804, ..., -0.21956078,
        -0.60521362, -0.30086522],
       [-0.51651242, -0.87434997, -0.87140804, ..., -0.21956078,
        -0.60521362,  3.32374748],
       ...,
       [-0.51651242, -1.67669993, -1.561031  , ..., -0.21956078,
         1.65230915, -0.30086522],
       [ 1.93606187,  1.53269994,  1.88708381, ..., -0.21956078,
         1.65230915, -0.30086522],
       [-0.51651242, -0.072     , -0.18178508, ..., -0.21956078,
        -0.60521362, -0.30086522]])

In [29]:
X_scaled = pd.DataFrame(X_scaled, columns=X.columns)

In [30]:
X_scaled

Unnamed: 0,property_type,bedRoom,bathroom,built_up_area,servant room,furnishing_type,luxury_category,sector_gwal pahari,sector_manesar,sector_sector 1,...,sector_sector 9,sector_sector 90,sector_sector 91,sector_sector 92,sector_sector 93,sector_sector 95,sector_sector 99,sector_sohna road,agePossession_old,agePossession_under construction
0,-0.516512,-0.07200,-0.871408,-0.833480,-0.746462,-0.667368,-0.983407,-0.071227,-0.093645,-0.041054,...,-0.078789,-0.15999,-0.06921,-0.169858,-0.050301,-0.133019,-0.109171,-0.219561,-0.605214,-0.300865
1,-0.516512,-0.87435,-0.871408,-0.521348,1.339653,-0.667368,-0.983407,-0.071227,-0.093645,-0.041054,...,-0.078789,-0.15999,-0.06921,-0.169858,-0.050301,-0.133019,-0.109171,-0.219561,-0.605214,-0.300865
2,-0.516512,-0.87435,-0.871408,-0.973432,-0.746462,-0.667368,-0.983407,-0.071227,-0.093645,-0.041054,...,-0.078789,-0.15999,-0.06921,-0.169858,-0.050301,-0.133019,-0.109171,-0.219561,-0.605214,3.323747
3,-0.516512,-0.87435,-0.871408,-0.708959,-0.746462,-0.667368,-0.983407,-0.071227,-0.093645,-0.041054,...,-0.078789,-0.15999,-0.06921,-0.169858,-0.050301,-0.133019,-0.109171,4.554547,-0.605214,-0.300865
4,-0.516512,-0.07200,0.507838,-0.198425,1.339653,1.038606,1.868833,-0.071227,-0.093645,-0.041054,...,-0.078789,-0.15999,-0.06921,5.887274,-0.050301,-0.133019,-0.109171,-0.219561,-0.605214,-0.300865
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3561,-0.516512,-0.87435,-0.871408,-1.097463,-0.746462,-0.667368,0.442713,-0.071227,-0.093645,-0.041054,...,-0.078789,-0.15999,-0.06921,-0.169858,-0.050301,-0.133019,-0.109171,-0.219561,-0.605214,-0.300865
3562,1.936062,1.53270,1.197461,3.630995,1.339653,-0.667368,1.868833,-0.071227,-0.093645,-0.041054,...,-0.078789,-0.15999,-0.06921,-0.169858,-0.050301,-0.133019,-0.109171,-0.219561,-0.605214,-0.300865
3563,-0.516512,-1.67670,-1.561031,-0.987055,-0.746462,1.038606,0.442713,-0.071227,-0.093645,-0.041054,...,-0.078789,-0.15999,-0.06921,-0.169858,-0.050301,-0.133019,-0.109171,-0.219561,1.652309,-0.300865
3564,1.936062,1.53270,1.887084,3.018354,1.339653,-0.667368,0.442713,-0.071227,-0.093645,-0.041054,...,-0.078789,-0.15999,-0.06921,-0.169858,-0.050301,-0.133019,-0.109171,-0.219561,1.652309,-0.300865


In [31]:
kfold = KFold(n_splits= 10, shuffle=True, random_state= 42)
scores = cross_val_score(LinearRegression(), X_scaled, y_log, cv=kfold, scoring='r2')

In [32]:
scores.mean(), scores.std()

(0.8548967596761885, 0.017657251192510152)

In [33]:
lr = LinearRegression()
ridge = Ridge(alpha= 0.0001)


In [34]:
lr.fit(X_scaled, y_log)

In [35]:
ridge.fit(X_scaled, y_log)0']).rename(columns={'level_1':

In [43]:
coef_df = (pd.DataFrame(lr.coef_.reshape(1,112), columns=X.columns).stack().reset_index().drop(columns=['level_0']).
rename(columns={'level_1':'feature', 0: 'coef'}))

In [44]:
coef_df

Unnamed: 0,feature,coef
0,property_type,0.119409
1,bedRoom,0.051878
2,bathroom,0.064207
3,built_up_area,0.214254
4,servant room,0.050714
...,...,...
107,sector_sector 95,-0.024257
108,sector_sector 99,-0.010231
109,sector_sohna road,-0.029670
110,agePossession_old,-0.006666


In [45]:
y_log.std()

0.5587046885958955

In [46]:
#standardized std deviation (a)
X_scaled['bedRoom'].std()

1.000140242620435

In [47]:
#Un standardized std deviation (b)
X['bedRoom'].std()

1.2465137192323026

In [48]:
y.std()

2.7818446154824525

In [50]:
y_log.std()

0.5587046885958955

In [51]:
np.expm1(y_log.std())

0.7484063022745442

In [None]:
# Here we have to find un standardized coefficients as the coefficients we have received from the linear regression 
# is standardized coefficients. as we have done standard scaler for the features.

In [49]:
# Standardized coefficient[x1 = 'bedRoom'] = 0.051878
# std y = 2.7818
# std ('bedroom') before scaling X['bedRoom'] = 1.2465
0.051878*(2.7818/1.2465) # This is unstandardized coefficient of 'bedroom' features 

0.11577554785399118

In [52]:
0.051878*(0.7484063022745442/1.2465)

0.0311478717604483

In [53]:
# Standardized coefficient[x1 = 'built_up_are'] = 0.214254
# std y = 2.7818
# std ('built_up') before scaling X['builtup'] = 1204.79
 
X['built_up_area'].std()

1204.7899739117795

In [56]:
0.214254* (0.7484/1204.79) # This is unstandardized coefficient of 'built_up' features 

0.0001330918198192216