In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [4]:
df = pd.read_csv('gurgaon_properties_post_feature_selection_v2.csv').drop(columns = ['store room','floor_category','balcony'])

In [5]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_category
0,flat,sector 36,0.82,3.0,2.0,New Property,850.0,0.0,0.0,Low
1,flat,sector 89,0.95,2.0,2.0,New Property,1226.0,1.0,0.0,Low
2,flat,sohna road,0.32,2.0,2.0,New Property,1000.0,0.0,0.0,Low
3,flat,sector 92,1.6,3.0,4.0,Relatively New,1615.0,1.0,1.0,High
4,flat,sector 102,0.48,2.0,2.0,Relatively New,582.0,0.0,0.0,High


In [6]:
#Plan of the action
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished

In [7]:
# Numerical = bedRoom, bathroom, built_up_area, servant room
# Ordinal = property_type, furnishing_type, luxury_category 
# OHE = sector, agePossession

In [8]:
df['agePossession'].value_counts()

agePossession
Relatively New        1732
Moderately Old         619
New Property           599
Old Property           327
Under Construction     277
Name: count, dtype: int64

In [14]:
df['agePossession'] = df['agePossession'].replace(
    {
    'Relatively New' : 'new',
    'Moderately Old' : 'old',
    'New Property'  : 'new',
    'Old Property'  : 'old',
    'Under Construction'  : 'Under Construction'
})

In [15]:
df['agePossession'].value_counts()

agePossession
new                   2331
old                    946
Under Construction     277
Name: count, dtype: int64

In [17]:
df.tail()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_category
3549,flat,sector 84,0.37,2.0,2.0,new,532.0,0.0,0.0,Medium
3550,house,sector 109,6.0,5.0,5.0,new,6228.0,1.0,0.0,High
3551,flat,sector 2,0.6,1.0,1.0,old,665.0,0.0,1.0,Medium
3552,house,sector 43,15.5,5.0,6.0,old,5490.0,1.0,0.0,Medium
3553,flat,sector 68,1.78,3.0,3.0,new,1845.0,0.0,1.0,Medium


In [18]:
df['property_type'] = df['property_type'].replace({'flat': 0, 'house':1})

  df['property_type'] = df['property_type'].replace({'flat': 0, 'house':1})


In [20]:
df.tail()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_category
3549,0,sector 84,0.37,2.0,2.0,new,532.0,0.0,0.0,Medium
3550,1,sector 109,6.0,5.0,5.0,new,6228.0,1.0,0.0,High
3551,0,sector 2,0.6,1.0,1.0,old,665.0,0.0,1.0,Medium
3552,1,sector 43,15.5,5.0,6.0,old,5490.0,1.0,0.0,Medium
3553,0,sector 68,1.78,3.0,3.0,new,1845.0,0.0,1.0,Medium


In [21]:
df['luxury_category'].value_counts()

luxury_category
Low       1594
Medium    1465
High       495
Name: count, dtype: int64

In [26]:
df['luxury_category'] = df['luxury_category'].replace({'Low':0,'Medium':1, 'High':2})

  df['luxury_category'] = df['luxury_category'].replace({'Low':0,'Medium':1, 'High':2})


In [27]:
df.tail(7)

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_category
3547,0,sector 106,0.92,2.0,2.0,new,940.0,0.0,0.0,2
3548,1,sector 6,0.75,4.0,4.0,old,480.0,0.0,0.0,0
3549,0,sector 84,0.37,2.0,2.0,new,532.0,0.0,0.0,1
3550,1,sector 109,6.0,5.0,5.0,new,6228.0,1.0,0.0,2
3551,0,sector 2,0.6,1.0,1.0,old,665.0,0.0,1.0,1
3552,1,sector 43,15.5,5.0,6.0,old,5490.0,1.0,0.0,1
3553,0,sector 68,1.78,3.0,3.0,new,1845.0,0.0,1.0,1


In [28]:
new_df = pd.get_dummies(df , columns = ['sector', 'agePossession'], drop_first = False)

In [29]:
new_df

Unnamed: 0,property_type,price,bedRoom,bathroom,built_up_area,servant room,furnishing_type,luxury_category,sector_dwarka expressway,sector_gwal pahari,...,sector_sector 90,sector_sector 91,sector_sector 92,sector_sector 93,sector_sector 95,sector_sector 99,sector_sohna road,agePossession_Under Construction,agePossession_new,agePossession_old
0,0,0.82,3.0,2.0,850.0,0.0,0.0,0,False,False,...,False,False,False,False,False,False,False,False,True,False
1,0,0.95,2.0,2.0,1226.0,1.0,0.0,0,False,False,...,False,False,False,False,False,False,False,False,True,False
2,0,0.32,2.0,2.0,1000.0,0.0,0.0,0,False,False,...,False,False,False,False,False,False,True,False,True,False
3,0,1.60,3.0,4.0,1615.0,1.0,1.0,2,False,False,...,False,False,True,False,False,False,False,False,True,False
4,0,0.48,2.0,2.0,582.0,0.0,0.0,2,False,False,...,False,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3549,0,0.37,2.0,2.0,532.0,0.0,0.0,1,False,False,...,False,False,False,False,False,False,False,False,True,False
3550,1,6.00,5.0,5.0,6228.0,1.0,0.0,2,False,False,...,False,False,False,False,False,False,False,False,True,False
3551,0,0.60,1.0,1.0,665.0,0.0,1.0,1,False,False,...,False,False,False,False,False,False,False,False,False,True
3552,1,15.50,5.0,6.0,5490.0,1.0,0.0,1,False,False,...,False,False,False,False,False,False,False,False,False,True


In [30]:
X = new_df.drop(columns = ['price'])
y = new_df['price']

In [31]:
y_log = np.log1p(y)

In [32]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [34]:
X_scaled = pd.DataFrame(X_scaled , columns = X.columns)

In [35]:
X_scaled

Unnamed: 0,property_type,bedRoom,bathroom,built_up_area,servant room,furnishing_type,luxury_category,sector_dwarka expressway,sector_gwal pahari,sector_manesar,...,sector_sector 90,sector_sector 91,sector_sector 92,sector_sector 93,sector_sector 95,sector_sector 99,sector_sohna road,agePossession_Under Construction,agePossession_new,agePossession_old
0,-0.517180,-0.074329,-0.874300,-0.831662,-0.747968,-0.668281,-0.984642,-0.047498,-0.071348,-0.093805,...,-0.160267,-0.069328,-0.170153,-0.050386,-0.125375,-0.109357,-0.219245,-0.290738,0.724339,-0.602271
1,-0.517180,-0.877269,-0.874300,-0.522517,1.336956,-0.668281,-0.984642,-0.047498,-0.071348,-0.093805,...,-0.160267,-0.069328,-0.170153,-0.050386,-0.125375,-0.109357,-0.219245,-0.290738,0.724339,-0.602271
2,-0.517180,-0.877269,-0.874300,-0.708333,-0.747968,-0.668281,-0.984642,-0.047498,-0.071348,-0.093805,...,-0.160267,-0.069328,-0.170153,-0.050386,-0.125375,-0.109357,4.561105,-0.290738,0.724339,-0.602271
3,-0.517180,-0.074329,0.505173,-0.202684,1.336956,1.037949,1.866207,-0.047498,-0.071348,-0.093805,...,-0.160267,-0.069328,5.877074,-0.050386,-0.125375,-0.109357,-0.219245,-0.290738,0.724339,-0.602271
4,-0.517180,-0.877269,-0.874300,-1.052010,-0.747968,-0.668281,1.866207,-0.047498,-0.071348,-0.093805,...,-0.160267,-0.069328,-0.170153,-0.050386,-0.125375,-0.109357,-0.219245,-0.290738,0.724339,-0.602271
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3549,-0.517180,-0.877269,-0.874300,-1.093119,-0.747968,-0.668281,0.440783,-0.047498,-0.071348,-0.093805,...,-0.160267,-0.069328,-0.170153,-0.050386,-0.125375,-0.109357,-0.219245,-0.290738,0.724339,-0.602271
3550,1.933563,1.531549,1.194909,3.590095,1.336956,-0.668281,1.866207,-0.047498,-0.071348,-0.093805,...,-0.160267,-0.069328,-0.170153,-0.050386,-0.125375,-0.109357,-0.219245,-0.290738,0.724339,-0.602271
3551,-0.517180,-1.680208,-1.564036,-0.983768,-0.747968,1.037949,0.440783,-0.047498,-0.071348,-0.093805,...,-0.160267,-0.069328,-0.170153,-0.050386,-0.125375,-0.109357,-0.219245,-0.290738,-1.380568,1.660383
3552,1.933563,1.531549,1.884645,2.983317,1.336956,-0.668281,0.440783,-0.047498,-0.071348,-0.093805,...,-0.160267,-0.069328,-0.170153,-0.050386,-0.125375,-0.109357,-0.219245,-0.290738,-1.380568,1.660383


In [37]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(LinearRegression(),X_scaled ,y_log ,cv= kfold ,scoring = 'r2')

In [38]:
scores.mean()

0.8512613057405425

In [39]:
scores.std()

0.016992929105286176

In [40]:
lr= LinearRegression()
ridge  = Ridge(alpha= 0.0001)

In [41]:
lr.fit(X_scaled ,y_log)

In [42]:
ridge.fit(X_scaled ,y_log)

In [43]:
coef_df = pd.DataFrame(ridge.coef_.reshape(1,112),columns=X.columns).stack().reset_index().drop(columns=['level_0']).rename(columns={'level_1':'feature',0:'coef'})

ValueError: cannot reshape array of size 114 into shape (1,112)

In [46]:
# 1. Import necessary libraries
import statsmodels.api as sm

# 2. Add a constant to X
X_with_const = sm.add_constant(X_scaled)

# 3. Fit the model
model = sm.OLS(y_log, X_with_const).fit()

# 4. Obtain summary statistics
print(model.summary())


                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.865
Model:                            OLS   Adj. R-squared:                  0.860
Method:                 Least Squares   F-statistic:                     196.7
Date:                Sun, 06 Apr 2025   Prob (F-statistic):               0.00
Time:                        14:07:09   Log-Likelihood:                 588.22
No. Observations:                3554   AIC:                            -950.4
Df Residuals:                    3441   BIC:                            -252.6
Df Model:                         112                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
const   