In [7]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import KFold , cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [8]:
df = pd.read_csv('gurgaon_properties_post_feature_selection_v2.csv').drop(columns =['store room','floor_category','balcony'])

In [9]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_category
0,flat,sector 7,0.45,2,2,Relatively New,814.0,0,1,Low
1,flat,sector 3,0.5,2,2,Old Property,588.0,0,0,Low
2,flat,sohna road,0.4,2,2,New Property,538.0,0,1,Low
3,flat,sector 61,1.47,2,2,New Property,1086.0,0,1,Medium
4,flat,sector 92,0.7,2,2,Under Construction,1217.0,0,1,Low


In [14]:
df['agePossession'] = df['agePossession'].replace(
    {'Relatively New' : 'new',
     'Moderately Old' : 'old',
     'New Property' : 'new',
     'Old Property' : 'old',
     'Under Construction' : 'under construction'
    }
)

In [15]:
df['agePossession'].value_counts()

agePossession
new                   2486
old                    942
under construction     126
Name: count, dtype: int64

In [19]:
df['property_type'] = df['property_type'].replace({'flat': 0 , 'house': 1 })

  df['property_type'] = df['property_type'].replace({'flat': 0 , 'house': 1 })


In [20]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_category
0,0,sector 7,0.45,2,2,new,814.0,0,1,Low
1,0,sector 3,0.5,2,2,old,588.0,0,0,Low
2,0,sohna road,0.4,2,2,new,538.0,0,1,Low
3,0,sector 61,1.47,2,2,new,1086.0,0,1,Medium
4,0,sector 92,0.7,2,2,under construction,1217.0,0,1,Low


In [33]:
df['luxury_category'] = df['luxury_category'].replace({'Low':0, 'Medium':1 , 'High' : 2})

  df['luxury_category'] = df['luxury_category'].replace({'Low':0, 'Medium':1 , 'High' : 2})


In [34]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,agePossession,built_up_area,servant room,furnishing_type,luxury_category
0,0,sector 7,0.45,2,2,new,814.0,0,1,0
1,0,sector 3,0.5,2,2,old,588.0,0,0,0
2,0,sohna road,0.4,2,2,new,538.0,0,1,0
3,0,sector 61,1.47,2,2,new,1086.0,0,1,1
4,0,sector 92,0.7,2,2,under construction,1217.0,0,1,0


In [35]:
new_df = pd.get_dummies(df, columns = ['sector' , 'agePossession'], drop_first= True )

In [36]:
new_df.shape

(3554, 106)

In [37]:
x = new_df.drop(columns=['price'])
y = new_df['price']

In [38]:
y_log = np.log1p(y)

In [40]:
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

In [42]:
x_scaled = pd.DataFrame(x_scaled , columns = x.columns)

In [45]:
kfold = KFold(n_splits = 10 , shuffle = True , random_state = 42)
scores = cross_val_score(LinearRegression() , x_scaled , y_log , cv= kfold , scoring = 'r2')

In [46]:
scores.mean() , scores.std()

(np.float64(0.8543377380386342), np.float64(0.023090800055170484))

In [47]:
lr = LinearRegression()
lr.fit(x_scaled , y_log)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [48]:
lr.coef_

array([ 1.10405122e-01,  5.10700979e-02,  6.38960208e-02,  2.23109590e-01,
        4.86685653e-02, -1.69051325e-03,  4.26061366e-03,  1.58682014e-02,
        6.35282048e-02,  2.01208631e-02,  3.27754788e-02, -1.72016790e-03,
        2.29714536e-02,  1.45901585e-02,  4.27360933e-02,  5.71857466e-02,
        2.96428517e-04,  2.50524494e-02,  3.33809522e-02,  4.61652559e-02,
        5.76569986e-02,  1.65210069e-03, -1.18394014e-03,  4.48292515e-02,
        1.28506324e-02,  2.57329760e-02,  3.53548799e-02, -3.04355283e-05,
        5.29093275e-02,  1.53851574e-02,  3.48731732e-02,  8.09617821e-02,
        9.62644206e-02,  1.27502760e-02,  6.53497675e-02,  2.27683516e-03,
        1.43887181e-02,  3.46746238e-02,  5.96278717e-02,  2.04305453e-02,
        3.25800446e-02,  2.45077392e-02,  1.58741778e-02,  5.17021525e-03,
        1.64059214e-02,  2.90460542e-02,  8.91897568e-02,  3.82159916e-02,
        2.92712917e-02,  2.82533578e-02,  8.16400597e-02,  5.61455052e-02,
        6.06878310e-03,  

In [50]:
lr.coef_.shape

(105,)

In [58]:
coef_df = pd.DataFrame(lr.coef_.reshape(1,105),columns = x.columns).stack().reset_index().drop(columns = ['level_0']).rename(columns ={'level_1' : 'feature' , 0:'coef' })

In [59]:
coef_df

Unnamed: 0,feature,coef
0,property_type,0.110405
1,bedRoom,0.051070
2,bathroom,0.063896
3,built_up_area,0.223110
4,servant room,0.048669
...,...,...
100,sector_sector 95,0.002630
101,sector_sector 99,0.012944
102,sector_sohna road,0.017850
103,agePossession_old,-0.006612


In [60]:
import statsmodels.api as sm
x_with_const = sm.add_constant(x_scaled)

model = sm.OLS(y_log , x_with_const).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.867
Model:                            OLS   Adj. R-squared:                  0.863
Method:                 Least Squares   F-statistic:                     214.0
Date:                Sat, 21 Jun 2025   Prob (F-statistic):               0.00
Time:                        01:31:37   Log-Likelihood:                 615.00
No. Observations:                3554   AIC:                            -1018.
Df Residuals:                    3448   BIC:                            -363.4
Df Model:                         105                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
const   

In [61]:
y.std()

np.float64(2.7839195632397495)