In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression,Ridge
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [18]:
df = pd.read_csv('../data/cleaned_data_v8_1.csv').drop(columns=['Landmark_Category'])
df.head()

Unnamed: 0,PROPERTY_TYPE,CITY,BEDROOM_NUM,AREA,BALCONY_NUM,Location,price,luxury_category,floor_category,age_category
0,Residential Apartment,Kolkata South,2.0,570.5,1.0,Amtala,0.191,Low,Low Floor,Old Property
1,Residential Apartment,Kolkata South,3.0,1115.5,1.0,EM Bypass,1.175,Low,Low Floor,Old Property
2,Residential Apartment,Kolkata South,3.0,1446.0,1.0,Garia,1.285,Low,Low Floor,Old Property
3,Residential Apartment,Kolkata South,3.0,1295.0,1.0,Joka,0.675,Low,Low Floor,Old Property
4,Residential Apartment,Kolkata South,2.0,920.0,1.0,Joka,0.47,Low,Low Floor,Old Property


In [19]:
df['age_category'].value_counts()

age_category
New Property    2863
Old Property    2377
Intermediate     430
Name: count, dtype: int64

In [20]:
df['luxury_category'].value_counts()

luxury_category
Low       3613
Medium    1510
High       547
Name: count, dtype: int64

In [21]:
df['floor_category'].value_counts()

floor_category
Low Floor     2921
Mid Floor     2220
High Floor     529
Name: count, dtype: int64

In [22]:
df['luxury_category'] = df['luxury_category'].replace({'Low':0,'Medium':1,'High':2})

In [23]:
df.head()

Unnamed: 0,PROPERTY_TYPE,CITY,BEDROOM_NUM,AREA,BALCONY_NUM,Location,price,luxury_category,floor_category,age_category
0,Residential Apartment,Kolkata South,2.0,570.5,1.0,Amtala,0.191,0,Low Floor,Old Property
1,Residential Apartment,Kolkata South,3.0,1115.5,1.0,EM Bypass,1.175,0,Low Floor,Old Property
2,Residential Apartment,Kolkata South,3.0,1446.0,1.0,Garia,1.285,0,Low Floor,Old Property
3,Residential Apartment,Kolkata South,3.0,1295.0,1.0,Joka,0.675,0,Low Floor,Old Property
4,Residential Apartment,Kolkata South,2.0,920.0,1.0,Joka,0.47,0,Low Floor,Old Property


In [24]:
new_df = pd.get_dummies(df,columns=['PROPERTY_TYPE','CITY','Location','floor_category','age_category'],drop_first=True,dtype=float)

In [25]:
new_df.head()

Unnamed: 0,BEDROOM_NUM,AREA,BALCONY_NUM,price,luxury_category,PROPERTY_TYPE_Independent/Builder Floor,PROPERTY_TYPE_Residential Apartment,CITY_Kolkata East,CITY_Kolkata North,CITY_Kolkata South,...,Location_new alipore block G,Location_on embypass,Location_purbachal,Location_rajathat chowmatha,Location_sunny park,Location_telipukur,floor_category_Low Floor,floor_category_Mid Floor,age_category_New Property,age_category_Old Property
0,2.0,570.5,1.0,0.191,0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,3.0,1115.5,1.0,1.175,0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,3.0,1446.0,1.0,1.285,0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,3.0,1295.0,1.0,0.675,0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,2.0,920.0,1.0,0.47,0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [26]:
X = new_df.drop(columns=['price'])
y = new_df['price']

In [27]:
y_log = np.log1p(y)

In [28]:
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

In [29]:
X_scaled = pd.DataFrame(X_scaled,columns=X.columns)

In [30]:
X_scaled.head()

Unnamed: 0,BEDROOM_NUM,AREA,BALCONY_NUM,luxury_category,PROPERTY_TYPE_Independent/Builder Floor,PROPERTY_TYPE_Residential Apartment,CITY_Kolkata East,CITY_Kolkata North,CITY_Kolkata South,CITY_Kolkata West,...,Location_new alipore block G,Location_on embypass,Location_purbachal,Location_rajathat chowmatha,Location_sunny park,Location_telipukur,floor_category_Low Floor,floor_category_Mid Floor,age_category_New Property,age_category_Old Property
0,-0.804804,-0.983839,-0.401712,-0.69135,-0.164845,0.250982,-0.664566,-0.541336,1.156782,-0.18159,...,-0.013281,-0.013281,-0.02657,-0.018785,-0.013281,-0.013281,0.970111,-0.802171,-1.009926,1.177013
1,0.381994,-0.223542,-0.401712,-0.69135,-0.164845,0.250982,-0.664566,-0.541336,1.156782,-0.18159,...,-0.013281,-0.013281,-0.02657,-0.018785,-0.013281,-0.013281,0.970111,-0.802171,-1.009926,1.177013
2,0.381994,0.237518,-0.401712,-0.69135,-0.164845,0.250982,-0.664566,-0.541336,1.156782,-0.18159,...,-0.013281,-0.013281,-0.02657,-0.018785,-0.013281,-0.013281,0.970111,-0.802171,-1.009926,1.177013
3,0.381994,0.026867,-0.401712,-0.69135,-0.164845,0.250982,-0.664566,-0.541336,1.156782,-0.18159,...,-0.013281,-0.013281,-0.02657,-0.018785,-0.013281,-0.013281,0.970111,-0.802171,-1.009926,1.177013
4,-0.804804,-0.496272,-0.401712,-0.69135,-0.164845,0.250982,-0.664566,-0.541336,1.156782,-0.18159,...,-0.013281,-0.013281,-0.02657,-0.018785,-0.013281,-0.013281,0.970111,-0.802171,-1.009926,1.177013


In [34]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(LinearRegression(), X_scaled, y_log, cv=kfold, scoring='r2')

In [35]:
scores.mean(),scores.std()

(-6.564307238600746e+26, 7.072042340467314e+26)

In [33]:
# 1. Import necessary libraries
import statsmodels.api as sm

# 2. Add a constant to X
X_with_const = sm.add_constant(X_scaled)

# 3. Fit the model
model = sm.OLS(y_log, X_with_const).fit()

# 4. Obtain summary statistics
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.887
Model:                            OLS   Adj. R-squared:                  0.877
Method:                 Least Squares   F-statistic:                     91.28
Date:                Thu, 01 Feb 2024   Prob (F-statistic):               0.00
Time:                        16:15:15   Log-Likelihood:                 3670.3
No. Observations:                5670   AIC:                            -6441.
Df Residuals:                    5220   BIC:                            -3451.
Df Model:                         449                                         
Covariance Type:            nonrobust                                         
                                                                coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------

In [37]:
y_log.std()

0.37686351638315313

In [42]:
X['AREA'].std()

716.8888922817067

In [44]:
0.212 * (0.376/716.88)

0.00011119294721571252

In [45]:
np.expm1(0.0001111)

0.00011110617183356148