In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
import joblib

In [2]:
df = pd.read_csv('advanced-feature-dataset.csv')

In [3]:
df = df.drop(columns=['store room', 'floor_category', 'balcony'])

In [4]:
df['agePossession'] = df['agePossession'].replace({
    'Relatively New': 'new',
    'Moderately Old': 'old',
    'New Property': 'new',
    'Old Property': 'old',
    'Under Construction': 'under construction'
})

In [5]:
df['property_type'] = df['property_type'].replace({'flat': 0, 'house': 1})
df['furnishing_type'] = df['furnishing_type'].replace({'unfurnished': 0, 'semifurnished': 1, 'furnished': 2})
df['luxury_category'] = df['luxury_category'].replace({'low': 0, 'medium': 1, 'high': 2})

In [6]:
joblib.dump(df, 'insights_df_final.pkl')

['insights_df_final.pkl']

In [7]:
df_encoded = pd.get_dummies(df, columns=['agePossession'], drop_first=True)

In [8]:
X = df_encoded.drop(columns=['price', 'sector'])
y = df_encoded['price']

In [9]:
y_log = np.log1p(y)

In [10]:
joblib.dump(X.columns.tolist(), 'model_columns_final.pkl')

['model_columns_final.pkl']

In [17]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled,columns=X.columns)

In [12]:
joblib.dump(scaler, 'scaler_final.pkl')

['scaler_final.pkl']

In [13]:
ridge_model = Ridge(alpha=0.0001)
ridge_model.fit(X_scaled, y_log)

In [14]:
joblib.dump(ridge_model, 'ridge_model_final.pkl')

['ridge_model_final.pkl']

In [19]:
import statsmodels.api as sm

X_with_const = sm.add_constant(X_scaled)

model = sm.OLS(y_log, X_with_const).fit()

print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.817
Model:                            OLS   Adj. R-squared:                  0.817
Method:                 Least Squares   F-statistic:                     1214.
Date:                Sat, 02 Aug 2025   Prob (F-statistic):               0.00
Time:                        23:07:20   Log-Likelihood:                 63.727
No. Observations:                3543   AIC:                            -99.45
Df Residuals:                    3529   BIC:                            -13.04
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                                       coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------------
const   

# 1. Overall Model Performance

### model can explain 81.7% of the variation in property prices

# 2. Analysis of Individual Features (The coef and P>|t| columns)

### The Most Powerful Predictors (Low P-value < 0.05)

### 'area_x_sector_avg_price' is useless or meaning less because p-value >> 0.05 
### p-value 0.205 

# 3. Model Health & Diagnostics