In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

In [None]:
df = pd.read_csv('data/india_housing_prices.csv')
print(df.shape)
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df['Age_of_Property'] = 2026 - df['Year_Built']
df['Price_per_BHK'] = df['Size_in_SqFt'] / df['BHK']
df['Floor_Ratio'] = df['Floor_No'] / (df['Total_Floors'] + 1)
df['Size_BHK_Interaction'] = df['Size_in_SqFt'] * df['BHK']
df['Size_Squared'] = df['Size_in_SqFt'] ** 2
df['Size_Cubed'] = df['Size_in_SqFt'] ** 3
df['Price_per_SqFt_Squared'] = df['Price_per_SqFt'] ** 2
df['BHK_Squared'] = df['BHK'] ** 2
df['Age_Squared'] = df['Age_of_Property'] ** 2
df['Size_Age_Interaction'] = df['Size_in_SqFt'] * df['Age_of_Property']
df['BHK_Age_Interaction'] = df['BHK'] * df['Age_of_Property']
df['Floor_Size_Interaction'] = df['Floor_No'] * df['Size_in_SqFt']
df['Schools_Hospitals'] = df['Nearby_Schools'] + df['Nearby_Hospitals']
df['Amenities_Count'] = df['Amenities'].str.count(',') + 1
df['Log_Size'] = np.log1p(df['Size_in_SqFt'])
df['Log_Price_per_SqFt'] = np.log1p(df['Price_per_SqFt'])
print('Feature engineering completed')

In [None]:
numerical_cols = ['BHK', 'Size_in_SqFt', 'Price_in_Lakhs', 'Price_per_SqFt', 
                  'Year_Built', 'Floor_No', 'Total_Floors', 'Nearby_Schools', 'Nearby_Hospitals']

for col in numerical_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])
    df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])

print('Outlier removal completed')

In [None]:
df_encoded = pd.get_dummies(df, drop_first=True)
print(df_encoded.shape)
df_encoded.head()

In [None]:
X = df_encoded.drop(['Price_in_Lakhs', 'ID'], axis=1)
y = df_encoded['Price_in_Lakhs']

selector = SelectKBest(score_func=f_regression, k=min(500, X.shape[1]))
X_selected = selector.fit_transform(X, y)
selected_features = X.columns[selector.get_support()]
X = pd.DataFrame(X_selected, columns=selected_features)
print(f'Selected {X.shape[1]} features')

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X = pd.DataFrame(X_scaled, columns=X.columns)

poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=True)
X_poly = poly.fit_transform(X)
poly_feature_names = poly.get_feature_names_out(X.columns)
X = pd.DataFrame(X_poly, columns=poly_feature_names)
print(f'Polynomial features created: {X.shape[1]} features')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set: {X_train.shape}")
print(f"Test set: {X_test.shape}")

In [None]:
model = Ridge(alpha=10.0)
model.fit(X_train, y_train)
print("Model trained!")

In [None]:
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"R2 Score: {r2:.2f}")

In [None]:
joblib.dump(model, 'linear_regression_model.joblib')
joblib.dump(model, 'model_compressed.joblib', compress=3)
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(selector, 'selector.joblib')
joblib.dump(poly, 'poly.joblib')
joblib.dump(selected_features.tolist(), 'selected_features.joblib')
print("Model saved!")