# <span style="color:gold;">Mumbai House Price Prediction Model</span>

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [14]:
df = pd.read_csv('Mumbai.csv',encoding='unicode_escape')

### <span style="color:lightgreen;">Data Cleaning and Preprocessing</span>

In [15]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,price,sqrt,location,bhk,New/Resale,Gymnasium,Lift Available,CarParking,Maintenance Staff,24x7 Security,Children's Play Area,Clubhouse,Intercom,Landscaped Gardens,Indoor Games,Gas Connection,Jogging Track,Swimming Pool
0,0,4850000,720,Kharghar,1,0,0,1,1,1,1,0,0,0,0,0,0,0,0
1,1,4500000,600,Kharghar,1,0,1,1,1,1,1,0,1,0,0,0,0,1,1
2,2,6700000,650,Kharghar,1,0,1,1,1,1,1,1,1,1,0,0,0,1,1
3,3,4500000,650,Kharghar,1,0,0,1,1,1,1,0,0,1,1,0,0,0,0
4,4,5000000,665,Kharghar,1,0,0,1,1,1,1,0,0,1,1,0,0,0,0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6347 entries, 0 to 6346
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0            6347 non-null   int64 
 1   price                 6347 non-null   int64 
 2   sqrt                  6347 non-null   int64 
 3   location              6347 non-null   object
 4   bhk                   6347 non-null   int64 
 5   New/Resale            6347 non-null   int64 
 6   Gymnasium             6347 non-null   int64 
 7   Lift Available        6347 non-null   int64 
 8   CarParking            6347 non-null   int64 
 9   Maintenance Staff     6347 non-null   int64 
 10  24x7 Security         6347 non-null   int64 
 11  Children's Play Area  6347 non-null   int64 
 12  Clubhouse             6347 non-null   int64 
 13  Intercom              6347 non-null   int64 
 14  Landscaped Gardens    6347 non-null   int64 
 15  Indoor Games          6347 non-null   

In [17]:
df.isnull().sum()

Unnamed: 0              0
price                   0
sqrt                    0
location                0
bhk                     0
New/Resale              0
Gymnasium               0
Lift Available          0
CarParking              0
Maintenance Staff       0
24x7 Security           0
Children's Play Area    0
Clubhouse               0
Intercom                0
Landscaped Gardens      0
Indoor Games            0
Gas Connection          0
Jogging Track           0
Swimming Pool           0
dtype: int64

In [18]:
# Select only numeric columns for correlation calculation
numeric_df = df.select_dtypes(include=[np.number])
corr_matrix = numeric_df.corr()

In [19]:
corr_matrix['price'].sort_values(ascending=False)

price                   1.000000
sqrt                    0.722336
bhk                     0.594865
Swimming Pool           0.123902
Indoor Games            0.122352
Gas Connection          0.118245
Gymnasium               0.098097
Clubhouse               0.084775
Lift Available          0.083656
Landscaped Gardens      0.082225
Intercom                0.063060
CarParking              0.055221
Children's Play Area    0.045126
24x7 Security           0.045107
Jogging Track           0.042325
New/Resale              0.032428
Maintenance Staff       0.027604
Unnamed: 0             -0.074733
Name: price, dtype: float64

In [20]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [22]:
# After filtering top 20 locations and one-hot encoding
df['price_per_sqft'] = df['price'] / df['sqrt']

features = [
    'sqrt', 'bhk', 'New/Resale', 'Gymnasium', 'Lift Available',
    'CarParking', '24x7 Security', 'Swimming Pool', 'Intercom',
    'price_per_sqft'
] + [col for col in df.columns if col.startswith('location_')]

X = df[features]
y = np.log1p(df['price'])


In [24]:

from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBRegressor(n_estimators=200, learning_rate=0.1, max_depth=6, random_state=42)
model.fit(X_train, y_train)

y_pred_log = model.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_test_actual = np.expm1(y_test)

r2 = r2_score(y_test_actual, y_pred)
rmse = np.sqrt(mean_squared_error(y_test_actual, y_pred))
mae = mean_absolute_error(y_test_actual, y_pred)

print(f"R² Score: {r2:.3f}")
print(f"RMSE: ₹{rmse:,.0f}")
print(f"MAE: ₹{mae:,.0f}")


R² Score: 0.962
RMSE: ₹3,459,840
MAE: ₹468,116


In [25]:
import pickle

pickle.dump(model, open('mumbai_price_model.pkl', 'wb'))
pickle.dump(X.columns.tolist(), open('model_columns.pkl', 'wb'))
