Import required libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib  # For saving the model

Load the dataset

In [None]:
# Load Dataset
file_path = "/content/House Price India.csv"  # Updated path
df = pd.read_csv(file_path)

In [None]:
df.head()

Unnamed: 0,id,Date,number of bedrooms,number of bathrooms,living area,lot area,number of floors,waterfront present,number of views,condition of the house,grade of the house,Area of the house(excluding basement),Area of the basement,Built Year,Renovation Year,Postal Code,Lattitude,Longitude,living_area_renov,lot_area_renov,Number of schools nearby,Distance from the airport,Price
0,6762810145,42491,5,2.5,3650,9050,2.0,0,4,5,10,3370,280,1921,0,122003,52.8645,-114.557,2880,5400,2,58,2380000
1,6762810635,42491,4,2.5,2920,4000,1.5,0,0,5,8,1910,1010,1909,0,122004,52.8878,-114.47,2470,4000,2,51,1400000
2,6762810998,42491,5,2.75,2910,9480,1.5,0,0,3,8,2910,0,1939,0,122004,52.8852,-114.468,2940,6600,1,53,1200000
3,6762812605,42491,4,2.5,3310,42998,2.0,0,0,3,9,3310,0,2001,0,122005,52.9532,-114.321,3350,42847,3,76,838000
4,6762812919,42491,3,2.0,2710,4500,1.5,0,0,4,8,1880,830,1929,0,122006,52.9047,-114.485,2060,4500,1,51,805000


Data Preprocessing

In [None]:
#check for missing values
df.fillna(df.median(), inplace=True)

In [None]:
#check for datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14620 entries, 0 to 14619
Data columns (total 23 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   id                                     14620 non-null  int64  
 1   Date                                   14620 non-null  int64  
 2   number of bedrooms                     14620 non-null  int64  
 3   number of bathrooms                    14620 non-null  float64
 4   living area                            14620 non-null  int64  
 5   lot area                               14620 non-null  int64  
 6   number of floors                       14620 non-null  float64
 7   waterfront present                     14620 non-null  int64  
 8   number of views                        14620 non-null  int64  
 9   condition of the house                 14620 non-null  int64  
 10  grade of the house                     14620 non-null  int64  
 11  Ar

In [None]:
# Define Features and Target
X = df.drop(columns=["Price"])  # Features
y = df["Price"]  # Target variable

In [None]:
# Identify Numeric & Categorical Columns
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

In [None]:
#check basic Statistics
df.describe()

Unnamed: 0,id,Date,number of bedrooms,number of bathrooms,living area,lot area,number of floors,waterfront present,number of views,condition of the house,grade of the house,Area of the house(excluding basement),Area of the basement,Built Year,Renovation Year,Postal Code,Lattitude,Longitude,living_area_renov,lot_area_renov,Number of schools nearby,Distance from the airport,Price
count,14620.0,14620.0,14620.0,14620.0,14620.0,14620.0,14620.0,14620.0,14620.0,14620.0,14620.0,14620.0,14620.0,14620.0,14620.0,14620.0,14620.0,14620.0,14620.0,14620.0,14620.0,14620.0,14620.0
mean,6762821000.0,42604.538646,3.379343,2.129583,2098.262996,15093.28,1.50236,0.007661,0.233105,3.430506,7.682421,1801.783926,296.47907,1970.926402,90.924008,122033.062244,52.792848,-114.404007,1996.702257,12753.500068,2.012244,64.950958,538932.2
std,6237.575,67.347991,0.938719,0.769934,928.275721,37919.62,0.540239,0.087193,0.766259,0.664151,1.175033,833.809963,448.551409,29.493625,416.216661,19.082418,0.137522,0.141326,691.093366,26058.414467,0.817284,8.936008,367532.4
min,6762810000.0,42491.0,1.0,0.5,370.0,520.0,1.0,0.0,0.0,1.0,4.0,370.0,0.0,1900.0,0.0,122003.0,52.3859,-114.709,460.0,651.0,1.0,50.0,78000.0
25%,6762815000.0,42546.0,3.0,1.75,1440.0,5010.75,1.0,0.0,0.0,3.0,7.0,1200.0,0.0,1951.0,0.0,122017.0,52.7076,-114.519,1490.0,5097.75,1.0,57.0,320000.0
50%,6762821000.0,42600.0,3.0,2.25,1930.0,7620.0,1.5,0.0,0.0,3.0,7.0,1580.0,0.0,1975.0,0.0,122032.0,52.8064,-114.421,1850.0,7620.0,2.0,65.0,450000.0
75%,6762826000.0,42662.0,4.0,2.5,2570.0,10800.0,2.0,0.0,0.0,4.0,8.0,2240.0,580.0,1997.0,0.0,122048.0,52.9089,-114.315,2380.0,10125.0,3.0,73.0,645000.0
max,6762832000.0,42734.0,33.0,8.0,13540.0,1074218.0,3.5,1.0,4.0,5.0,13.0,9410.0,4820.0,2015.0,2015.0,122072.0,53.0076,-113.505,6110.0,560617.0,3.0,80.0,7700000.0


Data Preprocessing and Pipeline

In [None]:
# Data Preprocessing Pipeline
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())  # Standardization
])

cat_pipeline = Pipeline([
    ("encoder", OneHotEncoder(handle_unknown="ignore"))  # One-hot encoding for categorical variables
])

preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])


Split the Data into Training and Testing

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Apply Preprocessing
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

Train the Random Forest Model

In [None]:
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

In [None]:
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_grid,
    n_iter=5,  # Test only 5 random combinations
    cv=3,
    n_jobs=-1,
    scoring='r2',
    verbose=2,
    random_state=42
)

random_search.fit(X_train, y_train)
best_rf_model = random_search.best_estimator_


Fitting 3 folds for each of 5 candidates, totalling 15 fits


In [None]:
# Try XGBoost Model
xgb_model = XGBRegressor(n_estimators=200, learning_rate=0.05, max_depth=6, random_state=42)
xgb_model.fit(X_train, y_train)


Make Prediction

In [None]:
y_pred_xgb = xgb_model.predict(X_test)
print(y_pred_xgb[:5])  # Show first 5 predictions


[235265.1  552436.8  613826.25 552906.1  646821.25]


Evaluae the Model

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
r2_xgb = r2_score(y_test, y_pred_xgb)

print(f"XGBoost Mean Absolute Error : {mae_xgb}")
print(f"XGBoost Root Mean Squared Error : {rmse_xgb}")
print(f"XGBoost R-Squared : {r2_xgb}")


XGBoost Mean Absolute Error : 6287.50634765625
XGBoost Root Mean Squared Error : 76735.13260560641
XGBoost R-Squared : 0.9582151174545288


In [None]:
import os

# Define the main project directory
project_dir = "House-Price-Prediction-ML"

# Create the 'data' folder inside the project directory
os.makedirs(os.path.join(project_dir, "data"), exist_ok=True)

print("✅ 'data' folder created successfully inside House-Price-Prediction-ML!")


✅ 'data' folder created successfully inside House-Price-Prediction-ML!


In [None]:
import shutil

# Move dataset to 'data' folder
shutil.move("/content/House Price India.csv", "House-Price-Prediction-ML/data/House_Price_India.csv")

print("✅ Dataset moved to 'data' folder successfully!")


✅ Dataset moved to 'data' folder successfully!
