In [27]:
# Step 1: Import necessary libraries
# These are essential packages for data handling, preprocessing, and machine learning.
from sklearn.model_selection import train_test_split             # For splitting the dataset into training and testing sets
from sklearn.ensemble import RandomForestRegressor               # For training the regression model
from sklearn.metrics import mean_absolute_error, r2_score        # For evaluating model performance
import numpy as np                                               # For numerical operations
import pandas as pd                                              # For handling tabular data like spreadsheets

In [28]:
# Step 2: Load the dataset
# Read the CSV file containing housing data into a Pandas DataFrame.
df = pd.read_csv('House Price India.csv')

In [29]:
# Step 3: Preview the data
# This helps you understand the structure, column names, and spot obvious issues.
df.head()

Unnamed: 0,id,Date,number of bedrooms,number of bathrooms,living area,lot area,number of floors,waterfront present,number of views,condition of the house,...,Built Year,Renovation Year,Postal Code,Lattitude,Longitude,living_area_renov,lot_area_renov,Number of schools nearby,Distance from the airport,Price
0,6762810145,42491,5,2.5,3650,9050,2.0,0,4,5,...,1921,0,122003,52.8645,-114.557,2880,5400,2,58,2380000
1,6762810635,42491,4,2.5,2920,4000,1.5,0,0,5,...,1909,0,122004,52.8878,-114.47,2470,4000,2,51,1400000
2,6762810998,42491,5,2.75,2910,9480,1.5,0,0,3,...,1939,0,122004,52.8852,-114.468,2940,6600,1,53,1200000
3,6762812605,42491,4,2.5,3310,42998,2.0,0,0,3,...,2001,0,122005,52.9532,-114.321,3350,42847,3,76,838000
4,6762812919,42491,3,2.0,2710,4500,1.5,0,0,4,...,1929,0,122006,52.9047,-114.485,2060,4500,1,51,805000


In [30]:
# Step 4: Drop non-informative columns
# 'id' is just a unique identifier and doesn't help in prediction.
# 'Date' might not be in a usable format or relevant without transformation.
df_cleaned = df.drop(columns=['id', 'Date'])

In [31]:
# Step 5: Separate input features and the target variable
# X: all columns except the target ('Price')
# y: the target variable, which we are trying to predict
X = df_cleaned.drop(columns=['Price'])
y = df_cleaned['Price']

In [32]:
# Step 6: Check and handle missing values
# This step ensures our model doesn't break on missing data.
# .isnull() identifies NaNs; .sum() counts them per column.
print("Missing values per column BEFORE filling:\n", X.isnull().sum())

# Fill missing values (if any) with the mean of each column
# This is a simple and commonly used imputation method.
X = X.fillna(X.mean())

# Optional: You could re-check to confirm no missing values remain
print("Missing values per column AFTER filling:\n", X.isnull().sum())

Missing values per column BEFORE filling:
 number of bedrooms                       0
number of bathrooms                      0
living area                              0
lot area                                 0
number of floors                         0
waterfront present                       0
number of views                          0
condition of the house                   0
grade of the house                       0
Area of the house(excluding basement)    0
Area of the basement                     0
Built Year                               0
Renovation Year                          0
Postal Code                              0
Lattitude                                0
Longitude                                0
living_area_renov                        0
lot_area_renov                           0
Number of schools nearby                 0
Distance from the airport                0
dtype: int64
Missing values per column AFTER filling:
 number of bedrooms                       0

In [33]:
# Step 7: Split the dataset
# This divides the data into training (80%) and testing (20%) sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [34]:
# Step 8: Initialize and train the Random Forest Regressor
# A powerful ensemble method that builds multiple decision trees.
rf = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model on the training data
rf.fit(X_train, y_train)

In [35]:
# Step 9: Make predictions on the test data
y_pred = rf.predict(X_test)

In [36]:
# Step 10: Evaluate the model
# Calculate metrics to understand performance
mae = mean_absolute_error(y_test, y_pred)  # Average absolute difference between predicted and actual prices
r2 = r2_score(y_test, y_pred)              # Proportion of variance in price explained by the model

# Print metrics with formatting for clarity
print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"R-squared (R²): {r2:.4f}")

Mean Absolute Error (MAE): 69047.2389
R-squared (R²): 0.8700
