In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [4]:
# Load dataset
data = pd.read_csv("indian_house_price_data.csv")
print(data.head())

# Check dataset information
data.info()

# Describe the dataset
print(data.describe())

    Location  Size (sq ft)  Bedrooms  Bathrooms  Age of Property (years)  \
0       Pune          3677         1          1                       18   
1  Hyderabad          2628         2          3                       12   
2    Chennai          2680         4          3                       18   
3       Pune           637         3          3                       16   
4  Bangalore          3972         4          3                        5   

   Nearby Facilities Score (out of 10)  Price (INR)  
0                                    1     21131022  
1                                    6     41191314  
2                                    5     26952620  
3                                    9     12460166  
4                                    2      7379872  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               -------

In [9]:
# Check for missing values
print(data.isnull().sum())

# If there are missing values, handle them (no missing values expected here)
# Example: Fill missing values with median
# data.fillna(data.median(), inplace=True)

Location                               0
Size (sq ft)                           0
Bedrooms                               0
Bathrooms                              0
Age of Property (years)                0
Nearby Facilities Score (out of 10)    0
Price (INR)                            0
dtype: int64


In [13]:
# Convert 'Location' into numeric categories
data = pd.get_dummies(data, columns=['Location'], drop_first=True)

In [15]:
# Define features (X) and target (y)
X = data.drop("Price (INR)", axis=1)
y = data["Price (INR)"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
# Scale the features (not the target)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [19]:
# Initialize RandomForestRegressor
rf_model = RandomForestRegressor(random_state=42)

# Train the model
rf_model.fit(X_train_scaled, y_train)

In [21]:
# Predict on the testing set
y_pred = rf_model.predict(X_test_scaled)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Root Mean Squared Error:", rmse)
print("R^2 Score:", r2)

Mean Squared Error: 117550354871520.19
Root Mean Squared Error: 10842064.14256622
R^2 Score: -0.09693684070391906


In [23]:
# Create a DataFrame to compare actual and predicted prices
comparison = pd.DataFrame({
    "Actual Price (INR)": y_test,
    "Predicted Price (INR)": y_pred
})
print(comparison.head(10))

     Actual Price (INR)  Predicted Price (INR)
521            22565760            15828139.38
737            24598192            24082381.90
740            17549904            17533869.76
660            35750454            18409826.15
411             6693285            24034419.58
678            22688868            20858687.60
626            11141064            23548530.29
513            15449421            19203268.92
859             4906230            17149072.64
136            18508976            17073176.75


In [27]:
# Example: New input data
new_data = pd.DataFrame({
    "Size (sq ft)": [1500],
    "Bedrooms": [3],
    "Bathrooms": [2],
    "Age of Property (years)": [5],
    "Nearby Facilities Score (out of 10)": [8],
    # Add appropriate values for encoded 'Location' columns
    # "Location_Bangalore": [1],
    "Location_Chennai": [0],
    "Location_Delhi": [0],
    "Location_Hyderabad": [0],
    "Location_Kolkata": [0],
    "Location_Mumbai": [0],
    "Location_Pune": [0]
})

# Scale the new data
new_data_scaled = scaler.transform(new_data)

# Predict the price
predicted_price = rf_model.predict(new_data_scaled)
print("Predicted Price (INR):", predicted_price[0])

Predicted Price (INR): 22275619.42


In [29]:
comparison.to_csv("actual_vs_predicted_prices.csv", index=False)