In [10]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


In [3]:
# Load dataset
file_path = 'house_prices_custom.csv'  # Adjust the path if needed
data = pd.read_csv(file_path)

# Display initial rows
print("Dataset Snapshot:")
print(data.head())


Dataset Snapshot:
   Size(SqFt)  Rooms     Price
0       956.0    5.0  471112.0
1       602.0    5.0  430879.0
2      1626.0    3.0  130866.0
3      1503.0    5.0  220031.0
4      1414.0    4.0  135337.0


In [4]:
# Handle missing values
data['Size(SqFt)'] = data['Size(SqFt)'].fillna(data['Size(SqFt)'].mean())
data['Rooms'] = data['Rooms'].fillna(data['Rooms'].median())
data = data.dropna(subset=['Price'])  # Drop rows with missing target values

print("\nDataset After Handling Missing Values:")
print(data.head())



Dataset After Handling Missing Values:
   Size(SqFt)  Rooms     Price
0       956.0    5.0  471112.0
1       602.0    5.0  430879.0
2      1626.0    3.0  130866.0
3      1503.0    5.0  220031.0
4      1414.0    4.0  135337.0


In [11]:
print(data.to_string())

     Size(SqFt)  Rooms     Price
0    956.000000    5.0  471112.0
1    602.000000    5.0  430879.0
2   1626.000000    3.0  130866.0
3   1503.000000    5.0  220031.0
4   1414.000000    4.0  135337.0
5   1071.000000    5.0  116469.0
6    919.000000    4.0  273239.0
7   1637.452632    3.0  137149.0
9   2918.000000    2.0  224782.0
10  2228.000000    3.0  246002.0
11   630.000000    4.0  450737.0
12   622.000000    1.0  354496.0
13   883.000000    1.0  212321.0
14  1395.000000    1.0  382713.0
16  2569.000000    2.0  479245.0
17  2965.000000    4.0  399391.0
18   608.000000    5.0  402100.0
19  2798.000000    1.0  347815.0
20  1637.452632    4.0  227400.0
21  2732.000000    4.0  347974.0
22  2218.000000    5.0  313416.0
23  1402.000000    3.0  199829.0
24  2339.000000    5.0  149453.0
25  2913.000000    3.0  150819.0
26  1639.000000    5.0  445496.0
27   526.000000    1.0  325994.0
28  1153.000000    1.0  285753.0
29  2231.000000    5.0  322077.0
31  1638.000000    3.0  344855.0
32  1136.0

In [12]:
# Features (X) and Target (y)
X = data[['Size(SqFt)', 'Rooms']]  # Features
y = data['Price']  # Target

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("\nTraining Features Snapshot:")
print(X_train.head())



Training Features Snapshot:
    Size(SqFt)  Rooms
70       792.0    2.0
17      2965.0    4.0
82       729.0    4.0
83      1438.0    3.0
33      1381.0    3.0


In [13]:
# Initialize Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Fit the model to the training data
model.fit(X_train, y_train)

print("\nRandom Forest Model Trained Successfully!")



Random Forest Model Trained Successfully!


In [14]:
# Predict on test data
predictions = model.predict(X_test)

print("\nPredicted Prices:")
print(predictions)

print("\nActual Prices:")
print(y_test.values)



Predicted Prices:
[277593.77       404434.39       296892.54       287623.18
 248625.9        402130.26       282018.76       327312.21
 448903.91       141733.78       421873.13       328232.16
 416899.41       358766.         349440.77       299801.91
 244603.952      144193.375      198417.80134921]

Actual Prices:
[186317. 149453. 239042. 338555. 471112. 285753. 230367. 199724. 450737.
 173495. 214137. 441911. 355155. 196200. 212321. 135337. 227400. 344855.
 157288.]


In [15]:
# Calculate Mean Squared Error
mse = mean_squared_error(y_test, predictions)
print(f"\nMean Squared Error (MSE): {mse:.2f}")



Mean Squared Error (MSE): 17721617508.45


In [17]:
# Test with a new house example
new_data = [[10000, 3]]  # Example: 1500 SqFt, 3 Rooms
new_prediction = model.predict(new_data)
print(f"\nPredicted Price for a 1500 SqFt house with 3 rooms: ₹{new_prediction[0]:,.2f}")



Predicted Price for a 1500 SqFt house with 3 rooms: ₹172,756.62


