In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score, mean_absolute_error, root_mean_squared_error
import joblib

In [3]:
df = pd.read_csv('ahmedabad_real_estate_dataset_updated.csv')
print(df.head())

      Location  BHK  Total_Area_Sqft           Condition       Furnishing  \
0   SG Highway    1             1459                 New  Fully-Furnished   
1    Naranpura    3             1906              Resale      Unfurnished   
2    Satellite    1              751              Resale  Fully-Furnished   
3  South Bopal    1             1034  Under Construction   Semi-Furnished   
4    Naranpura    3             2225                 New      Unfurnished   

  Floor_Level  Nearby_Amenities_Score  Price_Per_Sqft  Estimated_Price  \
0      Ground                       4            7260         10592340   
1         2nd                       8            4608          8783705   
2         2nd                       5            7298          5481008   
3         2nd                      10            4097          4235858   
4         3rd                       9            5859         13036275   

   Property_Age  Total_Floors  Has_Parking Balcony_View  Year_Of_Sale  
0            12     

In [4]:
categorical_cols = ['Location', 'Condition', 'Furnishing', 'Floor_Level', 'Balcony_View']
encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

In [5]:
X = df.drop(columns=['Estimated_Price', 'Price_Per_Sqft'])  # drop Price_Per_Sqft if included
y = df['Estimated_Price']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [8]:
y_pred = model.predict(X_test)
print("R² Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", root_mean_squared_error(y_test, y_pred))

R² Score: 0.8396283558874298
MAE: 1087753.0976666666
RMSE: 1374399.738023789


In [9]:
joblib.dump(model, 'house_price_model_fixed.pkl')
joblib.dump(encoders, 'label_encoders_fixed.pkl')

['label_encoders_fixed.pkl']

In [10]:
for age in [0, 5, 10, 20, 30]:
    sample = X_test.iloc[0].copy()
    sample['Property_Age'] = age
    sample_df = pd.DataFrame([sample])  # Wrap with correct column names
    print(f"🏗️ Property Age {age} → ₹{int(model.predict(sample_df)[0]):,}")

🏗️ Property Age 0 → ₹8,375,966
🏗️ Property Age 5 → ₹7,765,907
🏗️ Property Age 10 → ₹7,663,446
🏗️ Property Age 20 → ₹6,270,408
🏗️ Property Age 30 → ₹6,086,572
