In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
import pickle

# Load the dataset
df = pd.read_csv('house_data.csv')
print(df["Price"].describe().reset_index()["Price"])
round(df["Price"].describe().reset_index()["Price"], 2)

# Select features for prediction
features = ['number of bedrooms', 'number of bathrooms', 'living area', 
           'condition of the house', 'Number of schools nearby']

X = df[features]
y = df['Price']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Ensure the main 'model' directory exists (at the correct location)
os.makedirs(r'D:\House\backend\model', exist_ok=True)

# Save the model and scaler into the main 'model' folder
with open(r'D:\House\backend\model\house_price_model.pkl', 'wb') as file:
    pickle.dump(rf_model, file)

with open(r'D:\House\backend\model\scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

# Print model performance
train_score = rf_model.score(X_train_scaled, y_train)
test_score = rf_model.score(X_test_scaled, y_test)
print(f"Training R² Score: {train_score:.4f}")
print(f"Testing R² Score: {test_score:.4f}")


0    1.461900e+04
1    5.388063e+05
2    3.672294e+05
3    7.800000e+04
4    3.200000e+05
5    4.500000e+05
6    6.450000e+05
7    7.700000e+06
Name: Price, dtype: float64
Training R² Score: 0.8944
Testing R² Score: 0.5146
