In [9]:
import pandas as pd

# Load the dataset
file_path = 'Bengaluru_House_Data.csv'
df = pd.read_csv(file_path)

# Drop rows with missing values in 'location', 'size', 'bath', 'balcony'
df = df.dropna(subset=['location', 'size', 'bath', 'balcony'])

# Drop the 'society' column since it has many missing values
df = df.drop(columns=['society'])

# Convert 'total_sqft' to numeric
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0]) + float(tokens[1])) / 2
    try:
        return float(x)
    except:
        return None

df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_num)
df = df.dropna(subset=['total_sqft'])

# Extract the number of bedrooms from the 'size' column
df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))

# Drop the 'size' column as it's no longer needed
df = df.drop(columns=['size'])

# One-hot encode the 'location' column
df = pd.get_dummies(df, columns=['location'], drop_first=True)

# Save the cleaned dataset
df.to_csv('Cleaned_Bengaluru_House_Data.csv', index=False)


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, 
import joblib
import numpy as np

# Load the cleaned dataset
df = pd.read_csv('Cleaned_Bengaluru_House_Data.csv')

# Select features and target variable
X = df.drop(columns=['price', 'availability', 'area_type'])
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate the metrics
mse = mean_squared_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


# Print the metrics
print(f"Mean Squared Error: {mse}")
print(f"Root Mean Squared Error: {rmse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared: {r2}")


# Save the model and feature names
joblib.dump(model, 'house_price_model.pkl')
X.columns.to_series().to_csv('feature_names.csv', index=False)


Mean Squared Error: 7993.526510975105
Root Mean Squared Error: 89.4065238725626
Mean Absolute Error: 29.628698268212567
R-squared: 0.627906705995066


