# üè° Bangalore House Price Prediction
End-to-End Machine Learning Project with Multiple Models & Graphical Representation

In [None]:

# üì¶ Importing Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor, plot_importance
from catboost import CatBoostRegressor

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings("ignore")


In [None]:

# üìÇ Load Dataset
df = pd.read_csv("Bengaluru_House_Data.csv")
df.head()


In [None]:

# üîé Dataset Overview
print("Shape of dataset:", df.shape)
print("\nColumn Info:")
print(df.info())
print("\nMissing Values:")
print(df.isnull().sum())
df.describe()


In [None]:

# üìä Exploratory Data Analysis (EDA)

# Price distribution
plt.figure(figsize=(8,5))
sns.histplot(df['price'], kde=True, bins=50)
plt.title("Distribution of House Prices")
plt.show()

# Correlation heatmap (numeric features only)
plt.figure(figsize=(10,6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap")
plt.show()


In [None]:

# üßπ Data Preprocessing (Example: dropping NA, encoding, etc.)
df = df.dropna()
# Example: convert categorical (location) into dummy vars
if 'location' in df.columns:
    df = pd.get_dummies(df, columns=['location'], drop_first=True)

X = df.drop('price', axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:

# ü§ñ Train Multiple Models

results = {}

# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
results['LinearRegression'] = [
    r2_score(y_test, y_pred_lr),
    mean_absolute_error(y_test, y_pred_lr),
    np.sqrt(mean_squared_error(y_test, y_pred_lr))
]

# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
results['RandomForest'] = [
    r2_score(y_test, y_pred_rf),
    mean_absolute_error(y_test, y_pred_rf),
    np.sqrt(mean_squared_error(y_test, y_pred_rf))
]

# XGBoost with basic tuning
xgb = XGBRegressor(random_state=42, n_estimators=300, max_depth=6, learning_rate=0.1)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
results['XGBoost'] = [
    r2_score(y_test, y_pred_xgb),
    mean_absolute_error(y_test, y_pred_xgb),
    np.sqrt(mean_squared_error(y_test, y_pred_xgb))
]

# CatBoost
cat = CatBoostRegressor(verbose=0, random_state=42, iterations=500, learning_rate=0.1, depth=6)
cat.fit(X_train, y_train)
y_pred_cat = cat.predict(X_test)
results['CatBoost'] = [
    r2_score(y_test, y_pred_cat),
    mean_absolute_error(y_test, y_pred_cat),
    np.sqrt(mean_squared_error(y_test, y_pred_cat))
]

# Convert results to DataFrame
results_df = pd.DataFrame(results, index=['R2 Score','MAE','RMSE']).T
results_df


In [None]:

# üìà Model Performance Comparison
results_df.plot(kind='bar', figsize=(10,6))
plt.title("Model Performance Comparison")
plt.ylabel("Score / Error")
plt.xticks(rotation=45)
plt.show()


In [None]:

# üåü Feature Importance (XGBoost)
plt.figure(figsize=(10,6))
plot_importance(xgb, max_num_features=15)
plt.show()


In [None]:

# üìâ Error Analysis (Actual vs Predicted)
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred_xgb, alpha=0.5)
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs Predicted (XGBoost)")
plt.show()

# Error distribution
errors = y_test - y_pred_xgb
plt.figure(figsize=(8,6))
sns.histplot(errors, bins=50, kde=True)
plt.title("Error Distribution (XGBoost)")
plt.show()
