House Price Prediction

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error

# Load data
df = pd.read_csv("C:\\Users\\HP\\OneDrive\\Desktop\\House_price_prediction\\dataset\\Bengaluru_House_Data.csv")

df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


Data Preprocessing

In [3]:
def extract_bhk(x):
    if pd.isna(x):  # Handle NaN values
        return None
    try:
        if isinstance(x, str):  # For string data type ("2 BHK")
            return int(x.split(' ')[0])
        elif isinstance(x, (int, float)):  # For numerical data type
            return int(x)
        else:  
            return None
    except:
        return None  
    
def convert_sqft(x):
    try:
        x = str(x).strip()
        if '-' in x:  # For intervals ("1000-1800")
            low, high = map(float, x.split('-'))
            return (low + high) / 2  # average
        return float(x)
    except:
        return None   


In [4]:
df['bhk'] = df['size'].apply(extract_bhk)
df.drop('size', axis=1, inplace=True)  # Drop original column

df['total_sqft'] = df['total_sqft'].apply(convert_sqft)

Feature Engineering

In [None]:
# Remove outliers
df = df[(df['total_sqft'] > 300) & (df['total_sqft'] < 3000)]
df = df[df['price'] < df['price'].quantile(0.95)]

# Handle location
df = df[df['location'].notnull()]
df['location'] = df['location'].str.strip()
loc_counts = df['location'].value_counts()
df['location'] = df['location'].apply(lambda x: 'other' if loc_counts[x] <= 10 else x)

# Feature engineering
df['price_per_sqft'] = (df['price'] * 1e5) / df['total_sqft']

# Select features and target
features = ['total_sqft', 'bath', 'bhk', 'price_per_sqft', 'location']
df = df[features + ['price']].dropna()

# One-hot encoding
df = pd.get_dummies(df, columns=['location'], drop_first=True)

Multivariate Linear Regression

In [None]:
# Split data
X = df.drop('price', axis=1)
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle missing values 
for col in ['total_sqft', 'bath', 'bhk', 'price_per_sqft']:
    median_val = X_train[col].median()
    X_train[col].fillna(median_val, inplace=True)
    X_test[col].fillna(median_val, inplace=True)

Independent variables (X): "total_sqft", "bath", "bhk", "price_per_sqft"

Dependent variable (Y): "Price"

In [116]:
# Model training
model = LinearRegression()
model.fit(X_train, y_train)
preds = model.predict(X_test)

# Evaluation
print("R2 Score:", r2_score(y_test, preds))
print("MAE:", mean_absolute_error(y_test, preds))

R2 Score: 0.9367269265798911
MAE: 6.428069917511212
