In [4]:
import pandas as pd

# Load the dataset
df = pd.read_csv("Bengaluru_House_Data.csv")
df.head()


Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [5]:
# Drop unnecessary columns
df = df.drop(['area_type', 'availability', 'society'], axis=1)

# Handle missing values
df = df.dropna()

In [6]:
# Convert 'size' column to 'BHK' (number of bedrooms)
df['BHK'] = df['size'].apply(lambda x: int(x.split(' ')[0]))
df = df.drop(['size'], axis=1)

# Handle 'total_sqft' (convert to numeric, handling ranges)
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0]) + float(tokens[1])) / 2
    try:
        return float(x)
    except:
        return None

df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_num)
df = df.dropna()


In [7]:
# One-hot encode the 'location' column
dummies = pd.get_dummies(df['location'])
df = pd.concat([df, dummies], axis=1)
df = df.drop(['location'], axis=1)

# Display cleaned data
print(df.head())

   total_sqft  bath  balcony   price  BHK   Anekal   Banaswadi   Basavangudi  \
0      1056.0   2.0      1.0   39.07    2    False       False         False   
1      2600.0   5.0      3.0  120.00    4    False       False         False   
2      1440.0   2.0      3.0   62.00    3    False       False         False   
3      1521.0   3.0      1.0   95.00    3    False       False         False   
4      1200.0   2.0      1.0   51.00    2    False       False         False   

    Bhoganhalli   Devarabeesana Halli  ...  \
0         False                 False  ...   
1         False                 False  ...   
2         False                 False  ...   
3         False                 False  ...   
4         False                 False  ...   

   ravindra nagar, T.dasarahalli peenya  rr nagar  sankeswari  \
0                                 False     False       False   
1                                 False     False       False   
2                                 False     Fal

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Split the data into features and target variable
X = df.drop(['price'], axis=1)
y = df['price']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

In [9]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')
print(f'R-squared: {r2}')


Mean Squared Error: 5.325739108370471e+20
Root Mean Squared Error: 23077562931.060272
R-squared: -2.479095809745092e+16
