In [1]:
#import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
#load the data
df = pd.read_csv("Bengaluru_House_Data.csv")
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


EDA

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [4]:
#columns to remove : area_type,availability,society,balcony
df = df.drop(["area_type","availability","society","balcony"],axis=1)
df.head()

Unnamed: 0,location,size,total_sqft,bath,price
0,Electronic City Phase II,2 BHK,1056,2.0,39.07
1,Chikka Tirupathi,4 Bedroom,2600,5.0,120.0
2,Uttarahalli,3 BHK,1440,2.0,62.0
3,Lingadheeranahalli,3 BHK,1521,3.0,95.0
4,Kothanur,2 BHK,1200,2.0,51.0


Handling Missing values

In [5]:
#Handling missing values in location
df["location"].value_counts()
#Mostly repeated locations are:  Whitefield, Sarjapur  Road

df["location"] = df["location"].fillna("Sarjapur  Road")

In [6]:
#Handling nan values in size column
#Here mostly repeated bhk is 2 BHk 
df["size"] = df["size"].fillna("2 BHK")

In [7]:
#Handling nan values in bath
med = df["bath"].median()
df["bath"] = df["bath"].fillna(med)
df["bath"] = df["bath"].astype(int)

In [8]:
df.isnull().sum()

location      0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [9]:
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import joblib

# Function to convert total_sqft to float
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0]) + float(tokens[1])) / 2
    try:
        return float(x)
    except:
        return None

# Apply conversion
df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_num)

# Drop rows with None
df = df.dropna(subset=['total_sqft'])

# Extract BHK from size
df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))

# Encode location
le = LabelEncoder()
df['encoded_loc'] = le.fit_transform(df['location'])

# Select relevant columns
df = df[['location', 'bhk', 'bath', 'total_sqft', 'encoded_loc', 'price']]

# Save cleaned df
df.to_csv('cleaned_df.csv', index=False)

# Prepare data for training
X = df[['total_sqft', 'bath', 'bhk', 'encoded_loc']]
y = df['price']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Save model
joblib.dump(model, 'model.pkl')

print("Cleaned data and model saved successfully!")

Cleaned data and model saved successfully!
