In [1]:
import pandas as pd
import numpy as np

---

*-  Import the "RealEstate_Prices.csv" dataset. Clean column names by removing spaces,
special characters, or renaming them for clarity.*

In [4]:
data = pd.read_csv("Bengaluru_House_Data - Bengaluru_House_Data.csv")

In [5]:
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [6]:
data.columns

Index(['area_type', 'availability', 'location', 'size', 'society',
       'total_sqft', 'bath', 'balcony', 'price'],
      dtype='object')

In [7]:
data.dtypes

area_type        object
availability     object
location         object
size             object
society          object
total_sqft       object
bath            float64
balcony         float64
price           float64
dtype: object

*- Handle missing values in the dataset, deciding on an appropriate strategy (e.g.,
imputation or removal).*

In [9]:
data.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [10]:
data['bath'].fillna(data['bath'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['bath'].fillna(data['bath'].median(), inplace=True)


In [11]:
data['balcony'].fillna(data['balcony'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['balcony'].fillna(data['balcony'].mean(), inplace=True)


In [12]:
data.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath               0
balcony            0
price              0
dtype: int64

---

In [14]:
data.dropna(subset=['price'], inplace=True)

---

*- Filter and subset the data based on specific criteria, such as a particular time period,
property type, or location.*

In [17]:
data = data[(data['availability'] == 'Ready To Move') & (data['size'].str.contains('2 BHK', na=False))]

---

*- Handle categorical variables by encoding them appropriately (e.g., one-hot encoding or
label encoding) for further analysis.*

In [20]:
from sklearn.preprocessing import LabelEncoder

In [21]:
label_encoder = LabelEncoder()

In [22]:
data['area_type_encoded'] = label_encoder.fit_transform(data['area_type'])

In [23]:
data.area_type_encoded.value_counts()

area_type_encoded
2    3208
0     853
1      25
Name: count, dtype: int64

---

*- Aggregate the data to calculate summary statistics or derived metrics such as average
sale prices by neighborhood or property type.*

In [26]:
avg_price_by_location = data.groupby('location')['price'].mean()

In [27]:
print("Location \t\t\t Average Price\n", avg_price_by_location)

Location 			 Average Price
 location
1st Block HRBR Layout              67.000000
1st Block Jayanagar                60.000000
1st Block Koramangala              73.500000
1st Phase JP Nagar                 86.583333
1st Stage Indira Nagar             61.500000
                                     ...    
Yeshwanthpur Industrial Suburb    120.000000
kadubisnahalli                     57.000000
kanakapura main road               36.440000
rr nagar                           60.000000
sapthagiri Layout                 115.000000
Name: price, Length: 653, dtype: float64


---

*- Identify and handle outliers or extreme values in the data that may affect the analysis
or modeling process*

In [30]:
Q1 = data['price'].quantile(0.25)
Q3 = data['price'].quantile(0.75)

In [31]:
IQR = Q3 - Q1

In [32]:
data = data[(data['price'] >= (Q1 - 1.5 * IQR)) & (data['price'] <= (Q3 + 1.5 * IQR))]