# Import Necessary Libraries

In [679]:
import pandas as pd
import numpy as np 
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Loading Data

In [680]:
df = pd.read_csv(r'../data/v1_EDA.csv')
df.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [681]:
df.shape

(13320, 9)

### Let's drop some unnecessary columns

In [682]:
# Let's drop some unnecessary columns which are not very important for my model prediction
# df = df.drop(['area_type', 'availability', 'society', 'balcony'], axis='columns')
# df.head()

### Let's Check The duplicate values

In [683]:
df.duplicated().sum()

np.int64(529)

In [684]:
# Let's drop the duplicate rows
df = df.drop_duplicates()
df.duplicated().sum()

np.int64(0)

### Let's Check The Missing values

In [685]:
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5328
total_sqft         0
bath              73
balcony          605
price              0
dtype: int64

In [686]:
# Drop rows where 'size' or 'location' is missing
df = df.dropna(subset=['size', 'location'])
print(df.shape)
df.head()

(12774, 9)


Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [687]:
# Fill 'bath' with median
df['bath'] = df['bath'].fillna(df['bath'].median())
print(df.shape)
df.head()

(12774, 9)


Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [688]:
# Let's Create a new columns based on size columns
df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))
print(df.shape)
df.head()

(12774, 10)


Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price,bhk
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0,4
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0,3
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0,2


In [689]:
#Drop the original 'size' column
df = df.drop(['size'], axis=1)
print(df.shape)
df.head()

(12774, 9)


Unnamed: 0,area_type,availability,location,society,total_sqft,bath,balcony,price,bhk
0,Super built-up Area,19-Dec,Electronic City Phase II,Coomee,1056,2.0,1.0,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,Theanmp,2600,5.0,3.0,120.0,4
2,Built-up Area,Ready To Move,Uttarahalli,,1440,2.0,3.0,62.0,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,Soiewre,1521,3.0,1.0,95.0,3
4,Super built-up Area,Ready To Move,Kothanur,,1200,2.0,1.0,51.0,2


In [690]:
df.shape

(12774, 9)

In [691]:
df.head()

Unnamed: 0,area_type,availability,location,society,total_sqft,bath,balcony,price,bhk
0,Super built-up Area,19-Dec,Electronic City Phase II,Coomee,1056,2.0,1.0,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,Theanmp,2600,5.0,3.0,120.0,4
2,Built-up Area,Ready To Move,Uttarahalli,,1440,2.0,3.0,62.0,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,Soiewre,1521,3.0,1.0,95.0,3
4,Super built-up Area,Ready To Move,Kothanur,,1200,2.0,1.0,51.0,2


In [692]:
df['bhk'].unique()

array([ 2,  4,  3,  6,  1,  8,  7,  5, 11,  9, 27, 10, 19, 16, 43, 14, 12,
       13, 18])

### Let's see the outlier

In [693]:
df[df.bhk>20]

Unnamed: 0,area_type,availability,location,society,total_sqft,bath,balcony,price,bhk
1718,Super built-up Area,Ready To Move,2Electronic City Phase II,,8000,27.0,0.0,230.0,27
4684,Plot Area,Ready To Move,Munnekollal,,2400,40.0,0.0,660.0,43


27,43-BHK house is extremely rare. </br>
these are data entry errors (like: someone entered the total square footage in the BHK column) </br>
If we have a 43-BHK house that is only 2,400 sqft, it's an impossible outlier that will confuse our model.

43 bedrooms but only 2400 sqft. Mathematically, that means each room is about 55 sqft (including bathrooms and hallways), which is impossible for a residential home.

Usually, a room needs at least 300 sqft. Anything significantly below that is likely a data entry error.

In [694]:
# Let's check total_sqft
df.total_sqft.unique()

<StringArray>
[       '1056',        '2600',        '1440',        '1521',        '1200',
        '1170',        '2732',        '3300',        '1310',        '1020',
 ...
        '3124',        '9200',         '613',         '250',        '2395',
 '1020 - 1130',        '2758', '1133 - 1384',         '774',        '4689']
Length: 2110, dtype: str

Our unique() check reveals that total_sqft is a StringArray containing values like '1020 - 1130'. Since a machine learning model only understands single numbers, we need to convert these ranges to their average.

In [695]:
# Let's check total_sqft is float or not
def is_float(x):
    try:
        float(x)
    except:
        return False
    return True

In [696]:
df[~df['total_sqft'].apply(is_float)].head(10)

Unnamed: 0,area_type,availability,location,society,total_sqft,bath,balcony,price,bhk
30,Super built-up Area,19-Dec,Yelahanka,LedorSa,2100 - 2850,4.0,0.0,186.0,4
56,Built-up Area,20-Feb,Devanahalli,BrereAt,3010 - 3410,2.0,,192.0,4
81,Built-up Area,18-Oct,Hennur Road,Gollela,2957 - 3450,2.0,,224.5,4
122,Super built-up Area,18-Mar,Hebbal,SNontle,3067 - 8156,4.0,0.0,477.0,4
137,Super built-up Area,19-Mar,8th Phase JP Nagar,Vaarech,1042 - 1105,2.0,0.0,54.005,2
165,Super built-up Area,18-Dec,Sarjapur,Kinuerg,1145 - 1340,2.0,0.0,43.49,2
188,Super built-up Area,Ready To Move,KR Puram,MCvarar,1015 - 1540,2.0,0.0,56.8,2
224,Super built-up Area,19-Dec,Devanahalli,Jurdsig,1520 - 1740,2.0,,74.82,3
410,Super built-up Area,Ready To Move,Kengeri,,34.46Sq. Meter,1.0,0.0,18.5,1
549,Super built-up Area,18-Sep,Hennur Road,Shxorm,1195 - 1440,2.0,0.0,63.77,2


In [697]:
# as we can see the there is massy data let's convert into number
def convert_sqft_to_num(x):
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0]) + float(tokens[1]))/2
    try:
        return float(x)
    except:
        return None


In [698]:
convert_sqft_to_num('11225')

11225.0

In [699]:
# For getting average
convert_sqft_to_num('1145 - 1340')

1242.5

In [700]:
# For getting None
convert_sqft_to_num('34.46Sq. Meter')

In [701]:
df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_num)
df.head()

Unnamed: 0,area_type,availability,location,society,total_sqft,bath,balcony,price,bhk
0,Super built-up Area,19-Dec,Electronic City Phase II,Coomee,1056.0,2.0,1.0,39.07,2
1,Plot Area,Ready To Move,Chikka Tirupathi,Theanmp,2600.0,5.0,3.0,120.0,4
2,Built-up Area,Ready To Move,Uttarahalli,,1440.0,2.0,3.0,62.0,3
3,Super built-up Area,Ready To Move,Lingadheeranahalli,Soiewre,1521.0,3.0,1.0,95.0,3
4,Super built-up Area,Ready To Move,Kothanur,,1200.0,2.0,1.0,51.0,2


In [702]:
# Let's Check our 30 number index where total_sqft is 2100 - 2850 
df.iloc[30]

area_type       Super built-up  Area
availability                  19-Dec
location                   Yelahanka
society                      LedorSa
total_sqft                    2475.0
bath                             4.0
balcony                          0.0
price                          186.0
bhk                                4
Name: 30, dtype: object

In [703]:
# have a look it's cheange successfully 
(2100 + 2850)/2

2475.0

# Save Clean data

In [704]:
df.to_csv(r"../data/v2_Clean.csv", index=False)

"We noticed some houses had 43 bedrooms in just 2400 square feet. By creating a threshold of 300 sqft per room, we remove these 'dirty' data points. This ensures our model doesn't get confused by impossible scenarios, leading to much higher accuracy on real-world homes."

### This is for src/components/b_cleaning.py

In [705]:
import os
import sys
import pandas as pd
import numpy as np
from src.exception import CustomException
from src.logger import configure_logger

logger = configure_logger("DataCleaning")

class DataCleaning:
    def __init__(self):
        pass

    def convert_sqft_to_num(self, x):
        """Converts ranges to average and handles non-numeric strings."""
        try:
            tokens = str(x).split('-')
            if len(tokens) == 2:
                return (float(tokens[0]) + float(tokens[1])) / 2
            return float(x)
        except Exception:
            return None

    def start_data_cleaning(self, df: pd.DataFrame) -> pd.DataFrame:
        try:
            logger.info("ðŸ§¼ Starting the Data Cleaning process...")

            # 1. Create a copy to avoid SettingWithCopyWarning
            df = df.copy()

            # 2. Handle Duplicates
            df = df.drop_duplicates()
            logger.info(f"Duplicates removed. Shape: {df.shape}")

            # 3. Clean total_sqft using helper function
            df['total_sqft'] = df['total_sqft'].apply(self.convert_sqft_to_num)
            
            # 4. Drop rows with missing values
            df = df.dropna(subset=['total_sqft', 'bath', 'location', 'size'])
            
            # 5. Extract BHK from 'size' (Needed for outlier removal)
            df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0]))

            # 6. Outlier Removal: Business Logic (300 sqft per BHK rule)
            df = df[~(df.total_sqft / df.bhk < 300)]
            
            logger.info(f"Data Cleaning complete. Final shape: {df.shape}")
            return df

        except Exception as e:
            logger.error("Error occurred in Data Cleaning component")
            raise CustomException(e, sys)

if __name__ == "__main__":
    cleaner = DataCleaning()
    clean_df = cleaner.start_data_cleaning(df)
    pass

22:49:06 | INFO     | DataCleaning | ðŸ§¼ Starting the Data Cleaning process...


22:49:06 | INFO     | DataCleaning | Duplicates removed. Shape: (12773, 9)
22:49:06 | ERROR    | DataCleaning | Error occurred in Data Cleaning component


CustomException: Error occurred in python script: [/tmp/ipykernel_93765/3567300258.py] at line number: [39] with error message: [['size']]