In [1]:
import pandas as pd
import sklearn

In [2]:
sklearn.__version__

'1.5.0'

In [3]:
from pathlib import Path
df = pd.read_csv(Path("../data/raw/flats.csv"))

In [4]:
df.head()

Unnamed: 0,property_name,link,society,price,area,areaWithType,bedRoom,bathroom,balcony,additionalRoom,address,floorNum,facing,agePossession,nearbyLocations,description,furnishDetails,features,rating,property_id
0,2 BHK Flat in Krishna Colony,https://www.99acres.com/2-bhk-bedroom-apartmen...,maa bhagwati residency,45 Lac,"₹ 5,000/sq.ft.",Carpet area: 900 (83.61 sq.m.),2 Bedrooms,2 Bathrooms,1 Balcony,,"Krishna Colony, Gurgaon, Haryana",4th of 4 Floors,West,1 to 5 Year Old,"['Chintapurni Mandir', 'State bank ATM', 'Pear...",So with lift.Maa bhagwati residency is one of ...,"['3 Fan', '4 Light', '1 Wardrobe', 'No AC', 'N...","['Feng Shui / Vaastu Compliant', 'Security / F...","['Environment4 out of 5', 'Safety4 out of 5', ...",C68850746
1,2 BHK Flat in Ashok Vihar,https://www.99acres.com/2-bhk-bedroom-apartmen...,Apna Enclave,50 Lac,"₹ 7,692/sq.ft.",Carpet area: 650 (60.39 sq.m.),2 Bedrooms,2 Bathrooms,1 Balcony,,"46b, Ashok Vihar, Gurgaon, Haryana",1st of 3 Floors,West,10+ Year Old,"['Chintapurni Mandir', 'Sheetla Mata Mandir', ...","Property situated on main road, railway statio...","['3 Wardrobe', '4 Fan', '1 Exhaust Fan', '1 Ge...","['Security / Fire Alarm', 'Maintenance Staff',...","['Environment4 out of 5', 'Safety4 out of 5', ...",H68850564
2,2 BHK Flat in Sohna,https://www.99acres.com/2-bhk-bedroom-apartmen...,Tulsiani Easy in Homes,40 Lac,"₹ 6,722/sq.ft.",Carpet area: 595 (55.28 sq.m.),2 Bedrooms,2 Bathrooms,3 Balconies,,"Sohna, Gurgaon, Haryana",12nd of 14 Floors,,0 to 1 Year Old,"['Huda City Metro', 'Golf Course extn road', '...","This property is 15 km away from badshapur, gu...",,"['Power Back-up', 'Feng Shui / Vaastu Complian...","['Environment4 out of 5', 'Safety4 out of 5', ...",J68850120
3,2 BHK Flat in Sector 61 Gurgaon,https://www.99acres.com/2-bhk-bedroom-apartmen...,Smart World Orchard,1.47 Crore,"₹ 12,250/sq.ft.",Carpet area: 1200 (111.48 sq.m.),2 Bedrooms,2 Bathrooms,2 Balconies,Study Room,"Sector 61 Gurgaon, Gurgaon, Haryana",2nd of 4 Floors,,Dec 2023,"['Sector 55-56 Metro station', 'Bestech Centra...",Near to metro station of sector 56 and opposit...,,"['Security / Fire Alarm', 'Private Garden / Te...",,S68849476
4,2 BHK Flat in Sector 92 Gurgaon,https://www.99acres.com/2-bhk-bedroom-apartmen...,Parkwood Westend,70 Lac,"₹ 5,204/sq.ft.",Super Built up area 1345(124.95 sq.m.),2 Bedrooms,2 Bathrooms,3 Balconies,Study Room,"Sector 92 Gurgaon, Gurgaon, Haryana",5th of 8 Floors,,Under Construction,"['Yadav Clinic', 'Bangali Clinic', 'Dr. J. S. ...",We are the proud owners of this 2 bhk alongwit...,[],,"['Environment5 out of 5', 'Safety3 out of 5', ...",L47956793


In [5]:
df.shape

(3028, 20)

In [6]:
# to drop link col no need 
df.drop(columns=['link','property_id'],inplace=True)

In [7]:
# Now let's clean each column one by one
# rename columns
df.rename(columns={'area':'price_per_sqft'},inplace=True)

In [8]:
df['society'].value_counts()

society
SS The Leaf3.8 ★                             73
Tulip Violet4.3 ★                            40
Shapoorji Pallonji Joyville Gurugram4.0 ★    39
Signature Global Park4.0 ★                   36
Shree Vardhman Victoria3.8 ★                 35
                                             ..
shree kirpalu gi sector 52                    1
Meditech Apartment                            1
Mariners Home                                 1
IMT View Society                              1
Spire Woods Now Ananda by Alpha corp          1
Name: count, Length: 639, dtype: int64

In [9]:
import re

class DataCleaning:
    
    def __init__(self,df) -> None:
        self.df = df
    
    def split_bhk_area(self,prop_name):
        split_function = lambda x: pd.Series([x.split(' ')[0], ' '.join(x.split(' ')[1:])])
        
        df[['BHK','society1']] = df[prop_name].apply(split_function)
        df.drop(columns=[prop_name],inplace=True)
        
        return df
    
    def conv_price_unit_to_cr(self,prop_name):
        
        checks = ['Lacs','lac','LAKHS','lakh','Lakhs','lakhs','lacs','LACS','Lac']
        
        
        def convert_price(price):
            try:
                parts = str(price).split(' ')
                if len(parts)==2 and parts[1] in checks:
                    return float(parts[0])/100
                elif len(parts)==2 :
                    return parts[0]
                return price
            except (ValueError, AttributeError):
                return price
        
        self.df[prop_name] = self.df[prop_name].apply(convert_price)
        #renaming the column name
        self.df.rename(columns={prop_name:'price_in_cr'},inplace=True)
        # checked value_counts and got to know that this string was problem in the logic
        self.df = self.df[self.df['price_in_cr'] != 'Price on Request']
        return self.df
    
    def area_clean(self, area_feature):
        cleaned_values = []
        for val in df[area_feature]:
            try:
                # Ensuring the value is a string 
                val_str = str(val)
                # using the regular expression to remove unwanted characters
                cleaned_val = re.sub(r'[₹\s,/sq.ft]', '', val_str)
                # convert the cleaned value to integer
                cleaned_int = int(cleaned_val)
                cleaned_values.append(cleaned_int)
            except ValueError:
                #Handle the case where conversion to int fails
                cleaned_values.append(None)
            
        df['price_per_sqft'] = cleaned_values
        
        return df

      
  

In [10]:
DC = DataCleaning(df)
df = DC.split_bhk_area("property_name")

In [11]:
df.head()

Unnamed: 0,society,price,price_per_sqft,areaWithType,bedRoom,bathroom,balcony,additionalRoom,address,floorNum,facing,agePossession,nearbyLocations,description,furnishDetails,features,rating,BHK,society1
0,maa bhagwati residency,45 Lac,"₹ 5,000/sq.ft.",Carpet area: 900 (83.61 sq.m.),2 Bedrooms,2 Bathrooms,1 Balcony,,"Krishna Colony, Gurgaon, Haryana",4th of 4 Floors,West,1 to 5 Year Old,"['Chintapurni Mandir', 'State bank ATM', 'Pear...",So with lift.Maa bhagwati residency is one of ...,"['3 Fan', '4 Light', '1 Wardrobe', 'No AC', 'N...","['Feng Shui / Vaastu Compliant', 'Security / F...","['Environment4 out of 5', 'Safety4 out of 5', ...",2,BHK Flat in Krishna Colony
1,Apna Enclave,50 Lac,"₹ 7,692/sq.ft.",Carpet area: 650 (60.39 sq.m.),2 Bedrooms,2 Bathrooms,1 Balcony,,"46b, Ashok Vihar, Gurgaon, Haryana",1st of 3 Floors,West,10+ Year Old,"['Chintapurni Mandir', 'Sheetla Mata Mandir', ...","Property situated on main road, railway statio...","['3 Wardrobe', '4 Fan', '1 Exhaust Fan', '1 Ge...","['Security / Fire Alarm', 'Maintenance Staff',...","['Environment4 out of 5', 'Safety4 out of 5', ...",2,BHK Flat in Ashok Vihar
2,Tulsiani Easy in Homes,40 Lac,"₹ 6,722/sq.ft.",Carpet area: 595 (55.28 sq.m.),2 Bedrooms,2 Bathrooms,3 Balconies,,"Sohna, Gurgaon, Haryana",12nd of 14 Floors,,0 to 1 Year Old,"['Huda City Metro', 'Golf Course extn road', '...","This property is 15 km away from badshapur, gu...",,"['Power Back-up', 'Feng Shui / Vaastu Complian...","['Environment4 out of 5', 'Safety4 out of 5', ...",2,BHK Flat in Sohna
3,Smart World Orchard,1.47 Crore,"₹ 12,250/sq.ft.",Carpet area: 1200 (111.48 sq.m.),2 Bedrooms,2 Bathrooms,2 Balconies,Study Room,"Sector 61 Gurgaon, Gurgaon, Haryana",2nd of 4 Floors,,Dec 2023,"['Sector 55-56 Metro station', 'Bestech Centra...",Near to metro station of sector 56 and opposit...,,"['Security / Fire Alarm', 'Private Garden / Te...",,2,BHK Flat in Sector 61 Gurgaon
4,Parkwood Westend,70 Lac,"₹ 5,204/sq.ft.",Super Built up area 1345(124.95 sq.m.),2 Bedrooms,2 Bathrooms,3 Balconies,Study Room,"Sector 92 Gurgaon, Gurgaon, Haryana",5th of 8 Floors,,Under Construction,"['Yadav Clinic', 'Bangali Clinic', 'Dr. J. S. ...",We are the proud owners of this 2 bhk alongwit...,[],,"['Environment5 out of 5', 'Safety3 out of 5', ...",2,BHK Flat in Sector 92 Gurgaon


In [12]:
df = DC.conv_price_unit_to_cr('price')


In [13]:
df.sample(2)

Unnamed: 0,society,price_in_cr,price_per_sqft,areaWithType,bedRoom,bathroom,balcony,additionalRoom,address,floorNum,facing,agePossession,nearbyLocations,description,furnishDetails,features,rating,BHK,society1
345,Signature Global City,1.0,"₹ 8,160/sq.ft.",Built Up area: 1250 (116.13 sq.m.)Carpet area:...,3 Bedrooms,3 Bathrooms,3 Balconies,,"Sector 37d, Sector 37D Gurgaon, Gurgaon, Haryana",4th of 4 Floors,South-East,Mar 2024,"['Sapphire 83 Mall Sector 83', 'Dwarka Express...",3bhk type b 1250sq ft. 4th floor with roof.\nS...,"['1 Stove', '4 AC', '1 Chimney', '1 Modular Ki...","['Feng Shui / Vaastu Compliant', 'Security / F...","['Environment4 out of 5', 'Lifestyle4 out of 5...",3,BHK Flat in Sector 37D Gurgaon
1733,Antriksh Heights3.7 ★,0.6,"₹ 5,000/sq.ft.",Super Built up area 1200(111.48 sq.m.),2 Bedrooms,2 Bathrooms,2 Balconies,,"Antriksh Heights, Sector 84 Gurgaon, Gurgaon, ...",14th of 19 Floors,North,1 to 5 Year Old,"['Old Bengali Market', 'MatriKiran High School...",Corner unit sun facing for sale call for more ...,,"['Power Back-up', 'Lift(s)', 'High Ceiling Hei...","['Green Area4.5 out of 5', 'Construction3.5 ou...",2,BHK Flat in Sector 84 Gurgaon


In [14]:
df['price_per_sqft'].value_counts()

price_per_sqft
₹ 10,000/sq.ft.    19
₹ 12,500/sq.ft.    16
₹ 8,000/sq.ft.     16
₹ 6,666/sq.ft.     13
₹ 5,000/sq.ft.     13
                   ..
₹ 12,401/sq.ft.     1
₹ 6,224/sq.ft.      1
₹ 8,179/sq.ft.      1
₹ 12,802/sq.ft.     1
₹ 35,222/sq.ft.     1
Name: count, Length: 2131, dtype: int64

In [15]:
df = DC.area_clean("price_per_sqft")


In [16]:
df.sample(2)

Unnamed: 0,society,price_in_cr,price_per_sqft,areaWithType,bedRoom,bathroom,balcony,additionalRoom,address,floorNum,facing,agePossession,nearbyLocations,description,furnishDetails,features,rating,BHK,society1
968,La Vida by Tata Housing,1.65,10449.0,Super Built up area 1579(146.69 sq.m.)Built Up...,3 Bedrooms,2 Bathrooms,2 Balconies,,"Sector 113 Gurgaon , Gurgaon, Haryana",3rd of 13 Floors,North-East,1 to 5 Year Old,"['Dwarka Sector 21', 'Pacific D21 Mall', 'Bajg...",This 3 bhk apartment is available for sale in ...,"['2 Geyser', 'No AC', 'No Bed', 'No Chimney', ...","['Feng Shui / Vaastu Compliant', 'Security / F...",,3,BHK Flat in Sector 113 Gurgaon
474,Ambience Creacions,4.1,28101.0,Super Built up area 2781(258.36 sq.m.)Carpet a...,3 Bedrooms,3 Bathrooms,3+ Balconies,,"101, Sector 22 Gurgaon, Gurgaon, Haryana",2nd of 20 Floors,North-East,0 to 1 Year Old,"['Moulsari Avenue', 'Ambience Mall', 'Old Delh...",Closer to highway nd accessible to near shops,"['1 Fan', '1 Fridge', '1 Geyser', '1 Stove', '...","['Feng Shui / Vaastu Compliant', 'Intercom Fac...","['Environment4 out of 5', 'Lifestyle4 out of 5...",3,BHK Flat in Sector 22 Gurgaon


In [17]:
df.isnull().sum()
# 20 flats are their about which price is not provided 
df = df[~df['price_in_cr'].isnull()]

In [18]:
df.isnull().sum()

society               1
price_in_cr           0
price_per_sqft        1
areaWithType          0
bedRoom               0
bathroom              0
balcony               0
additionalRoom     1304
address               6
floorNum              2
facing              874
agePossession         1
nearbyLocations      91
description           0
furnishDetails      796
features            407
rating              328
BHK                   0
society1              0
dtype: int64

In [19]:
df =df[~df['price_per_sqft'].isnull()]

In [20]:
df['bedRoom'].value_counts()
# easy to cleamn

bedRoom
3 Bedrooms    1437
2 Bedrooms     943
4 Bedrooms     478
1 Bedroom      104
5 Bedrooms      31
6 Bedrooms       3
Name: count, dtype: int64

In [21]:
df['bedRoom'] = df['bedRoom'].apply(lambda x: int(x.split(' ')[0]) if x else 0)
df['bathroom'] = df['bathroom'].apply(lambda x: int(x.split(' ')[0]) if x else 0)


In [22]:
#df['balcony'] = df['balcony'].apply(lambda x: int(x.split(' ')[0]) if x else 0)
df['balcony'].value_counts()

balcony
3 Balconies     973
3+ Balconies    862
2 Balconies     749
1 Balcony       315
No Balcony       97
Name: count, dtype: int64

In [23]:
def bal(count):
    if count.split(' ')[0]=="3+":
        return 3
    elif count.split(' ')[0]=="No":
        return 0
    else:
        return int(count.split(' ')[0])
        

In [24]:
df['balcony']= df['balcony'].apply(bal)

In [25]:
df.head()

Unnamed: 0,society,price_in_cr,price_per_sqft,areaWithType,bedRoom,bathroom,balcony,additionalRoom,address,floorNum,facing,agePossession,nearbyLocations,description,furnishDetails,features,rating,BHK,society1
0,maa bhagwati residency,0.45,5000.0,Carpet area: 900 (83.61 sq.m.),2,2,1,,"Krishna Colony, Gurgaon, Haryana",4th of 4 Floors,West,1 to 5 Year Old,"['Chintapurni Mandir', 'State bank ATM', 'Pear...",So with lift.Maa bhagwati residency is one of ...,"['3 Fan', '4 Light', '1 Wardrobe', 'No AC', 'N...","['Feng Shui / Vaastu Compliant', 'Security / F...","['Environment4 out of 5', 'Safety4 out of 5', ...",2,BHK Flat in Krishna Colony
1,Apna Enclave,0.5,7692.0,Carpet area: 650 (60.39 sq.m.),2,2,1,,"46b, Ashok Vihar, Gurgaon, Haryana",1st of 3 Floors,West,10+ Year Old,"['Chintapurni Mandir', 'Sheetla Mata Mandir', ...","Property situated on main road, railway statio...","['3 Wardrobe', '4 Fan', '1 Exhaust Fan', '1 Ge...","['Security / Fire Alarm', 'Maintenance Staff',...","['Environment4 out of 5', 'Safety4 out of 5', ...",2,BHK Flat in Ashok Vihar
2,Tulsiani Easy in Homes,0.4,6722.0,Carpet area: 595 (55.28 sq.m.),2,2,3,,"Sohna, Gurgaon, Haryana",12nd of 14 Floors,,0 to 1 Year Old,"['Huda City Metro', 'Golf Course extn road', '...","This property is 15 km away from badshapur, gu...",,"['Power Back-up', 'Feng Shui / Vaastu Complian...","['Environment4 out of 5', 'Safety4 out of 5', ...",2,BHK Flat in Sohna
3,Smart World Orchard,1.47,12250.0,Carpet area: 1200 (111.48 sq.m.),2,2,2,Study Room,"Sector 61 Gurgaon, Gurgaon, Haryana",2nd of 4 Floors,,Dec 2023,"['Sector 55-56 Metro station', 'Bestech Centra...",Near to metro station of sector 56 and opposit...,,"['Security / Fire Alarm', 'Private Garden / Te...",,2,BHK Flat in Sector 61 Gurgaon
4,Parkwood Westend,0.7,5204.0,Super Built up area 1345(124.95 sq.m.),2,2,3,Study Room,"Sector 92 Gurgaon, Gurgaon, Haryana",5th of 8 Floors,,Under Construction,"['Yadav Clinic', 'Bangali Clinic', 'Dr. J. S. ...",We are the proud owners of this 2 bhk alongwit...,[],,"['Environment5 out of 5', 'Safety3 out of 5', ...",2,BHK Flat in Sector 92 Gurgaon
