In [46]:
import pandas as pd
import numpy as np

In [47]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [48]:
df2 = pd.read_csv('houses.csv')

In [49]:
df2.drop(columns={'link'},inplace=True)

In [50]:
df2.duplicated().sum()
df2.drop_duplicates(inplace=True)

In [51]:
df2 = df2[~df2.price.isna()]

In [52]:
import re

def extract_location(text):
    if not isinstance(text, str):
        return None
    
    # 1️⃣ Try extracting sector
    sector_match = re.search(
        r'in\s+(Sector[-\s]?\d+[A-Za-z]?)',
        text,
        re.I
    )
    
    if sector_match:
        return sector_match.group(1).title()
    
    # 2️⃣ Fallback: extract project / society name
    project_match = re.search(
        r'in\s+([A-Za-z ]+)',
        text,
        re.I
    )
    
    if project_match:
        return project_match.group(1).strip().title()
    
    return None


In [53]:
df2.sample()

Unnamed: 0,property_name,society,price,rate,area,areaWithType,bedRoom,bathroom,balcony,additionalRoom,address,noOfFloor,facing,agePossession,nearbyLocations,description,furnishDetails,features,rating,property_id
865,4 Bedroom House for sale in Sector 56 Gurgaon,,2.99 Crore,"₹ 20,635/sq.ft.",(135 sq.m.) Plot Area,Plot area 161(134.62 sq.m.),4 Bedrooms,4 Bathrooms,2 Balconies,"Pooja Room,Study Room,Servant Room","242, Sector 56 Gurgaon, Gurgaon, Haryana",2 Floors,South,5 to 10 Year Old,"['Sector metro station', 'Sector metro station...",This 3 bhk house for sale in sector 56 gurgaon...,"['3 Wardrobe', '7 Fan', '1 Exhaust Fan', '4 Ge...","['Feng Shui / Vaastu Compliant', 'Private Gard...","['Environment4 out of 5', 'Safety4 out of 5', ...",N68716724


In [54]:
df2['sector'] = df2.property_name.apply(extract_location)

In [55]:
df2[df2.sector == 'sector']

Unnamed: 0,property_name,society,price,rate,area,areaWithType,bedRoom,bathroom,balcony,additionalRoom,address,noOfFloor,facing,agePossession,nearbyLocations,description,furnishDetails,features,rating,property_id,sector


In [56]:
df2['society_name'] = df2.society.str.split(r'(\d+)').str.get(0)\
                                .str.strip().str.lower()


In [57]:
def convert_price(text):
    if 'crore' in text.lower():
        return float(text.split(" ")[0])
    elif 'lac' in text.lower():
        return float(text.split(" ")[0])/100
    else:
        return 0

In [58]:
df2['Price'] = df2.price.apply(convert_price)

In [59]:
df2=df2[~(df2.Price == 0)]

In [60]:
df2.shape

(945, 23)

In [61]:
df2.drop(columns={'property_name','society','price'},inplace=True)

In [62]:
df2.rate=df2.rate.str.split("/").str.get(0).str.split(" ").str.get(1).str.replace(",","").astype(int)

In [63]:
df2['area'] = round((df2.Price*10000000)/df2['rate'],2)

In [64]:
df2.sample(5)

Unnamed: 0,rate,area,areaWithType,bedRoom,bathroom,balcony,additionalRoom,address,noOfFloor,facing,agePossession,nearbyLocations,description,furnishDetails,features,rating,property_id,sector,society_name,Price
23,26235,3239.95,Plot area 360(301.01 sq.m.),4 Bedrooms,4 Bathrooms,2 Balconies,"Pooja Room,Servant Room,Store Room","Sector 48 Gurgaon, Gurgaon, Haryana",,East,5 to 10 Year Old,"['Athena', 'Star Mall', 'The Sixth Element Sch...",Three side open villa in tatvam villa sector-4...,"['1 Chimney', '1 Modular Kitchen', 'No AC', 'N...","['Feng Shui / Vaastu Compliant', 'Private Gard...","['Environment4 out of 5', 'Lifestyle4 out of 5...",Q69361158,Sector 48,vipul tatvam villa,8.5
813,31926,1350.0,Plot area 150(125.42 sq.m.),5 Bedrooms,4 Bathrooms,3 Balconies,,"DLF Phase 2, Gurgaon, Haryana",3 Floors,North,10+ Year Old,"['Vodafone belvedere towers metro station', 'D...",Deal of the month \n150 sqyards kothi for sale...,[],"['Feng Shui / Vaastu Compliant', 'Maintenance ...","['Environment5 out of 5', 'Lifestyle5 out of 5...",R70186684,Dlf Phase,,4.31
915,42222,2250.01,Plot area 302(252.51 sq.m.)Built Up area: 300 ...,5 Bedrooms,5 Bathrooms,3+ Balconies,"Servant Room,Pooja Room","Sushant Lok 1, Sushant Lok Phase 1, Gurgaon, H...",2 Floors,North-East,1 to 5 Year Old,"['Huda city centre metro station', 'New Life C...","Sushant lok 1 very prime location , this kothi...","['1 Water Purifier', '6 Fan', '1 Exhaust Fan',...","['Feng Shui / Vaastu Compliant', 'Private Gard...","['Environment5 out of 5', 'Safety4.5 out of 5'...",B70289660,Sushant Lok Phase,,9.5
848,36883,3239.98,Plot area 360(301.01 sq.m.),3 Bedrooms,3 Bathrooms,2 Balconies,"Study Room,Servant Room","Nirvana Country, Gurgaon, Haryana",1 Floors,West,5 to 10 Year Old,"['Radhakrishna Shani Mandir', 'Standard charte...",Park fecing villa for sale in nirvana country ...,"['1 Wardrobe', '1 Fan', '1 Geyser', '3 Light',...","['Feng Shui / Vaastu Compliant', 'Private Gard...","['Environment5 out of 5', 'Safety5 out of 5', ...",T69900704,Nirvana Country,unitech nirvana birch court,11.95
905,38986,3078.03,Plot area 342(285.96 sq.m.),16 Bedrooms,16 Bathrooms,3+ Balconies,"Pooja Room,Study Room,Servant Room,Store Room","Sector 40 Gurgaon, Gurgaon, Haryana",4 Floors,,0 to 1 Year Old,"['Huda city centre metro station', 'Axis bank ...",Ready to move in simplex kothi available for s...,"['20 Wardrobe', '24 Fan', '18 Geyser', '99 Lig...",,"['Environment4 out of 5', 'Safety4 out of 5', ...",J70268910,Sector 40,,12.0


In [65]:
df2.bedRoom = df2.bedRoom.str.split(" ").str.get(0).astype(int)
df2.bathroom = df2.bathroom.str.split(" ").str.get(0).astype(int)
df2.balcony = df2.balcony.str.split(" ").str.get(0)

In [66]:
# df2.balcony.value_counts()

In [67]:
df2.additionalRoom.fillna("Not",inplace=True)
def study(text):
    if "Study Room" in text:
        return 1
    else:
        return 0
def other(text):
    if "Other" in text:
        return 1
    else:
        return 0
def servant(text):
    if "Servant Room" in text:
        return 1
    else:
        return 0
def pooja(text):
    if "Pooja Room" in text:
        return 1
    else:
        return 0

In [68]:
df2['Study_room'] = df2.additionalRoom.apply(study)
df2['Servant_room'] = df2.additionalRoom.apply(servant)
df2['Pooja_room'] = df2.additionalRoom.apply(pooja)
df2['Other'] = df2.additionalRoom.apply(other)

In [69]:
df2.drop(columns={'additionalRoom'},inplace=True)

In [70]:
df2.noOfFloor = df2.noOfFloor.str.split(" ").str.get(0)

In [71]:
df2.drop(columns={'address','nearbyLocations','description','property_id','rating'},inplace=True)

In [72]:
df2.agePossession.fillna('undefined',inplace=True)
# df.agePossession.value_counts()

In [73]:
def age(text):
    if text == "0 to 1 Year Old":
        return "Newly House"
    elif text == "1 to 5 Year Old":
        return "Relatively New"
    elif text == "10+ Year Old":
        return "Old"
    elif text == "5 to 10 Year Old":
        return "Mid"
    elif text == "undefined":
        return "undefined"
    else:
        return "Under Construction"
    

In [74]:
df2['age'] = df2.agePossession.apply(age)

In [75]:
df2.drop(columns={'agePossession'},inplace=True)

In [76]:
df2.features.fillna("[]",inplace=True)

In [77]:
import ast
li = set()
df2.reset_index(drop=True, inplace=True)
import ast
for i in range(df2.shape[0]):
    for j in ast.literal_eval(df2['features'][i]):
        li.add(j)

In [78]:
for i in li:
    print(i)

Low Density Society
Security Personnel
Club house / Community Center
Water Storage
Recently Renovated
Separate entry for servant room
Internet/wi-fi connectivity
Water purifier
False Ceiling Lighting
Private Garden / Terrace
Swimming Pool
Bank Attached Property
Waste Disposal
Fitness Centre / GYM
Rain Water Harvesting
Security / Fire Alarm
Spacious Interiors
Park
Natural Light
No open drainage around
Visitor Parking
Feng Shui / Vaastu Compliant
Maintenance Staff
Piped-gas
Centrally Air Conditioned
Airy Rooms
High Ceiling Height


In [79]:
feature_weights = {
    "Club house / Community Center": 3,
    "Swimming Pool": 3,
    "Fitness Centre / GYM": 2,
    "Lift(s)": 2,
    "Power Back-up": 2,
    "Security Personnel": 2,
    "Park": 2,

    "Internet/wi-fi connectivity": 1,
    "Intercom Facility": 1,
    "Visitor Parking": 1,
    "Water Storage": 1,
    "Waste Disposal": 1,
    "Maintenance Staff": 1,
    "Piped-gas": 1,
    "Rain Water Harvesting": 1,
    "Feng Shui / Vaastu Compliant": 1,

    "Private Garden / Terrace": 2,
    "High Ceiling Height": 1,
    "False Ceiling Lighting": 1,
    "Spacious Interiors": 1,
    "Airy Rooms": 1,
    "Natural Light": 1
}
def feature_score(text):
    score = 0
    for feature,weight in feature_weights.items():
        if feature in ast.literal_eval(text):
            score += weight
    return score

In [80]:
df2['feature_score'] = df2['features'].apply(feature_score)

In [81]:
furnish_weights = {
    'AC': 3,
    'Bed': 3,
    'Wardrobe': 2,
    'Sofa': 2,
    'TV': 2,
    'Fridge': 2,
    'Washing Machine': 2,
    'Modular Kitchen': 3,

    'Fan': 1,
    'Light': 0.5,
    'Geyser': 1,
    'Curtains': 0.5,
    'Chimney': 1,
    'Microwave': 1,
    'Dining Table': 1,
    'Exhaust Fan': 0.5,
    'Stove': 1,
    'Water Purifier': 1
}
import re
import  ast
def furnished_score(furnish_list):
    score = 0

    for item in ast.literal_eval(furnish_list):
        # skip items starting with "No"
        if item.lower().startswith('no'):
            continue

        # extract quantity and item name
        match = re.match(r'(\d+)\s+(.*)', item)
        if match:
            qty = int(match.group(1))
            name = match.group(2)

            # add weighted score
            for key, weight in furnish_weights.items():
                if key.lower() in name.lower():
                    score += qty * weight

    return score

In [82]:
df2.furnishDetails.fillna("[]",inplace=True)
df2['furnish_score'] = df2.furnishDetails.apply(furnished_score)

In [83]:
df2.drop(columns={'furnishDetails','features'},inplace=True)

In [84]:
df2.areaWithType.value_counts()

areaWithType
Plot area 360(301.01 sq.m.)                                                                                            37
Plot area 300(250.84 sq.m.)                                                                                            27
Plot area 200(167.23 sq.m.)                                                                                            20
Plot area 270(225.75 sq.m.)                                                                                            18
Plot area 502(419.74 sq.m.)                                                                                            18
Plot area 900(83.61 sq.m.)                                                                                             17
Plot area 500(418.06 sq.m.)                                                                                            14
Plot area 150(125.42 sq.m.)                                                                                            14
Plot area 1

In [85]:
df2.feature_score.max()

np.int64(27)

In [86]:
df2.sample(4)

Unnamed: 0,rate,area,areaWithType,bedRoom,bathroom,balcony,noOfFloor,facing,sector,society_name,Price,Study_room,Servant_room,Pooja_room,Other,age,feature_score,furnish_score
586,11102,539.99,Plot area 60(50.17 sq.m.),8,4,3,3,West,Sector 105,,0.5995,0,0,0,0,Newly House,4,17.5
198,8697,2150.17,Built Up area: 2150 (199.74 sq.m.),3,4,No,12,,Sector 109,ats tourmaline,1.87,0,0,0,0,undefined,0,0.0
750,1087,50597.98,Plot area 5620(4699.04 sq.m.)Built Up area: 82...,8,8,2,3,North-East,Mayfield Garden,,5.5,0,0,1,0,Relatively New,10,79.0
47,20690,2609.96,Plot area 290(242.48 sq.m.),4,5,3+,3,North,Sector 70A,bptp visionnaire,5.4,1,1,1,0,Relatively New,24,52.0


In [87]:
df2.drop(columns={'areaWithType'},inplace=True)

In [88]:
df2.insert(loc=1,column='property_type',value='house')

In [89]:
df2.rename({'noOfFloor':'floor'},inplace=True)

In [90]:
df2.to_csv('houses_cleaned.csv',index=False)