In [1]:
import time
import numpy as np
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains
from webdriver_manager.chrome import ChromeDriverManager


class PropertyScraper99Acres:
    def __init__(self, city="Chennai"):
        self.city = city
        self.data = []

        # Set Chrome options
        chrome_options = Options()
        chrome_options.add_argument("--disable-http2")
        chrome_options.add_argument("--incognito")
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_argument("--ignore-certificate-errors")
        chrome_options.add_argument("--enable-features=NetworkServiceInProcess")
        chrome_options.add_argument("--disable-features=NetworkService")
        chrome_options.add_argument(
            "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
            "(KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"
        )

        # Launch driver
        self.driver = webdriver.Chrome(
            service=Service(ChromeDriverManager().install()),
            options=chrome_options
        )
        self.wait = WebDriverWait(self.driver, 10)
        self.actions = ActionChains(self.driver)

    def wait_to_load(self):
        try:
            self.wait.until(lambda d: d.execute_script("return document.readyState") == "complete")
        except Exception as e:
            print("Page load timeout:", e)
        else:
            print("Page loaded successfully:", self.driver.title)

    def scrape_properties(self):
        url = "https://www.99acres.com/"
        self.driver.get(url)
        self.wait_to_load()

        # Interact with search bar
        try:
            search_bar = self.wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="keyword2"]')))
            search_bar.send_keys(self.city)
            time.sleep(2)
            valid_option = self.wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="0"]')))
            valid_option.click()
            time.sleep(2)
            search_btn = self.wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="searchform_search_btn"]')))
            search_btn.click()
            self.wait_to_load()
        except Exception as e:
            print("Error during search setup:", e)

        # Filters
        try:
            slider = self.wait.until(EC.element_to_be_clickable((By.ID, "budgetLeftFilter_max_node")))
            self.actions.click_and_hold(slider).move_by_offset(-73, 0).release().perform()
            time.sleep(2)
        except:
            print("Couldn't adjust budget slider.")

        for tag in ["Verified", "Ready To Move", "With Photos", "With Videos"]:
            try:
                tag_elem = self.wait.until(EC.element_to_be_clickable((By.XPATH, f"//span[normalize-space()='{tag}']")))
                tag_elem.click()
                time.sleep(1)
            except:
                print(f"Tag '{tag}' could not be clicked.")

        while True:
            try:
                self.extract_page_data()
                next_btn = self.wait.until(EC.element_to_be_clickable((By.XPATH, "//a[normalize-space()='Next Page >']")))
                self.driver.execute_script("arguments[0].scrollIntoView(true);", next_btn)
                time.sleep(2)
                next_btn.click()
                time.sleep(5)
            except:
                print("No more pages or error while navigating.")
                break

        self.driver.quit()
        return self.data

    def extract_page_data(self):
        rows = self.driver.find_elements(By.CLASS_NAME, "tupleNew__TupleContent")
        for row in rows:
            def try_extract(by, class_name):
                try:
                    return row.find_element(by, class_name).text
                except:
                    return np.nan

            name = try_extract(By.CLASS_NAME, "tupleNew__headingNrera")
            location = try_extract(By.CLASS_NAME, "tupleNew__propType")
            price = try_extract(By.CLASS_NAME, "tupleNew__priceValWrap")

            try:
                elements = row.find_elements(By.CLASS_NAME, "tupleNew__area1Type")
                area, bhk = [ele.text for ele in elements] if len(elements) == 2 else [np.nan, np.nan]
            except:
                area, bhk = [np.nan, np.nan]

            self.data.append({
                "name": name,
                "location": location,
                "price": price,
                "area": area,
                "bhk": bhk
            })

    def clean_data(self):
        df = pd.DataFrame(self.data).drop_duplicates()

        def convert_price(val):
            val = val.replace("₹", "").strip().lower()
            if "lac" in val:
                return float(val.replace("lac", "").strip())
            elif "cr" in val:
                return float(val.replace("cr", "").strip()) * 100
            return np.nan

        return (
            df
            .apply(lambda col: col.str.strip().str.lower() if col.dtype == "object" else col)
            .assign(
                is_starred=lambda df_: df_.name.str.contains("\n").astype(int),
                name=lambda df_: (
                    df_.name.str.replace("\n[0-9.]+", "", regex=True)
                           .str.strip()
                           .replace("adroit district s", "adroit district's")
                ),
                location=lambda df_: (
                    df_.location.str.replace("chennai", "")
                                .str.strip()
                                .str.replace(",$", "", regex=True)
                                .str.split("in")
                                .str[-1]
                                .str.strip()
                ),
                price=lambda df_: df_.price.apply(convert_price),
                area_sqft=lambda df_: pd.to_numeric(
                    df_.area.str.replace("sqft", "").str.replace(",", "").str.strip()
                ),
                bhk=lambda df_: pd.to_numeric(
                    df_.bhk.str.replace("bhk", "").str.strip()
                )
            )
            .drop(columns=["area", "bhk"])
            .rename(columns={"price": "price_lakhs"})
            .reset_index(drop=True)
        )



In [2]:
if __name__ == "__main__":
    scraper = PropertyScraper99Acres(city="Chennai")
    
    # Step 1: Scrape
    raw_data = scraper.scrape_properties()

    # Step 2: Clean
    cleaned_df = scraper.clean_data()

    # Save to CSV
    cleaned_df.to_excel("chennai-properties-99acres.xlxx", index=False)
    print("Data saved to chennai-properties-99acres.xlxx")


Page loaded successfully: India Real Estate Property Site - Buy Sell Rent Properties Portal - 99acres.com
Page loaded successfully: Property in Chennai - Real Estate in Chennai
Tag 'With Videos' could not be clicked.
No more pages or error while navigating.
Data saved to chennai-properties-99acres.csv
