# Mudah.my House Scraper

In [2]:
%pip install selenium
%pip install pandas
%pip install bs4

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install rich
from rich.pretty import pprint

Note: you may need to restart the kernel to use updated packages.


In [4]:
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By

options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                     "AppleWebKit/537.36 (KHTML, like Gecko) "
                     "Chrome/130.0.0.0 Safari/537.36")
driver = webdriver.Chrome(options=options)

In [5]:
def extract_text_from_div(div):
    els = div.find_elements(By.XPATH, ".//*[not(*) and normalize-space(text())]")
    texts = [el.text.strip() for el in els if el.text.strip()]
    return texts


def extract_info(driver):
    divs = driver.find_elements(
        By.XPATH, "//div[contains(@class, 'style__ParentWrapper')]"
    )

    full_texts = [div.text for div in divs]

    data = {}

    main_info = extract_text_from_div(divs[0])
    assert main_info[2].startswith("RM")
    name = main_info[4]
    price = " ".join(main_info[2].split(" ")[1:])
    size = main_info[main_info.index("sq.ft") - 1]
    bed = main_info[main_info.index("Bed") - 1]
    bathroom = main_info[main_info.index("Bath") - 1]
    data["main"] = {
        "name": name,
        "price": price,
        "size": size,
        "bed": bed,
        "bathroom": bathroom,
    }

    if id := next((i for i, s in enumerate(full_texts) if s.startswith("Property Details")), None):
        details_info = extract_text_from_div(divs[id])
        property_type = details_info[details_info.index("Property Type") + 1]
        furnishing = details_info[details_info.index("Furnishing") + 1]

        data["details"] = {"type": property_type, "furnish": furnishing}

    if id := next((i for i, s in enumerate(full_texts) if s.startswith("About")), None):
        location_info = extract_text_from_div(divs[id])
        location = location_info[2] if location_info[1].startswith("DEVELOPED BY") else location_info[1]
        district = " ".join(location.split(",")[-2].strip().split(" ")[1:])
        state = location.split(",")[-1].strip()

        data["location"] = {"full_text": location, "district": district, "state": state}

    if id := next((i for i, s in enumerate(full_texts) if s.startswith("Facilities")), None):
        facilities = extract_text_from_div(divs[id])
        if facilities[0] == "Facilities":
            facilities = facilities[1:]
        data["facilities"] = facilities

    if id := next((i for i, s in enumerate(full_texts) if s.startswith("Amenities")), None):
        amenities = extract_text_from_div(divs[id])
        if amenities[0] == "Amenities":
            amenities = amenities[1:]
        data["amenities"] = amenities

    return data

In [6]:
test_urls = [
    "https://www.mudah.my/pearl-132-seremban-2-double-storey-semi-d-for-rent-furnished-clean-113248576.htm",
    "https://www.mudah.my/taman-sri-sinar-3-storey-terraced-segambut-near-united-point-113108423.htm",
    "https://www.mudah.my/nice-unit-2sty-house-garland-1-kota-emerald-anggun-rawang-112714003.htm",
    "https://www.mudah.my/suria-kipark-damansara-kepong-suria-kip-wangsa-permai-113218549.htm",
    "https://www.mudah.my/triuni-residence-fully-furnished-gelugor-112895318.htm",
]

for url in test_urls:
    driver.get(url)
    pprint(extract_info(driver))

In [7]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import pandas as pd
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import os

main_url = "https://www.mudah.my/malaysia/all-residential-for-rent"
max_pages = 15
output_file = "House_Rental.csv"

all_data = []

def scrape_listing(link, i):
    print(f"  → Scraping {i + 1}/{len(links)}: {link}")
    #Chrome setup
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument(
        "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/130.0.0.0 Safari/537.36"
    )
    driver = webdriver.Chrome(options=options)

    MAX_RETRIES = 3
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            driver.get(link)
            #Wait for any h2 or span with text to appear
            WebDriverWait(driver, 10).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, "h2 span, h2"))
            )

            data=extract_info(driver)
            print(f"    → Success for {i + 1}/{len(links)}: {link}!")
            row = {
                "Name": data["main"].get("name", ""),
                "Price": data["main"].get("price", ""),
                "Size": data["main"].get("size", ""),
                "Number of beds": data["main"].get("bed", ""),
                "Number of bathrooms": data["main"].get("bathroom", ""),
                "Type": data.get("details", {}).get("type", ""),
                "Furnished Status": data.get("details", {}).get("furnish", ""),
                "Location": data.get("location", {}).get("full_text", ""),
                "District": data.get("location", {}).get("district", ""),
                "State": data.get("location", {}).get("state", ""),
                "Facilities": ", ".join(data.get("facilities", [])),
                "Public transport": "true" if any("transport" in a.lower() for a in data.get("amenities", [])) else "false",
            }
            all_data.append(row)
            break

        except Exception:
            print(f"    Attempt {attempt} failed: Error scraping this listing: {link}")
            if attempt == MAX_RETRIES:
                print(f"    Max retries reached, skipping this listing: {link}")
    driver.quit()

#Chrome setup
options = Options()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument(
    "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/130.0.0.0 Safari/537.36"
)
driver = webdriver.Chrome(options=options)

for page in range(1, max_pages + 1):
    print(f"\nScraping page {page}...")
    url = f"{main_url}?o={page}"
    driver.get(url)
    time.sleep(3)  #Allow JS listings to render

    #Collect listing URLs
    listings = driver.find_elements(
        By.XPATH, "//a[contains(@href, '.htm') and @data-listid]"
    )
    links = list({item.get_attribute("href") for item in listings})
    print(f"Found {len(links)} listings on page {page}")

    for i, link in enumerate(links):
        scrape_listing(link, i)

driver.quit()

if all_data:
    df_new = pd.DataFrame(all_data)

    # If file exists, read and combine without duplicates
    if os.path.exists(output_file):
        df_existing = pd.read_csv(output_file)
        combined = pd.concat([df_existing, df_new], ignore_index=True)
        #Drop duplicates
        combined.drop_duplicates(subset=["Name", "Location", "Price", "Size", "Type"], inplace=True)
        combined.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"\n Updated {output_file}: now {len(combined)} unique listings (removed duplicates)")
    else:
        # If file doesn't exist yet, just save normally
        df_new.to_csv(output_file, index=False, encoding="utf-8-sig")
        print(f"\n Created new file {output_file} ({len(df_new)} listings)")

else:
    print("\n No data scraped.")


Scraping page 1...
Found 46 listings on page 1
  → Scraping 1/46: https://www.mudah.my/royale-heights-3-storey-bungalow-taman-lembah-indah-tambun-113360568.htm
    → Success for 1/46: https://www.mudah.my/royale-heights-3-storey-bungalow-taman-lembah-indah-tambun-113360568.htm!
  → Scraping 2/46: https://www.mudah.my/bm-alma-taman-aman-jaya-semi-detached-house-4-bedrooms-3-bathrooms-113349864.htm
    → Success for 2/46: https://www.mudah.my/bm-alma-taman-aman-jaya-semi-detached-house-4-bedrooms-3-bathrooms-113349864.htm!
  → Scraping 3/46: https://www.mudah.my/rumah-luas-sewa-murah-durian-tunggal-near-mitc-melaka-111617995.htm
    → Success for 3/46: https://www.mudah.my/rumah-luas-sewa-murah-durian-tunggal-near-mitc-melaka-111617995.htm!
  → Scraping 4/46: https://www.mudah.my/fully-furnished-taman-ria-jaya-single-story-terrace-house-for-rent-113360550.htm
    → Success for 4/46: https://www.mudah.my/fully-furnished-taman-ria-jaya-single-story-terrace-house-for-rent-113360550.htm!
  