In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

In [2]:
import os
import json
import requests
from datetime import datetime, date
from dotenv import load_dotenv
import psycopg2
from psycopg2.extras import execute_values
import re


In [3]:
# Function to set up the Chrome WebDriver
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

def setup_driver():
    # Configure Chrome options
    chrome_options = Options()
    chrome_options.add_argument('--incognito')  # Open in incognito mode
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument('--headless')  # Run Chrome in headless mode
    chrome_options.add_argument('--disable-gpu')  # Disable GPU acceleration for headless mode
    chrome_options.add_argument('--window-size=1920,1080')  # Set a fixed window size for consistency

    # Provide the path to the chromedriver
    driver_path = ''  # Replace this with the path to your chromedriver
    service = Service(driver_path)
    driver = webdriver.Chrome(service=service, options=chrome_options)
    return driver


In [4]:
# Function to create URL based on job title, location and page number
def create_url( page_num):
    # Construct the search URL dynamically using the job title, location, and page number
    base_url = "https://www.domain.com.au/sold-listings/?suburb=miranda-nsw-2228,hammondville-nsw-2170,scone-nsw-2337&excludepricewithheld=1&page={page_num}"
   
    
    return base_url.format( page_num=page_num)

In [5]:
# Function to save job data to a CSV file
def save_to_csv(job_data, filename='Real_estate.csv'):
    df = pd.DataFrame(job_data)  # Convert the list of job data into a DataFrame
    
    # Save DataFrame to CSV
    df.to_csv(filename, index=False, encoding='utf-8')
    print(f"Data saved to {filename}")

In [6]:
# Prompt user for job title and location
# job_title = input("Enter City (e.g., 'Miranda'): ")
# location = input("Enter State+postcode (e.g., 'NSW-2228'): ")



    # Set the URL for the first page
url = create_url( 1)

In [7]:
    # Set up the driver
driver = setup_driver()

In [8]:
print(url)

https://www.domain.com.au/sold-listings/?suburb=miranda-nsw-2228,hammondville-nsw-2170,scone-nsw-2337&excludepricewithheld=1&page=1


In [13]:
driver.get(url)
time.sleep(2)
all_job_data = []
total_jobs_found = 0  # Counter for the total jobs found
pages_to_scrape = 100

# Loop through the pages
for page_num in range(1, 50):
    print(f"Scraping page {page_num} of {pages_to_scrape}...")

    # Construct the URL for the current page
    url = create_url( page_num)

    # Open the page
    driver.get(url)
    time.sleep(2)  # Increased time to allow all jobs to load

    # Collect job cards using the base ID for sorting through jobs (for both elite and premium)
    job_cards = driver.find_elements(By.CSS_SELECTOR, '[data-testid^="listing-card-wrapper-elite"], [data-testid^="listing-card-wrapper-premium"]')
    if not job_cards:
        print(f"No jobs found on page {page_num}, stopping scraping.")
        break

    for job in job_cards:
        job_info = {}

        try:
            # Extract Job Price from listing-card-price
            price_element = job.find_element(By.CSS_SELECTOR, '[data-testid="listing-card-price"]').text
            job_info['price'] = price_element if price_element else 'NA'

            # Extract Job Price from listing-card-price-wrapper (if available)
            

        except Exception as e:
            job_info['price'] = 'NA'
            

        try:
            # Extract Job Address
            address_element = job.find_element(By.CSS_SELECTOR, '[data-testid="address-wrapper"]').text
            job_info['address'] = address_element if address_element else 'NA'
        except Exception as e:
            job_info['address'] = 'NA'

        try:
            # Extract Property Features
            property_features_element = job.find_element(By.CSS_SELECTOR, '[data-testid="property-features-wrapper"]')
            
            # Extract each feature (beds, baths, parking, size)
            features = property_features_element.find_elements(By.CSS_SELECTOR, '[data-testid="property-features-feature"]')

            # Initialize variables for property details
            beds, baths, parking, size = 'NA', 'NA', 'NA', 'NA'

            for feature in features:
                try:
                    feature_text = feature.find_element(By.CSS_SELECTOR, '[data-testid="property-features-text-container"]').text
                    feature_label = feature.find_element(By.CSS_SELECTOR, '[data-testid="property-features-text"]').text
                    
                    # Extract just the numeric part from feature_text
                    feature_number = ''.join(filter(str.isdigit, feature_text))  # Get only digits

                    if 'bed' in feature_label.lower():
                        beds = feature_number if feature_number else 'NA'
                    elif 'bath' in feature_label.lower():
                        baths = feature_number if feature_number else 'NA'
                    elif 'park' in feature_label.lower():
                        parking = feature_number if feature_number else 'NA'
                    
                except Exception as e:
                    continue  # Skip if any element is not found

            job_info['beds'] = beds
            job_info['baths'] = baths
            job_info['parking'] = parking

        except Exception as e:
            job_info['beds'] = 'NA'
            job_info['baths'] = 'NA'
            job_info['parking'] = 'NA'

        try:
            # Extract Property Type
            type_element = job.find_element(By.XPATH, './/span[@class="css-693528"]').text
            job_info['type'] = type_element if type_element else 'NA'
        except Exception as e:
            job_info['type'] = 'NA'


        try:
            sold_details_element = job.find_element(By.CSS_SELECTOR, 'span.css-1nj9ymt').text
            # Use regular expression to extract date (e.g., "19 Dec 2024")
            date_match = re.search(r'(\d{1,2}\s+[A-Za-z]+\s+\d{4})', sold_details_element)
            if date_match:
                job_info['sold_date'] = date_match.group(0)  # Extracted date
            else:
                job_info['sold_date'] = 'NA'
        except Exception as e:
            job_info['sold_date'] = 'NA'


        # Append job details to the main list
        all_job_data.append(job_info)

    # Increment the total number of jobs found
    total_jobs_found += len(job_cards)
    print(f"Found {len(job_cards)} jobs on page {page_num}")

print(f"Total jobs found: {total_jobs_found}")


Scraping page 1 of 100...
Found 20 jobs on page 1
Scraping page 2 of 100...
Found 20 jobs on page 2
Scraping page 3 of 100...
Found 20 jobs on page 3
Scraping page 4 of 100...
Found 20 jobs on page 4
Scraping page 5 of 100...
Found 18 jobs on page 5
Scraping page 6 of 100...
Found 20 jobs on page 6
Scraping page 7 of 100...
Found 20 jobs on page 7
Scraping page 8 of 100...
Found 19 jobs on page 8
Scraping page 9 of 100...
Found 19 jobs on page 9
Scraping page 10 of 100...
Found 15 jobs on page 10
Scraping page 11 of 100...
Found 18 jobs on page 11
Scraping page 12 of 100...
Found 18 jobs on page 12
Scraping page 13 of 100...
Found 15 jobs on page 13
Scraping page 14 of 100...
Found 17 jobs on page 14
Scraping page 15 of 100...
Found 16 jobs on page 15
Scraping page 16 of 100...
Found 19 jobs on page 16
Scraping page 17 of 100...
Found 16 jobs on page 17
Scraping page 18 of 100...
Found 18 jobs on page 18
Scraping page 19 of 100...
Found 16 jobs on page 19
Scraping page 20 of 100...
Fou

In [14]:
print(all_job_data)

[{'price': '$770,000', 'address': '134 Tullong Road, \nSCONE NSW 2337', 'beds': 'NA', 'baths': 'NA', 'parking': 'NA', 'type': 'Rural', 'sold_date': '21 DEC 2024'}, {'price': '$762,000', 'address': '8/2-8 Kiora Road, \nMIRANDA NSW 2228', 'beds': '2', 'baths': '1', 'parking': '1', 'type': 'Apartment / Unit / Flat', 'sold_date': '19 DEC 2024'}, {'price': '$825,000', 'address': '66/118-128 Karimbla Road, \nMIRANDA NSW 2228', 'beds': '2', 'baths': '1', 'parking': '1', 'type': 'Apartment / Unit / Flat', 'sold_date': '17 DEC 2024'}, {'price': '$656,000', 'address': 'BA305/18 University Road, \nMIRANDA NSW 2228', 'beds': '1', 'baths': '1', 'parking': '1', 'type': 'Apartment / Unit / Flat', 'sold_date': '16 DEC 2024'}, {'price': '$930,000', 'address': '28 Oxford Rd, \nSCONE NSW 2337', 'beds': '4', 'baths': '2', 'parking': '2', 'type': 'House', 'sold_date': '16 DEC 2024'}, {'price': '$740,000', 'address': '20 Gray St, \nSCONE NSW 2337', 'beds': '4', 'baths': '2', 'parking': '2', 'type': 'House',

In [15]:
save_to_csv(all_job_data)

Data saved to Real_estate.csv


In [16]:
driver.quit()