In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

In [3]:
web = 'https://www.realtor.ca/on/toronto/real-estate'
path = r"C:\Users\syuan\Downloads\chromedriver-win64\chromedriver-win64\chromedriver.exe"
cService = webdriver.ChromeService(executable_path=path)
driver = webdriver.Chrome(service=cService)
driver.get(web)
driver.maximize_window()

# Using an explicit wait to ensure the page is loaded
WebDriverWait(driver, 20).until(
    EC.presence_of_element_located((By.XPATH, '//div[contains(@class,"listingCard card")]'))
)

price = []
bedrooms = []
address = []

# Scrape additional pages
current_page = 1
max_pages = 5

while current_page <= max_pages:
    
    WebDriverWait(driver, 20).until(
        EC.visibility_of_all_elements_located((By.XPATH, '//div[contains(@class,"listingCard card")]'))
    )
    products = driver.find_elements(By.XPATH, '//div[contains(@class,"listingCard card")]')

    for product in products:
        price.append(product.find_element(By.CLASS_NAME,'listingCardPrice').text)
        bedrooms.append(product.find_element(By.CLASS_NAME,'listingCardIconStrip').text)
        address.append(product.find_element(By.CLASS_NAME,'listingCardAddress').text)
        
    current_page += 1

    # Ensure the next page button is clickable
    next_page_button = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, "#ListViewPagination_Bottom .lnkNextResultsPage div"))
    )
    driver.execute_script("arguments[0].scrollIntoView();", next_page_button)

    next_page_button.click()

    # Using explicit wait to ensure elements load
    WebDriverWait(driver, 20).until(
        EC.presence_of_element_located((By.XPATH, '//div[contains(@class,"listingCard card")]'))
    )

# Create DataFrame
df_listings = pd.DataFrame({'Price': price, 'Number of rooms': bedrooms, 'Address': address})

# Save DataFrame to CSV
df_listings.to_csv('real_estate_listings.csv', index=False)

# Close the browser
driver.quit()

# Print DataFrame for verification
print(df_listings)


         Price                Number of rooms  \
0   $1,799,800  3 + 2\nBedrooms\n3\nBathrooms   
1   $2,999,800  4 + 2\nBedrooms\n4\nBathrooms   
2     $798,800  1 + 1\nBedrooms\n2\nBathrooms   
3   $1,380,000      3\nBedrooms\n3\nBathrooms   
4   $1,199,900      2\nBedrooms\n2\nBathrooms   
5   $1,880,000  4 + 3\nBedrooms\n5\nBathrooms   
6     $799,000  2 + 1\nBedrooms\n2\nBathrooms   
7   $1,139,000  3 + 1\nBedrooms\n2\nBathrooms   
8   $1,188,000  4 + 3\nBedrooms\n4\nBathrooms   
9     $688,888      2\nBedrooms\n2\nBathrooms   
10  $5,690,000  2 + 1\nBedrooms\n3\nBathrooms   
11    $628,000      1\nBedrooms\n1\nBathrooms   
12  $1,799,800  3 + 2\nBedrooms\n3\nBathrooms   
13  $2,999,800  4 + 2\nBedrooms\n4\nBathrooms   
14    $798,800  1 + 1\nBedrooms\n2\nBathrooms   
15  $1,380,000      3\nBedrooms\n3\nBathrooms   
16  $1,199,900      2\nBedrooms\n2\nBathrooms   
17  $1,880,000  4 + 3\nBedrooms\n5\nBathrooms   
18    $799,000  2 + 1\nBedrooms\n2\nBathrooms   
19  $1,139,000  3 + 

In [27]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Initialize the WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# Target URL
url = "https://www.realtor.ca/on/toronto/real-estate"
driver.get(url)

# Wait for the cards to be visible
wait = WebDriverWait(driver, 30)
cards = wait.until(EC.visibility_of_all_elements_located((By.XPATH, "//li[contains(@class,'cardCon')]/div/a")))

# List to store page URLs
page_urls = [card.get_attribute('href') for card in cards]

# Data list to collect all the information
data_list = []

# Loop through each page URL
for page_url in page_urls:
    if page_url:
        # Navigate to the page
        driver.get(page_url)
        # Wait for necessary elements to load
        wait.until(EC.presence_of_element_located((By.XPATH, "//div[contains(@id,'listingPriceValue')]")))

        # Scrape data
        try:
            price = driver.find_element(By.XPATH, "//div[contains(@id,'listingPriceValue')]").text
            address = driver.find_element(By.XPATH, "//h1[contains(@id,'listingAddress')]").text
            bedrooms = driver.find_element(By.XPATH, '//*[@id="BedroomIcon"]/div[2]').text
            bathrooms = driver.find_element(By.XPATH, '//*[@id="BathroomIcon"]/div[2]').text
            
            # Store data in a dictionary
            data = {
                "price": price,
                "address": address,
                "rooms": bedrooms,
                "bathrooms": bathrooms
            }
            data_list.append(data)
        except Exception as e:
            print(f"Error scraping data for {page_url}: {e}")



# Print collected data
for data in data_list:
    print(data)



{'price': '$595,000', 'address': '#1805 -2212 LAKE SHORE BLVD W\nToronto, Ontario M8V0C2', 'rooms': '1', 'bathrooms': '1'}
{'price': '$888,888', 'address': '270 ELMHURST DR\nToronto, Ontario M9W2M2', 'rooms': '3 + 1', 'bathrooms': '2'}
{'price': '$968,800', 'address': '#6107 -30 SHORE BREEZE DR\nToronto, Ontario M8V0J1', 'rooms': '2', 'bathrooms': '2'}
{'price': '$889,000', 'address': '154 FIFTH ST\nToronto, Ontario M8V2Z7', 'rooms': '2 + 1', 'bathrooms': '2'}
{'price': '$659,000', 'address': '#1615 -85 MCMAHON DR\nToronto, Ontario M2K0H1', 'rooms': '1', 'bathrooms': '1'}
{'price': '$1,160,000', 'address': '#5908 -1 BLOOR ST E\nToronto, Ontario M4W1A9', 'rooms': '2', 'bathrooms': '2'}
{'price': '$718,000', 'address': '#4111 -181 DUNDAS ST E\nToronto, Ontario M5A0N5', 'rooms': '2', 'bathrooms': '1'}
{'price': '$489,900', 'address': '#1005 -21 OVERLEA BLVD\nToronto, Ontario M4H1P2', 'rooms': '1', 'bathrooms': '1'}
{'price': '$669,000', 'address': '#1206 -15 ICEBOAT TERR\nToronto, Ontario

In [34]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException, NoSuchElementException
import pandas as pd

# Initialize the WebDriver
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# Target URL
url = "https://www.realtor.ca/on/toronto/real-estate"
driver.get(url)

wait = WebDriverWait(driver, 30)
data_list = []

current_page = 1
last_page = 25  # Set the last page you want to scrape

xpaths = [
    "//li[contains(@class,'cardCon')]/div/a",  # Original XPath for initial pages
    "//div[contains(@class,'cardCon')]/div/a"  # Updated XPath for page 3 onwards
]

while True:  # Loop to handle pagination
    cards_found = False
    for xpath in xpaths:
        try:
            # Wait for the cards to be visible using current XPath
            cards = wait.until(EC.visibility_of_all_elements_located((By.XPATH, xpath)))
            if cards:
                cards_found = True
                break  # Exit the loop once cards are found
        except TimeoutException:
            continue  # Try the next XPath if current one fails

    if not cards_found:
        print("Failed to find cards with any known XPath.")
        break

    # List to store page URLs
    page_urls = [card.get_attribute('href') for card in cards]

    # Process each page URL
    for page_url in page_urls:
        if page_url:
            # Navigate to the page
            driver.get(page_url)
            wait.until(EC.presence_of_element_located((By.XPATH, "//div[contains(@id,'listingPriceValue')]")))
            # Scrape data
            try:
                price = driver.find_element(By.XPATH, "//div[contains(@id,'listingPriceValue')]").text
                address = driver.find_element(By.XPATH, "//h1[contains(@id,'listingAddress')]").text
                bedrooms = driver.find_element(By.XPATH, '//*[@id="BedroomIcon"]/div[2]').text
                bathrooms = driver.find_element(By.XPATH, '//*[@id="BathroomIcon"]/div[2]').text

                data = {
                    "price": price,
                    "address": address,
                    "bedrooms": bedrooms,
                    "bathrooms": bathrooms
                }
                data_list.append(data)
            except Exception as e:
                print(f"Error scraping data for {page_url}: {e}")
            finally:
                driver.back()  # Navigate back to the listing page

    if current_page >= last_page:
        print(f"Reached the last page: {last_page}. Ending scrape.")
        break

    try:
        next_page_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "#ListViewPagination_Bottom .lnkNextResultsPage div")))
        driver.execute_script("arguments[0].scrollIntoView();", next_page_button)
        next_page_button.click()
        current_page += 1
    except TimeoutException:
        print("No more pages or failed to click next.")
        break

driver.quit()

df2 = pd.DataFrame(data_list)
df2.to_csv('Toronto_listings_2.csv', index=False)
print("Data collection complete and saved to CSV.")


Error scraping data for https://www.realtor.ca/real-estate/26831097/283-297-sheppard-ave-w-toronto-lansing-westgate: Message: no such element: Unable to locate element: {"method":"xpath","selector":"//*[@id="BedroomIcon"]/div[2]"}
  (Session info: chrome=124.0.6367.118); For documentation on this error, please visit: https://www.selenium.dev/documentation/webdriver/troubleshooting/errors#no-such-element-exception
Stacktrace:
	GetHandleVerifier [0x00CCC113+48259]
	(No symbol) [0x00C5CA41]
	(No symbol) [0x00B50A17]
	(No symbol) [0x00B90BED]
	(No symbol) [0x00B90C9B]
	(No symbol) [0x00BCBC12]
	(No symbol) [0x00BB0DE4]
	(No symbol) [0x00BC9B9C]
	(No symbol) [0x00BB0B36]
	(No symbol) [0x00B8570D]
	(No symbol) [0x00B862CD]
	GetHandleVerifier [0x00F86533+2908323]
	GetHandleVerifier [0x00FC3B4B+3159739]
	GetHandleVerifier [0x00D6505B+674763]
	GetHandleVerifier [0x00D6B21C+699788]
	(No symbol) [0x00C66244]
	(No symbol) [0x00C62298]
	(No symbol) [0x00C6242C]
	(No symbol) [0x00C54BB0]
	BaseThread