In [1]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import time
import random
import re
import math
import pandas as pd
from datetime import datetime



In [2]:
def get_page_elements(browser, url):
    """Gets all href elements from a given url page using the browser webdriver"""
    # Load page
    browser.get(url)
    time.sleep(random.uniform(1.0, 1.5))  # Wait a bit for the page to load
    # Get page HTML
    innerHTML = browser.execute_script("return document.body.innerHTML")
    soup = BeautifulSoup(innerHTML, 'lxml')
    # Find all elements with the desired href attribute and add them to a list
    elements = [link.get('href') for link in soup.findAll('a', attrs={'href': re.compile("^/detail/")})]
    return elements[::2]  # Return every other element

def get_max_pages(soup):
    """Gets the maximum number of pages from a BeautifulSoup object"""
    # Find the text showing the total number of records
    records = soup.find_all(class_='numero ng-binding')[1].text
    # Extract the number from the text
    records = int("".join(re.split(r'\D', records)))
    # Calculate the number of pages (20 records per page)
    return math.ceil(records / 20)

def print_scraping_info(typ_obchodu, typ_stavby, records, max_page, pages):
    """Prints some information about what is being scraped"""
    print("----------------")
    print("Scraping: " + typ_obchodu + " " + typ_stavby)
    print("Total listings: " + str(records))
    print("Total pages: " + str(max_page))
    print("Scraping " + str(pages) + " pages.")
    print("----------------")

def scrape_pages(typ_obchodu, typ_stavby, max_page, url, pages):
    # Start timer
    start_time = time.time()
    # Set up Chrome options
    chrome_options = Options()
    chrome_options.add_argument("--start-minimized") # Starts Chrome minimized
    """Scrape a given number of pages for href elements"""
    browser = webdriver.Chrome(options=chrome_options)
    # Get elements from the first page
    elements = get_page_elements(browser, url)
    # Print information about what we're scraping
    print_scraping_info(typ_obchodu, typ_stavby, len(elements), max_page, pages)
    # Scrape the remaining pages
    for i in range(1, pages):
        # Calculate elapsed time and remaining time
        elapsed_time = time.time() - start_time
        time_per_page = elapsed_time / i
        remaining_pages = pages - i
        remaining_time_estimate = time_per_page * remaining_pages

        print('\r'+"Page " + str(i) + " = " + str(round(100*i/pages, 2)) + "% progress. Estimated remaining time: " + str(round(remaining_time_estimate, 2)) + " seconds.", end="")

        new_url = url + "?strana=" + str(i+1)
        new_elements = get_page_elements(browser, new_url)
        elements.extend(new_elements)
    browser.quit()
    return elements

def get_id(url):
    """Extract the ID from a URL"""
    return url.split("/")[-1]

def elements_and_ids(x):
    """Create a DataFrame of URLs and IDs, remove duplicates, and save to a CSV file"""
    elements = pd.DataFrame({"url":x})
    elements["url_id"] = elements["url"].apply(get_id)
    
    len_before = len(elements)
    # Remove duplicates
    elements.drop_duplicates(subset=["url", "url_id"], keep="first", inplace=True)
    len_after = len(elements)
    
    print(f"-- Removed {len_before - len_after} records due to duplication.")
    today_date = datetime.now().strftime('%Y-%m-%d')
    filename = f'{today_date}_urls.csv'
    elements.to_csv(filename, index=False)
    return elements

def get_soup_elements(typ_obchodu="prodej", typ_stavby="byty", pages=1):
    """Main function to get href elements for a given type of trade and construction"""
    url = f"https://www.sreality.cz/hledani/{typ_obchodu}/{typ_stavby}"
    browser = webdriver.Chrome()
    # Get initial soup to extract maximum pages
    browser.get(url)
    time.sleep(random.uniform(1.0, 1.5))  
    innerHTML = browser.execute_script("return document.body.innerHTML")
    soup = BeautifulSoup(innerHTML,'lxml')
    max_page = get_max_pages(soup)
    # If 2000 pages were requested, set pages to max_page
    if pages == 2000:
        pages = max_page
    # Start scraping pages
    elements = scrape_pages(typ_obchodu, typ_stavby, max_page, url, pages)
    # Pass the elements to the elements_and_ids function
    elements_df = elements_and_ids(elements)
    return elements_df

In [3]:
get_soup_elements(typ_obchodu = "prodej", typ_stavby = "byty", pages = 2000)

----------------
Scraping: prodej byty
Total listings: 70
Total pages: 974
Scraping 974 pages.
----------------
Page 973 = 99.9% progress. Estimated remaining time: 3.43 seconds.....-- Removed 38554 records due to duplication.


Unnamed: 0,url,url_id
0,/detail/prodej/byt/3+1/brno-lisen-strnadova/84...,847426892
4,/detail/prodej/byt/2+kk/libcice-nad-vltavou-li...,461526092
7,/detail/prodej/byt/3+kk/libcice-nad-vltavou-li...,1687311436
11,/detail/prodej/byt/3+kk/liberec-liberec-xxx-vr...,63538508
14,/detail/prodej/byt/4+kk/trhove-sviny-trhove-sv...,741217356
...,...,...
53976,/detail/prodej/byt/2+1/loket-loket-kostelni/23...,2397433436
53980,/detail/prodej/byt/2+1/karlovy-vary-karlovy-va...,400612956
53983,/detail/prodej/byt/3+kk/pisek--/4244561500,4244561500
53987,/detail/prodej/byt/2+kk/marianske-lazne-marian...,4023578204
