In [1]:
pip install selenium beautifulsoup4 pandas webdriver-manager

Note: you may need to restart the kernel to use updated packages.


In [9]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import re

def setup_driver(chromedriver_path=None):
    """Set up and return a configured Chrome webdriver.
    
    Args:
        chromedriver_path: Path to chromedriver executable. If None, expects chromedriver to be in PATH.
    """
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    if chromedriver_path:
        service = Service(executable_path=chromedriver_path)
        driver = webdriver.Chrome(service=service, options=chrome_options)
    else:
        # Assumes chromedriver is in PATH
        driver = webdriver.Chrome(options=chrome_options)
    
    return driver

def scrape_bezrealitky(url, chromedriver_path=None):
    """Scrape apartment listings from bezrealitky.sk."""
    driver = setup_driver(chromedriver_path)
    driver.get(url)
    
    # Wait for the listings to load
    try:
        # Wait for the listings container to be present
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, ".styles__StyledResults-sc-1sj10vl-0"))
        )
        
        # Additional wait to ensure all content is loaded
        time.sleep(3)
        
        # Extract all property listings
        listings = driver.find_elements(By.CSS_SELECTOR, ".styles__StyledPropertyCard-sc-snnxt9-0")
        
        results = []
        for listing in listings:
            try:
                # Extract listing details
                title_element = listing.find_element(By.CSS_SELECTOR, "h2")
                title = title_element.text if title_element else "N/A"
                
                price_element = listing.find_element(By.CSS_SELECTOR, ".price")
                price = price_element.text if price_element else "N/A"
                
                location_element = listing.find_element(By.CSS_SELECTOR, ".locality")
                location = location_element.text if location_element else "N/A"
                
                details_element = listing.find_element(By.CSS_SELECTOR, ".parameters")
                details = details_element.text if details_element else "N/A"
                
                # Extract the link
                link_element = listing.find_element(By.CSS_SELECTOR, "a")
                link = link_element.get_attribute("href") if link_element else "N/A"
                
                # Parse additional details
                area_match = re.search(r'(\d+)\s*m²', details)
                area = area_match.group(1) if area_match else "N/A"
                
                rooms_match = re.search(r'(\d+)\+(\d+)', details)
                rooms = f"{rooms_match.group(1)}+{rooms_match.group(2)}" if rooms_match else "N/A"
                
                # Clean price text
                price_clean = re.sub(r'[^\d]', '', price) if price != "N/A" else "N/A"
                
                results.append({
                    'title': title,
                    'price': price_clean,
                    'location': location,
                    'area_sqm': area,
                    'rooms': rooms,
                    'details': details,
                    'url': link
                })
                
            except Exception as e:
                print(f"Error extracting listing details: {e}")
                continue
        
        return results
        
    except Exception as e:
        print(f"Error scraping website: {e}")
        return []
    
    finally:
        driver.quit()

def save_to_csv(data, filename="bezrealitky_apartments.csv"):
    """Save scraped data to a CSV file."""
    if not data:
        print("No data to save.")
        return
    
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False, encoding='utf-8-sig')  # utf-8-sig for Excel compatibility with accented chars
    print(f"Data saved to {filename}")

def main():
    url = "https://www.bezrealitky.sk/vyhladat?offerType=PRONAJEM&estateType=BYT&osm_value=Bratislava%2C+Bratislavsk%C3%BD+kraj%2C+Slovensko&regionOsmIds=R1702499&currency=EUR&location=exact"
    print("Starting to scrape Bezrealitky.sk...")
    
    # Path to your chromedriver - replace with actual path if not in PATH
    chromedriver_path = 'C:/Users/Filip/Desktop/najom/RentBot/Scrapping/chromedriver.exe'  # e.g., "C:/path/to/chromedriver.exe" or "/usr/local/bin/chromedriver"
    
    
    listings = scrape_bezrealitky(url, chromedriver_path)
    
    if listings:
        print(f"Successfully scraped {len(listings)} listings.")
        save_to_csv(listings)
        
        # Print sample data
        print("\nSample data:")
        sample_df = pd.DataFrame(listings[:3])
        print(sample_df)
    else:
        print("No listings were found.")

if __name__ == "__main__":
    main()

Starting to scrape Bezrealitky.sk...
Error scraping website: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF6917D4C25+3179557]
	(No symbol) [0x00007FF6914388A0]
	(No symbol) [0x00007FF6912C91CA]
	(No symbol) [0x00007FF69131FA67]
	(No symbol) [0x00007FF69131FC9C]
	(No symbol) [0x00007FF691373627]
	(No symbol) [0x00007FF691347C6F]
	(No symbol) [0x00007FF6913702F3]
	(No symbol) [0x00007FF691347A03]
	(No symbol) [0x00007FF6913106D0]
	(No symbol) [0x00007FF691311983]
	GetHandleVerifier [0x00007FF6918367CD+3579853]
	GetHandleVerifier [0x00007FF69184D1D2+3672530]
	GetHandleVerifier [0x00007FF691842153+3627347]
	GetHandleVerifier [0x00007FF6915A092A+868650]
	(No symbol) [0x00007FF691442FFF]
	(No symbol) [0x00007FF69143F4A4]
	(No symbol) [0x00007FF69143F646]
	(No symbol) [0x00007FF69142EAA9]
	BaseThreadInitThunk [0x00007FF9B3FBE8D7+23]
	RtlUserThreadStart [0x00007FF9B525BF2C+44]

No listings were found.


In [10]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

def setup_driver(chromedriver_path=None):
    """Set up and return a configured Chrome webdriver."""
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    if chromedriver_path:
        service = Service(executable_path=chromedriver_path)
        driver = webdriver.Chrome(service=service, options=chrome_options)
    else:
        # Assumes chromedriver is in PATH
        driver = webdriver.Chrome(options=chrome_options)
    
    return driver

def scrape_urls(url, chromedriver_path=None):
    """Scrape property listing URLs only from bezrealitky.sk."""
    driver = setup_driver(chromedriver_path)
    driver.get(url)
    
    # Wait for the listings to load
    try:
        # Wait for at least one property card to be present
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "article[class*='PropertyCard_propertyCard']"))
        )
        
        # Additional wait to ensure all content is loaded
        time.sleep(2)
        
        # Get all URLs directly - more efficient approach
        property_urls = []
        links = driver.find_elements(By.CSS_SELECTOR, "a[href*='/nehnutelnosti-byty-domy/']")
        
        for link in links:
            href = link.get_attribute("href")
            if href and '/nehnutelnosti-byty-domy/' in href and href not in property_urls:
                property_urls.append(href)
        
        return property_urls
        
    except Exception as e:
        print(f"Error scraping website: {e}")
        return []
    
    finally:
        driver.quit()

def save_urls(urls, filename="bezrealitky_urls.csv"):
    """Save scraped URLs to a CSV file."""
    if not urls:
        print("No URLs to save.")
        return
    
    df = pd.DataFrame({'url': urls})
    df.to_csv(filename, index=False, encoding='utf-8-sig')
    print(f"URLs saved to {filename}")
    
    # Print URLs to console as well
    print("\nScraped URLs:")
    for i, url in enumerate(urls, 1):
        print(f"{i}. {url}")

def main():
    url = "https://www.bezrealitky.sk/vyhladat?offerType=PRONAJEM&estateType=BYT&osm_value=Bratislava%2C+Bratislavsk%C3%BD+kraj%2C+Slovensko&regionOsmIds=R1702499&currency=EUR&location=exact"
    print("Starting to scrape property URLs from Bezrealitky.sk...")
    
    # Path to your chromedriver - replace with actual path if not in PATH
    chromedriver_path = 'C:/Users/Filip/Desktop/najom/RentBot/Scrapping/chromedriver.exe'  # e.g., "C:/path/to/chromedriver.exe" or "/usr/local/bin/chromedriver"
    
    urls = scrape_urls(url, chromedriver_path)
    
    if urls:
        print(f"Successfully scraped {len(urls)} property URLs.")
        save_urls(urls)
    else:
        print("No property URLs were found.")

if __name__ == "__main__":
    main()

Starting to scrape property URLs from Bezrealitky.sk...
Successfully scraped 15 property URLs.
URLs saved to bezrealitky_urls.csv

Scraped URLs:
1. https://www.bezrealitky.sk/nehnutelnosti-byty-domy/885983-nabidka-pronajem-bytu-mlynarovicova-bratislava
2. https://www.bezrealitky.sk/nehnutelnosti-byty-domy/885056-nabidka-pronajem-bytu
3. https://www.bezrealitky.sk/nehnutelnosti-byty-domy/883136-nabidka-pronajem-bytu-robotnicka-bratislava
4. https://www.bezrealitky.sk/nehnutelnosti-byty-domy/885339-nabidka-pronajem-bytu-bazova-ruzinov
5. https://www.bezrealitky.sk/nehnutelnosti-byty-domy/885655-nabidka-pronajem-bytu-gemerska-bratislava
6. https://www.bezrealitky.sk/nehnutelnosti-byty-domy/885937-nabidka-pronajem-bytu-bujnakova-dubravka
7. https://www.bezrealitky.sk/nehnutelnosti-byty-domy/885864-nabidka-pronajem-bytu-cecinova-bratislava
8. https://www.bezrealitky.sk/nehnutelnosti-byty-domy/885829-nabidka-pronajem-bytu-racianska-bratislava
9. https://www.bezrealitky.sk/nehnutelnosti-byty-

In [None]:
####

In [12]:
import time
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import re

def setup_driver(chromedriver_path=None):
    """Set up and return a configured Chrome webdriver."""
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    if chromedriver_path:
        service = Service(executable_path=chromedriver_path)
        driver = webdriver.Chrome(service=service, options=chrome_options)
    else:
        # Assumes chromedriver is in PATH
        driver = webdriver.Chrome(options=chrome_options)
    
    return driver

def find_total_pages(driver):
    """Find the total number of pages by examining pagination elements."""
    try:
        # Wait for pagination to be present
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "li[class*='page-item']"))
        )
        
        # Find all page links
        page_links = driver.find_elements(By.CSS_SELECTOR, "a[class*='page-link']")
        
        # Extract page numbers from the links
        page_numbers = []
        for link in page_links:
            try:
                # Get text content of the link (which might be the page number)
                link_text = link.text.strip()
                if link_text and link_text.isdigit():
                    page_numbers.append(int(link_text))
                else:
                    # If text isn't a number, check if the href contains a page parameter
                    href = link.get_attribute("href")
                    if href:
                        page_match = re.search(r'page=(\d+)', href)
                        if page_match:
                            page_numbers.append(int(page_match.group(1)))
            except:
                continue
        
        # If we found page numbers, return the maximum
        if page_numbers:
            return max(page_numbers)
        
        return 1  # Default if pagination not found or can't determine
        
    except Exception as e:
        print(f"Error finding total pages: {e}")
        return 1  # Default to 1 page in case of error

def scrape_urls_from_page(driver):
    """Scrape property listing URLs from the currently loaded page."""
    try:
        # Wait for at least one property card to be present
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "article[class*='PropertyCard_propertyCard']"))
        )
        
        # Short wait for all listings to load
        time.sleep(2)
        
        # Get all property URLs
        property_urls = []
        links = driver.find_elements(By.CSS_SELECTOR, "a[href*='/nehnutelnosti-byty-domy/']")
        
        for link in links:
            href = link.get_attribute("href")
            if href and '/nehnutelnosti-byty-domy/' in href and href not in property_urls:
                property_urls.append(href)
        
        return property_urls
        
    except Exception as e:
        print(f"Error scraping URLs from page: {e}")
        return []

def scrape_all_pages(base_url, chromedriver_path=None):
    """Scrape property listing URLs from all pages."""
    driver = setup_driver(chromedriver_path)
    all_urls = []
    
    try:
        # Start with page 1 (which doesn't need the page parameter)
        current_url = base_url
        driver.get(current_url)
        
        # Find total number of pages
        total_pages = find_total_pages(driver)
        print(f"Found {total_pages} total pages to scrape")
        
        # Scrape page 1
        print(f"Scraping page 1...")
        page1_urls = scrape_urls_from_page(driver)
        all_urls.extend(page1_urls)
        print(f"Found {len(page1_urls)} URLs on page 1")
        
        # Scrape remaining pages (if any)
        for page_num in range(2, total_pages + 1):
            # Construct URL for the current page
            if "page=" in base_url:
                # Replace existing page parameter
                current_url = re.sub(r'page=\d+', f'page={page_num}', base_url)
            else:
                # Add page parameter
                separator = "&" if "?" in base_url else "?"
                current_url = f"{base_url}{separator}page={page_num}"
            
            print(f"Scraping page {page_num}...")
            driver.get(current_url)
            
            page_urls = scrape_urls_from_page(driver)
            all_urls.extend(page_urls)
            print(f"Found {len(page_urls)} URLs on page {page_num}")
            
            # Add a short delay between pages to avoid being blocked
            time.sleep(1)
    
    except Exception as e:
        print(f"Error during multi-page scraping: {e}")
    
    finally:
        driver.quit()
    
    # Remove any duplicates that might have occurred
    unique_urls = list(dict.fromkeys(all_urls))
    return unique_urls

def save_urls(urls, filename="bezrealitky_all_urls.csv"):
    """Save scraped URLs to a CSV file."""
    if not urls:
        print("No URLs to save.")
        return
    
    df = pd.DataFrame({'url': urls})
    df.to_csv(filename, index=False, encoding='utf-8-sig')
    print(f"URLs saved to {filename}")
    
    # Print summary
    print(f"\nSuccessfully scraped {len(urls)} unique property URLs")

def main():
    base_url = "https://www.bezrealitky.sk/vyhladat?offerType=PRONAJEM&estateType=BYT&osm_value=Bratislava%2C+Bratislavsk%C3%BD+kraj%2C+Slovensko&regionOsmIds=R1702499&currency=EUR&location=exact"
    print("Starting to scrape property URLs from all pages on Bezrealitky.sk...")
    
    # Path to your chromedriver - replace with actual path if not in PATH
    chromedriver_path = 'C:/Users/Filip/Desktop/najom/RentBot/Scrapping/chromedriver.exe'  # e.g., "C:/path/to/chromedriver.exe" or "/usr/local/bin/chromedriver"
    
    urls = scrape_all_pages(base_url, chromedriver_path)
    
    if urls:
        save_urls(urls)
    else:
        print("No property URLs were found.")

if __name__ == "__main__":
    main()

Starting to scrape property URLs from all pages on Bezrealitky.sk...
Found 6 total pages to scrape
Scraping page 1...
Found 15 URLs on page 1
Scraping page 2...
Found 15 URLs on page 2
Scraping page 3...
Found 15 URLs on page 3
Scraping page 4...
Found 15 URLs on page 4
Scraping page 5...
Found 15 URLs on page 5
Scraping page 6...
Found 1 URLs on page 6
URLs saved to bezrealitky_all_urls.csv

Successfully scraped 76 unique property URLs


In [None]:
#####

In [9]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re

# Setup Chrome driver
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")

# Initialize driver (assumes chromedriver is in PATH)
driver = webdriver.Chrome(options=chrome_options)

# Function to extract details from URL
def get_property_details(url):
    try:
        driver.get(url)
        time.sleep(1.5)  # Wait a bit longer for page to load
        
        # Extract property ID from URL
        property_id = re.search(r'/(\d+)-', url).group(1) if re.search(r'/(\d+)-', url) else "N/A"
        
        # Get address using JavaScript
        address = driver.execute_script("""
            var mapLinks = document.querySelectorAll('a[href="#mapa"]');
            for (var i = 0; i < mapLinks.length; i++) {
                var text = mapLinks[i].textContent.trim();
                if (text) return text;
            }
            return "N/A";
        """)
        
        # Extract usable area
        usable_area = driver.execute_script("""
            var rows = document.querySelectorAll('table tbody tr');
            for (var i = 0; i < rows.length; i++) {
                var th = rows[i].querySelector('th');
                if (th && (th.textContent.trim() === 'Užitná plocha' || th.textContent.trim() === 'Uzitna plocha')) {
                    var td = rows[i].querySelector('td');
                    if (td) {
                        return td.textContent.trim();
                    }
                }
            }
            return "N/A";
        """)
        
        # Get room disposition
        disposition = driver.execute_script("""
            var headers = Array.from(document.querySelectorAll('th')).filter(th => 
                th.textContent.includes('Dispozícia') || th.textContent.includes('Dispozicia'));
            
            if (headers.length > 0) {
                var row = headers[0].closest('tr');
                if (row) {
                    var cell = row.querySelector('td');
                    if (cell) return cell.textContent.trim();
                }
            }
            
            var dispLinks = document.querySelectorAll('a[href*="/vypis/ponuka-prenajom/byt/"]');
            for (var i = 0; i < dispLinks.length; i++) {
                var text = dispLinks[i].textContent.trim();
                if (text.match(/^\d\+\d$/) || text.match(/^\d$/) || text.match(/^garsónka$/i)) {
                    return text;
                }
            }
            
            return "N/A";
        """)
        
        # Get location
        location = driver.execute_script("""
            var headers = Array.from(document.querySelectorAll('th')).filter(th => 
                th.textContent.includes('Umiestnenie'));
            
            if (headers.length > 0) {
                var row = headers[0].closest('tr');
                if (row) {
                    var cell = row.querySelector('td');
                    if (cell) return cell.textContent.trim();
                }
            }
            
            return "N/A";
        """)
        
        # Get available from
        available_from = driver.execute_script("""
            var headers = Array.from(document.querySelectorAll('th')).filter(th => 
                th.textContent.includes('Dostupné od') || th.textContent.includes('Dostupne od'));
            
            if (headers.length > 0) {
                var row = headers[0].closest('tr');
                if (row) {
                    var cell = row.querySelector('td');
                    if (cell) return cell.textContent.trim();
                }
            }
            
            return "N/A";
        """)
        
        # Get furnishing/equipment (Vybavené)
        furnishing = driver.execute_script("""
            var headers = Array.from(document.querySelectorAll('th')).filter(th => 
                th.textContent.includes('Vybavené') || th.textContent.includes('Vybavene'));
            
            if (headers.length > 0) {
                var row = headers[0].closest('tr');
                if (row) {
                    var cell = row.querySelector('td');
                    if (cell) return cell.textContent.trim();
                }
            }
            
            return "N/A";
        """)
        
        # Extract pricing information
        pricing = driver.execute_script("""
            var result = {
                monthly_rent: "N/A",
                energy_fees: "N/A",
                security_deposit: "N/A",
                admin_fee: "N/A"
            };
            
            // Method 1: Try to find by text labels
            var labels = [
                {key: 'monthly_rent', search: 'Mesačné nájomné'},
                {key: 'energy_fees', search: 'Poplatky za energie'},
                {key: 'security_deposit', search: 'Vratná kaucia'},
                {key: 'admin_fee', search: 'Správny poplatok'}
            ];
            
            // Try to find all price elements
            var priceElements = document.querySelectorAll('strong span, div strong span');
            
            // First try to match labels with their values
            var textNodes = document.createTreeWalker(
                document.body, 
                NodeFilter.SHOW_TEXT, 
                { acceptNode: function(node) { return NodeFilter.FILTER_ACCEPT; } },
                false
            );
            
            while(textNodes.nextNode()) {
                var node = textNodes.currentNode;
                var text = node.textContent.trim();
                
                for (var i = 0; i < labels.length; i++) {
                    if (text.includes(labels[i].search)) {
                        // Look for nearest price element
                        var parent = node.parentNode;
                        while (parent && !parent.querySelector('strong span')) {
                            parent = parent.parentNode;
                        }
                        
                        if (parent) {
                            var priceElement = parent.querySelector('strong span');
                            if (priceElement) {
                                var priceText = priceElement.textContent.trim();
                                // Extract just the number
                                var numMatch = priceText.match(/[\\d\\s]+/);
                                if (numMatch) {
                                    result[labels[i].key] = numMatch[0].trim().replace(/\\s+/g, '');
                                }
                            }
                        }
                    }
                }
            }
            
            // Method 2: Try to find by position and structure
            // If we couldn't find by labels, try to extract by looking at format and position
            
            // Find all prices formatted as strong elements with span children
            var strongElements = document.querySelectorAll('strong span');
            var priceValues = [];
            
            for (var i = 0; i < strongElements.length; i++) {
                var text = strongElements[i].textContent.trim();
                if (text.includes('€') || text.match(/\\d+\\s*€/)) {
                    // Clean up the text to extract just the number
                    var numOnly = text.replace(/[^0-9]/g, '');
                    if (numOnly) {
                        priceValues.push(numOnly);
                    }
                }
            }
            
            // If we have at least one price, assume it's the monthly rent
            if (priceValues.length > 0 && result.monthly_rent === "N/A") {
                result.monthly_rent = priceValues[0];
            }
            
            // If we have more than one price, assume the second is energy fees
            if (priceValues.length > 1 && result.energy_fees === "N/A") {
                result.energy_fees = priceValues[1];
            }
            
            // If we have more than two prices, assume the third is security deposit
            if (priceValues.length > 2 && result.security_deposit === "N/A") {
                result.security_deposit = priceValues[2];
            }
            
            // If we have more than three prices, assume the fourth is admin fee
            if (priceValues.length > 3 && result.admin_fee === "N/A") {
                result.admin_fee = priceValues[3];
            }
            
            return result;
        """)
        
        # Return all extracted details
        return {
            'property_id': property_id,
            'url': url,
            'address': address,
            'disposition': disposition,
            'usable_area': usable_area,
            'location': location,
            'available_from': available_from,
            'furnishing': furnishing,
            'monthly_rent': pricing.get('monthly_rent', 'N/A'),
            'energy_fees': pricing.get('energy_fees', 'N/A'),
            'security_deposit': pricing.get('security_deposit', 'N/A'),
            'admin_fee': pricing.get('admin_fee', 'N/A')
        }
    except Exception as e:
        print(f"Error processing URL {url}: {e}")
        return {
            'property_id': property_id if 'property_id' in locals() else "N/A",
            'url': url,
            'address': "Error",
            'disposition': "Error",
            'usable_area': "Error",
            'location': "Error",
            'available_from': "Error",
            'furnishing': "Error",
            'monthly_rent': "Error",
            'energy_fees': "Error",
            'security_deposit': "Error",
            'admin_fee': "Error"
        }

# Test URL to verify it works
test_url = "https://www.bezrealitky.sk/nehnutelnosti-byty-domy/878310-nabidka-pronajem-bytu-ulica-zavodu-matador-bratislava"

# Process test URL first to verify
print(f"Testing extraction with URL: {test_url}")
test_result = get_property_details(test_url)
print("\nTest result:")
for key, value in test_result.items():
    print(f"{key}: {value}")

# Load your URLs from CSV or use a list
# urls = pd.read_csv("bezrealitky_all_urls.csv")['url'].tolist()

# For testing with just a few URLs:
urls = [
    "https://www.bezrealitky.sk/nehnutelnosti-byty-domy/878310-nabidka-pronajem-bytu-ulica-zavodu-matador-bratislava",
    'https://www.bezrealitky.sk/nehnutelnosti-byty-domy/845382-nabidka-pronajem-bytu-zamocka-bratislava',
    'https://www.bezrealitky.sk/nehnutelnosti-byty-domy/882332-nabidka-pronajem-bytu-zavadska-bratislava'
    # Add more URLs here
]

# Process all URLs
results = []
for i, url in enumerate(urls):
    print(f"\nProcessing {i+1}/{len(urls)}: {url}")
    results.append(get_property_details(url))
    
# Convert to DataFrame
results_df = pd.DataFrame(results)
print("\nResults DataFrame:")
print(results_df)

# Save to CSV
output_file = "bezrealitky_property_details.csv"
results_df.to_csv(output_file, index=False, encoding='utf-8-sig')
print(f"\nResults saved to {output_file}")

# Close the driver
driver.quit()

Testing extraction with URL: https://www.bezrealitky.sk/nehnutelnosti-byty-domy/878310-nabidka-pronajem-bytu-ulica-zavodu-matador-bratislava

Test result:
property_id: 878310
url: https://www.bezrealitky.sk/nehnutelnosti-byty-domy/878310-nabidka-pronajem-bytu-ulica-zavodu-matador-bratislava
address: Ulica Závodu Matador, Petržalka - Kapitulský Dvor, Bratislavský kraj
disposition: 2+1
usable_area: 58 m²
location: Tichá časť
available_from: 27. 5. 2025
furnishing: Vybavené
monthly_rent: 1117
energy_fees: 197
security_deposit: 690
admin_fee: 690

Processing 1/3: https://www.bezrealitky.sk/nehnutelnosti-byty-domy/878310-nabidka-pronajem-bytu-ulica-zavodu-matador-bratislava

Processing 2/3: https://www.bezrealitky.sk/nehnutelnosti-byty-domy/845382-nabidka-pronajem-bytu-zamocka-bratislava

Processing 3/3: https://www.bezrealitky.sk/nehnutelnosti-byty-domy/882332-nabidka-pronajem-bytu-zavadska-bratislava

Results DataFrame:
  property_id                                                url  \
0

In [10]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re

def setup_driver(chromedriver_path=None):
    """Set up and return a configured Chrome webdriver."""
    chrome_options = Options()
    chrome_options.add_argument("--headless")  # Run in headless mode (no GUI)
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    
    if chromedriver_path:
        service = Service(executable_path=chromedriver_path)
        driver = webdriver.Chrome(service=service, options=chrome_options)
    else:
        # Assumes chromedriver is in PATH
        driver = webdriver.Chrome(options=chrome_options)
    
    return driver

def find_total_pages(driver):
    """Find the total number of pages by examining pagination elements."""
    try:
        # Wait for pagination to be present
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "li[class*='page-item']"))
        )
        
        # Find all page links
        page_links = driver.find_elements(By.CSS_SELECTOR, "a[class*='page-link']")
        
        # Extract page numbers from the links
        page_numbers = []
        for link in page_links:
            try:
                # Get text content of the link (which might be the page number)
                link_text = link.text.strip()
                if link_text and link_text.isdigit():
                    page_numbers.append(int(link_text))
                else:
                    # If text isn't a number, check if the href contains a page parameter
                    href = link.get_attribute("href")
                    if href:
                        page_match = re.search(r'page=(\d+)', href)
                        if page_match:
                            page_numbers.append(int(page_match.group(1)))
            except:
                continue
        
        # If we found page numbers, return the maximum
        if page_numbers:
            return max(page_numbers)
        
        return 1  # Default if pagination not found or can't determine
        
    except Exception as e:
        print(f"Error finding total pages: {e}")
        return 1  # Default to 1 page in case of error

def scrape_urls_from_page(driver):
    """Scrape property listing URLs from the currently loaded page."""
    try:
        # Wait for at least one property card to be present
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "article[class*='PropertyCard_propertyCard']"))
        )
        
        # Short wait for all listings to load
        time.sleep(2)
        
        # Get all property URLs
        property_urls = []
        links = driver.find_elements(By.CSS_SELECTOR, "a[href*='/nehnutelnosti-byty-domy/']")
        
        for link in links:
            href = link.get_attribute("href")
            if href and '/nehnutelnosti-byty-domy/' in href and href not in property_urls:
                property_urls.append(href)
        
        return property_urls
        
    except Exception as e:
        print(f"Error scraping URLs from page: {e}")
        return []

def scrape_all_pages(base_url, driver):
    """Scrape property listing URLs from all pages."""
    all_urls = []
    
    try:
        # Start with page 1 (which doesn't need the page parameter)
        current_url = base_url
        driver.get(current_url)
        
        # Find total number of pages
        total_pages = find_total_pages(driver)
        print(f"Found {total_pages} total pages to scrape")
        
        # Scrape page 1
        print(f"Scraping page 1...")
        page1_urls = scrape_urls_from_page(driver)
        all_urls.extend(page1_urls)
        print(f"Found {len(page1_urls)} URLs on page 1")
        
        # Scrape remaining pages (if any)
        for page_num in range(2, total_pages + 1):
            # Construct URL for the current page
            if "page=" in base_url:
                # Replace existing page parameter
                current_url = re.sub(r'page=\d+', f'page={page_num}', base_url)
            else:
                # Add page parameter
                separator = "&" if "?" in base_url else "?"
                current_url = f"{base_url}{separator}page={page_num}"
            
            print(f"Scraping page {page_num}...")
            driver.get(current_url)
            
            page_urls = scrape_urls_from_page(driver)
            all_urls.extend(page_urls)
            print(f"Found {len(page_urls)} URLs on page {page_num}")
            
            # Add a short delay between pages to avoid being blocked
            time.sleep(1)
    
    except Exception as e:
        print(f"Error during multi-page scraping: {e}")
    
    # Remove any duplicates that might have occurred
    unique_urls = list(dict.fromkeys(all_urls))
    return unique_urls

def get_property_details(url, driver):
    """Extract detailed information from a property listing page."""
    try:
        driver.get(url)
        time.sleep(1.5)  # Wait a bit longer for page to load
        
        # Extract property ID from URL
        property_id = re.search(r'/(\d+)-', url).group(1) if re.search(r'/(\d+)-', url) else "N/A"
        
        # Get address using JavaScript
        address = driver.execute_script("""
            var mapLinks = document.querySelectorAll('a[href="#mapa"]');
            for (var i = 0; i < mapLinks.length; i++) {
                var text = mapLinks[i].textContent.trim();
                if (text) return text;
            }
            return "N/A";
        """)
        
        # Extract usable area
        usable_area = driver.execute_script("""
            var rows = document.querySelectorAll('table tbody tr');
            for (var i = 0; i < rows.length; i++) {
                var th = rows[i].querySelector('th');
                if (th && (th.textContent.trim() === 'Užitná plocha' || th.textContent.trim() === 'Uzitna plocha')) {
                    var td = rows[i].querySelector('td');
                    if (td) {
                        return td.textContent.trim();
                    }
                }
            }
            return "N/A";
        """)
        
        # Get room disposition
        disposition = driver.execute_script("""
            var headers = Array.from(document.querySelectorAll('th')).filter(th => 
                th.textContent.includes('Dispozícia') || th.textContent.includes('Dispozicia'));
            
            if (headers.length > 0) {
                var row = headers[0].closest('tr');
                if (row) {
                    var cell = row.querySelector('td');
                    if (cell) return cell.textContent.trim();
                }
            }
            
            var dispLinks = document.querySelectorAll('a[href*="/vypis/ponuka-prenajom/byt/"]');
            for (var i = 0; i < dispLinks.length; i++) {
                var text = dispLinks[i].textContent.trim();
                if (text.match(/^\d\+\d$/) || text.match(/^\d$/) || text.match(/^garsónka$/i)) {
                    return text;
                }
            }
            
            return "N/A";
        """)
        
        # Get location
        location = driver.execute_script("""
            var headers = Array.from(document.querySelectorAll('th')).filter(th => 
                th.textContent.includes('Umiestnenie'));
            
            if (headers.length > 0) {
                var row = headers[0].closest('tr');
                if (row) {
                    var cell = row.querySelector('td');
                    if (cell) return cell.textContent.trim();
                }
            }
            
            return "N/A";
        """)
        
        # Get available from
        available_from = driver.execute_script("""
            var headers = Array.from(document.querySelectorAll('th')).filter(th => 
                th.textContent.includes('Dostupné od') || th.textContent.includes('Dostupne od'));
            
            if (headers.length > 0) {
                var row = headers[0].closest('tr');
                if (row) {
                    var cell = row.querySelector('td');
                    if (cell) return cell.textContent.trim();
                }
            }
            
            return "N/A";
        """)
        
        # Get furnishing/equipment (Vybavené)
        furnishing = driver.execute_script("""
            var headers = Array.from(document.querySelectorAll('th')).filter(th => 
                th.textContent.includes('Vybavené') || th.textContent.includes('Vybavene'));
            
            if (headers.length > 0) {
                var row = headers[0].closest('tr');
                if (row) {
                    var cell = row.querySelector('td');
                    if (cell) return cell.textContent.trim();
                }
            }
            
            return "N/A";
        """)
        
        # Extract pricing information
        pricing = driver.execute_script("""
            var result = {
                monthly_rent: "N/A",
                energy_fees: "N/A",
                security_deposit: "N/A",
                admin_fee: "N/A"
            };
            
            // Method 1: Try to find by text labels
            var labels = [
                {key: 'monthly_rent', search: 'Mesačné nájomné'},
                {key: 'energy_fees', search: 'Poplatky za energie'},
                {key: 'security_deposit', search: 'Vratná kaucia'},
                {key: 'admin_fee', search: 'Správny poplatok'}
            ];
            
            // Try to find all price elements
            var priceElements = document.querySelectorAll('strong span, div strong span');
            
            // First try to match labels with their values
            var textNodes = document.createTreeWalker(
                document.body, 
                NodeFilter.SHOW_TEXT, 
                { acceptNode: function(node) { return NodeFilter.FILTER_ACCEPT; } },
                false
            );
            
            while(textNodes.nextNode()) {
                var node = textNodes.currentNode;
                var text = node.textContent.trim();
                
                for (var i = 0; i < labels.length; i++) {
                    if (text.includes(labels[i].search)) {
                        // Look for nearest price element
                        var parent = node.parentNode;
                        while (parent && !parent.querySelector('strong span')) {
                            parent = parent.parentNode;
                        }
                        
                        if (parent) {
                            var priceElement = parent.querySelector('strong span');
                            if (priceElement) {
                                var priceText = priceElement.textContent.trim();
                                // Extract just the number
                                var numMatch = priceText.match(/[\\d\\s]+/);
                                if (numMatch) {
                                    result[labels[i].key] = numMatch[0].trim().replace(/\\s+/g, '');
                                }
                            }
                        }
                    }
                }
            }
            
            // Method 2: Try to find by position and structure
            // If we couldn't find by labels, try to extract by looking at format and position
            
            // Find all prices formatted as strong elements with span children
            var strongElements = document.querySelectorAll('strong span');
            var priceValues = [];
            
            for (var i = 0; i < strongElements.length; i++) {
                var text = strongElements[i].textContent.trim();
                if (text.includes('€') || text.match(/\\d+\\s*€/)) {
                    // Clean up the text to extract just the number
                    var numOnly = text.replace(/[^0-9]/g, '');
                    if (numOnly) {
                        priceValues.push(numOnly);
                    }
                }
            }
            
            // If we have at least one price, assume it's the monthly rent
            if (priceValues.length > 0 && result.monthly_rent === "N/A") {
                result.monthly_rent = priceValues[0];
            }
            
            // If we have more than one price, assume the second is energy fees
            if (priceValues.length > 1 && result.energy_fees === "N/A") {
                result.energy_fees = priceValues[1];
            }
            
            // If we have more than two prices, assume the third is security deposit
            if (priceValues.length > 2 && result.security_deposit === "N/A") {
                result.security_deposit = priceValues[2];
            }
            
            // If we have more than three prices, assume the fourth is admin fee
            if (priceValues.length > 3 && result.admin_fee === "N/A") {
                result.admin_fee = priceValues[3];
            }
            
            return result;
        """)
        
        # Return all extracted details
        return {
            'property_id': property_id,
            'url': url,
            'address': address,
            'disposition': disposition,
            'usable_area': usable_area,
            'location': location,
            'available_from': available_from,
            'furnishing': furnishing,
            'monthly_rent': pricing.get('monthly_rent', 'N/A'),
            'energy_fees': pricing.get('energy_fees', 'N/A'),
            'security_deposit': pricing.get('security_deposit', 'N/A'),
            'admin_fee': pricing.get('admin_fee', 'N/A')
        }
    except Exception as e:
        print(f"Error processing URL {url}: {e}")
        return {
            'property_id': property_id if 'property_id' in locals() else "N/A",
            'url': url,
            'address': "Error",
            'disposition': "Error",
            'usable_area': "Error",
            'location': "Error",
            'available_from': "Error",
            'furnishing': "Error",
            'monthly_rent': "Error",
            'energy_fees': "Error",
            'security_deposit': "Error",
            'admin_fee': "Error"
        }

def save_results(results, filename="bezrealitky_property_details.csv", url_filename="bezrealitky_all_urls.csv"):
    """Save scraped results to CSV files."""
    # Save property details
    if results:
        results_df = pd.DataFrame(results)
        results_df.to_csv(filename, index=False, encoding='utf-8-sig')
        print(f"Detailed property information saved to {filename}")
        
        # Also save just the URLs
        urls_df = pd.DataFrame({'url': [result['url'] for result in results]})
        urls_df.to_csv(url_filename, index=False, encoding='utf-8-sig')
        print(f"URLs saved to {url_filename}")
    else:
        print("No results to save.")

def main():
    # Set up the base URL for listings
    base_url = "https://www.bezrealitky.sk/vyhladat?offerType=PRONAJEM&estateType=BYT&osm_value=Bratislava%2C+Bratislavsk%C3%BD+kraj%2C+Slovensko&regionOsmIds=R1702499&currency=EUR&location=exact"
    
    # Path to your chromedriver - replace with actual path if not in PATH
    chromedriver_path = None  # Update this path if needed
    
    # Initialize the WebDriver once for the entire process
    driver = setup_driver(chromedriver_path)
    
    try:
        print("===== STAGE 1: COLLECTING ALL PROPERTY LISTING URLS =====")
        listing_urls = scrape_all_pages(base_url, driver)
        print(f"Successfully scraped {len(listing_urls)} unique property URLs")
        
        # Optionally save just the URLs at this point
        # pd.DataFrame({'url': listing_urls}).to_csv("bezrealitky_all_urls.csv", index=False, encoding='utf-8-sig')
        
        print("\n===== STAGE 2: EXTRACTING DETAILED PROPERTY INFORMATION =====")
        results = []
        
        # Process all URLs
        for i, url in enumerate(listing_urls):
            print(f"Processing property {i+1}/{len(listing_urls)}: {url}")
            property_details = get_property_details(url, driver)
            results.append(property_details)
            
            # Add a small delay between requests to reduce server load
            time.sleep(0.5)
            
            # Optional: Save progress periodically
            if (i + 1) % 10 == 0 or (i + 1) == len(listing_urls):
                save_results(results, f"bezrealitky_details_progress_{i+1}.csv")
        
        # Final save of the complete results
        save_results(results)
        
        print("\n===== SCRAPING COMPLETED SUCCESSFULLY =====")
        
    except Exception as e:
        print(f"An error occurred during scraping: {e}")
    
    finally:
        # Always close the driver to release resources
        driver.quit()
        print("Browser closed.")

if __name__ == "__main__":
    main()

===== STAGE 1: COLLECTING ALL PROPERTY LISTING URLS =====
Found 6 total pages to scrape
Scraping page 1...
Found 15 URLs on page 1
Scraping page 2...
Found 15 URLs on page 2
Scraping page 3...
Found 15 URLs on page 3
Scraping page 4...
Found 15 URLs on page 4
Scraping page 5...
Found 15 URLs on page 5
Scraping page 6...
Found 1 URLs on page 6
Successfully scraped 76 unique property URLs

===== STAGE 2: EXTRACTING DETAILED PROPERTY INFORMATION =====
Processing property 1/76: https://www.bezrealitky.sk/nehnutelnosti-byty-domy/883136-nabidka-pronajem-bytu-robotnicka-bratislava
Processing property 2/76: https://www.bezrealitky.sk/nehnutelnosti-byty-domy/885339-nabidka-pronajem-bytu-bazova-ruzinov
Processing property 3/76: https://www.bezrealitky.sk/nehnutelnosti-byty-domy/882332-nabidka-pronajem-bytu-zavadska-bratislava
Processing property 4/76: https://www.bezrealitky.sk/nehnutelnosti-byty-domy/886366-nabidka-pronajem-bytu-lermontovova-bratislava-mestska-cast-stare-mesto
Processing proper