In [2]:
# Import 
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup

In [15]:
# BSoup sample 
URL = "https://realpython.github.io/fake-jobs/"
page = requests.get(URL)

soup = BeautifulSoup(page.content, "html.parser")   # Return bs4.BeautifulSoup

In [None]:
# Cheatsheet
# ----------
URL = "https://realpython.github.io/fake-jobs/"
page = requests.get(URL)
# ----------

# Return bs4.BeautifulSoup
soup = BeautifulSoup(page.content, "html.parser")

# Find HTML Element -> Return bs4.element.Tag
results = soup.find(
    name="input",
    id="ResultsContainer",
) 

# print(soup.prettify())

# Find sub elements in element
job_cards = soup.find_all("div", class_="card-content")

# Pass a function to a bs4 method 
python_jobs = results.find_all(
    "h2", string=lambda text: "python" in text.lower()
);

# Fetch links in the job boards 
# If you used .text then it will leave only visable part, so only text is left 
# So the link you want in href is stripped as well. Do the following instead
for job_card in job_cards:
    link_url = job_card.find_all("a")[1]["href"]

#### 2. Sample with Glassdoor

- bs4 is a library to navigate documents 
- requests is a library for well, requesting but mostly to work with http session 
- MechanicalSoup is a Python library for automating interaction with websites

In [2]:
# Import 
import mechanicalsoup
from pprint import pprint
from selenium import webdriver
from selenium.webdriver.common.by import By
from typing import Tuple
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
import re

# Parse the listings
def _search_str_property(listing):
    size_match = re.search(r'(\d+\s*m2)', listing)
    size = size_match.group(1) if size_match else None

    # 2. Find the property type (e.g., apartment, house, room)
    type_match = re.search(r'\b(apartment|house|room)\b', listing, re.IGNORECASE)
    property_type = type_match.group(1) if type_match else None

    # 3. Find the location (everything after "in")
    location_match = re.search(r'(?<=\bin\s)(.*)', listing)
    location = location_match.group(1).strip() if location_match else None

    return (size, property_type, location)

def parse_listing_html(li_element : str) -> Tuple[str, int]:
    bs4_html = BeautifulSoup(li_element.get_attribute("outerHTML"), 'html.parser')
    link = bs4_html.find("a")["href"]
    rent = bs4_html.find("label").parent.find("span").text
    size, property_type, location = _search_str_property(bs4_html.find("a").text)

    return link, rent, size, property_type, location

In [None]:
# Get browser object 
url = "https://www.housingtarget.com/netherlands"
browser = mechanicalsoup.Browser()
page = browser.get(url)

# Get html 
html = page.soup

form = html.select("form")[0]
form.find("input", {"name": "LeftMenu_ZipCodes"})["value"] = "Utretch"

In [None]:
import mechanicalsoup

# Create a browser object
browser = mechanicalsoup.StatefulBrowser()

# Open the webpage
browser.open("https://www.housingtarget.com/netherlands")

browser.select_form()

form = browser.get_current_form()
form["LeftMenu_LPEstateTypes"] = ["2", "3", "9", "20"]
form["LeftMenu_ZipCodes"] = "1319"

response = browser.submit_selected()

In [13]:
# Set up Chrome options
chrome_option = Options()
chrome_option.add_argument("--headless")
driver = webdriver.Chrome(options=chrome_option)

# Open the page after the form is submitted
driver.get(
    # 'https://www.housingtarget.com/netherlands/housing-rentals?' + 
    # 'zip_codes=1039;1319&estate_types=2;3;9;20&area_to=69&max_rent=2000'
    'https://www.housingtarget.com/netherlands/housing-rentals/amsterdam'
)

ul_element = driver.find_element(By.CLASS_NAME, 'table-ads')
li_elements = ul_element.find_elements(By.TAG_NAME, 'li')
listings = [li for li in li_elements if li.get_attribute("class") == "" and li.find_elements(By.TAG_NAME, "div")]


In [None]:
html = BeautifulSoup(driver.find_element(By.CLASS_NAME, "pager").get_attribute("outerHTML"), "html.parser")
urls = [e["href"] for e in html.find_all("a")] 
parsed_listings = [parse_listing_html(li) for li in listings]
r = pd.DataFrame(parsed_listings, columns = ["link", "rent", "size", "property_type", "location"])

    

In [9]:
def listing_spider(root_url, base_url='https://www.housingtarget.com'):
    visited = set()
    final = []
    def get_listings(url):
        """Get listings on an url"""
        if url in visited:
            return
        print(f"Visiting {url}")
        visited.add(url)
        chrome_option = Options()
        chrome_option.add_argument("--headless")
        driver = webdriver.Chrome(options=chrome_option)
        driver.get(url)

        ul_element = driver.find_element(By.CLASS_NAME, "table-ads")
        li_elements = ul_element.find_elements(By.TAG_NAME, "li")
        listings = [
            li
            for li in li_elements
            if li.get_attribute("class") == "" and li.find_elements(By.TAG_NAME, "div")
        ]

        parsed_listings = [parse_listing_html(li) for li in listings]
        r = pd.DataFrame(
            parsed_listings, columns=["link", "rent", "size", "property_type", "location"]
        )
        final.append(r)

        # Recursive 
        html = BeautifulSoup(driver.find_element(By.CLASS_NAME, "pager").get_attribute("outerHTML"), "html.parser")
        urls = [e["href"] for e in html.find_all("a") if  base_url + e["href"] not in visited]
        driver.quit()

        for link in urls:
            try:
                get_listings('https://www.housingtarget.com' + link)
            except Exception as e:
                # raise ValueError(f"{url}")
                print(f"Can visit {link}")
        return r

    get_listings(root_url)
    result = pd.concat(final)
    result.to_pickle(f"outputs/{root_url}.pkl")
    return pd.concat(final)

In [None]:
listing_spider("https://www.housingtarget.com/netherlands/housing-rentals/amsterdam")

Visiting https://www.housingtarget.com/netherlands/housing-rentals/amsterdam
Visiting https://www.housingtarget.com/netherlands/housing-rentals/amsterdam/pageindex2
Visiting https://www.housingtarget.com/netherlands/housing-rentals/amsterdam/pageindex3
Visiting https://www.housingtarget.com/netherlands/housing-rentals/amsterdam/pageindex4
