In [5]:
import os
import time
import requests
import re
import math
import pandas as pd
from dotenv import load_dotenv
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


def open_and_authenticate_salary(keyword, city):
    load_dotenv()  # Load environment variables from .env file
    
    chrome_options = Options()
    # Do NOT use headless mode if you want to input credentials manually:
    # chrome_options.add_argument("--headless")
    
    driver = webdriver.Chrome(options=chrome_options)
    wait = WebDriverWait(driver, 20)
    
    # Open the job listings page
    url = f"https://www.stepstone.de/jobs/{keyword}/in-{city}?radius=30&searchOrigin=Resultlist_top-search"
    driver.get(url)
    print("Job listings page opened:", url)
    
    # Accept cookies by finding the cookie button using its ID
    try:
        cookie_button = wait.until(EC.element_to_be_clickable((By.ID, "ccmgt_explicit_accept")))
        driver.execute_script("arguments[0].click();", cookie_button)
        print("Cookies accepted.")
    except Exception as e:
        print("Cookie button not found or already accepted:", e)
    
    time.sleep(5)  # Allow time for job listings to load
    
    # Find the first job card
    job_ads = driver.find_elements(By.XPATH, '//article[contains(@class, "res-sfoyn7")]')
    if not job_ads:
        print("No job listings found on the page.")
        driver.quit()
        return None
    first_job = job_ads[0]
    print("First job listing found.")
    
    # Attempt to find and click the "Gehalt anzeigen" (Show salary) button within the first job card
    try:
        salary_buttons = first_job.find_elements(By.XPATH, './/button[.//span[contains(text(),"Gehalt anzeigen")]]')
        if salary_buttons:
            show_salary_btn = salary_buttons[0]
            wait.until(EC.element_to_be_clickable((By.XPATH, './/button[.//span[contains(text(),"Gehalt anzeigen")]]')))
            driver.execute_script("arguments[0].click();", show_salary_btn)
            print("Salary display button clicked.")
        else:
            print("Salary display button not found. Skipping this step.")
    except Exception as e:
        print("Error while clicking the 'Gehalt anzeigen' button:", e)
        # Do not quit; continue to manual authentication.
    
    # Pause to allow the login popup to appear.
    time.sleep(3)
    
    # At this point a login popup should be visible.
    # Stop execution so that you can manually input your credentials (enter email/password and click 'Jetzt einloggen').
    input("Please complete the manual login (enter credentials and click 'Jetzt einloggen' in the popup), then press Enter to continue...")
    print("Continuing after manual login input.")
    
    # Wait a few seconds for data to update (adjust if necessary)
    time.sleep(5)
    
    # Retrieve cookies from Selenium and create a requests session
    cookies = driver.get_cookies()
    driver.quit()
    
    session = requests.Session()
    for cookie in cookies:
        session.cookies.set(cookie['name'], cookie['value'], domain=cookie.get('domain'))
    
    return session


def get_job_listings(session, keyword, city):
    jobs = []
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/115.0.0.0 Safari/537.36"
        )
    }
    
    page = 1
    max_pages = None
    
    while True:
        # Build the URL for the current page
        if page == 1:
            url = f"https://www.stepstone.de/jobs/{keyword}/in-{city}?radius=30&searchOrigin=Resultlist_top-search"
        else:
            url = f"https://www.stepstone.de/jobs/{keyword}/in-{city}?radius=30&page={page}&searchOrigin=Resultlist_top-search"
        
        print(f"Processing page {page} at:\n {url}")
        response = session.get(url, headers=headers)
        if response.status_code != 200:
            print(f"Request error, status code: {response.status_code}")
            break
        
        soup = BeautifulSoup(response.content, "html.parser")
        
        if page == 1:
            total_count_elem = soup.find("span", class_="res-vurnku at-facet-header-total-results")
            if total_count_elem:
                total_text = total_count_elem.get_text(strip=True)
                total_vacancies = int(re.sub(r'[^\d]', '', total_text))
                print(f"Total job listings found: {total_vacancies}")
                job_cards_first = soup.find_all("div", class_="res-urswt")
                if job_cards_first:
                    per_page = len(job_cards_first)
                    max_pages = math.ceil(total_vacancies / per_page)
                    print(f"Approximately {max_pages} pages will be processed")
                else:
                    max_pages = 1
        
        job_cards = soup.find_all("article", class_="res-sfoyn7")
        if not job_cards:
            print(f"No job listings found on page {page}. Ending collection.")
            break
        
        for card in job_cards:
            job_id_attr = card.get("id")
            job_id = job_id_attr.replace("job-item-", "") if job_id_attr else None
            
            # Job title
            job_title_h2 = card.find("h2", class_="res-1tassqi")
            job_title_div = job_title_h2.find('div', class_="res-nehv70") if job_title_h2 else None
            job_title = job_title_div.get_text(strip=True) if job_title_div else None
            
            # Company
            comp_parent = card.find("div", class_="res-1r68twq")
            company_elem = comp_parent.find("span", class_="res-btchsq") if comp_parent else None
            company = company_elem.get_text(strip=True) if company_elem else None
            
            # Location
            loc_parent = card.find("div", class_="res-qchjmw")
            location_elem = loc_parent.find("span", class_="res-btchsq") if loc_parent else None
            job_city = location_elem.get_text(strip=True) if location_elem else city
            
            # Salary
            salary_min, salary_max = None, None
            salary_text = None
            res_elements = card.find_all("div", class_="res-lgmafx")
            if res_elements:
                for element in res_elements:
                    home_office_span = element.find("span", class_="res-btchsq")
                    if home_office_span and "Teilweise Home-Office" in home_office_span.get_text():
                        continue 
                    salary_span = element.find("span", class_="res-1fad2gj")
                    if salary_span:
                        salary_text = salary_span.get_text(strip=True)
                        break

            if salary_text:
                pattern = r'(\d{1,3}(?:\.\d{3})?)\s*[–-]\s*(\d{1,3}(?:\.\d{3})?)'
                match = re.search(pattern, salary_text)
                if match:
                    salary_min = match.group(1)
                    salary_max = match.group(2)
                else:
                    print("Salary values not found in text:", salary_text)
            else:
                print("Salary element not found.")
            
            # Days since posted
            time_element = card.find("time")
            if time_element:
                posted_text = time_element.get_text(strip=True)  # e.g., "vor 1 Tag", "vor 3 Tagen", "vor 2 Wochen"
                
                # Check if the job was posted days ago or weeks ago
                match_days = re.search(r'vor\s+(\d+)\s+Tag(?:en)?', posted_text, flags=re.IGNORECASE)
                match_weeks = re.search(r'vor\s+(\d+)\s+Woche(?:n)?', posted_text, flags=re.IGNORECASE)
                
                if match_days:
                    days_posted = int(match_days.group(1))
                elif match_weeks:
                    days_posted = int(match_weeks.group(1)) * 7
                else:
                    days_posted = None
            else:
                days_posted = None
            
            jobs.append({
                "id": job_id,
                "title": job_title,
                "company": company,
                "city": job_city,
                "salary_min": salary_min,
                "salary_max": salary_max,
                "days_posted": days_posted
            })
        
        print(f"Found {len(job_cards)} job listings on page {page}.")
        if max_pages is not None and page >= max_pages:
            print("Reached the maximum number of pages, ending collection.")
            break
        
        page += 1
        time.sleep(1)
    
    return jobs


if __name__ == "__main__":
    job_title = "SQL"
    location = "deutschland"
    
    session = open_and_authenticate_salary(job_title, location)
    if session is None:
        print("Authentication failed. Please ensure you completed manual login.")
        exit(1)
    
    results = get_job_listings(session, job_title, location)
    
    filename = f"data/vacancies_{job_title}_{location}.csv"
    
    df = pd.DataFrame(results)
    df.to_csv(filename, index=False, encoding="utf-8-sig")
    print("Data successfully saved to", filename)


Job listings page opened: https://www.stepstone.de/jobs/SQL/in-deutschland?radius=30&searchOrigin=Resultlist_top-search
Cookies accepted.
First job listing found.
Salary display button clicked.
Continuing after manual login input.
Processing page 1 at:
 https://www.stepstone.de/jobs/SQL/in-deutschland?radius=30&searchOrigin=Resultlist_top-search
Total job listings found: 2005
Approximately 81 pages will be processed
Salary element not found.
Salary element not found.
Found 25 job listings on page 1.
Processing page 2 at:
 https://www.stepstone.de/jobs/SQL/in-deutschland?radius=30&page=2&searchOrigin=Resultlist_top-search
Salary element not found.
Salary element not found.
Salary element not found.
Salary element not found.
Salary element not found.
Found 25 job listings on page 2.
Processing page 3 at:
 https://www.stepstone.de/jobs/SQL/in-deutschland?radius=30&page=3&searchOrigin=Resultlist_top-search
Salary element not found.
Salary element not found.
Salary element not found.
Salary