In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
from selenium import webdriver
from tqdm import tqdm 
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager

In [None]:
# WebDriver configuration with automatic ChromeDriver installation
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run without opening the browser window
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [None]:
# 🔧 Config
SAVE_HTML = True  # Flag to control saving of HTML pages
WAIT_TIME = 0.5   # Timeout in seconds

In [None]:
# 📥 Save HTML for analysis (if flag is enabled)
def save_html(page_source, page):
    if SAVE_HTML:
        folder_path = os.path.expanduser("~/pet-projects/jupyter-notebooks/data/hh_pages")
        os.makedirs(folder_path, exist_ok=True)
        file_path = os.path.join(folder_path, f"page_{page}.html")
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(page_source)
        print(f"✅ HTML of page {page + 1} saved to {file_path}")

# 🔄 Scroll through the page to load all vacancies
def scroll_to_load_all_vacancies(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    scroll_attempts = 0

    while scroll_attempts < 10:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(WAIT_TIME)

        # Wait for new vacancies to appear (DOM update)
        try:
            WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-qa^="vacancy-serp__vacancy"]'))
            )
        except:
            pass  # Continue if no new elements are found

        new_height = driver.execute_script("return document.body.scrollHeight")

        if new_height == last_height:
            scroll_attempts += 1
        else:
            scroll_attempts = 0  # Reset attempts if new elements appeared

        last_height = new_height

    print(f"📜 Scrolling completed. Page height: {last_height}")

# 📊 Get vacancies from one page
def get_vacancy_links_and_companies(driver, keyword, page):
    url = f"https://hh.ru/search/vacancy?text={keyword}&search_field=name&excluded_text=&salary=&currency_code=RUR&experience=doesNotMatter&order_by=relevance&search_period=0&items_on_page=50&L_save_area=true&page={page}"
    print(f"\n🔗 Loading page {page + 1}: {url}")
    
    driver.get(url)
    # Wait until at least one vacancy appears on the page
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-qa^="vacancy-serp__vacancy"]'))
        )
    except:
        print(f"❗ Vacancies not loaded on page {page + 1}")
    scroll_to_load_all_vacancies(driver)
    save_html(driver.page_source, page)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    results = []

    vacancy_blocks = soup.select('div[data-qa^="vacancy-serp__vacancy"]')
    print(f"📦 Found {len(vacancy_blocks)} vacancy blocks on page {page + 1}")

    for vacancy in vacancy_blocks:
        link_tag = vacancy.select_one('a[data-qa="serp-item__title"]')
        company_tag = vacancy.find('a', {'data-qa': 'vacancy-serp__vacancy-employer'})
        title_tag = vacancy.find('span', {'data-qa': 'serp-item__title-text'})  # Job title
        address_tag = vacancy.find('span', {'data-qa': 'vacancy-serp__vacancy-address'})  # Address

        link = link_tag['href'] if link_tag else ""
        if link and not link.startswith('http'):
            link = f"https://hh.ru{link}"

        company = company_tag.get_text(strip=True) if company_tag else "Not specified"
        title = title_tag.get_text(strip=True) if title_tag else "Title not specified"
        address = address_tag.get_text(strip=True) if address_tag else "Address not specified"

        if "adsrv.hh.ru" not in link:
            results.append({
                "Link": link,
                "Company": company,
                "Job Title": title,
                "Address": address
            })

    return results

# 📄 Get job description
def get_vacancy_description(driver, link, index, total):
    print(f"➡️  [{index}/{total}] Loading description: {link}")
    try:
        driver.get(link)
        time.sleep(WAIT_TIME)

        soup = BeautifulSoup(driver.page_source, 'html.parser')
        description_classes = [
            'g-user-content',
            'vacancy-branded-user-content',
            'vacancy-description',
        ]
        for cls in description_classes:
            description_tag = soup.find('div', class_=cls) or soup.find('div', {'data-qa': cls})
            if description_tag:
                return description_tag.get_text(separator=' ', strip=True)
    except Exception as e:
        print(f"❌ Error loading description for {link}: {e}")
    
    return "Description not available"

# 🔍 Main function to scrape data
def scrape_hh_vacancy_descriptions(keyword):
    vacancies_per_page = 50
    driver.get(f"https://hh.ru/search/vacancy?text={keyword}&search_field=name&excluded_text=&salary=&currency_code=RUR&experience=doesNotMatter&order_by=relevance&search_period=0&items_on_page={vacancies_per_page}&L_save_area=true")
    scroll_to_load_all_vacancies(driver)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    total_vacancies_tag = soup.select_one('div[data-qa="vacancies-search-header"] h1[data-qa="title"]')
    total_vacancies = int(''.join(filter(str.isdigit, total_vacancies_tag.get_text(strip=True)))) if total_vacancies_tag else 0

    print(f"\n🔍 Total vacancies found: {total_vacancies}")

    total_pages = (total_vacancies // vacancies_per_page) + (1 if total_vacancies % vacancies_per_page > 0 else 0)
    print(f"📄 Total pages: {total_pages}")

    all_vacancies = []
    for page in range(total_pages):
        vacancies = get_vacancy_links_and_companies(driver, keyword, page)
        all_vacancies.extend(vacancies)
        time.sleep(WAIT_TIME)

    print(f"\n📦 Total vacancies to process: {len(all_vacancies)}")

    # Progress bar for loading descriptions
    for i, vacancy in enumerate(tqdm(all_vacancies, desc="📥 Loading descriptions", unit="vacancy")):
        vacancy['Job Description'] = get_vacancy_description(driver, vacancy['Link'], i + 1, len(all_vacancies))
        time.sleep(WAIT_TIME)

    df = pd.DataFrame(all_vacancies)
    return df

# 🚀 Run
df_vacancies = scrape_hh_vacancy_descriptions("CPO")
df_vacancies.info()

# Close the driver
driver.quit()

In [None]:
df_vacancies.head(100)

In [None]:
file_path = os.path.expanduser("~/pet-projects/jupyter-notebooks/vacancies_cpo.csv")
df_vacancies.to_csv(file_path, index=False)