In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import re
from selenium import webdriver
from tqdm import tqdm 
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from concurrent.futures import ThreadPoolExecutor, as_completed

In [None]:
# WebDriver configuration with automatic ChromeDriver installation
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run without opening the browser window
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

In [None]:
# 🔧 Config
SAVE_HTML = False  # Flag to control saving of HTML pages
WAIT_TIME = 0.5   # Timeout in seconds

In [None]:
# 📥 Save HTML for analysis (if flag is enabled)
def save_html(page_source, page):
    if SAVE_HTML:
        folder_path = os.path.expanduser("~/pet-projects/jupyter-notebooks/data/hh_pages")
        os.makedirs(folder_path, exist_ok=True)
        file_path = os.path.join(folder_path, f"page_{page}.html")
        with open(file_path, "w", encoding="utf-8") as file:
            file.write(page_source)
        print(f"✅ HTML of page {page + 1} saved to {file_path}")

# 🔄 Function to scroll through the page to load all vacancies
def scroll_to_load_all_vacancies(driver):
    last_height = driver.execute_script("return document.body.scrollHeight")
    scroll_attempts = 0

    while scroll_attempts < 10:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(WAIT_TIME)

        # Wait for new vacancies to appear (DOM update)
        try:
            WebDriverWait(driver, 5).until(
                EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-qa^="vacancy-serp__vacancy"]'))
            )
        except:
            pass  # Proceed if no new elements are found

        new_height = driver.execute_script("return document.body.scrollHeight")

        if new_height == last_height:
            scroll_attempts += 1
        else:
            scroll_attempts = 0  # Reset if new elements appeared

        last_height = new_height

    print(f"📜 Scrolling completed. Page height: {last_height}")

# 📊 Get vacancies from one search result page
def get_vacancy_links_and_companies(driver, keyword, page, vacancies_per_page):
    url = f"https://hh.ru/search/vacancy?text={keyword}&search_field=name&excluded_text=&salary=&currency_code=RUR&experience=doesNotMatter&order_by=relevance&search_period=0&items_on_page={vacancies_per_page}&L_save_area=true&page={page}"
    print(f"\n🔗 Loading page {page + 1}: {url}")
    
    driver.get(url)
    try:
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, 'div[data-qa^="vacancy-serp__vacancy"]'))
        )
    except:
        print(f"❗ Vacancies not loaded on page {page + 1}")
    scroll_to_load_all_vacancies(driver)
    save_html(driver.page_source, page)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    results = []

    vacancy_blocks = soup.select('div[data-qa^="vacancy-serp__vacancy"]')
    print(f"📦 Found {len(vacancy_blocks)} vacancy blocks on page {page + 1}")

    for vacancy in vacancy_blocks:
        link_tag = vacancy.select_one('a[data-qa="serp-item__title"]')
        company_tag = vacancy.find('a', {'data-qa': 'vacancy-serp__vacancy-employer'})
        title_tag = vacancy.find('span', {'data-qa': 'serp-item__title-text'})
        address_tag = vacancy.find('span', {'data-qa': 'vacancy-serp__vacancy-address'})

        link = link_tag['href'] if link_tag else ""
        if link and not link.startswith('http'):
            link = f"https://hh.ru{link}"

        company = company_tag.get_text(strip=True) if company_tag else "Not specified"
        title = title_tag.get_text(strip=True) if title_tag else "Title not specified"
        address = address_tag.get_text(strip=True) if address_tag else "Address not specified"

        # Skip links containing "adsrv.hh.ru"
        if "adsrv.hh.ru" not in link:
            results.append({
                "Link": link,
                "Company": company,
                "Job Title": title,
                "Address": address
            })

    return results
def extract_salary_info(salary_span):
    """
    Извлекает информацию о зарплате из span элемента.
    Возвращает строку вида "start-end currency" или "amount currency"
    """
    if not salary_span:
        return "неизвестно"
    
    try:
        # Получаем весь текст из span
        raw_salary = salary_span.get_text(separator=' ', strip=True)
        
        # Находим все числа в строке
        numbers = re.findall(r'\d+(?:\s*\d+)*', raw_salary)
        numbers = [n.replace(' ', '') for n in numbers]  # Убираем пробелы внутри чисел
        
        # Находим валюту (₽, $, €, etc.)
        currency = re.search(r'[$₽€]', raw_salary)
        currency = currency.group(0) if currency else '₽'  # По умолчанию рубли
        
        if len(numbers) == 2:
            # Если найдено два числа - это диапазон
            return f"{numbers[0]}-{numbers[1]} {currency}"
        elif len(numbers) == 1:
            # Если найдено одно число
            return f"{numbers[0]} {currency}"
        else:
            return "неизвестно"
            
    except Exception as e:
        print(f"❌ Ошибка при обработке зарплаты: {e}")
        return "неизвестно"
        
# ⚡ Function to obtain vacancy details (job description and salary info) using a separate driver instance
def get_vacancy_details(link, index, total):
    print(f"➡️  [{index}/{total}] Loading details: {link}")
    try:
        # Create a new driver instance for each vacancy detail extraction
        driver_detail = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
        driver_detail.get(link)
        time.sleep(WAIT_TIME)
        soup = BeautifulSoup(driver_detail.page_source, 'html.parser')
        
        # Extract job description from possible blocks
        description = None
        description_classes = ['vacancy-branded-user-content', 'vacancy-description']
        for cls in description_classes:
            description_tag = soup.find('div', class_=cls) or soup.find('div', {'data-qa': cls})
            if description_tag:
                description = description_tag.get_text(separator=' ', strip=True)
                break
        if description is None:
            description = "Description not available"
        
        # Extract salary information
        salary_info = "неизвестно"
        salary_div = soup.find('div', {'data-qa': 'vacancy-salary'})
        if salary_div:
            salary_span = salary_div.find('span', {'data-qa': 'vacancy-salary-compensation-type-net'})
            salary_info = extract_salary_info(salary_span)

        driver_detail.quit()
        return description, salary_info
    except Exception as e:
        print(f"❌ Error on {link}: {e}")
        return "Description not available", "неизвестно"

def get_vacancy_details_with_retry(link, index, total, max_attempts=5):
    """
    Обёртка для get_vacancy_details с повтором при возникновении исключений.
    Попытка повторяется до max_attempts раз.
    """
    attempt = 0
    while attempt < max_attempts:
        try:
            result = get_vacancy_details(link, index, total)
            return result
        except Exception as e:
            attempt += 1
            print(f"❌ Ошибка при получении данных для {link} (попытка {attempt}/{max_attempts}): {e}")
            time.sleep(WAIT_TIME)
    print(f"⚠️ Для {link} превышено количество попыток ({max_attempts}). Возвращаем значения по умолчанию.")
    return ("Description not available", "неизвестно")


# 🔍 Main scraping function that gathers vacancy data and then obtains detailed descriptions and salary info in parallel
def scrape_hh_vacancy_descriptions(keyword):
    vacancies_per_page = 100
    driver.get(f"https://hh.ru/search/vacancy?text={keyword}&search_field=name&excluded_text=&salary=&currency_code=RUR&experience=doesNotMatter&order_by=relevance&search_period=0&items_on_page={vacancies_per_page}&L_save_area=true")
    scroll_to_load_all_vacancies(driver)

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    total_vacancies_tag = soup.select_one('div[data-qa="vacancies-search-header"] h1[data-qa="title"]')
    total_vacancies = int(''.join(filter(str.isdigit, total_vacancies_tag.get_text(strip=True)))) if total_vacancies_tag else 0

    print(f"\n🔍 Total vacancies found: {total_vacancies}")
    total_pages = (total_vacancies // vacancies_per_page) + (1 if total_vacancies % vacancies_per_page > 0 else 0)
    print(f"📄 Total pages: {total_pages}")

    all_vacancies = []
    for page in range(total_pages):
        vacancies = get_vacancy_links_and_companies(driver, keyword, page, vacancies_per_page)
        all_vacancies.extend(vacancies)
        time.sleep(WAIT_TIME)

    print(f"\n📦 Total vacancies to process: {len(all_vacancies)}")

    # Parallel extraction of vacancy details
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_vacancy = {
            executor.submit(get_vacancy_details_with_retry, vacancy['Link'], i+1, len(all_vacancies)): vacancy 
            for i, vacancy in enumerate(all_vacancies)
        }
        for future in tqdm(as_completed(future_to_vacancy), total=len(future_to_vacancy), desc="📥 Loading vacancy details", unit="vacancy"):
            description, salary_info = future.result()
            vacancy = future_to_vacancy[future]
            vacancy['Job Description'] = description
            vacancy['Salary Info'] = salary_info
            time.sleep(WAIT_TIME)

    df = pd.DataFrame(all_vacancies)
    return df

In [None]:
vacancy_name = "TeamLead"
df_vacancies = scrape_hh_vacancy_descriptions(vacancy_name)
df_vacancies.info()

# Close the driver
driver.quit()

In [None]:
df_vacancies[
    (df_vacancies['Address'] == 'Новосибирск')
    # (df_vacancies['Job Title'].str.contains("VK", na=False))
].head()

In [None]:
formatted_vacancy_name = vacancy_name.lower().replace(" ", "_")
file_path = os.path.expanduser(f"~/pet-projects/jupyter-notebooks/vacancies_{formatted_vacancy_name}.csv")
df_vacancies.to_csv(file_path, index=False)

In [None]:
print(extract_salary_info('от <!-- -->400&nbsp;000<!-- --> <!-- -->₽<!-- --> за&nbsp;месяц<!-- -->, <span class="vacancy-salary-compensation-type"> <!-- -->на руки'))