In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Unified links collector for hh.ru

- STEP 1: first_pass_collect_links()  – initial collection, from page 0
- STEP 1b: second_pass_extend_links() – extend existing queries, with control of start page
"""

import os
import csv
import time
from datetime import datetime
from urllib.parse import quote_plus, urljoin

import requests
from bs4 import BeautifulSoup

# =====================================================================
# CONFIG: MAIN QUERY LIST (used for FIRST PASS)
# =====================================================================

QUERIES = [
    # Python разработка
    "Python разработчик",
    "Python developer",
    "Backend Python",
    "Fullstack Python",
    "Django разработчик",
    "Flask разработчик",
    "FastAPI разработчик",

    # Frontend разработка
    "Frontend разработчик",
    "React разработчик",
    "Vue разработчик",
    "Angular разработчик",
    "JavaScript разработчик",
    "TypeScript разработчик",
    "HTML/CSS верстальщик",

    # Mobile разработка
    "iOS разработчик",
    "Android разработчик",
    "React Native разработчик",
    "Flutter разработчик",
    "Mobile developer",

    # Backend разработка (другие языки)
    "Java разработчик",
    "C# разработчик",
    ".NET разработчик",
    "PHP разработчик",
    "Go разработчик",
    "Golang разработчик",
    "Node.js разработчик",
    "Ruby разработчик",
    "Scala разработчик",

    # Data Science & ML
    "Data Scientist",
    "Machine Learning Engineer",
    "ML Engineer",
    "Deep Learning Engineer",
    "AI Engineer",
    "Computer Vision Engineer",
    "NLP Engineer",
    "MLOps Engineer",

    # Data Engineering & Analytics
    "Data Engineer",
    "Data Analyst",
    "Аналитик данных",
    "BI Analyst",
    "Business Intelligence",
    "Big Data Engineer",
    "ETL разработчик",

    # DevOps & Infrastructure
    "DevOps Engineer",
    "SRE",
    "Site Reliability Engineer",
    "Cloud Engineer",
    "Kubernetes Engineer",
    "System Administrator",
    "Infrastructure Engineer",

    # QA & Testing
    "QA Engineer",
    "Тестировщик",
    "Automation QA",
    "Manual QA",
    "Performance Engineer",
    "Test Engineer",

    # Security
    "Information Security",
    "Security Engineer",
    "Penetration Tester",
    "Security Analyst",
    "Cybersecurity",

    # Product & Project Management
    "Product Manager",
    "Project Manager IT",
    "Technical Project Manager",
    "Product Owner",
    "Scrum Master",
    "Agile Coach",

    # Design
    "UI/UX Designer",
    "Product Designer",
    "Graphic Designer IT",
    "Motion Designer",

    # Architecture & Leadership
    "Solution Architect",
    "Software Architect",
    "Technical Lead",
    "Team Lead",
    "CTO",
    "Head of Engineering",

    # Game Development
    "Game Developer",
    "Unity Developer",
    "Unreal Engine Developer",
    "Game Designer",

    # Специализированные роли
    "Blockchain Developer",
    "Smart Contract Developer",
    "Embedded Developer",
    "IoT Developer",
    "Firmware Engineer",

    # Junior позиции
    "Junior Python",
    "Junior Java",
    "Junior Frontend",
    "Junior Backend",
    "Junior Developer",
    "Стажер программист",
]

# =====================================================================
# CONFIG: QUERIES FOR EXTENSION (used for SECOND PASS)
# (this is your extended list from the old second cell)
# =====================================================================

EXISTING_QUERIES_TO_EXTEND = [
    "Python разработчик",
    "Python developer",
    "Backend Python",
    "Fullstack Python",
    "Django разработчик",
    "Flask разработчик",
    "FastAPI разработчик",
    "Frontend разработчик",
    "React разработчик",
    "Vue разработчик",
    "Angular разработчик",
    "JavaScript разработчик",
    "TypeScript разработчик",
    "HTML/CSS верстальщик",
    "iOS разработчик",
    "Android разработчик",
    "React Native разработчик",
    "Flutter разработчик",
    "Mobile developer",
    "Java разработчик",
    "C# разработчик",
    ".NET разработчик",
    "PHP разработчик",
    "Go разработчик",
    "Golang разработчик",
    "Node.js разработчик",
    "Ruby разработчик",
    "Scala разработчик",
    "Data Scientist",
    "Machine Learning Engineer",
    "ML Engineer",
    "Deep Learning Engineer",
    "AI Engineer",
    "Computer Vision Engineer",
    "NLP Engineer",
    "MLOps Engineer",
    "Data Engineer",
    "Data Analyst",
    "Аналитик данных",
    "BI Analyst",
    "Business Intelligence",
    "Big Data Engineer",
    "ETL разработчик",
    "DevOps Engineer",
    "SRE",
    "Site Reliability Engineer",
    "Cloud Engineer",
    "Kubernetes Engineer",
    "System Administrator",
    "Infrastructure Engineer",
    "QA Engineer",
    "Тестировщик",
    "Automation QA",
    "Manual QA",
    "Performance Engineer",
    "Test Engineer",
    "Information Security",
    "Security Engineer",
    "Penetration Tester",
    "Security Analyst",
    "Cybersecurity",
    "Product Manager",
    "Project Manager IT",
    "Technical Project Manager",
    "Product Owner",
    "Scrum Master",
    "Agile Coach",
    "UI/UX Designer",
    "Product Designer",
    "Graphic Designer IT",
    "Motion Designer",
    "Solution Architect",
    "Software Architect",
    "Technical Lead",
    "Team Lead",
    "CTO",
    "Head of Engineering",
    "Game Developer",
    "Unity Developer",
    "Unreal Engine Developer",
    "Game Designer",
    "Blockchain Developer",
    "Smart Contract Developer",
    "Embedded Developer",
    "IoT Developer",
    "Firmware Engineer",
    "Junior Python",
    "Junior Java",
    "Junior Frontend",
    "Junior Backend",
    "Junior Developer",
    "Стажер программист",

    # Дополнительные вариации из прежнего кода
    "Python backend",
    "Python backend developer",
    "Senior Python",
    "Middle Python",
    "Python engineer",

    "JS разработчик",
    "Frontend developer",
    "Senior Frontend",
    "Middle Frontend",
    "Junior JavaScript",
    "Верстальщик",

    "Senior Java",
    "Middle Java",
    "Java backend",
    "Java engineer",

    "Мобильный разработчик",
    "Senior iOS",
    "Senior Android",
    "Middle Android",
    "Middle iOS",

    "DevOps инженер",
    "Senior DevOps",
    "Middle DevOps",
    "Linux администратор",

    "Тестировщик ПО",
    "Senior QA",
    "Middle QA",
    "Junior QA",
    "Автоматизатор",

    "Аналитик",
    "Системный аналитик",
    "Бизнес аналитик",
    "Senior Data Scientist",
    "Middle Data Scientist",

    "C++ разработчик",
    "Senior C++",
    "PHP backend",
    "Senior PHP",

    "Fullstack разработчик",
    "Fullstack developer",
    "Full stack",

    "1С программист",
    "1С разработчик",
    "Битрикс разработчик",
    "Wordpress разработчик",

    "UX дизайнер",
    "UI дизайнер",
    "Веб дизайнер",
]

# =====================================================================
# MAIN TUNABLE SETTINGS
# =====================================================================

# For first pass (you can override per-call)
MAX_PAGES_PER_QUERY = 20   # ~ up to 2000 resumes per query

# For extension pass
PAGES_FOR_EXTEND = 50      # additional pages for existing professions

# Smart stop
STOP_AFTER_EMPTY_PAGES = 5  # stop after N pages in a row with < MIN_NEW_TO_CONTINUE new resumes
MIN_NEW_TO_CONTINUE = 1     # if at least this many new resumes, continue

PAUSE_BETWEEN_REQUESTS = 1.5

LINKS_FILE = "resumes_links.csv"

SEARCH_URL_TEMPLATE = (
    "https://hh.ru/search/resume"
    "?text={query}"
    "&from=suggest_post"
    "&pos=full_text"
    "&logic=normal"
    "&exp_period=all_time"
    "&ored_clusters=true"
    "&order_by=relevance"
    "&search_period=365"
    "&items_on_page=100"
    "&page={page}"
)

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/117.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "ru-RU,ru;q=0.9",
}

BASE_DOMAIN = "https://hh.ru"

# =====================================================================
# HELPERS
# =====================================================================

def ensure_dir_for_file(path: str) -> None:
    d = os.path.dirname(os.path.abspath(path))
    if d and not os.path.exists(d):
        os.makedirs(d, exist_ok=True)


def append_row_to_csv(file_path: str, row: dict, header: list) -> None:
    file_exists = os.path.exists(file_path)
    ensure_dir_for_file(file_path)
    with open(file_path, "a", encoding="utf-8-sig", newline="") as fh:
        writer = csv.DictWriter(fh, fieldnames=header)
        if not file_exists:
            writer.writeheader()
        writer.writerow(row)


def resume_id_from_url(url: str) -> str | None:
    if not url:
        return None
    if "/resume/" in url:
        return url.split("/resume/")[1].split("?")[0].strip("/")
    return None


def normalize_tokens(query: str) -> list[str]:
    return [
        t.strip().lower()
        for t in query.replace("-", " ").replace("/", " ").split()
        if t.strip()
    ]


def load_existing_resume_ids(file_path: str) -> set[str]:
    """Load already collected resume_id values from CSV for duplicate checking."""
    if not os.path.exists(file_path):
        return set()

    existing_ids: set[str] = set()
    try:
        with open(file_path, "r", encoding="utf-8-sig") as fh:
            reader = csv.DictReader(fh)
            for row in reader:
                rid = row.get("resume_id")
                if rid:
                    existing_ids.add(rid)
    except Exception as e:
        print(f"    [!] Error loading existing IDs: {e}")

    return existing_ids


def get_query_page_count(file_path: str, query: str) -> int:
    """
    Roughly estimate how many pages are ALREADY collected for a query,
    based on the count of unique resume_id for this search_query.
    """
    if not os.path.exists(file_path):
        return 0

    query_ids: set[str] = set()
    try:
        with open(file_path, "r", encoding="utf-8-sig") as fh:
            reader = csv.DictReader(fh)
            for row in reader:
                if row.get("search_query") == query:
                    rid = row.get("resume_id")
                    if rid:
                        query_ids.add(rid)
    except Exception as e:
        print(f"    [!] Error counting query pages: {e}")
        return 0

    resume_count = len(query_ids)
    if resume_count == 0:
        return 0

    # Conservative estimate: ~80 unique resumes per page
    estimated_pages = (resume_count + 79) // 80
    return estimated_pages

# =====================================================================
# CORE COLLECTOR
# =====================================================================

def collect_links_for_query(
    query: str,
    max_pages: int,
    existing_ids: set[str],
    start_page: int = 0,
    stop_after_empty: int = STOP_AFTER_EMPTY_PAGES,
    min_new_to_continue: int = MIN_NEW_TO_CONTINUE,
) -> list[dict]:
    """
    Collect resume links for one search query.

    Args:
        query: search query string
        max_pages: maximum number of pages to scan
        existing_ids: set of already known resume_id (for duplicate skipping)
        start_page: page to start from (0-based)
        stop_after_empty: stop after N pages in a row with < min_new_to_continue new items
        min_new_to_continue: minimum number of new resumes to consider a page "non-empty"
    """
    tokens = normalize_tokens(query)
    collected: list[dict] = []
    new_ids_count = 0
    empty_pages_streak = 0

    print(f"\n>> Collecting links for query: {query!r}")
    print(f"   Pages: {start_page} to {start_page + max_pages - 1}")
    print(
        f"   Stop rule: {stop_after_empty} empty pages in a row OR "
        f"less than {min_new_to_continue} new"
    )

    for page in range(start_page, start_page + max_pages):
        url = SEARCH_URL_TEMPLATE.format(query=quote_plus(query), page=page)
        print(f"  [search] page {page}: {url}")

        try:
            resp = requests.get(url, headers=HEADERS, timeout=15)
            resp.raise_for_status()
            html = resp.text
        except Exception as e:
            print(f"    [!] request failed: {e}")
            break

        soup = BeautifulSoup(html, "html.parser")

        anchors = soup.select('a[data-qa="serp-item__title"]') or []

        if not anchors:
            containers = soup.select("div.resume-search-item, div.serp-item")
            anchors = []
            for c in containers:
                a = c.find("a", href=lambda x: x and "/resume/" in x)
                if a:
                    anchors.append(a)

        if not anchors:
            all_links = soup.find_all("a", href=lambda x: x and "/resume/" in x)
            anchors = [
                a
                for a in all_links
                if "/resume/" in a.get("href", "")
                and "?" not in a.get("href", "").split("/resume/")[1].split("?")[0]
            ]

        if not anchors:
            debug_fname = (
                f"debug_search_{query.replace(' ', '_').replace('/', '_')}_page{page}.html"
            )
            with open(debug_fname, "w", encoding="utf-8") as fh:
                fh.write(html)
            print(f"    [!] No anchors found. Saved debug: {debug_fname}")
            # no anchors at all – likely end of results
            break

        page_links: list[dict] = []
        page_new_count = 0
        total_on_page = 0

        for a in anchors:
            href = a.get("href")
            if not href or "/resume/" not in href:
                continue

            href = urljoin(BASE_DOMAIN, href)
            rid = resume_id_from_url(href)
            if not rid:
                continue

            total_on_page += 1

            # duplicate check
            if rid in existing_ids:
                continue

            card_text = a.get_text(" ", strip=True).lower()
            if tokens and not any(tok in card_text for tok in tokens):
                continue

            title = a.get_text(strip=True) or card_text[:100]
            obj = {
                "resume_id": rid,
                "title": title,
                "url": href,
                "search_query": query,
                "collected_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            }
            page_links.append(obj)
            existing_ids.add(rid)
            page_new_count += 1

        if page_new_count < min_new_to_continue:
            empty_pages_streak += 1
            print(
                f"    [!] Only {page_new_count} new (< {min_new_to_continue}), "
                f"empty streak: {empty_pages_streak}/{stop_after_empty}"
            )
        else:
            empty_pages_streak = 0

        print(
            f"    \u2192 Page {page}: {page_new_count} NEW, "
            f"{total_on_page - page_new_count} duplicates (total: {total_on_page}) "
            f"| Total collected: {new_ids_count + page_new_count}"
        )

        if page_links:
            links_header = ["resume_id", "title", "url", "search_query", "collected_at"]
            for row in page_links:
                append_row_to_csv(LINKS_FILE, row, header=links_header)

            collected.extend(page_links)
            new_ids_count += page_new_count

        # stop rules
        if empty_pages_streak >= stop_after_empty:
            print(
                f"    [STOP] {empty_pages_streak} pages with insufficient results, stopping"
            )
            break

        if total_on_page == 0:
            print("    [!] No results at all on page, likely end of results")
            break

        time.sleep(PAUSE_BETWEEN_REQUESTS)

    print(f"  >> Total NEW links collected for '{query}': {new_ids_count}")
    return collected

# =====================================================================
# HIGH-LEVEL PASSES
# =====================================================================

def first_pass_collect_links(
    queries=None,
    max_pages_per_query: int = MAX_PAGES_PER_QUERY,
    stop_after_empty: int = STOP_AFTER_EMPTY_PAGES,
    min_new_to_continue: int = MIN_NEW_TO_CONTINUE,
) -> None:
    """
    FIRST PASS:
    - Uses QUERIES by default
    - Always starts from page 0
    - Works even if LINKS_FILE does not exist yet
    """
    if queries is None:
        queries = QUERIES

    print("=" * 60)
    print("STEP 1: COLLECTING RESUME LINKS (FIRST PASS)")
    print("=" * 60)

    existing_ids = load_existing_resume_ids(LINKS_FILE)
    print(f">> Found {len(existing_ids)} existing resume IDs in {LINKS_FILE}")
    total_collected = 0

    for idx, query in enumerate(queries, start=1):
        print(f"\n[{idx}/{len(queries)}] Processing query: {query}")
        collected = collect_links_for_query(
            query=query,
            max_pages=max_pages_per_query,
            existing_ids=existing_ids,
            start_page=0,
            stop_after_empty=stop_after_empty,
            min_new_to_continue=min_new_to_continue,
        )
        total_collected += len(collected)
        print(f"  >> Collected {len(collected)} new links for this query")
        time.sleep(PAUSE_BETWEEN_REQUESTS)

    print("\n" + "=" * 60)
    print(f">> ALL DONE! Total NEW links collected: {total_collected}")
    print(f">> Total unique resume IDs: {len(existing_ids)}")
    print(f">> Links saved to: {LINKS_FILE}")
    print("=" * 60)


def second_pass_extend_links(
    queries=None,
    pages_for_extend: int = PAGES_FOR_EXTEND,
    start_page_override: int | None = None,
    stop_after_empty: int = STOP_AFTER_EMPTY_PAGES,
    min_new_to_continue: int = MIN_NEW_TO_CONTINUE,
) -> None:
    """
    SECOND PASS:
    - Extends existing queries (EXISTING_QUERIES_TO_EXTEND by default)
    - If start_page_override is None → continue from last collected page (estimated)
    - If start_page_override is set → start from that page number for ALL queries
    """
    if queries is None:
        queries = EXISTING_QUERIES_TO_EXTEND

    print("=" * 70)
    print("ADDING MORE QUERIES AND EXTENDING EXISTING ONES (SECOND PASS)")
    print("=" * 70)
    print("\nSettings:")
    print(f"  - Extend existing: +{pages_for_extend} pages each")
    print(f"  - Stop after: {stop_after_empty} empty pages in a row")
    print(f"  - Continue if: at least {min_new_to_continue} new resume(s) found")
    print("=" * 70)

    existing_ids = load_existing_resume_ids(LINKS_FILE)
    if not os.path.exists(LINKS_FILE):
        print(f"[!] Links file not found: {LINKS_FILE}")
        print("[!] No existing data – behaviour will be same as FIRST PASS from page 0.")
    print(f"\n>> Found {len(existing_ids)} existing resume IDs in {LINKS_FILE}")
    print(">> This set will be used to skip duplicates during collection\n")

    total_collected = 0
    queries_processed = 0
    queries_skipped = 0  

    print("\n" + "=" * 70)
    print(f"PART 2: EXTENDING {len(queries)} EXISTING QUERIES")
    print("=" * 70)

    for idx, query in enumerate(queries, start=1):
        print(f"\n[EXTEND {idx}/{len(queries)}] Query: {query}")

        pages_done = get_query_page_count(LINKS_FILE, query)

        if start_page_override is not None:
            start_page = start_page_override
            print(
                f"  Using user-specified start_page={start_page_override} "
                f"(previous estimate pages_done={pages_done})"
            )
        else:
            if pages_done == 0:
                print("  [!] No previous data found for this query. Starting from page 0.")
                start_page = 0
            else:
                start_page = pages_done
                print(
                    f"  Already collected ~{pages_done} pages; "
                    f"will continue from page {start_page}"
                )

        collected = collect_links_for_query(
            query=query,
            max_pages=pages_for_extend,
            existing_ids=existing_ids,
            start_page=start_page,
            stop_after_empty=stop_after_empty,
            min_new_to_continue=min_new_to_continue,
        )
        total_collected += len(collected)
        queries_processed += 1

        if len(collected) > 0:
            print(f"  ✓ Collected {len(collected)} new links")
        else:
            print("  ⚠ Collected 0 new links (all were duplicates or no results)")

        time.sleep(PAUSE_BETWEEN_REQUESTS * 2)

    print(f"\n  Summary: Processed {queries_processed}, Skipped {queries_skipped}")
    print("\n" + "=" * 70)
    print("ALL DONE!")
    print("=" * 70)
    print(f">> Total NEW links collected this run: {total_collected}")
    print(f">> Total unique resume IDs in database: {len(existing_ids)}")
    print(f">> All links saved to: {LINKS_FILE}")
    print("\n" + "=" * 70)
    print("NEXT STEPS:")
    print("=" * 70)
    print("1. Run your STEP 2 cell that parses resume details")
    print("   (it will automatically skip already parsed resumes)")
    print("2. Check resumes_details.csv for results")
    print("=" * 70)

# =====================================================================
# HOW TO RUN 
# =====================================================================
# Just change MODE and re-run the whole cell.

MODE = "first"   # "first"  -> first_pass_collect_links
                 # "second" -> second_pass_extend_links

if MODE == "first":
    first_pass_collect_links()
elif MODE == "second":
    # start_page_override=None  -> continue from where you stopped
    # start_page_override=0     -> force start from page 0 for all queries
    second_pass_extend_links(start_page_override=None)

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
2_parse_resume_details.py

Скрипт для парсинга детальной информации из резюме hh.ru
"""

import os
import json
import csv
import time
from datetime import datetime

import requests
from bs4 import BeautifulSoup
import pandas as pd

# ---------------- CONFIG ----------------
LINKS_FILE = "resumes_links.csv"
DETAILS_FILE = "resumes_details.csv"
PROCESSED_FILE = "processed_ids.json"
NEW_BLOCKS_FILE = "new_blocks_detected.csv"

PAUSE_BETWEEN_REQUESTS = 1.1

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/117.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "ru-RU,ru;q=0.9"
}

# ---------------- Helpers ----------------
def ensure_dir_for_file(path):
    d = os.path.dirname(os.path.abspath(path))
    if d and not os.path.exists(d):
        os.makedirs(d, exist_ok=True)

def load_processed_ids():
    if os.path.exists(PROCESSED_FILE):
        with open(PROCESSED_FILE, "r", encoding="utf-8") as f:
            try:
                return set(json.load(f))
            except Exception:
                return set()
    return set()

def save_processed_ids(processed_set):
    tmp = PROCESSED_FILE + ".tmp"
    with open(tmp, "w", encoding="utf-8") as f:
        json.dump(sorted(list(processed_set)), f, ensure_ascii=False, indent=2)
    os.replace(tmp, PROCESSED_FILE)

def append_row_to_csv(file_path, row, header):
    file_exists = os.path.exists(file_path)
    ensure_dir_for_file(file_path)
    with open(file_path, "a", encoding="utf-8-sig", newline="") as fh:
        writer = csv.DictWriter(fh, fieldnames=header)
        if not file_exists:
            writer.writeheader()
        writer.writerow(row)

def resume_id_from_url(url):
    if not url:
        return None
    if "/resume/" in url:
        return url.split("/resume/")[1].split("?")[0].strip("/")
    return None

# ---------------- Parse resume detail ----------------
def parse_resume_detail(url, known_blocks=None):
    if known_blocks is None:
        known_blocks = set()

    try:
        resp = requests.get(url, headers=HEADERS, timeout=15)
        resp.raise_for_status()
        html = resp.text
    except Exception as e:
        print(f"    [detail] request failed: {e}")
        return None

    soup = BeautifulSoup(html, "html.parser")

    def safe_text(el):
        return el.get_text(" ", strip=True) if el else None

    parsed_data = {
        "url": url,
        "parsed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    }

    # Общие поля
    parsed_data["title"] = safe_text(
        soup.select_one('span[data-qa="resume-block-title-position"]') or 
        soup.select_one('h1') or 
        soup.select_one('h2')
    )
    parsed_data["location"] = safe_text(soup.select_one('span[data-qa="resume-personal-address"]'))
    parsed_data["age"] = safe_text(soup.select_one('span[data-qa="resume-personal-age"]'))
    parsed_data["gender"] = safe_text(soup.select_one('span[data-qa="resume-personal-gender"]'))
    parsed_data["salary"] = safe_text(soup.select_one('span[data-qa="resume-block-salary"]'))

    # Навыки
    skills = [safe_text(t) for t in soup.select('span[data-qa="bloko-tag__text"]')]
    parsed_data["skills"] = ", ".join(skills) if skills else None
    parsed_data["skills_count"] = len(skills)
    
    # О себе
    about_el = (
        soup.select_one('div[data-qa="resume-block-skills-content"]') or 
        soup.select_one('div[data-qa="resume-block-summary"]')
    )
    parsed_data["about"] = safe_text(about_el)

    # Опыт работы
    experiences = []
    exp_section = soup.select_one('div[data-qa="resume-block-experience"]')
    
    if exp_section:
        exp_items = exp_section.select('div.resume-block-item-gap')
        
        if exp_items:
            print(f"    [experience] Found {len(exp_items)} experience items")
            for exp in exp_items:
                comp_el = (
                    exp.select_one('div[data-qa="resume-block-experience-company"]') or
                    exp.select_one('div.bloko-text_strong')
                )
                company = safe_text(comp_el)
                
                pos_el = exp.select_one('div[data-qa="resume-block-experience-position"]')
                position = safe_text(pos_el)
                
                period_el = (
                    exp.select_one('div.resume-block-item-gap-date') or
                    exp.select_one('div.bloko-column_s-2')
                )
                period = safe_text(period_el)
                
                desc_el = exp.select_one('div[data-qa="resume-block-experience-description"]')
                description = safe_text(desc_el)
                
                experiences.append({
                    "company": company,
                    "position": position,
                    "period": period,
                    "description": description
                })
        else:
            exp_items = exp_section.select('div.bloko-gap.bloko-gap_bottom')
            
            if not exp_items:
                exp_items = exp_section.find_all('div', attrs={'data-qa': lambda x: x and 'experience' in x}, recursive=False)
            
            if exp_items:
                print(f"    [experience] Found {len(exp_items)} experience items (alternative)")
                for exp in exp_items:
                    comp_el = (
                        exp.select_one('[data-qa="resume-block-experience-company"]') or
                        exp.select_one('[data-qa="resume-block-experience-organization"]') or
                        exp.find('div', class_='bloko-text_strong')
                    )
                    company = safe_text(comp_el)
                    
                    pos_el = exp.select_one('[data-qa="resume-block-experience-position"]')
                    position = safe_text(pos_el)
                    
                    period_el = (
                        exp.select_one('div.resume-block-item-gap-date') or
                        exp.select_one('div.bloko-column_s-2') or
                        exp.select_one('[data-qa*="period"]')
                    )
                    period = safe_text(period_el)
                    
                    desc_el = exp.select_one('[data-qa="resume-block-experience-description"]')
                    description = safe_text(desc_el)
                    
                    if company or position:
                        experiences.append({
                            "company": company,
                            "position": position,
                            "period": period,
                            "description": description
                        })
    
    if not experiences:
        print(f"    [!] No experience found")
    
    parsed_data["experience_json"] = json.dumps(experiences, ensure_ascii=False)
    parsed_data["experience_count"] = len(experiences)

    #образование
    education = []
    education_level_general = None  # Общий уровень образования
    
    edu_section = soup.select_one('div[data-qa="resume-block-education"]')
    
    if edu_section:
        
        # Вариант 1: ищем полный текст из wrapper (включает основной уровень + подуровень)
        level_wrapper = edu_section.select_one('div.resume-block__title-text-wrapper')
        if level_wrapper:
            education_level_general = safe_text(level_wrapper)
            print(f"    [education] General level (full): {education_level_general}")
        
        # Вариант 2: если wrapper не найден, ищем только основной текст
        if not education_level_general:
            level_header = edu_section.select_one('div.resume-block__title-text')
            if level_header:
                education_level_general = safe_text(level_header)
                print(f"    [education] General level (basic): {education_level_general}")
        
        # Ищем конкретные учебные заведения
        edu_items = edu_section.select('div.resume-block-item-gap')
        
        if edu_items:
            print(f"    [education] Found {len(edu_items)} education items")
            for edu in edu_items:
                name_el = (
                    edu.select_one('div[data-qa="resume-block-education-name"]') or
                    edu.select_one('div.bloko-text_strong')
                )
                name = safe_text(name_el)
                
                org_el = edu.select_one('div[data-qa="resume-block-education-organization"]')
                organization = safe_text(org_el)
                
                period_el = (
                    edu.select_one('div.resume-block-item-gap-date') or
                    edu.select_one('div.bloko-column_s-2') or
                    edu.select_one('div[data-qa="resume-block-education-year"]')
                )
                period = safe_text(period_el)
                
                level_el = (
                    edu.select_one('span.resume-block__title-text_sub') or
                    edu.select_one('div.bloko-text_secondary')
                )
                level = safe_text(level_el)
                
                result_el = edu.select_one('div[data-qa="resume-block-education-result"]')
                result = safe_text(result_el)
                
                education.append({
                    "name": name,
                    "organization": organization,
                    "period": period,
                    "level": level,
                    "result": result
                })
        else:
            edu_items = edu_section.select('div.bloko-gap')
            
            if not edu_items:
                edu_items = edu_section.find_all('div', recursive=False)
            
            if edu_items:
                print(f"    [education] Found {len(edu_items)} education items (alternative)")
                for edu in edu_items:
                    name_el = (
                        edu.select_one('[data-qa="resume-block-education-name"]') or
                        edu.find('div', class_='bloko-text_strong')
                    )
                    name = safe_text(name_el)
                    
                    org_el = edu.select_one('[data-qa="resume-block-education-organization"]')
                    organization = safe_text(org_el)
                    
                    period_el = (
                        edu.select_one('div.resume-block-item-gap-date') or
                        edu.select_one('div.bloko-column_s-2') or
                        edu.select_one('[data-qa="resume-block-education-year"]')
                    )
                    period = safe_text(period_el)
                    
                    level_el = (
                        edu.select_one('span.resume-block__title-text_sub') or
                        edu.select_one('div.bloko-text_secondary')
                    )
                    level = safe_text(level_el)
                    
                    result_el = edu.select_one('[data-qa="resume-block-education-result"]')
                    result = safe_text(result_el)
                    
                    if name:
                        education.append({
                            "name": name,
                            "organization": organization,
                            "period": period,
                            "level": level,
                            "result": result
                        })
    
    if not education:
        print(f"    [!] No education found")
    
    parsed_data["education_json"] = json.dumps(education, ensure_ascii=False)
    parsed_data["education_count"] = len(education)
    parsed_data["education_level"] = education_level_general  # Добавляем отдельное поле

    # ---------------- ADDITIONAL EDUCATION / COURSES ----------------
    additional_education = []
    
    # Ищем блок с дополнительным образованием/курсами
    additional_sections = soup.select('div[data-qa="resume-block-additional-education"]')
    
    for section in additional_sections:
        # Находим все курсы/сертификаты внутри секции
        course_items = section.select('div.resume-block-item-gap')
        
        if not course_items:
            course_items = section.select('div.bloko-gap')
        
        for course in course_items:
            # Название курса/организации
            name_el = (
                course.select_one('div[data-qa="resume-block-education-name"]') or
                course.select_one('div.bloko-text_strong') or
                course.select_one('a')
            )
            name = safe_text(name_el)
            
            # Организация, проводившая курс
            org_el = course.select_one('div[data-qa="resume-block-education-organization"]')
            organization = safe_text(org_el)
            
            # Год/период прохождения
            period_el = (
                course.select_one('div.resume-block-item-gap-date') or
                course.select_one('div.bloko-column_s-2') or
                course.select_one('div[data-qa="resume-block-education-year"]')
            )
            period = safe_text(period_el)
            
            # Описание/результат
            result_el = course.select_one('div[data-qa="resume-block-education-result"]')
            result = safe_text(result_el)
            
            if name or organization:
                additional_education.append({
                    "name": name,
                    "organization": organization,
                    "period": period,
                    "result": result
                })
    
    if additional_education:
        print(f"    [courses] Found {len(additional_education)} additional education/courses")
    
    parsed_data["additional_education_json"] = json.dumps(additional_education, ensure_ascii=False)
    parsed_data["additional_education_count"] = len(additional_education)

    # другие блоки-
    new_blocks = []
    blocks = soup.select('div[data-qa^="resume-block-"]')
    for block in blocks:
        block_type = block.get("data-qa", "unknown_block")
        if block_type not in known_blocks:
            new_blocks.append(block_type)
            append_row_to_csv(
                NEW_BLOCKS_FILE, 
                {"url": url, "new_block": block_type}, 
                header=["url", "new_block"]
            )
            known_blocks.add(block_type)

    return parsed_data

def main():

    processed = load_processed_ids()

    if not os.path.exists(LINKS_FILE):
        print(f"[!] Links file not found: {LINKS_FILE}")
        return

    df = pd.read_csv(LINKS_FILE, dtype=str)
    rows = df.to_dict(orient="records")
    
    # Фильтруем уже обработанные
    rows_to_process = [r for r in rows if r.get("resume_id") not in processed]
    print(f">> Need to process: {len(rows_to_process)} resumes")

    details_header = [
        "resume_id", "url", "title", "location", "age", "gender", "salary",
        "skills", "skills_count", "about",
        "experience_json", "experience_count",
        "education_json", "education_count", "education_level",
        "additional_education_json", "additional_education_count",
        "parsed_at"
    ]

    known_blocks = set()
    success_count = 0
    error_count = 0

    for idx, r in enumerate(rows_to_process, start=1):
        url = r.get("url")
        if not url:
            continue
        
        rid = r.get("resume_id") or resume_id_from_url(url)
        if not rid:
            continue

        print(f"\n[{idx}/{len(rows_to_process)}] Parsing {rid}")
        print(f"  URL: {url}")
        
        detail = parse_resume_detail(url, known_blocks=known_blocks)
        
        if not detail:
            print("  ✗ Could not parse detail")
            error_count += 1
            time.sleep(PAUSE_BETWEEN_REQUESTS)
            continue

        row = {"resume_id": rid}
        detail_to_save = detail.copy()
        
        row.update(detail_to_save)
        append_row_to_csv(DETAILS_FILE, row, header=details_header)

        processed.add(rid)
        save_processed_ids(processed)
        success_count += 1
        time.sleep(PAUSE_BETWEEN_REQUESTS)

    print("\n" + "=" * 60)
    print(f">> Successfully parsed: {success_count}")
    print(f">> Errors: {error_count}")
    print(f">> Total processed: {len(processed)}")
    print("=" * 60)

if __name__ == "__main__":
    main()