In [4]:
import argparse
import json
import ast
from pathlib import Path

from bs4 import BeautifulSoup


def parse_literal_list(s: str):
    """
    Преобразует строку вида "['Games', 'Expected Value']"
    в обычный python-список. Если не получилось — аккуратно
    возвращаем список из одной строки или пустой список.
    """
    if s is None:
        return []
    s = s.strip()
    if not s:
        return []
    try:
        return ast.literal_eval(s)
    except Exception:
        # на всякий случай — грубый разбор
        if s.startswith("[") and s.endswith("]"):
            inner = s[1:-1].strip()
            if not inner:
                return []
            return [x.strip(" '\"") for x in inner.split(",")]
        return [s]


def parse_question_div(div, source_file: Path):
    # ---------- Title ----------
    h2 = div.find("h2")
    title = h2.get_text(strip=True) if h2 else None

    # ---------- URL ----------
    url = None
    first_p = div.find("p")
    if first_p:
        a = first_p.find("a")
        if a and a.get("href"):
            url = a["href"]

    # ---------- Metadata table ----------
    meta = {}
    table = div.find("table")
    if table:
        for tr in table.find_all("tr"):
            tds = tr.find_all("td")
            if len(tds) >= 2:
                key = tds[0].get_text(strip=True).rstrip(":")
                val = tds[1].get_text(strip=True)
                meta[key] = val

    tags = parse_literal_list(meta.get("Tags"))
    companies = parse_literal_list(meta.get("Companies"))

    # ---------- Task ----------
    task_h3 = None
    for h3 in div.find_all("h3"):
        if "task" in h3.get_text(strip=True).lower():
            task_h3 = h3
            break

    task_text = None
    task_html = None
    if task_h3:
        task_p = task_h3.find_next_sibling("p")
        if task_p:
            task_text = task_p.get_text(" ", strip=True)
            task_html = str(task_p)

    # ---------- Hint / Solution / Answer ----------
    hint = None
    solution = None
    answer = None

    for details in div.find_all("details"):
        summary = details.find("summary")
        label = summary.get_text(strip=True).lower() if summary else ""

        # Клон, чтобы выкинуть summary и взять только содержимое
        clone = BeautifulSoup(str(details), "html.parser")
        for s in clone.find_all("summary"):
            s.decompose()
        content = clone.get_text("\n", strip=True)

        if "hint" in label:
            hint = content
        elif "solution" in label:
            solution = content
        elif "answer" in label:
            li = details.find("li")
            answer = li.get_text(strip=True) if li else content

    return {
        "source_file": source_file.name,
        "title": title,
        "url": url,
        "topic": meta.get("Topic"),
        "tags": tags,
        "difficulty": meta.get("Difficulty"),
        "companies": companies,
        "last_edited_at": meta.get("Last Edited at"),
        "last_edited_by": meta.get("Last Edited by"),
        "internal_difficulty": meta.get("Internal Difficulty"),
        "task_text": task_text,
        "task_html": task_html,
        "hint": hint,
        "solution": solution,
        "answer": answer,
    }


def parse_file(path: Path):
    html = path.read_text(encoding="utf-8")
    soup = BeautifulSoup(html, "html.parser")
    questions = []

    for div in soup.find_all("div"):
        q = parse_question_div(div, path)
        # отсеиваем случайные div без задачи
        if q["title"] and q["url"]:
            questions.append(q)

    return questions




usage: ipykernel_launcher.py [-h] [-o OUTPUT] input_path
ipykernel_launcher.py: error: the following arguments are required: input_path


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [8]:
def main():
    parser = argparse.ArgumentParser(
        description="Парсер задач из QuantGuide HTML файлов."
    )
    parser.add_argument(
        "input_path",
        help="Папка с *.html файлами или один HTML-файл",
    )
    parser.add_argument(
        "-o",
        "--output",
        default="questions.json",
        help="Имя выходного JSON файла (по умолчанию questions.json)",
    )
    args = parser.parse_args()


    input_path = Path(args.input_path)
    all_files = []

    if input_path.is_dir():
        all_files = sorted(input_path.glob("*.html"))
    elif input_path.is_file():
        all_files = [input_path]
    else:
        raise SystemExit(f"Не найден файл/папка: {input_path}")

    all_questions = []
    for f in all_files:
        qs = parse_file(f)
        all_questions.extend(qs)

    out_path = Path(args.output)
    out_path.write_text(
        json.dumps(all_questions, ensure_ascii=False, indent=2),
        encoding="utf-8",
    )

    print(
        f"Готово! Спарсили {len(all_questions)} задач из {len(all_files)} файлов "
        f"в {out_path}"
    )

In [9]:
main()

usage: ipykernel_launcher.py [-h] [-o OUTPUT] input_path
ipykernel_launcher.py: error: the following arguments are required: input_path


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
