# Сбор текстов с веб-страниц (парсер) — интерактивный режим

Этот блокнот предназначен для студентов без знания Python: на “фронте” есть кнопки, которые **выполняют сбор текстов** с веб-страниц и формируют таблицу `docs`.

## Что делает
- скачивает HTML по URL (requests)
- извлекает **основной текст** (эвристика: `article/main` → иначе “крупнейший текстовый блок”)
- достаёт `title`, `date` (если есть)
- собирает единый датасет `docs`

## Выход `docs`
Колонки: `doc_id`, `source`, `text_raw`, `text_clean`, `date`, `url`, `meta`

> Примечание: некоторые сайты могут блокировать запросы. Тогда в `meta.fetch.error` будет причина.


In [None]:
from __future__ import annotations

import re
import hashlib
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Tuple
from urllib.parse import urlparse

import pandas as pd
import requests
from bs4 import BeautifulSoup, Comment

import ipywidgets as widgets
from IPython.display import display, Markdown


In [None]:
import importlib.util

_is_colab = importlib.util.find_spec('google.colab') is not None
if _is_colab:
    from google.colab import output  # type: ignore
    output.enable_custom_widget_manager()


In [None]:
# ----------------------------
# Мини-очистка текста
# ----------------------------
_HTML_TAG_RE = re.compile(r"<[^>]+>")
_SPACE_RE = re.compile(r"\s+")
_PUNCT_RUN_RE = re.compile(r"([!?.,])\1{2,}")

def clean_text_minimal(text: str) -> str:
    if text is None:
        return ""
    t = str(text)
    t = _HTML_TAG_RE.sub(" ", t)
    t = _PUNCT_RUN_RE.sub(r"\1\1", t)
    t = _SPACE_RE.sub(" ", t).strip()
    return t

def _stable_id(url: str, text: str) -> str:
    h = hashlib.sha1((url + "\n" + (text or "")[:4000]).encode("utf-8", errors="ignore")).hexdigest()[:16]
    return f"web_{h}"

def _safe_to_datetime(value: Optional[str]) -> Optional[pd.Timestamp]:
    if not value:
        return None
    dt = pd.to_datetime(value, errors="coerce", utc=True)
    if pd.isna(dt):
        return None
    return dt

# ----------------------------
# Извлечение содержимого из HTML
# ----------------------------
def _strip_noise(soup: BeautifulSoup) -> None:
    for tag in soup(["script", "style", "noscript", "svg", "canvas", "iframe"]):
        tag.decompose()
    for tag in soup.find_all(["nav", "footer", "header", "aside"]):
        tag.decompose()
    for c in soup.find_all(string=lambda x: isinstance(x, Comment)):
        c.extract()

def _get_title(soup: BeautifulSoup) -> Optional[str]:
    for attr, val in [("property", "og:title"), ("name", "twitter:title")]:
        node = soup.find("meta", attrs={attr: val})
        if node and node.get("content"):
            return clean_text_minimal(node["content"])

    if soup.title and soup.title.get_text(strip=True):
        return clean_text_minimal(soup.title.get_text(" ", strip=True))

    h1 = soup.find("h1")
    if h1 and h1.get_text(strip=True):
        return clean_text_minimal(h1.get_text(" ", strip=True))

    return None

def _get_date(soup: BeautifulSoup) -> Optional[pd.Timestamp]:
    meta_candidates = [
        ("property", "article:published_time"),
        ("property", "article:modified_time"),
        ("property", "og:updated_time"),
        ("name", "pubdate"),
        ("name", "publishdate"),
        ("name", "timestamp"),
        ("name", "date"),
        ("name", "DC.date.issued"),
        ("name", "DC.Date"),
        ("itemprop", "datePublished"),
        ("itemprop", "dateModified"),
    ]
    for attr, val in meta_candidates:
        node = soup.find("meta", attrs={attr: val})
        if node and node.get("content"):
            dt = _safe_to_datetime(node["content"])
            if dt is not None:
                return dt

    time_tag = soup.find("time")
    if time_tag:
        dt = _safe_to_datetime(time_tag.get("datetime"))
        if dt is not None:
            return dt
    return None

def _node_text_len(node) -> int:
    txt = node.get_text(" ", strip=True) if node else ""
    txt = clean_text_minimal(txt)
    return len(txt)

def _extract_main_text(
    soup: BeautifulSoup,
    selector: Optional[str] = None,
    min_chars: int = 400,
) -> Tuple[str, Dict[str, Any]]:
    meta: Dict[str, Any] = {"extractor": None}

    if selector:
        nodes = soup.select(selector)
        if nodes:
            parts = [n.get_text("\n", strip=True) for n in nodes]
            text = clean_text_minimal("\n".join(parts))
            meta["extractor"] = f"css:{selector}"
            if len(text) >= min_chars:
                return text, meta
            meta["extractor_fallback"] = "too_short"

    for tag_name in ["article", "main"]:
        node = soup.find(tag_name)
        if node:
            text = clean_text_minimal(node.get_text("\n", strip=True))
            if len(text) >= min_chars:
                meta["extractor"] = tag_name
                return text, meta

    candidates = []
    for key in ["content", "article", "post", "entry", "text", "body", "main"]:
        candidates.extend(soup.find_all(attrs={"class": re.compile(key, re.I)}))
        candidates.extend(soup.find_all(attrs={"id": re.compile(key, re.I)}))

    candidates.extend(soup.find_all(["div", "section"]))

    best = None
    best_len = 0
    for node in candidates:
        l = _node_text_len(node)
        if l > best_len:
            best = node
            best_len = l

    if best is not None and best_len > 0:
        meta["extractor"] = "largest_block"
        text = clean_text_minimal(best.get_text("\n", strip=True))
        return text, meta

    meta["extractor"] = "none"
    return "", meta

# ----------------------------
# HTTP
# ----------------------------
@dataclass
class FetchConfig:
    timeout: int = 20
    user_agent: str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome Safari"
    max_bytes: int = 5_000_000  # 5MB

def _fetch_html(url: str, cfg: FetchConfig, session: requests.Session) -> Tuple[Optional[str], Dict[str, Any]]:
    meta: Dict[str, Any] = {"status_code": None, "final_url": None}
    try:
        r = session.get(
            url,
            headers={"User-Agent": cfg.user_agent, "Accept": "text/html,application/xhtml+xml"},
            timeout=cfg.timeout,
            allow_redirects=True,
        )
        meta["status_code"] = r.status_code
        meta["final_url"] = r.url
        r.raise_for_status()

        content = r.content
        if content and len(content) > cfg.max_bytes:
            meta["error"] = f"response_too_large:{len(content)}"
            return None, meta

        return r.text, meta
    except Exception as e:
        meta["error"] = repr(e)
        return None, meta

def parse_websites(
    urls: List[str],
    *,
    selector: Optional[str] = None,
    min_chars: int = 400,
    cfg: Optional[FetchConfig] = None,
) -> pd.DataFrame:
    cfg = cfg or FetchConfig()
    rows: List[Dict[str, Any]] = []

    with requests.Session() as session:
        for url in urls:
            url = (url or "").strip()
            if not url:
                continue

            html, fetch_meta = _fetch_html(url, cfg, session=session)
            if not html:
                rows.append({
                    "doc_id": _stable_id(url, ""),
                    "source": "website",
                    "text_raw": "",
                    "date": None,
                    "url": url,
                    "meta": {"fetch": fetch_meta},
                })
                continue

            soup = BeautifulSoup(html, "lxml")
            _strip_noise(soup)

            title = _get_title(soup)
            date = _get_date(soup)
            text_raw, extract_meta = _extract_main_text(soup, selector=selector, min_chars=min_chars)

            meta = {
                "fetch": fetch_meta,
                "title": title,
                "date_extracted": date.isoformat() if isinstance(date, pd.Timestamp) else None,
                "extraction": extract_meta,
                "domain": urlparse(fetch_meta.get("final_url") or url).netloc,
            }

            rows.append({
                "doc_id": _stable_id(fetch_meta.get("final_url") or url, text_raw),
                "source": "website",
                "text_raw": text_raw,
                "date": date,
                "url": fetch_meta.get("final_url") or url,
                "meta": meta,
            })

    df = pd.DataFrame(rows)
    if df.empty:
        df = pd.DataFrame(columns=["doc_id", "source", "text_raw", "date", "url", "meta"])

    df["text_clean"] = df["text_raw"].map(clean_text_minimal)
    return df


## Интерфейс (кнопки)

1) Вставь URL-ы по одному на строку  
2) (Опционально) укажи CSS-селектор (например `article`, `.post-content`, `main`)  
3) Нажми **Собрать тексты**  
4) По желанию: **Скачать CSV**


In [None]:
#@title Форма управления { display-mode: "form" }
# --- UI controls ---
urls_ta = widgets.Textarea(
    value="",
    placeholder="https://example.com/article1\nhttps://example.com/article2",
    description="URL-ы:",
    style={"description_width": "initial"},
    layout=widgets.Layout(width="900px", height="160px"),
)

selector_txt = widgets.Text(
    value="",
    placeholder="(опционально) например: article или .post-content",
    description="CSS селектор:",
    style={"description_width": "initial"},
    layout=widgets.Layout(width="560px"),
)

min_chars_int = widgets.IntSlider(
    value=400, min=100, max=4000, step=50,
    description="Мин. длина текста:",
    style={"description_width": "initial"},
    layout=widgets.Layout(width="560px"),
)

timeout_int = widgets.IntSlider(
    value=20, min=5, max=120, step=5,
    description="Таймаут (сек):",
    style={"description_width": "initial"},
    layout=widgets.Layout(width="560px"),
)

parse_btn = widgets.Button(description="Собрать тексты", button_style="primary", icon="play")
demo_btn = widgets.Button(description="Подставить DEMO URL", button_style="info")
save_btn = widgets.Button(description="Скачать CSV", button_style="success", icon="download")
preview_btn = widgets.Button(description="Показать 1-й текст", button_style="")

status_out = widgets.Output()
table_out = widgets.Output()
preview_out = widgets.Output()

docs = None  # will hold DataFrame

def _get_urls() -> List[str]:
    return [u.strip() for u in urls_ta.value.splitlines() if u.strip()]

def _set_demo(_=None):
    urls_ta.value = """https://example.com
https://www.iana.org/domains/reserved"""

def _parse(_=None):
    global docs
    urls = _get_urls()
    cfg = FetchConfig(timeout=int(timeout_int.value))
    selector = selector_txt.value.strip() or None
    min_chars = int(min_chars_int.value)

    with status_out:
        status_out.clear_output()
        display(Markdown(
            f"**URL-ов:** {len(urls)}  \\n"

            f"**selector:** `{selector}`  \\n"

            f"**min_chars:** {min_chars}  \\n"

            f"**timeout:** {cfg.timeout}s"

        ))

    with table_out:
        table_out.clear_output()
        if not urls:
            display(Markdown("⚠️ Список URL пуст."))
            return
        docs = parse_websites(urls, selector=selector, min_chars=min_chars, cfg=cfg)
        display(Markdown(f"✅ Готово. Документов: **{len(docs)}**"))
        display(docs[["doc_id", "source", "date", "url"]].head(50))

def _preview(_=None):
    global docs
    with preview_out:
        preview_out.clear_output()
        if docs is None or docs.empty:
            display(Markdown("⚠️ Сначала нажми **Собрать тексты**."))
            return
        row = docs.iloc[0].to_dict()
        title = (row.get('meta') or {}).get('title') or 'Без заголовка'
        display(Markdown(f"### {title}"))
        display(Markdown(f"**URL:** {row.get('url')}"))
        txt = (row.get("text_raw") or "").strip()
        if not txt:
            display(Markdown("(текст не извлечён — см. `meta.fetch.error`)"))
        else:
            display(Markdown(txt[:2500] + ("…" if len(txt) > 2500 else "")))

def _save(_=None):
    global docs
    with status_out:
        if docs is None or docs.empty:
            display(Markdown("⚠️ Нечего сохранять. Сначала нажми **Собрать тексты**."))
            return
        path = "docs_web.csv"
        docs.to_csv(path, index=False, encoding="utf-8")
        display(Markdown(f"✅ Сохранено в файл: `{path}` (скачай через файловый браузер Jupyter/Colab)."))

demo_btn.on_click(_set_demo)
parse_btn.on_click(_parse)
preview_btn.on_click(_preview)
save_btn.on_click(_save)

display(widgets.VBox([
    urls_ta,
    widgets.HBox([demo_btn, parse_btn, preview_btn, save_btn]),
    widgets.HBox([selector_txt, min_chars_int]),
    timeout_int,
    status_out,
    table_out,
    preview_out,
]))


---

### Типичные проблемы
- Если `text_raw` пустой: сайт мог отдать капчу/редирект/заблокировать запрос — смотри `meta.fetch.error`.
- Если вытаскивает мусор: попробуй заполнить **CSS селектор** (`article`, `.post-content`, `main` и т.п.).
