# Инструменты сбора данных

## Парсинг текстов с сайтов и соцсетей

Этот раздел помогает собрать тексты из сайтов, Яндекс‑отзывов и групп ВК и сохранить их в CSV.

**Как пользоваться:**
1. Запустите ячейки **«Установка зависимостей»** и **«Импорт парсеров»**.
2. В выпадающем списке выберите источник (сайты / Яндекс отзывы / ВК).
3. Укажите URL или домен. Для ВК дополнительно вставьте токен доступа.
4. Задайте лимит записей и нажмите **«Запустить»**.
5. Результаты сохраняются в `parsed_texts.csv` и выводятся в таблице.

Поддерживаемые форматы: CSV с колонкой `text` используется далее во всех инструментах анализа.


In [None]:
#@title Установка зависимостей
!pip -q install requests beautifulsoup4 lxml ipywidgets pandas shapely pymorphy3 flair folium tqdm osmnx bertopic sentence-transformers plotly hdbscan wordcloud keybert kaleido

import os
import sys
from pathlib import Path

# Укажите репозиторий в формате "owner/repo" при необходимости
REPO_SLUG = os.environ.get("GITHUB_REPO", "Sandrro/digital_identity")
REPO_URL = f"https://github.com/{REPO_SLUG}"
REPO_DIR = Path("/content/digital_identity")

if not REPO_DIR.exists():
    !git clone --depth 1 {REPO_URL} {REPO_DIR}

sys.path.insert(0, str(REPO_DIR))


In [None]:
#@title Импорт парсеров
import sys
sys.path.insert(0, "/content/digital_identity/parsers")
from vk_group_parser import VKGroupParser
from website_parser import parse_websites


In [None]:
#@title Интерактивный запуск
import csv
from datetime import date, datetime, timezone
from pathlib import Path

import ipywidgets as widgets
import pandas as pd
from IPython.display import display

OUTPUT_CSV = Path("/content/digital_identity/parsed_texts.csv")
LINKS_UPLOAD_TARGET = Path("/content/digital_identity/uploaded_links.csv")

parser_dropdown = widgets.Dropdown(
    options=[
        ("Сайты", "website"),
        ("ВК группа", "vk"),
    ],
    description="Парсер:"
)
url_input = widgets.Text(
    value="",
    description="URL/домен:",
    placeholder="https://...",
    layout=widgets.Layout(width="80%"),
)
links_input = widgets.Textarea(
    value="",
    description="Ссылки:",
    placeholder="https://site1\nhttps://site2",
    layout=widgets.Layout(width="80%", height="120px"),
)
links_upload = widgets.FileUpload(accept=".csv", multiple=False)
links_upload_button = widgets.Button(description="Загрузить список")
links_upload_status = widgets.Label()
links_upload_path = {"path": None}

token_input = widgets.Password(
    value="",
    description="VK токен:",
    placeholder="Требуется только для ВК",
)
max_items = widgets.IntSlider(
    value=20,
    min=1,
    max=200,
    step=1,
    description="Лимит:"
)
run_button = widgets.Button(description="Запустить")
progress = widgets.IntProgress(value=0, min=0, max=1, description="Прогресс:")
progress_label = widgets.Label()
output = widgets.Output()


def _normalize_date(value):
    if value in (None, ""):
        return None
    if isinstance(value, datetime):
        return value.date().isoformat()
    if isinstance(value, date):
        return value.isoformat()
    parsed = pd.to_datetime(value, errors="coerce", utc=True)
    if pd.isna(parsed):
        return str(value)[:10]
    return parsed.date().isoformat()


def _format_vk_timestamp(ts):
    if ts is None:
        return None
    return datetime.fromtimestamp(ts, tz=timezone.utc).date().isoformat()


def _write_rows(rows):
    if not rows:
        return
    OUTPUT_CSV.parent.mkdir(parents=True, exist_ok=True)
    with OUTPUT_CSV.open("w", newline="", encoding="utf-8") as handle:
        writer = csv.DictWriter(
            handle,
            fieldnames=["source", "url", "title", "timestamp", "text"],
        )
        writer.writeheader()
        writer.writerows(rows)


def _extract_upload_payload(upload_widget):
    upload_value = upload_widget.value
    if isinstance(upload_value, dict):
        if not upload_value:
            return None, None
        return next(iter(upload_value.items()))
    if isinstance(upload_value, (list, tuple)):
        if not upload_value:
            return None, None
        payload = upload_value[0]
        name = payload.get("name") or payload.get("metadata", {}).get("name")
        return name, payload
    return None, None


def _handle_links_upload(_):
    links_upload_status.value = ""
    name, payload = _extract_upload_payload(links_upload)
    if not name or not payload:
        links_upload_status.value = "Выберите CSV со ссылками"
        return
    suffix = Path(name).suffix or ".csv"
    target = LINKS_UPLOAD_TARGET.with_suffix(suffix)
    target.write_bytes(payload["content"])
    links_upload_path["path"] = target
    links_upload_status.value = f"Файл сохранён: {target}"


def _parse_links_text(raw_value: str) -> list[str]:
    if not raw_value:
        return []
    raw_value = raw_value.replace(",", "\n")
")
    return [item.strip() for item in raw_value.splitlines() if item.strip()]


def _collect_links() -> list[str]:
    links: list[str] = []
    saved_path = links_upload_path.get("path")
    if saved_path and Path(saved_path).exists():
        df = pd.read_csv(saved_path)
        if "link" not in df.columns:
            raise ValueError("CSV должен содержать колонку 'link'")
        links.extend(df["link"].dropna().astype(str).tolist())
    links.extend(_parse_links_text(links_input.value.strip()))
    if url_input.value.strip():
        links.append(url_input.value.strip())
    seen = set()
    result = []
    for link in links:
        if link in seen:
            continue
        seen.add(link)
        result.append(link)
    return result


def _toggle_parser(change=None):
    is_vk = parser_dropdown.value == "vk"
    token_input.layout.display = "" if is_vk else "none"
    links_input.layout.display = "none" if is_vk else ""
    links_upload.layout.display = "none" if is_vk else ""
    links_upload_button.layout.display = "none" if is_vk else ""
    links_upload_status.layout.display = "none" if is_vk else ""


def run_parser(_):
    output.clear_output()
    progress.value = 0
    progress_label.value = ""
    with output:
        rows = []
        parser = parser_dropdown.value
        if parser == "website":
            links = _collect_links()
            if not links:
                raise ValueError("Добавьте хотя бы одну ссылку.")
            progress.max = len(links)
            df = parse_websites(links, show_progress=True)
            for _, row in df.iterrows():
                meta = row.get("meta") or {}
                rows.append({
                    "source": row.get("source"),
                    "url": row.get("url"),
                    "title": meta.get("title"),
                    "timestamp": _normalize_date(row.get("date")),
                    "text": row.get("text_clean") or row.get("text_raw"),
                })
            progress.value = len(rows)
        else:
            if not token_input.value:
                raise ValueError("Нужен VK токен.")
            progress.max = max_items.value
            vk = VKGroupParser(token_input.value)
            for idx, post in enumerate(vk.iter_posts(url_input.value, total=max_items.value), start=1):
                rows.append({
                    "source": "vk",
                    "url": url_input.value,
                    "title": None,
                    "timestamp": _format_vk_timestamp(post.date),
                    "text": post.text,
                })
                progress.value = idx
        _write_rows(rows)
        progress_label.value = f"Загружено текстов: {len(rows)}"
        if rows:
            display(pd.DataFrame(rows))


parser_dropdown.observe(_toggle_parser, names="value")
_toggle_parser()
links_upload_button.on_click(_handle_links_upload)
run_button.on_click(run_parser)

display(
    parser_dropdown,
    url_input,
    links_input,
    widgets.HBox([links_upload, links_upload_button]),
    links_upload_status,
    token_input,
    max_items,
    run_button,
    progress,
    progress_label,
    output,
)


## Инструменты анализа

### Геокодирование и визуализация адресов

Инструмент извлекает топонимы из текстов, геокодирует их через Photon и строит интерактивную карту.

**Как пользоваться:**
1. Укажите путь к CSV/GeoJSON (обычно `parsed_texts.csv`) или загрузите файл.
2. При необходимости задайте `BBox` вручную или укажите территорию и нажмите **«Определить BBox»**.
3. Укажите путь для GeoJSON‑вывода.
4. Нажмите **«Геокодировать»** — на выходе получите GeoJSON и интерактивную карту.


In [ ]:
#@title Геокодер: импорт и запуск с UI
from pathlib import Path

import geopandas as gpd
import ipywidgets as widgets
import pandas as pd
from IPython.display import display

import sys
sys.path.insert(0, "/content/digital_identity/parsers")
from geocoder import geocode_texts, build_geojson, save_geojson, save_map, bbox_from_area_name

DEFAULT_INPUT = Path("/content/digital_identity/parsed_texts.csv")
UPLOAD_TARGET = Path("/content/digital_identity/uploaded_texts")

file_picker = widgets.Text(
    value=str(DEFAULT_INPUT),
    description="CSV/текст:",
    layout=widgets.Layout(width="80%"),
)
upload_widget = widgets.FileUpload(accept=".csv,.txt", multiple=False)
upload_button = widgets.Button(description="Загрузить файл")
upload_status = widgets.Label()

bbox_input = widgets.Text(
    value="",
    description="BBox:",
    placeholder="minx,miny,maxx,maxy (необязательно)",
    layout=widgets.Layout(width="80%"),
)
bbox_name_input = widgets.Text(
    value="",
    description="Территория:",
    placeholder="Например, Москва",
    layout=widgets.Layout(width="80%"),
)
bbox_resolve = widgets.Button(description="Определить BBox")
bbox_status = widgets.Label()

output_path = widgets.Text(
    value="/content/digital_identity/geocoded_points.geojson",
    description="Выход:",
    layout=widgets.Layout(width="80%"),
)
map_output = widgets.Text(
    value="/content/digital_identity/geocoded_map.html",
    description="Карта:",
    layout=widgets.Layout(width="80%"),
)
run_geocoder = widgets.Button(description="Геокодировать")
geo_output = widgets.Output()


def _default_output_path(input_path: Path) -> Path:
    suffix = input_path.suffix.lower()
    if suffix == ".csv":
        return input_path.with_name(f"{input_path.stem}_geocoded.csv")
    return input_path.with_name(f"{input_path.stem}_geocoded.geojson")


def _ensure_output_path(raw_value: str, input_path: Path) -> Path:
    target = Path(raw_value) if raw_value else _default_output_path(input_path)
    suffix = ".csv" if input_path.suffix.lower() == ".csv" else ".geojson"
    if target.suffix.lower() != suffix:
        target = target.with_suffix(suffix)
    return target


def _load_texts(path: Path):
    suffix = path.suffix.lower()
    if suffix == ".txt":
        return [line.rstrip() for line in path.read_text(encoding="utf-8").splitlines()], None, "text"
    if suffix == ".csv":
        df = pd.read_csv(path)
        if "text" not in df.columns:
            raise ValueError("CSV должен содержать колонку text")
        return df["text"].fillna("").astype(str).tolist(), df, "csv"
    if suffix in {".geojson", ".json"}:
        gdf = gpd.read_file(path)
        if "text" not in gdf.columns:
            raise ValueError("GeoJSON должен содержать колонку text")
        return gdf["text"].fillna("").astype(str).tolist(), gdf, "geojson"
    raise ValueError("Поддерживаются только TXT/CSV/GeoJSON")


def _extract_upload_payload():
    upload_value = upload_widget.value
    if isinstance(upload_value, dict):
        if not upload_value:
            return None, None
        return next(iter(upload_value.items()))
    if isinstance(upload_value, (list, tuple)):
        if not upload_value:
            return None, None
        payload = upload_value[0]
        name = payload.get("name") or payload.get("metadata", {}).get("name")
        return name, payload
    return None, None


def _handle_upload(_):
    upload_status.value = ""
    name, payload = _extract_upload_payload()
    if not name or not payload:
        upload_status.value = "Выберите файл для загрузки"
        return
    suffix = Path(name).suffix or ".txt"
    target = UPLOAD_TARGET.with_suffix(suffix)
    target.write_bytes(payload["content"])
    file_picker.value = str(target)
    upload_status.value = f"Файл сохранён: {target}"


def _resolve_bbox(_):
    bbox_status.value = ""
    name = bbox_name_input.value.strip()
    if not name:
        bbox_status.value = "Введите название территории"
        return
    try:
        bbox_value = bbox_from_area_name(name)
    except Exception as exc:
        bbox_status.value = f"Ошибка: {exc}"
        return
    bbox_input.value = bbox_value
    bbox_status.value = "BBox обновлён"


def _results_to_gdf(results: list) -> gpd.GeoDataFrame:
    rows = [
        {
            "geometry": res.geometry,
            "location": res.location,
            "osm_id": res.osm_id,
            "text": res.source_text,
        }
        for res in results
        if res.geometry is not None
    ]
    return gpd.GeoDataFrame(rows, geometry="geometry", crs="EPSG:4326")


def _attach_results(data, results: list):
    result = data.copy()
    result["location"] = [res.location for res in results]
    result["osm_id"] = [res.osm_id for res in results]
    result["lat"] = [res.geometry.y if res.geometry is not None else None for res in results]
    result["lon"] = [res.geometry.x if res.geometry is not None else None for res in results]
    return result


def _run_geocoding(_):
    geo_output.clear_output()
    with geo_output:
        csv_path = Path(file_picker.value).expanduser()
        if not csv_path.exists():
            raise FileNotFoundError(f"Файл не найден: {csv_path}")
        texts, data, data_type = _load_texts(csv_path)
        bbox = bbox_input.value.strip() or None
        bbox_name = bbox_name_input.value.strip() or None
        results = geocode_texts(texts, bbox=bbox, bbox_name=bbox_name)
        target = _ensure_output_path(output_path.value, csv_path)
        output_path.value = str(target)

        gdf = _results_to_gdf(results)
        if data_type == "csv":
            result_df = _attach_results(data, results)
            result_df.to_csv(target, index=False)
        elif data_type == "geojson":
            result_gdf = _attach_results(data, results)
            result_gdf.to_file(target, driver="GeoJSON")
        else:
            geojson_data = build_geojson(results)
            save_geojson(target, geojson_data)
        display({"output": str(target)})

        save_map(map_output.value, results)
        display({"map": map_output.value})

        if gdf.empty:
            display("Нет точек для отображения на карте.")
        else:
            display(gdf.explore())


upload_button.on_click(_handle_upload)
bbox_resolve.on_click(_resolve_bbox)
run_geocoder.on_click(_run_geocoding)

display(
    file_picker,
    widgets.HBox([upload_widget, upload_button]),
    upload_status,
    bbox_input,
    widgets.HBox([bbox_name_input, bbox_resolve]),
    bbox_status,
    output_path,
    map_output,
    run_geocoder,
    geo_output,
)


### Классификация эмоций

Определяет эмоции в текстах и сохраняет результат с новой колонкой `emotion`.

**Как пользоваться:**
1. Укажите путь к CSV/GeoJSON (обычно `parsed_texts.csv`) или загрузите файл.
2. Проверьте путь к выходному файлу.
3. Нажмите **«Классифицировать»** — получите файл с эмоциями и круговую диаграмму распределения.


In [None]:
#@title Классификатор эмоций: импорт и запуск с UI
from pathlib import Path

import geopandas as gpd
import ipywidgets as widgets
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display

import sys
sys.path.insert(0, "/content/digital_identity/parsers")
from emotion_classifier import models_initialization, classify_emotions

DEFAULT_INPUT = Path("/content/digital_identity/parsed_texts.csv")
UPLOAD_TARGET = Path("/content/digital_identity/uploaded_texts")

file_picker = widgets.Text(
    value=str(DEFAULT_INPUT),
    description="CSV/GeoJSON:",
    layout=widgets.Layout(width="80%"),
)
upload_widget = widgets.FileUpload(accept=".csv,.geojson,.json", multiple=False)
upload_button = widgets.Button(description="Загрузить файл")
upload_status = widgets.Label()

output_path = widgets.Text(
    value="/content/digital_identity/emotions_output.csv",
    description="Выход:",
    layout=widgets.Layout(width="80%"),
)
chart_output = widgets.Text(
    value="/content/digital_identity/emotions_chart.png",
    description="Диаграмма:",
    layout=widgets.Layout(width="80%"),
)
batch_size = widgets.IntSlider(
    value=32,
    min=4,
    max=128,
    step=4,
    description="Batch:",
    layout=widgets.Layout(width="70%"),
)
progress = widgets.IntProgress(value=0, min=0, max=1, description="Прогресс:")
progress_label = widgets.Label()
run_classifier = widgets.Button(description="Классифицировать")
classifier_output = widgets.Output()


def _default_output_path(input_path: Path) -> Path:
    suffix = input_path.suffix.lower()
    if suffix in {".geojson", ".json"}:
        return input_path.with_name(f"{input_path.stem}_emotions.geojson")
    return input_path.with_name(f"{input_path.stem}_emotions.csv")


def _ensure_output_path(raw_value: str, input_path: Path) -> Path:
    target = Path(raw_value) if raw_value else _default_output_path(input_path)
    suffix = ".geojson" if input_path.suffix.lower() in {".geojson", ".json"} else ".csv"
    if target.suffix.lower() != suffix:
        target = target.with_suffix(suffix)
    return target


def _load_dataset(path: Path):
    suffix = path.suffix.lower()
    if suffix in {".geojson", ".json"}:
        gdf = gpd.read_file(path)
        if "text" not in gdf.columns:
            raise ValueError("GeoJSON должен содержать колонку text")
        return gdf, "geojson"
    if suffix == ".csv":
        df = pd.read_csv(path)
        if "text" not in df.columns:
            raise ValueError("CSV должен содержать колонку text")
        return df, "csv"
    raise ValueError("Поддерживаются только CSV или GeoJSON файлы")


def _extract_upload_payload():
    upload_value = upload_widget.value
    if isinstance(upload_value, dict):
        if not upload_value:
            return None, None
        return next(iter(upload_value.items()))
    if isinstance(upload_value, (list, tuple)):
        if not upload_value:
            return None, None
        payload = upload_value[0]
        name = payload.get("name") or payload.get("metadata", {}).get("name")
        return name, payload
    return None, None


def _handle_upload(_):
    upload_status.value = ""
    name, payload = _extract_upload_payload()
    if not name or not payload:
        upload_status.value = "Выберите файл для загрузки"
        return
    suffix = Path(name).suffix or ".csv"
    target = UPLOAD_TARGET.with_suffix(suffix)
    target.write_bytes(payload["content"])
    file_picker.value = str(target)
    upload_status.value = f"Файл сохранён: {target}"


def _plot_emotions(df: pd.DataFrame) -> None:
    counts = df["emotion"].value_counts()
    fig, ax = plt.subplots(figsize=(6, 6))
    ax.pie(counts.values, labels=counts.index, autopct="%1.1f%%")
    ax.set_title("Распределение эмоций")
    fig.savefig(chart_output.value, bbox_inches="tight")
    display(fig)
    plt.close(fig)


def _run_classification(_):
    classifier_output.clear_output()
    with classifier_output:
        input_path = Path(file_picker.value).expanduser()
        if not input_path.exists():
            raise FileNotFoundError(f"Файл не найден: {input_path}")
        data, data_type = _load_dataset(input_path)
        if models_initialization._classification_model is None:
            models_initialization.init_models()
        texts = data["text"].fillna("").astype(str).tolist()
        progress.max = max(1, len(texts))
        progress.value = 0
        progress_label.value = ""

        def _update_progress(done: int, total: int) -> None:
            progress.value = min(done, progress.max)
            progress_label.value = f"{done}/{total}"

        data["emotion"] = classify_emotions(
            texts,
            batch_size=batch_size.value,
            progress_callback=_update_progress,
        )
        target = _ensure_output_path(output_path.value, input_path)
        output_path.value = str(target)
        if data_type == "geojson":
            data.to_file(target, driver="GeoJSON")
        else:
            data.to_csv(target, index=False)
        display({"output": str(target)})
        display(data.head())
        _plot_emotions(data)


upload_button.on_click(_handle_upload)
run_classifier.on_click(_run_classification)

display(
    file_picker,
    widgets.HBox([upload_widget, upload_button]),
    upload_status,
    output_path,
    chart_output,
    batch_size,
    progress,
    progress_label,
    run_classifier,
    classifier_output,
)


### Тематическое моделирование (BERTopic)

Кластеризует тексты по темам, добавляет номер темы и ключевые слова, строит визуализации.

**Как пользоваться:**
1. Укажите путь к CSV/GeoJSON или загрузите файл.
2. Проверьте имя текстовой колонки (`text`) и, при необходимости, колонку времени.
3. Выберите язык/модель и нажмите **«Обработать»**.
4. Результаты сохраняются в файл и показываются в таблице и графиках.


In [None]:
#@title BERTopic: кластеризация тем и визуализация
from pathlib import Path

import geopandas as gpd
import ipywidgets as widgets
import pandas as pd
import plotly.io as pio
from IPython.display import display

import sys
sys.path.insert(0, "/content/digital_identity/parsers")
from topic_modeler import attach_topics, train_topic_model
from wordcloud_generator import parse_stop_words

pio.renderers.default = "colab"

DEFAULT_INPUT = Path("/content/digital_identity/parsed_texts.csv")
UPLOAD_TARGET = Path("/content/digital_identity/uploaded_topics")

file_picker = widgets.Text(
    value=str(DEFAULT_INPUT),
    description="CSV/GeoJSON:",
    layout=widgets.Layout(width="80%"),
)
upload_widget = widgets.FileUpload(accept=".csv,.geojson,.json", multiple=False)
upload_button = widgets.Button(description="Загрузить файл")
upload_status = widgets.Label()

text_column = widgets.Text(
    value="text",
    description="Колонка текста:",
    layout=widgets.Layout(width="80%"),
)
timestamp_column = widgets.Text(
    value="",
    description="Колонка времени:",
    layout=widgets.Layout(width="80%"),
)

language_dropdown = widgets.Dropdown(
    options=[
        ("Многоязычный", "multilingual"),
        ("Русский", "russian"),
        ("English", "english"),
    ],
    value="multilingual",
    description="Язык:",
)
embedding_model = widgets.Text(
    value="",
    description="Embedding модель:",
    placeholder="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    layout=widgets.Layout(width="80%"),
)
stop_words = widgets.Text(
    value="",
    description="Стоп-слова:",
    placeholder="russian, english, custom",
    layout=widgets.Layout(width="80%"),
)

cluster_method = widgets.ToggleButtons(
    options=[("HDBSCAN", "hdbscan"), ("k-means", "kmeans")],
    value="hdbscan",
    description="Кластеризация:",
)
min_topic_size = widgets.IntSlider(
    value=10,
    min=2,
    max=50,
    step=1,
    description="Мин. размер темы:",
    layout=widgets.Layout(width="70%"),
)
min_samples = widgets.IntSlider(
    value=5,
    min=1,
    max=50,
    step=1,
    description="min_samples:",
    layout=widgets.Layout(width="70%"),
)
n_clusters = widgets.IntSlider(
    value=10,
    min=2,
    max=100,
    step=1,
    description="k-means k:",
    layout=widgets.Layout(width="70%"),
)

nr_topics = widgets.IntText(
    value=0,
    description="Число тем (0=auto):",
)
top_n_keywords = widgets.IntSlider(
    value=5,
    min=3,
    max=15,
    step=1,
    description="Ключевые слова:",
    layout=widgets.Layout(width="70%"),
)
reduce_frequent_words = widgets.Checkbox(
    value=True,
    description="Фильтровать частотные слова (TF-IDF)",
)

output_path = widgets.Text(
    value="/content/digital_identity/topics_output.csv",
    description="Выход:",
    layout=widgets.Layout(width="80%"),
)
visual_output_dir = widgets.Text(
    value="/content/digital_identity/topics_visuals",
    description="Визуализации:",
    layout=widgets.Layout(width="80%"),
)

visualizations = widgets.SelectMultiple(
    options=[
        ("Карта тем", "topics"),
        ("Барчарт", "barchart"),
        ("Иерархия", "hierarchy"),
        ("Тепловая карта", "heatmap"),
        ("Темы во времени", "over_time"),
    ],
    value=("topics", "barchart"),
    description="Визуализации:",
    layout=widgets.Layout(width="70%"),
)

progress = widgets.IntProgress(value=0, min=0, max=1, description="Прогресс:")
run_model = widgets.Button(description="Построить темы")
model_output = widgets.Output()


def _default_output_path(input_path: Path) -> Path:
    suffix = input_path.suffix.lower()
    if suffix in {".geojson", ".json"}:
        return input_path.with_name(f"{input_path.stem}_topics.geojson")
    return input_path.with_name(f"{input_path.stem}_topics.csv")


def _ensure_output_path(raw_value: str, input_path: Path) -> Path:
    target = Path(raw_value) if raw_value else _default_output_path(input_path)
    suffix = ".geojson" if input_path.suffix.lower() in {".geojson", ".json"} else ".csv"
    if target.suffix.lower() != suffix:
        target = target.with_suffix(suffix)
    return target


def _load_dataset(path: Path):
    suffix = path.suffix.lower()
    if suffix in {".geojson", ".json"}:
        gdf = gpd.read_file(path)
        return gdf, "geojson"
    if suffix == ".csv":
        df = pd.read_csv(path)
        return df, "csv"
    raise ValueError("Поддерживаются только CSV или GeoJSON файлы")


def _parse_stop_words(raw_value: str):
    stop_set = parse_stop_words(raw_value)
    return sorted(stop_set) if stop_set else None


def _extract_upload_payload():
    upload_value = upload_widget.value
    if isinstance(upload_value, dict):
        if not upload_value:
            return None, None
        return next(iter(upload_value.items()))
    if isinstance(upload_value, (list, tuple)):
        if not upload_value:
            return None, None
        payload = upload_value[0]
        name = payload.get("name") or payload.get("metadata", {}).get("name")
        return name, payload
    return None, None


def _handle_upload(_):
    upload_status.value = ""
    name, payload = _extract_upload_payload()
    if not name or not payload:
        upload_status.value = "Выберите файл для загрузки"
        return
    suffix = Path(name).suffix or ".csv"
    target = UPLOAD_TARGET.with_suffix(suffix)
    target.write_bytes(payload["content"])
    file_picker.value = str(target)
    upload_status.value = f"Файл сохранён: {target}"


def _parse_timestamps(data: pd.DataFrame, column_name: str):
    if not column_name:
        return None
    if column_name not in data.columns:
        raise ValueError(f"Колонка времени '{column_name}' не найдена")
    timestamps = pd.to_datetime(data[column_name], errors="coerce")
    if timestamps.isna().all():
        raise ValueError("Не удалось распознать временные метки")
    return timestamps


def _save_plotly_figure(fig, name: str):
    output_dir = Path(visual_output_dir.value).expanduser()
    output_dir.mkdir(parents=True, exist_ok=True)
    html_path = output_dir / f"{name}.html"
    fig.write_html(str(html_path))
    image_path = output_dir / f"{name}.png"
    try:
        fig.write_image(str(image_path))
    except Exception as exc:
        display(f"Не удалось сохранить PNG: {exc}")
        image_path = None
    display({"html": str(html_path), "image": str(image_path) if image_path else None})


def _run_topic_model(_):
    model_output.clear_output()
    with model_output:
        input_path = Path(file_picker.value).expanduser()
        if not input_path.exists():
            raise FileNotFoundError(f"Файл не найден: {input_path}")
        data, data_type = _load_dataset(input_path)
        text_col = text_column.value.strip() or "text"
        if text_col not in data.columns:
            raise ValueError(f"Колонка текста '{text_col}' не найдена")
        texts = data[text_col].fillna("").astype(str).tolist()
        timestamps = _parse_timestamps(data, timestamp_column.value.strip())

        progress.value = 0
        result = train_topic_model(
            texts,
            language=language_dropdown.value,
            min_topic_size=min_topic_size.value,
            nr_topics=nr_topics.value or None,
            embedding_model=embedding_model.value.strip() or None,
            stop_words=_parse_stop_words(stop_words.value),
            cluster_method=cluster_method.value,
            n_clusters=n_clusters.value if cluster_method.value == "kmeans" else None,
            min_samples=min_samples.value if cluster_method.value == "hdbscan" else None,
            reduce_frequent_words=reduce_frequent_words.value,
        )
        data = attach_topics(
            data,
            result.topics,
            result.model,
            top_n=top_n_keywords.value,
        )

        target = _ensure_output_path(output_path.value, input_path)
        output_path.value = str(target)
        if data_type == "geojson":
            data.to_file(target, driver="GeoJSON")
        else:
            data.to_csv(target, index=False)

        progress.value = 1
        display({"output": str(target)})
        display(data.head())
        topic_info = result.model.get_topic_info()
        display(topic_info.head(10))

        available_topics = topic_info[topic_info["Topic"] != -1]
        topic_count = len(available_topics)

        selected = set(visualizations.value)
        if "topics" in selected:
            fig = result.model.visualize_topics()
            _save_plotly_figure(fig, f"{input_path.stem}_topics")
            display(fig)
        if "barchart" in selected:
            fig = result.model.visualize_barchart(top_n_topics=20)
            _save_plotly_figure(fig, f"{input_path.stem}_barchart")
            display(fig)
        if "hierarchy" in selected:
            if topic_count < 2:
                display("Недостаточно тем для иерархии (нужно минимум 2).")
            else:
                fig = result.model.visualize_hierarchy()
                _save_plotly_figure(fig, f"{input_path.stem}_hierarchy")
                display(fig)
        if "heatmap" in selected:
            if topic_count < 2:
                display("Недостаточно тем для тепловой карты (нужно минимум 2).")
            else:
                fig = result.model.visualize_heatmap()
                _save_plotly_figure(fig, f"{input_path.stem}_heatmap")
                display(fig)
        if "over_time" in selected:
            if timestamps is None:
                display("Для динамики тем укажите колонку времени.")
            else:
                topics_over_time = result.model.topics_over_time(
                    texts, timestamps, nr_bins=20
                )
                fig = result.model.visualize_topics_over_time(topics_over_time)
                _save_plotly_figure(fig, f"{input_path.stem}_over_time")
                display(fig)


def _toggle_cluster_controls(change=None):
    is_kmeans = cluster_method.value == "kmeans"
    n_clusters.layout.display = "" if is_kmeans else "none"
    min_samples.layout.display = "none" if is_kmeans else ""


cluster_method.observe(_toggle_cluster_controls, names="value")
_toggle_cluster_controls()

upload_button.on_click(_handle_upload)
run_model.on_click(_run_topic_model)

display(
    file_picker,
    widgets.HBox([upload_widget, upload_button]),
    upload_status,
    text_column,
    timestamp_column,
    language_dropdown,
    embedding_model,
    stop_words,
    cluster_method,
    min_topic_size,
    min_samples,
    n_clusters,
    nr_topics,
    top_n_keywords,
    reduce_frequent_words,
    output_path,
    visual_output_dir,
    visualizations,
    progress,
    run_model,
    model_output,
)


### Извлечение ключевых слов (KeyBERT)

Извлекает ключевые слова из текстов и сохраняет их в итоговый файл.

**Как пользоваться:**
1. Укажите путь к CSV/GeoJSON или загрузите файл.
2. Проверьте колонку текста, при необходимости укажите модель эмбеддингов.
3. Настройте стоп‑слова, n‑граммы и параметры MMR/MaxSum.
4. Нажмите **«Извлечь ключевые слова»** — в выходном файле появится колонка с ключевыми словами, также строится топ‑лист.


In [None]:
#@title KeyBERT: извлечение ключевых слов и визуализация
from pathlib import Path

import geopandas as gpd
import ipywidgets as widgets
import pandas as pd
import plotly.express as px
import plotly.io as pio
from IPython.display import display

import sys
sys.path.insert(0, "/content/digital_identity/parsers")
from keybert_modeler import attach_keywords, extract_keywords
from wordcloud_generator import parse_stop_words

pio.renderers.default = "colab"

DEFAULT_INPUT = Path("/content/digital_identity/parsed_texts.csv")
UPLOAD_TARGET = Path("/content/digital_identity/uploaded_keywords")

file_picker = widgets.Text(
    value=str(DEFAULT_INPUT),
    description="CSV/GeoJSON:",
    layout=widgets.Layout(width="80%"),
)
upload_widget = widgets.FileUpload(accept=".csv,.geojson,.json", multiple=False)
upload_button = widgets.Button(description="Загрузить файл")
upload_status = widgets.Label()

text_column = widgets.Text(
    value="text",
    description="Колонка текста:",
    layout=widgets.Layout(width="80%"),
)

embedding_model = widgets.Text(
    value="",
    description="Embedding модель:",
    placeholder="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    layout=widgets.Layout(width="80%"),
)

stop_words = widgets.Text(
    value="",
    description="Стоп-слова:",
    placeholder="russian, english, custom",
    layout=widgets.Layout(width="80%"),
)

ngram_min = widgets.IntSlider(
    value=1,
    min=1,
    max=3,
    step=1,
    description="n-gram мин:",
    layout=widgets.Layout(width="70%"),
)
ngram_max = widgets.IntSlider(
    value=2,
    min=1,
    max=4,
    step=1,
    description="n-gram макс:",
    layout=widgets.Layout(width="70%"),
)

keywords_top_n = widgets.IntSlider(
    value=5,
    min=3,
    max=20,
    step=1,
    description="Ключевые слова:",
    layout=widgets.Layout(width="70%"),
)

use_mmr = widgets.Checkbox(
    value=False,
    description="MMR",
)

diversity = widgets.FloatSlider(
    value=0.5,
    min=0.1,
    max=0.9,
    step=0.1,
    description="Diversity:",
    layout=widgets.Layout(width="70%"),
)

use_maxsum = widgets.Checkbox(
    value=False,
    description="MaxSum",
)

nr_candidates = widgets.IntSlider(
    value=20,
    min=5,
    max=50,
    step=1,
    description="Кандидаты:",
    layout=widgets.Layout(width="70%"),
)

output_path = widgets.Text(
    value="/content/digital_identity/keywords_output.csv",
    description="Выход:",
    layout=widgets.Layout(width="80%"),
)
visual_output_dir = widgets.Text(
    value="/content/digital_identity/keywords_visuals",
    description="Визуализации:",
    layout=widgets.Layout(width="80%"),
)

visualizations = widgets.SelectMultiple(
    options=[
        ("Топ ключевых слов", "top_keywords"),
    ],
    value=("top_keywords",),
    description="Визуализации:",
    layout=widgets.Layout(width="70%"),
)

progress = widgets.IntProgress(value=0, min=0, max=1, description="Прогресс:")
run_model = widgets.Button(description="Извлечь ключевые слова")
model_output = widgets.Output()


def _default_output_path(input_path: Path) -> Path:
    suffix = input_path.suffix.lower()
    if suffix in {".geojson", ".json"}:
        return input_path.with_name(f"{input_path.stem}_keywords.geojson")
    return input_path.with_name(f"{input_path.stem}_keywords.csv")


def _ensure_output_path(raw_value: str, input_path: Path) -> Path:
    target = Path(raw_value) if raw_value else _default_output_path(input_path)
    suffix = ".geojson" if input_path.suffix.lower() in {".geojson", ".json"} else ".csv"
    if target.suffix.lower() != suffix:
        target = target.with_suffix(suffix)
    return target


def _load_dataset(path: Path):
    suffix = path.suffix.lower()
    if suffix in {".geojson", ".json"}:
        gdf = gpd.read_file(path)
        return gdf, "geojson"
    if suffix == ".csv":
        df = pd.read_csv(path)
        return df, "csv"
    raise ValueError("Поддерживаются только CSV или GeoJSON файлы")


def _parse_stop_words(raw_value: str):
    stop_set = parse_stop_words(raw_value)
    return sorted(stop_set) if stop_set else None


def _extract_upload_payload():
    upload_value = upload_widget.value
    if isinstance(upload_value, dict):
        if not upload_value:
            return None, None
        return next(iter(upload_value.items()))
    if isinstance(upload_value, (list, tuple)):
        if not upload_value:
            return None, None
        payload = upload_value[0]
        name = payload.get("name") or payload.get("metadata", {}).get("name")
        return name, payload
    return None, None


def _handle_upload(_):
    upload_status.value = ""
    name, payload = _extract_upload_payload()
    if not name or not payload:
        upload_status.value = "Выберите файл для загрузки"
        return
    suffix = Path(name).suffix or ".csv"
    target = UPLOAD_TARGET.with_suffix(suffix)
    target.write_bytes(payload["content"])
    file_picker.value = str(target)
    upload_status.value = f"Файл сохранён: {target}"


def _save_plotly_figure(fig, name: str):
    output_dir = Path(visual_output_dir.value).expanduser()
    output_dir.mkdir(parents=True, exist_ok=True)
    html_path = output_dir / f"{name}.html"
    fig.write_html(str(html_path))
    image_path = output_dir / f"{name}.png"
    try:
        fig.write_image(str(image_path))
    except Exception as exc:
        display(f"Не удалось сохранить PNG: {exc}")
        image_path = None
    display({"html": str(html_path), "image": str(image_path) if image_path else None})


def _run_keybert(_):
    model_output.clear_output()
    with model_output:
        input_path = Path(file_picker.value).expanduser()
        if not input_path.exists():
            raise FileNotFoundError(f"Файл не найден: {input_path}")
        data, data_type = _load_dataset(input_path)
        text_col = text_column.value.strip() or "text"
        if text_col not in data.columns:
            raise ValueError(f"Колонка текста '{text_col}' не найдена")
        texts = data[text_col].fillna("").astype(str).tolist()

        progress.max = max(1, len(texts))
        progress.value = 0

        def _update_progress(done: int, total: int) -> None:
            progress.value = min(done, progress.max)

        result = extract_keywords(
            texts,
            embedding_model=embedding_model.value.strip() or None,
            top_n=keywords_top_n.value,
            keyphrase_ngram_range=(ngram_min.value, ngram_max.value),
            stop_words=_parse_stop_words(stop_words.value),
            use_mmr=use_mmr.value,
            diversity=diversity.value,
            use_maxsum=use_maxsum.value,
            nr_candidates=nr_candidates.value,
            progress_callback=_update_progress,
        )
        data = attach_keywords(data, result.keywords)

        target = _ensure_output_path(output_path.value, input_path)
        output_path.value = str(target)
        if data_type == "geojson":
            data.to_file(target, driver="GeoJSON")
        else:
            data.to_csv(target, index=False)

        display({"output": str(target)})
        display(data.head())

        if "top_keywords" in set(visualizations.value):
            freq = {}
            for keyword_list in result.keywords:
                for keyword, _ in keyword_list:
                    freq[keyword] = freq.get(keyword, 0) + 1
            if not freq:
                display("Нет ключевых слов для визуализации.")
            else:
                freq_df = (
                    pd.DataFrame(
                        [{"keyword": k, "count": v} for k, v in freq.items()]
                    )
                    .sort_values("count", ascending=False)
                    .head(30)
                )
                fig = px.bar(freq_df, x="keyword", y="count", title="Топ ключевых слов")
                fig.update_layout(xaxis_tickangle=-45)
                _save_plotly_figure(fig, f"{input_path.stem}_keywords")
                display(fig)


upload_button.on_click(_handle_upload)
run_model.on_click(_run_keybert)

display(
    file_picker,
    widgets.HBox([upload_widget, upload_button]),
    upload_status,
    text_column,
    embedding_model,
    stop_words,
    widgets.HBox([ngram_min, ngram_max]),
    keywords_top_n,
    use_mmr,
    diversity,
    use_maxsum,
    nr_candidates,
    output_path,
    visual_output_dir,
    visualizations,
    progress,
    run_model,
    model_output,
)


### Семантическая лепестковая диаграмма

Оценивает тексты по заданным осям (до 10) и строит лепестковую диаграмму по средним значениям.

**Как пользоваться:**
1. Укажите путь к CSV/GeoJSON и колонку текста.
2. Опишите оси в формате `Название | левые слова | правые слова` (каждая ось в отдельной строке).
3. Нажмите **«Построить»** — получите таблицу оценок и диаграмму.


In [None]:
#@title Семантическая лепестковая диаграмма: оценка по осям и визуализация
from pathlib import Path

import geopandas as gpd
import ipywidgets as widgets
import pandas as pd
import plotly.io as pio
from IPython.display import display

import sys
sys.path.insert(0, "/content/digital_identity/parsers")
from semantic_axes import build_radar_chart, parse_axis_lines, score_texts_on_axes

pio.renderers.default = "colab"

DEFAULT_INPUT = Path("/content/digital_identity/parsed_texts.csv")
UPLOAD_TARGET = Path("/content/digital_identity/uploaded_semantic_axes")


def load_table(path: Path) -> pd.DataFrame:
    if path.suffix.lower() in {".geojson", ".json"}:
        return gpd.read_file(path)
    return pd.read_csv(path)


upload = widgets.FileUpload(accept=".csv,.geojson,.json", multiple=False)
upload_button = widgets.Button(description="Загрузить файл")
upload_status = widgets.Label()
input_path_widget = widgets.Text(value=str(DEFAULT_INPUT), description="Путь:")
text_column_widget = widgets.Text(value="text", description="Колонка:")
axes_widget = widgets.Textarea(
    value="Ось 1 | негатив, плохой | позитив, хороший\nОсь 2 | медленно | быстро",
    description="Оси:",
    layout=widgets.Layout(width="100%", height="120px"),
)
model_widget = widgets.Text(
    value="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    description="Модель:",
)
output_path = widgets.Text(
    value="/content/digital_identity/semantic_axes_output.csv",
    description="Выход:",
    layout=widgets.Layout(width="80%"),
)
visual_output_dir = widgets.Text(
    value="/content/digital_identity/semantic_axes_visuals",
    description="Визуализации:",
    layout=widgets.Layout(width="80%"),
)
progress = widgets.IntProgress(value=0, min=0, max=1, description="Прогресс:")
run_button = widgets.Button(description="Построить")
output = widgets.Output()


def _default_output_path(input_path: Path) -> Path:
    suffix = input_path.suffix.lower()
    if suffix in {".geojson", ".json"}:
        return input_path.with_name(f"{input_path.stem}_semantic_axes.geojson")
    return input_path.with_name(f"{input_path.stem}_semantic_axes.csv")


def _ensure_output_path(raw_value: str, input_path: Path) -> Path:
    target = Path(raw_value) if raw_value else _default_output_path(input_path)
    suffix = ".geojson" if input_path.suffix.lower() in {".geojson", ".json"} else ".csv"
    if target.suffix.lower() != suffix:
        target = target.with_suffix(suffix)
    return target


def _extract_upload_payload():
    upload_value = upload.value
    if isinstance(upload_value, dict):
        if not upload_value:
            return None, None
        return next(iter(upload_value.items()))
    if isinstance(upload_value, (list, tuple)):
        if not upload_value:
            return None, None
        payload = upload_value[0]
        name = payload.get("name") or payload.get("metadata", {}).get("name")
        return name, payload
    return None, None


def _handle_upload(_):
    upload_status.value = ""
    name, payload = _extract_upload_payload()
    if not name or not payload:
        upload_status.value = "Выберите файл для загрузки"
        return
    suffix = Path(name).suffix or ".csv"
    target = UPLOAD_TARGET.with_suffix(suffix)
    target.write_bytes(payload["content"])
    input_path_widget.value = str(target)
    upload_status.value = f"Файл сохранён: {target}"


def _save_plotly_figure(fig, name: str):
    output_dir = Path(visual_output_dir.value).expanduser()
    output_dir.mkdir(parents=True, exist_ok=True)
    html_path = output_dir / f"{name}.html"
    fig.write_html(str(html_path))
    image_path = output_dir / f"{name}.png"
    try:
        fig.write_image(str(image_path))
    except Exception as exc:
        display(f"Не удалось сохранить PNG: {exc}")
        image_path = None
    display({"html": str(html_path), "image": str(image_path) if image_path else None})


def on_run(_):
    output.clear_output()
    with output:
        input_path = Path(input_path_widget.value).expanduser()
        data = load_table(input_path)
        text_col = text_column_widget.value.strip() or "text"
        if text_col not in data.columns:
            raise ValueError(f"Колонка текста '{text_col}' не найдена")
        texts = data[text_col].fillna("").astype(str).tolist()
        axes = parse_axis_lines(axes_widget.value)
        if len(axes) > 10:
            raise ValueError("Максимум 10 осей.")
        progress.value = 0
        scores = score_texts_on_axes(texts, axes, model_name=model_widget.value, show_progress=True)
        progress.value = 1
        average = scores.mean()
        display(scores.head())
        display(pd.DataFrame({"axis": average.index, "mean_score": average.values}))

        result = data.copy()
        for column in scores.columns:
            result[column] = scores[column].values

        target = _ensure_output_path(output_path.value, input_path)
        output_path.value = str(target)
        if input_path.suffix.lower() in {".geojson", ".json"}:
            result.to_file(target, driver="GeoJSON")
        else:
            result.to_csv(target, index=False)
        display({"output": str(target)})

        fig = build_radar_chart(average)
        _save_plotly_figure(fig, f"{input_path.stem}_semantic_axes")
        fig.show()


upload_button.on_click(_handle_upload)
run_button.on_click(on_run)

display(
    widgets.VBox([
        widgets.HBox([upload, upload_button]),
        upload_status,
        input_path_widget,
        text_column_widget,
        axes_widget,
        model_widget,
        output_path,
        visual_output_dir,
        progress,
        run_button,
        output,
    ])
)


### Семантическая карта

Строит карту по двум осям (оси трактуются как диаметры, центр находится в нуле).

**Как пользоваться:**
1. Укажите путь к CSV/GeoJSON и колонку текста.
2. Задайте ровно две оси в формате `Название | левые слова | правые слова`.
3. Нажмите **«Построить»** — получите точки на карте, где оси пересекаются в центре.


In [None]:
#@title Семантическая карта: оценка по двум осям и визуализация
from pathlib import Path

import geopandas as gpd
import ipywidgets as widgets
import pandas as pd
import plotly.io as pio
from IPython.display import display

import sys
sys.path.insert(0, "/content/digital_identity/parsers")
from semantic_axes import build_semantic_map, parse_axis_lines, score_texts_on_axes

pio.renderers.default = "colab"

DEFAULT_INPUT = Path("/content/digital_identity/parsed_texts.csv")
UPLOAD_TARGET = Path("/content/digital_identity/uploaded_semantic_map")


def load_table(path: Path) -> pd.DataFrame:
    if path.suffix.lower() in {".geojson", ".json"}:
        return gpd.read_file(path)
    return pd.read_csv(path)


upload = widgets.FileUpload(accept=".csv,.geojson,.json", multiple=False)
upload_button = widgets.Button(description="Загрузить файл")
upload_status = widgets.Label()
input_path_widget = widgets.Text(value=str(DEFAULT_INPUT), description="Путь:")
text_column_widget = widgets.Text(value="text", description="Колонка:")
axes_widget = widgets.Textarea(
    value="Ось X | низкий | высокий\nОсь Y | холодный | тёплый",
    description="Оси:",
    layout=widgets.Layout(width="100%", height="120px"),
)
model_widget = widgets.Text(
    value="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
    description="Модель:",
)
output_path = widgets.Text(
    value="/content/digital_identity/semantic_map_output.csv",
    description="Выход:",
    layout=widgets.Layout(width="80%"),
)
visual_output_dir = widgets.Text(
    value="/content/digital_identity/semantic_map_visuals",
    description="Визуализации:",
    layout=widgets.Layout(width="80%"),
)
progress = widgets.IntProgress(value=0, min=0, max=1, description="Прогресс:")
run_button = widgets.Button(description="Построить")
output = widgets.Output()


def _default_output_path(input_path: Path) -> Path:
    suffix = input_path.suffix.lower()
    if suffix in {".geojson", ".json"}:
        return input_path.with_name(f"{input_path.stem}_semantic_map.geojson")
    return input_path.with_name(f"{input_path.stem}_semantic_map.csv")


def _ensure_output_path(raw_value: str, input_path: Path) -> Path:
    target = Path(raw_value) if raw_value else _default_output_path(input_path)
    suffix = ".geojson" if input_path.suffix.lower() in {".geojson", ".json"} else ".csv"
    if target.suffix.lower() != suffix:
        target = target.with_suffix(suffix)
    return target


def _extract_upload_payload():
    upload_value = upload.value
    if isinstance(upload_value, dict):
        if not upload_value:
            return None, None
        return next(iter(upload_value.items()))
    if isinstance(upload_value, (list, tuple)):
        if not upload_value:
            return None, None
        payload = upload_value[0]
        name = payload.get("name") or payload.get("metadata", {}).get("name")
        return name, payload
    return None, None


def _handle_upload(_):
    upload_status.value = ""
    name, payload = _extract_upload_payload()
    if not name or not payload:
        upload_status.value = "Выберите файл для загрузки"
        return
    suffix = Path(name).suffix or ".csv"
    target = UPLOAD_TARGET.with_suffix(suffix)
    target.write_bytes(payload["content"])
    input_path_widget.value = str(target)
    upload_status.value = f"Файл сохранён: {target}"


def _save_plotly_figure(fig, name: str):
    output_dir = Path(visual_output_dir.value).expanduser()
    output_dir.mkdir(parents=True, exist_ok=True)
    html_path = output_dir / f"{name}.html"
    fig.write_html(str(html_path))
    image_path = output_dir / f"{name}.png"
    try:
        fig.write_image(str(image_path))
    except Exception as exc:
        display(f"Не удалось сохранить PNG: {exc}")
        image_path = None
    display({"html": str(html_path), "image": str(image_path) if image_path else None})


def on_run(_):
    output.clear_output()
    with output:
        input_path = Path(input_path_widget.value).expanduser()
        data = load_table(input_path)
        text_col = text_column_widget.value.strip() or "text"
        if text_col not in data.columns:
            raise ValueError(f"Колонка текста '{text_col}' не найдена")
        texts = data[text_col].fillna("").astype(str).tolist()
        axes = parse_axis_lines(axes_widget.value)
        if len(axes) != 2:
            raise ValueError("Для карты нужно ровно две оси.")
        progress.value = 0
        scores = score_texts_on_axes(texts, axes, model_name=model_widget.value, show_progress=True)
        progress.value = 1
        axis_x, axis_y = scores.columns.tolist()
        display(scores.head())

        result = data.copy()
        for column in scores.columns:
            result[column] = scores[column].values
        target = _ensure_output_path(output_path.value, input_path)
        output_path.value = str(target)
        if input_path.suffix.lower() in {".geojson", ".json"}:
            result.to_file(target, driver="GeoJSON")
        else:
            result.to_csv(target, index=False)
        display({"output": str(target)})

        fig = build_semantic_map(scores, texts, axis_x=axis_x, axis_y=axis_y)
        _save_plotly_figure(fig, f"{input_path.stem}_semantic_map")
        fig.show()


upload_button.on_click(_handle_upload)
run_button.on_click(on_run)

display(
    widgets.VBox([
        widgets.HBox([upload, upload_button]),
        upload_status,
        input_path_widget,
        text_column_widget,
        axes_widget,
        model_widget,
        output_path,
        visual_output_dir,
        progress,
        run_button,
        output,
    ])
)


### Облако слов

Строит облако слов по колонке `text` в CSV/GeoJSON.

**Как пользоваться:**
1. Укажите путь к CSV/GeoJSON или загрузите файл.
2. Проверьте колонку текста, задайте стоп‑слова (например, `russian` или список через запятую).
3. Нажмите **«Построить»** — отобразится облако слов и сохранится изображение.


In [None]:
#@title Облако слов: генерация и визуализация
from pathlib import Path

import geopandas as gpd
import ipywidgets as widgets
import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display

import sys
sys.path.insert(0, "/content/digital_identity/parsers")
from wordcloud_generator import build_wordcloud

DEFAULT_INPUT = Path("/content/digital_identity/parsed_texts.csv")
UPLOAD_TARGET = Path("/content/digital_identity/uploaded_wordcloud")

file_picker = widgets.Text(
    value=str(DEFAULT_INPUT),
    description="CSV/GeoJSON:",
    layout=widgets.Layout(width="80%"),
)
upload_widget = widgets.FileUpload(accept=".csv,.geojson,.json", multiple=False)
upload_button = widgets.Button(description="Загрузить файл")
upload_status = widgets.Label()

text_column = widgets.Text(
    value="text",
    description="Колонка текста:",
    layout=widgets.Layout(width="80%"),
)

stop_words = widgets.Text(
    value="russian",
    description="Стоп-слова:",
    placeholder="russian, english, custom",
    layout=widgets.Layout(width="80%"),
)

max_words = widgets.IntSlider(
    value=200,
    min=50,
    max=500,
    step=10,
    description="Макс. слов:",
    layout=widgets.Layout(width="70%"),
)

width = widgets.IntSlider(
    value=800,
    min=400,
    max=1200,
    step=50,
    description="Ширина:",
    layout=widgets.Layout(width="70%"),
)
height = widgets.IntSlider(
    value=400,
    min=200,
    max=800,
    step=50,
    description="Высота:",
    layout=widgets.Layout(width="70%"),
)

background_color = widgets.Text(
    value="white",
    description="Фон:",
    layout=widgets.Layout(width="60%"),
)

output_path = widgets.Text(
    value="/content/digital_identity/wordcloud_output.csv",
    description="Выход:",
    layout=widgets.Layout(width="80%"),
)
image_output = widgets.Text(
    value="/content/digital_identity/wordcloud.png",
    description="Картинка:",
    layout=widgets.Layout(width="80%"),
)
progress = widgets.IntProgress(value=0, min=0, max=1, description="Прогресс:")
run_wordcloud = widgets.Button(description="Построить облако")
wordcloud_output = widgets.Output()


def _default_output_path(input_path: Path) -> Path:
    suffix = input_path.suffix.lower()
    if suffix in {".geojson", ".json"}:
        return input_path.with_name(f"{input_path.stem}_wordcloud.geojson")
    return input_path.with_name(f"{input_path.stem}_wordcloud.csv")


def _ensure_output_path(raw_value: str, input_path: Path) -> Path:
    target = Path(raw_value) if raw_value else _default_output_path(input_path)
    suffix = ".geojson" if input_path.suffix.lower() in {".geojson", ".json"} else ".csv"
    if target.suffix.lower() != suffix:
        target = target.with_suffix(suffix)
    return target


def _load_dataset(path: Path):
    suffix = path.suffix.lower()
    if suffix in {".geojson", ".json"}:
        gdf = gpd.read_file(path)
        return gdf, "geojson"
    if suffix == ".csv":
        df = pd.read_csv(path)
        return df, "csv"
    raise ValueError("Поддерживаются только CSV или GeoJSON файлы")


def _extract_upload_payload():
    upload_value = upload_widget.value
    if isinstance(upload_value, dict):
        if not upload_value:
            return None, None
        return next(iter(upload_value.items()))
    if isinstance(upload_value, (list, tuple)):
        if not upload_value:
            return None, None
        payload = upload_value[0]
        name = payload.get("name") or payload.get("metadata", {}).get("name")
        return name, payload
    return None, None


def _handle_upload(_):
    upload_status.value = ""
    name, payload = _extract_upload_payload()
    if not name or not payload:
        upload_status.value = "Выберите файл для загрузки"
        return
    suffix = Path(name).suffix or ".csv"
    target = UPLOAD_TARGET.with_suffix(suffix)
    target.write_bytes(payload["content"])
    file_picker.value = str(target)
    upload_status.value = f"Файл сохранён: {target}"


def _run_wordcloud(_):
    wordcloud_output.clear_output()
    with wordcloud_output:
        input_path = Path(file_picker.value).expanduser()
        if not input_path.exists():
            raise FileNotFoundError(f"Файл не найден: {input_path}")
        data, data_type = _load_dataset(input_path)
        text_col = text_column.value.strip() or "text"
        if text_col not in data.columns:
            raise ValueError(f"Колонка текста '{text_col}' не найдена")
        texts = data[text_col].fillna("").astype(str).tolist()
        progress.value = 0
        result = build_wordcloud(
            texts,
            stop_words=stop_words.value,
            max_words=max_words.value,
            width=width.value,
            height=height.value,
            background_color=background_color.value.strip() or "white",
        )
        progress.value = 1

        freq_df = (
            pd.DataFrame(
                [{"word": word, "weight": weight} for word, weight in result.wordcloud.words_.items()]
            )
            .sort_values("weight", ascending=False)
        )
        target = _ensure_output_path(output_path.value, input_path)
        output_path.value = str(target)
        if data_type == "geojson":
            gdf = gpd.GeoDataFrame(freq_df, geometry=[None] * len(freq_df), crs="EPSG:4326")
            gdf.to_file(target, driver="GeoJSON")
        else:
            freq_df.to_csv(target, index=False)
        display({"output": str(target)})

        result.wordcloud.to_file(image_output.value)
        display({"image": image_output.value})

        fig, ax = plt.subplots(figsize=(width.value / 100, height.value / 100))
        ax.imshow(result.wordcloud, interpolation="bilinear")
        ax.axis("off")
        display(fig)
        plt.close(fig)


upload_button.on_click(_handle_upload)
run_wordcloud.on_click(_run_wordcloud)

display(
    file_picker,
    widgets.HBox([upload_widget, upload_button]),
    upload_status,
    text_column,
    stop_words,
    max_words,
    widgets.HBox([width, height]),
    background_color,
    output_path,
    image_output,
    progress,
    run_wordcloud,
    wordcloud_output,
)
