In [1]:
import pandas as pd

In [2]:
df = pd.read_parquet("01_bbdd_think_tanks_no_stopwords.parquet")

In [4]:
# -*- coding: utf-8 -*-
"""
Diagnóstico de longitud textual:
- Cuenta de palabras por fila en df["Texto"] (robusto a NaN y espacios).
- Resumen global (percentiles) y por 'Think Tank'.
"""

from __future__ import annotations

import re
from typing import Optional

import numpy as np
import pandas as pd


# --------------------------------------------------------------------------- #
# Utilitarios
# --------------------------------------------------------------------------- #

_WORD_RGX = re.compile(r"[A-Za-zÁÉÍÓÚáéíóúÑñÜü0-9]+", flags=re.UNICODE)


def _count_words(text: Optional[str]) -> int:
    """Cuenta 'palabras' con regex Unicode (letras con tildes y dígitos)."""
    if not isinstance(text, str):
        return 0
    text = text.strip()
    if not text:
        return 0
    return len(_WORD_RGX.findall(text))


# --------------------------------------------------------------------------- #
# Diagnóstico principal
# --------------------------------------------------------------------------- #

def diagnose_text_lengths(df: pd.DataFrame) -> pd.DataFrame:
    """
    Devuelve un DataFrame con las longitudes por documento y
    emite resúmenes globales y por Think Tank.
    """
    cols_required = {"ID", "Think Tank", "Texto"}
    missing = cols_required - set(df.columns)
    if missing:
        raise KeyError(f"Faltan columnas requeridas: {missing}")

    out = df[["ID", "Think Tank", "Texto"]].copy()
    out["word_count"] = out["Texto"].apply(_count_words).astype(int)
    out["char_count"] = out["Texto"].astype(str).str.len().astype(int)

    # Resumen global
    q = out["word_count"].quantile([0.5, 0.75, 0.9, 0.95, 0.99])
    print("\n[Resumen global word_count]")
    print(f"n={len(out)} | mean={out['word_count'].mean():.1f} | "
          f"std={out['word_count'].std(ddof=1):.1f} | "
          f"min={out['word_count'].min()} | "
          f"p50={q.loc[0.5]:.0f} | p75={q.loc[0.75]:.0f} | "
          f"p90={q.loc[0.9]:.0f} | p95={q.loc[0.95]:.0f} | "
          f"p99={q.loc[0.99]:.0f} | max={out['word_count'].max()}")

    # Resumen por Think Tank (media, mediana, p95 y n)
    grp = (
        out.groupby("Think Tank")["word_count"]
        .agg(n="count",
             mean="mean",
             p50=lambda s: s.quantile(0.5),
             p95=lambda s: s.quantile(0.95),
             max="max")
        .sort_values(by="mean", ascending=False)
    )
    # Formateo rápido para lectura
    grp_fmt = grp.copy()
    for c in ["mean", "p50", "p95"]:
        grp_fmt[c] = grp_fmt[c].round(1)

    print("\n[Resumen por Think Tank] (ordenado por media de palabras)")
    print(grp_fmt)

    # Muestras extremas (opcional, útiles para inspección manual)
    print("\n[Ejemplos con textos más largos]")
    print(out.nlargest(5, "word_count")[["ID", "Think Tank", "word_count"]])

    print("\n[Ejemplos con textos más cortos]")
    print(out.nsmallest(5, "word_count")[["ID", "Think Tank", "word_count"]])

    return out


# --------------------------------------------------------------------------- #
# Uso
# --------------------------------------------------------------------------- #
# Ejecuta:
# lengths_df = diagnose_text_lengths(df)
# lengths_df.head()


In [5]:
lengths_df = diagnose_text_lengths(df)
lengths_df.head()


[Resumen global word_count]
n=17432 | mean=533.9 | std=672.4 | min=0 | p50=384 | p75=641 | p90=1042 | p95=1440 | p99=4518 | max=6691

[Resumen por Think Tank] (ordenado por media de palabras)
                           n    mean     p50     p95   max
Think Tank                                                
CDC                      171  3248.4  3480.0  5300.0  6527
IES                       20  3206.2  4996.0  5223.8  5259
Idea País                  3  3053.7  4014.0  5031.9  5145
ICAL                      38  1086.8   911.5  3043.4  3190
Nodo XXI                 328   871.4   709.0  2105.0  5089
Instituto Res Pública    575   782.2   614.0  1697.2  4913
Casa Común               227   728.8   501.0  2392.6  5554
FPP                     2586   680.3   432.5  2037.0  5562
Fundación Sol            550   637.8   569.5  1688.3  5128
LyD                     2623   569.3   496.0  1317.8  6691
CEP                     2008   553.4   492.0  1305.5  5700
Horizonte Ciudadano      130   489.4   4

Unnamed: 0,ID,Think Tank,Texto,word_count,char_count
205,206,LyD,MERCADO LABORAL PERMANECE DÉBIL A PESAR DE LA ...,446,2689
206,207,LyD,ENCUESTA DE PERCEPCIÓN DEL EMPLEO LYD DICIEMBR...,324,1899
207,208,LyD,ESTUDIO LYD: HOSPITALES ESTATALES PODRÍAN AUME...,695,4332
208,209,LyD,VALIOSOS INSUMOS TÉCNICOS PARA RESOLVER LA CRI...,471,2967
209,210,LyD,INFLACIÓN ANUAL CONTINÚA A LA BAJA. El Institu...,333,1959


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17432 entries, 205 to 31640
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   ID                    17432 non-null  int64         
 1   Think Tank            17432 non-null  object        
 2   Tipo de Think Tank    17432 non-null  object        
 3   Orientación Política  17432 non-null  object        
 4   Autor                 8390 non-null   object        
 5   Título                17425 non-null  object        
 6   Medio                 2498 non-null   object        
 7   Corpus                16690 non-null  object        
 8   Producto              10353 non-null  object        
 9   Enlace                17432 non-null  object        
 10  CorpusPDF             237 non-null    object        
 11  FechaPublicacion      17432 non-null  datetime64[ns]
 12  Año                   17432 non-null  float64       
 13  Mes                

In [3]:
# -*- coding: utf-8 -*-
"""
Pipeline CPU (completo) con cobertura 100% y App Dash.

Requiere columnas en df:
['ID', 'Think Tank', 'Texto', 'FechaPublicacion', 'Enlace']

Hace:
- SBERT (paraphrase-multilingual-mpnet-base-v2)
- BERTopic (macro-tópicos)
- Post-proceso de ruido: micro-tópicos (kNN + umbral coseno) y singletons
- Keywords por cluster (macro: BERTopic; micro/singleton: TF-IDF local)
- UMAP 2D/3D
- App Dash (toggle color ThinkTank/Tópico; histograma mensual por ThinkTank; tabla con enlace clickeable)
"""

from __future__ import annotations
import os, re, json, sys, math, hashlib, colorsys, platform
from dataclasses import dataclass
from typing import Dict, List, Tuple, Optional, Sequence
from datetime import datetime
from pathlib import Path
from collections import defaultdict, Counter

import numpy as np
import pandas as pd

import umap
import hdbscan
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize as sk_normalize
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from bertopic import BERTopic


# =========================
# Config
# =========================
@dataclass
class Config:
    # columnas
    id_col: str = "ID"
    cat_col: str = "Think Tank"
    text_col: str = "Texto"
    date_col: str = "FechaPublicacion"
    url_col: str = "Enlace"

    # filtro temporal (tú puedes setearlos antes de llamar run_pipeline_on_df)
    date_start: Optional[pd.Timestamp] = None
    date_end: Optional[pd.Timestamp] = None  # exclusivo

    # rutas
    out_dir_root: str = "./NLP"
    run_id: Optional[str] = None
    run_dir: Optional[str] = None
    out_dir_topics: str = ""
    out_dir_map: str = ""

    # SBERT
    sbert_model_name: str = "sentence-transformers/paraphrase-multilingual-mpnet-base-v2"
    sbert_batch_size: int = 32
    # sin cache: reconstruimos siempre (como pediste)
    force_reembed: bool = True

    # Limpieza ligera
    min_token_len: int = 3

    # UMAP pro-diversidad
    umap_neighbors_grid: Tuple[int, ...] = (10, 15, 20)
    umap_min_dist_grid: Tuple[float, ...] = (0.3, 0.5, 0.7)
    umap_n_components: int = 10
    umap_metric: str = "cosine"
    random_state: int = 42

    # HDBSCAN (macro)
    hdbscan_min_cluster_size: int = 8     # más bajo → más clusters
    hdbscan_min_samples: int = 5          # separa mejor islas
    hdbscan_metric: str = "euclidean"

    # Micro-tópicos sobre ruido
    micro_knn_k: int = 10                 # vecinos para grafo
    micro_cos_threshold: float = 0.65     # umbral coseno para unir outliers
    accept_singletons: bool = True        # crear singleton si no tiene vecinos

    # Keywords
    topic_kw_fixed: int = 4               # fijo para la app
    tfidf_max_features_micro: int = 1000  # para micro/singleton

    # App (marcadores)
    app_marker_3d_size: int = 3
    app_marker_2d_size: int = 5

    # Vectorizer para BERTopic (control de extremos)
    vec_min_df_abs: int = 2               # al menos en 2 docs
    vec_min_df_frac: float = 0.002        # o 0.2% del corpus
    vec_max_df: float = 0.75              # ignora super frecuentes

    verbose: bool = True


CFG = Config()


# =========================
# Utils
# =========================
def _log(msg: str):
    if CFG.verbose:
        ts = datetime.now().strftime("%H:%M:%S")
        print(f"[{ts}] {msg}")

def start_run_dirs(cfg: Config) -> None:
    Path(cfg.out_dir_root).mkdir(parents=True, exist_ok=True)
    runs_root = Path(cfg.out_dir_root) / "runs"
    runs_root.mkdir(exist_ok=True, parents=True)
    cfg.run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
    cfg.run_dir = str(runs_root / cfg.run_id)
    cfg.out_dir_topics = str(Path(cfg.run_dir) / "bertopic_outputs")
    cfg.out_dir_map = str(Path(cfg.run_dir) / "bertopic_mapping_outputs")
    Path(cfg.out_dir_topics).mkdir(parents=True, exist_ok=True)
    Path(cfg.out_dir_map).mkdir(parents=True, exist_ok=True)
    _log(f"[RUN] id={cfg.run_id} -> {cfg.run_dir}")

def scrub_text(s: str, min_len: int = 3) -> str:
    if not isinstance(s, str) or not s.strip():
        return ""
    t = s
    t = re.sub(r"\s+", " ", t)
    t = re.sub(r"\b\d+\b", " ", t)
    t = re.sub(r"[^\w\sÁÉÍÓÚáéíóúÑñÜü-]", " ", t, flags=re.UNICODE)
    t = " ".join([w for w in t.split() if len(w) >= min_len])
    return t.strip()

def cosine_matrix(a: np.ndarray, b: np.ndarray) -> np.ndarray:
    a_n = sk_normalize(a)
    b_n = sk_normalize(b)
    return np.clip(a_n @ b_n.T, -1.0, 1.0)

def month_str(dt: pd.Timestamp) -> str:
    if pd.isna(dt): return "NA"
    return f"{dt.year:04d}-{dt.month:02d}"


# =========================
# Embeddings (reconstruir siempre)
# =========================
def compute_embeddings(df_local: pd.DataFrame, cfg: Config) -> np.ndarray:
    _log("Calculando embeddings SBERT…")
    model = SentenceTransformer(cfg.sbert_model_name)
    texts = df_local[cfg.text_col].astype(str).map(lambda x: scrub_text(x, cfg.min_token_len)).tolist()
    emb = model.encode(
        texts, batch_size=cfg.sbert_batch_size,
        show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True
    )
    return sk_normalize(emb.astype(np.float32))


# =========================
# BERTopic (macro)
# =========================
def make_topic_model(n_docs: int, cfg: Config, n_neighbors: int, min_dist: float) -> BERTopic:
    reducer = umap.UMAP(
        n_neighbors=n_neighbors, min_dist=min_dist, n_components=cfg.umap_n_components,
        metric=cfg.umap_metric, random_state=cfg.random_state, verbose=True
    )
    # vectorizer seguro
    min_df_abs = cfg.vec_min_df_abs
    min_df_frac = cfg.vec_min_df_frac
    min_df = max(min_df_abs, int(max(1, n_docs * min_df_frac)))
    vec = CountVectorizer(
        lowercase=True, ngram_range=(1,2), min_df=min_df, max_df=cfg.vec_max_df,
        token_pattern=r"(?u)[A-Za-zÁÉÍÓÚáéíóúÑñÜü]{3,}"
    )
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=cfg.hdbscan_min_cluster_size,
        min_samples=cfg.hdbscan_min_samples,
        metric=cfg.hdbscan_metric, prediction_data=True, cluster_selection_method="eom"
    )
    tm = BERTopic(
        embedding_model=None, umap_model=reducer, hdbscan_model=clusterer,
        language="multilingual", calculate_probabilities=True,
        vectorizer_model=vec, top_n_words=max(cfg.topic_kw_fixed, 10),
        verbose=True
    )
    return tm

def fixed_keywords_macro(topic_model: BERTopic, k: int) -> Dict[int, List[str]]:
    info = topic_model.get_topic_info()
    valid = [int(t) for t in info["Topic"].tolist() if int(t) != -1]
    out: Dict[int, List[str]] = {}
    for t in valid:
        terms = [w for (w, _) in (topic_model.get_topic(int(t)) or [])][:k]
        out[int(t)] = terms
    return out


# =========================
# Micro-tópicos + singletons (sobre ruido)
# =========================
def build_micro_clusters(outlier_idx: np.ndarray, emb: np.ndarray, cfg: Config) -> Dict[int, List[int]]:
    """
    Crea micro-clusters por proximidad coseno. Retorna dict {cluster_id_interno: [rows]}.
    """
    if outlier_idx.size == 0:
        return {}
    X = emb[outlier_idx]
    # kNN en coseno → usamos métrica euclidean sobre vectores normalizados ≈ coseno
    nn = NearestNeighbors(n_neighbors=min(cfg.micro_knn_k, max(2, len(outlier_idx))), metric="cosine")
    nn.fit(X)
    dists, neigh = nn.kneighbors(X, return_distance=True)
    # convertimos a similitud coseno
    sims = 1.0 - dists
    used = np.zeros(len(outlier_idx), dtype=bool)
    clusters: Dict[int, List[int]] = {}
    cid = 0
    for i in range(len(outlier_idx)):
        if used[i]:
            continue
        # vecinos sobre umbral
        mates = [i]
        for j, s in zip(neigh[i][1:], sims[i][1:]):  # omitir self
            if s >= cfg.micro_cos_threshold:
                mates.append(j)
        if len(mates) >= 2:
            used[mates] = True
            clusters[cid] = [int(outlier_idx[m]) for m in mates]
            cid += 1
        else:
            # potencial singleton
            if cfg.accept_singletons:
                used[i] = True
                clusters[cid] = [int(outlier_idx[i])]
                cid += 1
            else:
                # se deja sin cluster (seguiría como -1)
                pass
    return clusters

def tfidf_keywords_for_docs(texts: List[str], k: int, max_features: int=1000) -> List[str]:
    if len(texts) == 0:
        return []
    vec = TfidfVectorizer(
        lowercase=True, ngram_range=(1,2), max_features=max_features,
        token_pattern=r"(?u)[A-Za-zÁÉÍÓÚáéíóúÑñÜü]{3,}"
    )
    X = vec.fit_transform(texts)
    if X.shape[0] == 1:
        # una sola doc: ordenar por TF-IDF de esa fila
        row = X[0].toarray().ravel()
        idx = row.argsort()[::-1]
        vocab = np.array(vec.get_feature_names_out())
        return vocab[idx][:k].tolist()
    else:
        row = np.asarray(X.sum(axis=0)).ravel()
        idx = row.argsort()[::-1]
        vocab = np.array(vec.get_feature_names_out())
        return vocab[idx][:k].tolist()


# =========================
# App writer
# =========================
def write_dash_app(df_vis: pd.DataFrame, kw_by_topic: Dict[int, List[str]], run_dir: str, kw_fixed: int,
                   size2d: int, size3d: int) -> None:
    out_dir = Path(run_dir) / "bertopic_outputs"
    out_dir.mkdir(parents=True, exist_ok=True)

    # Datos
    data_csv = out_dir / "df_vis.csv"
    # FECHA **normalizada completa** YYYY-MM-DD 00:00:00
    df_tmp = df_vis.copy()
    df_tmp["FechaPublicacion"] = pd.to_datetime(df_tmp["FechaPublicacion"], errors="coerce").dt.floor("D")
    df_tmp["FechaPublicacion"] = df_tmp["FechaPublicacion"].dt.strftime("%Y-%m-%d %H:%M:%S")
    df_tmp.to_csv(data_csv, index=False)

    # Keywords por tópico
    kw_json = {int(k): list(v) for k,v in kw_by_topic.items()}
    kw_path = out_dir / "keywords_by_topic.json"
    with open(kw_path, "w", encoding="utf-8") as f:
        json.dump(kw_json, f, ensure_ascii=False, indent=2)

    # Colores por tópico (paleta simple HLS)
    topic_ids_all = sorted({int(t) for t in df_vis["topic_id"].unique().tolist()})
    if len(topic_ids_all) == 0:
        topic_ids_all = [0]
    cols = []
    n = max(1, len(topic_ids_all))
    for i in range(n):
        h = (i / float(n)) % 1.0
        r,g,b = colorsys.hls_to_rgb(h, 0.52, 0.65)
        cols.append("#{0:02x}{1:02x}{2:02x}".format(int(r*255), int(g*255), int(b*255)))
    cmap = {str(t): c for t,c in zip(topic_ids_all, cols)}
    cmap_path = out_dir / "topic_colors.json"
    with open(cmap_path, "w", encoding="utf-8") as f:
        json.dump(cmap, f, ensure_ascii=False, indent=2)

    # ALL_KW incrustado (para autocompletar)
    all_kw = sorted({w for ks in kw_by_topic.values() for w in ks})
    all_kw_json = json.dumps(all_kw, ensure_ascii=False)

    # App
    app_path = Path(run_dir) / "dash_app_density.py"
    app_code = f'''# -*- coding: utf-8 -*-
"""
App Dash — Entradas:
- df_vis.csv
- keywords_by_topic.json
- topic_colors.json
"""
import json
from pathlib import Path
import numpy as np
import pandas as pd
import dash
from dash import Dash, dcc, html, dash_table, Input, Output, State
import plotly.express as px
import plotly.graph_objects as go

# Rutas (absolutas)
DATA_CSV = Path(r"{{data_csv.as_posix()}}")
KW_JSON  = Path(r"{{kw_path.as_posix()}}")
CMAP_JSON= Path(r"{{cmap_path.as_posix()}}")

df = pd.read_csv(DATA_CSV)
with open(KW_JSON, "r", encoding="utf-8") as f:
    kw_by_topic = json.load(f)
with open(CMAP_JSON, "r", encoding="utf-8") as f:
    topic_cmap = json.load(f)

# Parseo robusto de fecha
df["FechaPublicacion"] = pd.to_datetime(df["FechaPublicacion"], errors="coerce")
df["topic_str"] = df["topic_id"].astype(str)

# Opciones UI
tt_options = sorted(df["ThinkTank"].dropna().unique().tolist())
topic_ids = sorted([int(t) for t in df["topic_id"].dropna().unique().tolist()])
topic_options = [{{"label": f"Topic {{t}}", "value": t}} for t in topic_ids]
ALL_KW = json.loads({json.dumps(all_kw_json)})
kw_options = [{{"label": w, "value": w}} for w in ALL_KW]

# Día índice para slider
tmin = df["FechaPublicacion"].min()
tmax = df["FechaPublicacion"].max()
df["day_idx"] = (df["FechaPublicacion"] - tmin).dt.days.astype("Int64")
min_idx, max_idx = int(df["day_idx"].min()), int(df["day_idx"].max())

def day_to_date(i):
    return (tmin + pd.Timedelta(days=int(i))).date()

def label_marks():
    span = max_idx - min_idx
    step = max(1, span // 8) if span>0 else 1
    marks = {{}}
    for i in range(min_idx, max_idx+1, step):
        marks[i] = dict(label=str(day_to_date(i)))
    marks[min_idx] = dict(label=str(day_to_date(min_idx)))
    marks[max_idx] = dict(label=str(day_to_date(max_idx)))
    return marks

def apply_filters(dff, tt_sel, topic_sel, t_range):
    a,b = t_range
    dff = dff[(dff["day_idx"] >= a) & (dff["day_idx"] <= b)]
    if tt_sel: dff = dff[dff["ThinkTank"].isin(tt_sel)]
    if topic_sel: dff = dff[dff["topic_id"].isin(topic_sel)]
    return dff

KW_FIXED = {{int(kw_fixed)}}   # fijo UI
SIZE2D   = {{int(size2d)}}
SIZE3D   = {{int(size3d)}}

def banner_keywords(topic_sel):
    if not topic_sel:
        return "Selecciona uno o más tópicos para ver sus palabras clave."
    parts=[]
    for t in sorted(set(topic_sel)):
        ks = kw_by_topic.get(str(int(t))) or kw_by_topic.get(int(t)) or []
        show = ks[:KW_FIXED]
        parts.append("Topic {{t}}: " + ", ".join(show))
    return " | ".join(parts)

def topics_matching_keyword(word):
    if not word:
        return []
    q = str(word).strip().lower()
    hits = []
    for k,v in kw_by_topic.items():
        words = [w.lower() for w in v]
        if any(q == w for w in words):
            try:
                hits.append(int(k))
            except:
                pass
    return sorted(set(hits))

app = Dash(__name__)
app.title = "UMAP — Explorador de tópicos"

app.layout = html.Div([
    html.Div([
        html.Div([
            html.Label("Color:"),
            dcc.RadioItems(
                id="color-mode",
                options=[{{"label":"Think Tank","value":"tt"}}, {{"label":"Tópico","value":"topic"}}],
                value="tt", inline=True
            )
        ], style={{"display":"inline-block","marginRight":"16px"}}),
        html.Div([
            dcc.Dropdown(options=[{{"label": t, "value": t}} for t in tt_options],
                         id="tt-dd", value=[], multi=True, placeholder="Think Tanks (multi)")
        ], style={{"display":"inline-block","width":"30%","marginRight":"8px"}}),
        html.Div([
            dcc.Dropdown(options=topic_options, id="topic-dd", value=[], multi=True, placeholder="Tópicos (multi)")
        ], style={{"display":"inline-block","width":"30%","marginRight":"8px"}}),
        html.Div([
            dcc.Dropdown(id="kw-search-dd", options=kw_options, value=None, multi=False, searchable=True, placeholder="Buscar palabra de tópico")
        ], style={{"display":"inline-block","width":"35%"}})
    ], style={{"marginBottom":"8px"}}),

    html.Div(id="kw-banner",
             style={{"margin":"6px 0 10px 0","padding":"6px 10px",
                    "background":"#f6f8fa","border":"1px solid #e5e7eb",
                    "borderRadius":"8px","fontFamily":"Inter,system-ui,sans-serif"}}),

    html.Div([
        dcc.Graph(id="umap3d-fig", style={{"height":"48vh"}}),
        dcc.Graph(id="scatter-fig", style={{"height":"48vh"}})
    ], style={{"display":"grid","gridTemplateColumns":"1fr 1fr","gap":"8px"}}),

    html.Div([
        dcc.Graph(id="series-tt-fig", style={{"height":"34vh"}})
    ], style={{"marginTop":"4px"}}),

    html.Div([
        html.Div([
            html.Button("Todo el periodo", id="btn-all", n_clicks=0, style={{"marginRight":"8px"}}),
            html.Button("Estallido → Fin Convención", id="btn-period1", n_clicks=0, style={{"marginRight":"8px"}}),
            html.Button("Proc. Const. → Plebiscito Salida", id="btn-period2", n_clicks=0),
        ], style={{"textAlign":"center","marginBottom":"6px"}}),
        html.Label("Rango temporal", style={{"display":"block","textAlign":"center","marginBottom":"6px"}}),
        dcc.RangeSlider(id="time-rs", min=min_idx, max=max_idx,
                        step=1, value=[min_idx, max_idx],
                        marks=label_marks(), allowCross=False, pushable=1, updatemode="mouseup")
    ], style={{"maxWidth":"1100px","margin":"12px auto"}}),

    html.H4("Documentos filtrados"),
    dash_table.DataTable(
        id="result-table",
        columns=[
            {{"name":"topic_id","id":"topic_id"}},
            {{"name":"keywords","id":"keywords"}},
            {{"name":"ThinkTank","id":"ThinkTank"}},
            {{"name":"FechaPublicacion","id":"FechaPublicacion"}},
            {{"name":"Enlace","id":"Enlace_markdown","presentation":"markdown"}}
        ],
        page_size=15, sort_action="native", filter_action="native",
        style_table={{"overflowX":"auto"}},
        style_cell={{"fontFamily":"Inter, system-ui, sans-serif","fontSize":"13px","padding":"6px"}},
        style_header={{"fontWeight":"700"}}
    )
], style={{"padding":"10px"}})

# Botones de rango rápido
@app.callback(
    Output("time-rs","value"),
    Input("btn-all","n_clicks"),
    Input("btn-period1","n_clicks"),
    Input("btn-period2","n_clicks"),
    State("time-rs","value"),
    prevent_initial_call=True
)
def quick_ranges(n_all, n_p1, n_p2, cur):
    ctx = dash.callback_context
    if not ctx.triggered: return cur
    trg = ctx.triggered[0]["prop_id"].split(".")[0]
    if trg == "btn-all":
        return [min_idx, max_idx]
    elif trg == "btn-period1":
        a = int((pd.Timestamp("2019-10-18") - tmin).days)
        b = int((pd.Timestamp("2022-09-04") - tmin).days)
        return [max(min_idx,a), min(max_idx,b)]
    elif trg == "btn-period2":
        a = int((pd.Timestamp("2022-09-05") - tmin).days)
        b = int((pd.Timestamp("2023-12-17") - tmin).days)
        return [max(min_idx,a), min(max_idx,b)]
    return cur

# Autocompletar: setea tópicos que contienen la palabra
@app.callback(
    Output("topic-dd","value"),
    Input("kw-search-dd","value"),
    State("topic-dd","value"),
    prevent_initial_call=True
)
def on_kw_select(word, cur_sel):
    hits = topics_matching_keyword(word)
    if hits:
        return hits
    return cur_sel or []

@app.callback(
    Output("umap3d-fig","figure"),
    Output("scatter-fig","figure"),
    Output("series-tt-fig","figure"),
    Output("result-table","data"),
    Output("kw-banner","children"),
    Input("color-mode","value"),
    Input("tt-dd","value"), Input("topic-dd","value"), Input("time-rs","value")
)
def update_all(color_mode, tt_sel, topic_sel, t_range):
    tt_sel = tt_sel or []
    topic_sel = topic_sel or []
    dff = apply_filters(df.copy(), tt_sel, topic_sel, t_range)
    total_docs = len(dff)

    # Color por ThinkTank o por Tópico
    if color_mode == "tt":
        color_col = "ThinkTank"
        cmap = None
    else:
        color_col = "topic_str"
        cmap = topic_cmap

    dff["topic_str"] = dff["topic_id"].astype(str)

    # Hover
    hover3d = ("<b>%{{customdata[4]}}</b><br>"
               "<b>Topic</b>: %{{customdata[0]}} — %{{customdata[1]}}<br>"
               "<b>Think Tank</b>: %{{customdata[2]}}<br>"
               "<b>Fecha</b>: %{{customdata[3]|%Y-%m-%d}}")
    cd = np.stack([
        dff["topic_str"].astype(str).values,
        dff["topic_kw"].astype(str).values,
        dff["ThinkTank"].astype(str).values,
        dff["FechaPublicacion"].values.astype("datetime64[ns]"),
        dff["ID"].astype(str).values
    ], axis=1) if not dff.empty else np.empty((0,5), dtype=object)

    fig3d = px.scatter_3d(
        dff, x="x3", y="y3", z="z3", color=color_col,
        color_discrete_map=cmap if cmap else None,
        title=f"UMAP 3D — documentos filtrados (N={{total_docs}})"
    )
    fig3d.update_traces(marker=dict(size=SIZE3D, opacity=0.85), hovertemplate=hover3d, customdata=cd)

    hover2d = ("<b>%{{customdata[4]}}</b><br>"
               "<b>Topic</b>: %{{customdata[0]}} — %{{customdata[1]}}<br>"
               "<b>Think Tank</b>: %{{customdata[2]}}<br>"
               "<b>Fecha</b>: %{{customdata[3]|%Y-%m-%d}}")
    fig2d = px.scatter(
        dff, x="x", y="y", color=color_col,
        color_discrete_map=cmap if cmap else None,
        title="UMAP 2D"
    )
    fig2d.update_traces(marker=dict(size=SIZE2D, opacity=0.85), hovertemplate=hover2d, customdata=cd)
    fig2d.update_layout(legend=dict(itemsizing="constant"))

    # Serie mensual por ThinkTank
    if dff.empty:
        fig_series = go.Figure(); fig_series.update_layout(title="Documentos por mes (sin datos)")
    else:
        dff["period"] = dff["FechaPublicacion"].dt.to_period("M").astype(str)
        grp = dff.groupby(["period","ThinkTank"]).size().reset_index(name="n")
        fig_series = px.bar(
            grp, x="period", y="n", color="ThinkTank",
            title=f"Documentos por mes — total N={{total_docs}}",
            barmode="stack"
        )
        fig_series.update_layout(xaxis={{"categoryorder":"category ascending"}})

    # Tabla
    def kw_for(tid):
        ks = kw_by_topic.get(str(int(tid))) or kw_by_topic.get(int(tid)) or []
        return ", ".join(ks[:KW_FIXED])
    table_df = dff[["topic_id","ThinkTank","FechaPublicacion","Enlace","ID"]].copy()
    table_df["keywords"] = dff["topic_id"].apply(kw_for)
    table_df["FechaPublicacion"] = table_df["FechaPublicacion"].dt.strftime("%Y-%m-%d")
    # enlace clickeable en markdown
    def mk(url, idx):
        u = str(url) if pd.notna(url) else ""
        if u.startswith("http"):
            return f"[abrir]({{u}})"
        return ""
    table_df["Enlace_markdown"] = [mk(u, i) for i,u in enumerate(table_df["Enlace"].tolist())]
    table_df = table_df[["topic_id","keywords","ThinkTank","FechaPublicacion","Enlace_markdown"]]
    table_data = table_df.to_dict("records")

    banner = banner_keywords(topic_sel)
    return fig3d, fig2d, fig_series, table_data, banner

if __name__ == "__main__":
    app.run(debug=False)
'''
    with open(app_path, "w", encoding="utf-8") as f:
        f.write(app_code)
    _log(f"App escrita en: {app_path}")


# =========================
# Orquestación principal
# =========================
def run_pipeline_on_df(df: pd.DataFrame, cfg: Config = CFG) -> Dict[str, object]:
    start_run_dirs(cfg)

    # columnas requeridas
    need = [cfg.id_col, cfg.cat_col, cfg.text_col, cfg.date_col, cfg.url_col]
    if not set(need).issubset(df.columns):
        missing = set(need) - set(df.columns)
        raise KeyError(f"Faltan columnas: {missing}")

    # copia + tipos
    df_local = df[need].copy()
    # normaliza fecha
    if not np.issubdtype(df_local[cfg.date_col].dtype, np.datetime64):
        df_local[cfg.date_col] = pd.to_datetime(df_local[cfg.date_col], errors="coerce")

    # filtro temporal (si fue seteado)
    if cfg.date_start is not None or cfg.date_end is not None:
        m = pd.Series(True, index=df_local.index)
        if cfg.date_start is not None:
            m &= (df_local[cfg.date_col] >= cfg.date_start)
        if cfg.date_end is not None:
            m &= (df_local[cfg.date_col] < cfg.date_end)
        df_local = df_local.loc[m].copy()
        _log(f"Filtro fechas aplicado. Rango: {cfg.date_start} → {cfg.date_end} (exclusivo)")

    df_local[cfg.id_col] = df_local[cfg.id_col].astype(str)
    df_local[cfg.cat_col] = df_local[cfg.cat_col].astype(str)
    df_local[cfg.text_col] = df_local[cfg.text_col].astype(str)
    df_local[cfg.url_col] = df_local[cfg.url_col].astype(str)
    df_local = df_local.reset_index(drop=True)

    texts_raw = df_local[cfg.text_col].tolist()
    _log(f"N documentos: {len(texts_raw)}")

    # Embeddings
    emb = compute_embeddings(df_local, cfg)

    # Rejilla UMAP + BERTopic (elegimos por menos ruido y más #clusters)
    best = None
    texts_clean = [scrub_text(t, cfg.min_token_len) for t in texts_raw]
    for nn in cfg.umap_neighbors_grid:
        for md in cfg.umap_min_dist_grid:
            _log(f"UMAP grid → n_neighbors={nn}, min_dist={md}")
            tm = make_topic_model(len(texts_clean), cfg, nn, md)
            topics, _ = tm.fit_transform(texts_clean, embeddings=emb)
            topics = np.asarray(topics, dtype=int)

            # métricas
            noise_frac = float((topics==-1).mean()) if len(topics) else 1.0
            n_clusters = len([t for t in np.unique(topics) if t!=-1])

            # score: maximizamos clusters y minimizamos ruido
            score = (n_clusters, 1.0 - noise_frac)
            if (best is None) or (score > best["score"]):
                best = {"tm": tm, "topics": topics, "score": score,
                        "n_neighbors": nn, "min_dist": md}

    tm = best["tm"]; topics = best["topics"]
    _log(f"Mejor combinación → n_neighbors={best['n_neighbors']}, min_dist={best['min_dist']}")

    # Keywords macro
    kw_macro = fixed_keywords_macro(tm, cfg.topic_kw_fixed)

    # --- Micro-tópicos / Singletons sobre ruido (-1)
    idx_noise = np.where(topics == -1)[0]
    micro_clusters = build_micro_clusters(idx_noise, emb, cfg)

    # Asignar IDs nuevos
    cur_max = max([-1]+[int(t) for t in np.unique(topics) if t!=-1])
    new_id = cur_max + 1
    micro_kw: Dict[int, List[str]] = {}
    for cid, rows in micro_clusters.items():
        rows = list(rows)
        topics[rows] = new_id
        # keywords locales TF-IDF
        cluster_texts = [texts_clean[i] for i in rows]
        micro_kw[new_id] = tfidf_keywords_for_docs(cluster_texts, cfg.topic_kw_fixed, cfg.tfidf_max_features_micro)
        new_id += 1

    # Unimos keywords (macro + micro)
    kw_all = dict(kw_macro)
    kw_all.update(micro_kw)

    # Mapeo topic → ThinkTank top-1
    topic_to_idx: Dict[int, List[int]] = defaultdict(list)
    for i, t in enumerate(topics.tolist()):
        topic_to_idx[int(t)].append(i)
    rows = []
    for t, idxs in topic_to_idx.items():
        tt_counts = Counter(df_local.loc[idxs, cfg.cat_col].tolist())
        top_tt = tt_counts.most_common(1)[0][0] if len(tt_counts)>0 else "(sin docs)"
        rows.append({"topic_id": t, "category_top1": top_tt})
    df_map = pd.DataFrame(rows)
    df_map["topic_desc_fixed"] = df_map["topic_id"].map(lambda tid: ", ".join(kw_all.get(int(tid), [])))
    Path(cfg.out_dir_map).mkdir(parents=True, exist_ok=True)
    df_map.to_csv(os.path.join(cfg.out_dir_map, "topic_to_category_top1.csv"), index=False)

    # Export docs-topics
    df_docs = pd.DataFrame({
        "doc_row": np.arange(len(texts_raw)),
        "ID": df_local[cfg.id_col].values,
        "Think_Tank_true": df_local[cfg.cat_col].values,
        "FechaPublicacion": df_local[cfg.date_col].values,
        "topic_id": topics
    })
    df_docs.to_csv(os.path.join(cfg.out_dir_topics, "docs_topics.csv"), index=False)

    # UMAP 2D/3D para App
    _log("UMAP 2D/3D para visual…")
    reducer2d = umap.UMAP(n_neighbors=best["n_neighbors"], min_dist=0.0, n_components=2,
                          metric=cfg.umap_metric, random_state=cfg.random_state, verbose=True)
    z2d = reducer2d.fit_transform(emb)
    reducer3d = umap.UMAP(n_neighbors=best["n_neighbors"], min_dist=0.0, n_components=3,
                          metric=cfg.umap_metric, random_state=cfg.random_state, verbose=True)
    z3d = reducer3d.fit_transform(emb)

    # df_vis para App
    date_series = pd.to_datetime(df_local[cfg.date_col], errors="coerce").dt.floor("D")
    topic_str = np.array([str(int(t)) for t in topics])
    df_vis = pd.DataFrame({
        "x": z2d[:,0], "y": z2d[:,1],
        "x3": z3d[:,0], "y3": z3d[:,1], "z3": z3d[:,2],
        "topic_id": topics,
        "topic_str": topic_str,
        "ThinkTank": df_local[cfg.cat_col].values,
        "FechaPublicacion": date_series,
        "ID": df_local[cfg.id_col].values,
        "Enlace": df_local[cfg.url_col].values
    })
    df_vis["topic_kw"] = [", ".join(kw_all.get(int(t), [])) for t in df_vis["topic_id"]]

    # Guardar también df_vis (con fecha completa), y topic_info macro
    tm.get_topic_info().to_csv(os.path.join(cfg.out_dir_topics, "topic_info_macro.csv"), index=False)
    df_vis.to_csv(os.path.join(cfg.out_dir_topics, "df_vis.csv"), index=False)

    # App
    write_dash_app(df_vis, kw_all, cfg.run_dir, kw_fixed=cfg.topic_kw_fixed,
                   size2d=cfg.app_marker_2d_size, size3d=cfg.app_marker_3d_size)

    _log("Listo ✓")
    return {
        "run_dir": cfg.run_dir,
        "embeddings": emb,
        "topic_model_macro": tm,
        "topics_final": topics,
        "keywords_all": kw_all,
        "df_vis": df_vis
    }


# Fin plebiscito 1 - Fin plebscito 2

In [None]:
# 1) Ajusta rango temporal si quieres
CFG.date_start = pd.Timestamp("2019-01-01")
CFG.date_end   = pd.Timestamp("2023-12-31")  # exclusivo

# 2) Fuerza recrear embeddings (ignora cualquier cache previo)
CFG.force_reembed = True

# 3) Corre
results = run_pipeline_on_df(df, CFG)


[15:39:50] [RUN] id=20251021_153950 -> NLP\runs\20251021_153950
[15:39:50] Filtro fechas aplicado. Rango: 2019-01-01 00:00:00 → 2023-12-31 00:00:00 (exclusivo)
[15:39:50] N documentos: 17418
[15:39:50] Calculando embeddings SBERT…


Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x0000025B8362FA70>>
Traceback (most recent call last):
  File "c:\Users\rodri\AppData\Local\Programs\Python\Python312\Lib\site-packages\ipykernel\ipkernel.py", line 796, in _clean_thread_parent_frames
    active_threads = {thread.ident for thread in threading.enumerate()}
                                                 ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\rodri\AppData\Local\Programs\Python\Python312\Lib\threading.py", line 1535, in enumerate
    def enumerate():
    
KeyboardInterrupt: 


Batches:   0%|          | 0/545 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Estallido social - Fin plebiscito