In [3]:

import re, numpy as np, pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer

# Plotly (renderizador para notebooks)
import plotly.io as pio
pio.renderers.default = "vscode"  # usa "colab" o "vscode" si lo prefieres

# ---------- parámetros ----------
DOCS = Path("documents.parquet")
PAGES = Path("pages.parquet")
assert DOCS.exists() and PAGES.exists(), "Se requieren documents.parquet y pages.parquet"

MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
PARA_MAX_TOK = 120      # ~ palabras por párrafo
MIN_PARA_LEN = 8
BATCH_SIZE   = 64
RANDOM_STATE = 42

def norm_space(s):
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"(\.)([A-ZÁÉÍÓÚÑ])", r"\1 \2", s)
    return s.strip()

def split_paragraphs(text):
    parts = [p.strip() for p in re.split(r"\n\s*\n+", text) if p.strip()] or [text]
    out = []
    for p in parts:
        toks = p.split()
        if len(toks) <= PARA_MAX_TOK:
            out.append(norm_space(p))
        else:
            sents = re.split(r"(?<=[\.\?\!;:])\s+", p)
            buf, count = [], 0
            for s in sents:
                n = len(s.split())
                if count + n > PARA_MAX_TOK and buf:
                    out.append(norm_space(" ".join(buf))); buf, count = [], 0
                buf.append(s); count += n
            if buf: out.append(norm_space(" ".join(buf)))
    return [p for p in out if len(p.split()) >= MIN_PARA_LEN]

# 11P.1 — leer y segmentar
pages = pd.read_parquet(PAGES).sort_values(["candidate","filename","page"])
para_rows = []
for (cand, fname), g in tqdm(pages.groupby(["candidate","filename"], sort=False), desc="[11P] párrafos por doc"):
    txt = "\n".join([str(x) for x in g["text"] if isinstance(x,str)])
    for i, p in enumerate(split_paragraphs(txt), 1):
        para_rows.append({"candidate":cand, "filename":fname, "para_id":i, "text":p})
paras_df = pd.DataFrame(para_rows)
print("[11P] párrafos construidos:", paras_df.shape)

# 11P.2 — embeddings BERT
model = SentenceTransformer(MODEL_NAME)
def encode_texts(texts):
    return model.encode(texts, batch_size=BATCH_SIZE, show_progress_bar=True, normalize_embeddings=True).astype("float32")

Z_para = encode_texts(paras_df["text"].tolist())  # (N_parrafos, d)

# 11P.3 — promedios por documento y por candidato (sólo en memoria)
key_doc = paras_df[["candidate","filename"]].astype(str).agg("||".join, axis=1).to_numpy()
doc_keys = np.unique(key_doc)
E_docs = np.vstack([Z_para[key_doc==k].mean(axis=0) for k in tqdm(doc_keys, desc="[11P] avg doc")])
meta_docs = pd.DataFrame([dict(zip(["candidate","filename"], k.split("||",1))) for k in doc_keys])

cands = sorted(paras_df["candidate"].unique().tolist())
E_cand = np.vstack([E_docs[meta_docs["candidate"]==c].mean(axis=0) for c in tqdm(cands, desc="[11P] avg cand")])

print("[11P] shapes — Z_para:", Z_para.shape, " E_docs:", E_docs.shape, " E_cand:", E_cand.shape)


[11P] párrafos por doc:   0%|          | 0/8 [00:00<?, ?it/s]

[11P] párrafos construidos: (2021, 4)


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

[11P] avg doc:   0%|          | 0/8 [00:00<?, ?it/s]

[11P] avg cand:   0%|          | 0/8 [00:00<?, ?it/s]

[11P] shapes — Z_para: (2021, 384)  E_docs: (8, 384)  E_cand: (8, 384)


In [8]:
# %% MÓDULO 12-P — Radar + UMAP 2D/3D interactivos
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd, numpy as np, umap
import plotly.graph_objects as go
import plotly.express as px

# 12-P.1 — matriz coseno por candidato
S = cosine_similarity(E_cand, E_cand)
labels = cands

# 12-P.2 — radar interactivo con leyenda explicativa
S01 = (S - S.min()) / (S.max() - S.min() + 1e-9)
S100 = 100.0 * S01
theta = labels + [labels[0]]

fig_radar = go.Figure()
for i, name in enumerate(labels):
    vals = S100[i, :].tolist() + [S100[i, 0]]
    fig_radar.add_trace(go.Scatterpolar(
        r=vals, theta=theta,
        mode="lines+markers", fill="toself",
        name=name,
        hovertemplate="<b>Referencia:</b> "+name+
                      "<br>%{theta}: %{r:.1f}<extra></extra>"
    ))
fig_radar.update_layout(
    title="Afinidad BERT entre candidatos (radar interactivo)",
    polar=dict(radialaxis=dict(range=[0,100], tick0=0, dtick=20)),
    legend_title_text="Color = candidato de referencia"
)
fig_radar.show()

# 12-P.3 — UMAP 2D global
cand_codes = paras_df["candidate"].astype("category")
cand_idx = cand_codes.cat.codes
cand_names = cand_codes.cat.categories.tolist()

um2 = umap.UMAP(
    n_components=2, random_state=42,
    metric="cosine", n_neighbors=15, min_dist=0.1
).fit_transform(Z_para)

df_um2 = pd.DataFrame({
    "UMAP1": um2[:, 0],
    "UMAP2": um2[:, 1],
    "candidate": paras_df["candidate"],
    "filename": paras_df["filename"],
    "para_id": paras_df["para_id"],
    "text": [t[:280]+"…" if isinstance(t,str) and len(t)>280 else t
             for t in paras_df["text"]]
})

# paleta cualitativa segura
try:
    palette = px.colors.qualitative.Dark24
except AttributeError:
    palette = px.colors.qualitative.Plotly

fig_u2 = px.scatter(
    df_um2, x="UMAP1", y="UMAP2",
    color="candidate",
    hover_data=["filename", "para_id", "text"],
    title="UMAP 2D — párrafos coloreados por candidato",
    color_discrete_sequence=palette
)
fig_u2.update_traces(marker=dict(size=6, opacity=0.8))
fig_u2.show()

# 12-P.4 — UMAP 3D global
um3 = umap.UMAP(
    n_components=3, random_state=42,
    metric="cosine", n_neighbors=15, min_dist=0.1
).fit_transform(Z_para)

df_um3 = pd.DataFrame({
    "UMAP1": um3[:, 0],
    "UMAP2": um3[:, 1],
    "UMAP3": um3[:, 2],
    "candidate": paras_df["candidate"],
    "filename": paras_df["filename"],
    "para_id": paras_df["para_id"],
    "text": [t[:200]+"…" if isinstance(t,str) and len(t)>200 else t
             for t in paras_df["text"]]
})

fig_u3 = px.scatter_3d(
    df_um3, x="UMAP1", y="UMAP2", z="UMAP3",
    color="candidate",
    hover_data=["filename", "para_id", "text"],
    title="UMAP 3D — párrafos coloreados por candidato",
    color_discrete_sequence=palette
)
fig_u3.update_traces(marker=dict(size=3, opacity=0.8))
fig_u3.show()



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.




n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [9]:
# %% MÓDULO 13-P — HDBSCAN por candidato + UMAP 2D/3D con selector
import numpy as np, pandas as pd
import hdbscan, umap
import plotly.express as px
import plotly.graph_objects as go

MIN_CLUSTER_SIZE = 8
MIN_SAMPLES      = 5
METRIC           = "euclidean"   # con embeddings normalizados, "cosine" también funciona

# precomputamos por candidato: labels HDBSCAN + UMAP 2D/3D
per_cand = {}  # cand -> dict(X_idx, labels, U2, U3)
for cand in tqdm(cands, desc="[13P] HDBSCAN por candidato"):
    idx = np.where(paras_df["candidate"].values == cand)[0]
    Xc  = Z_para[idx]
    if Xc.shape[0] < max(10, MIN_CLUSTER_SIZE):
        per_cand[cand] = {"idx":idx, "labels":np.full(len(idx), -1), "U2":None, "U3":None}
        continue
    lab = hdbscan.HDBSCAN(min_cluster_size=MIN_CLUSTER_SIZE, min_samples=MIN_SAMPLES, metric=METRIC).fit_predict(Xc)
    U2c = umap.UMAP(n_components=2, random_state=42, metric="cosine", n_neighbors=15, min_dist=0.1).fit_transform(Xc)
    U3c = umap.UMAP(n_components=3, random_state=42, metric="cosine", n_neighbors=15, min_dist=0.1).fit_transform(Xc)
    per_cand[cand] = {"idx":idx, "labels":lab, "U2":U2c, "U3":U3c}

# 13P.1 — UMAP 2D con dropdown de candidato
def fig_umap_2d_dropdown(per_cand):
    buttons = []
    data0 = None
    i = 0
    for cand, d in per_cand.items():
        idx, lab, U2c = d["idx"], d["labels"], d["U2"]
        if U2c is None:
            U2c = np.zeros((len(idx), 2)); lab = np.full(len(idx), -1)
        scat = go.Scatter(
            x=U2c[:,0], y=U2c[:,1],
            mode="markers",
            marker=dict(size=6, color=lab, colorscale="Viridis"),
            name=cand,
            text=[f"{cand} — {paras_df.iloc[j]['filename']} — ¶{int(paras_df.iloc[j]['para_id'])}" for j in idx],
            hovertemplate="%{text}<extra></extra>",
            visible=(i==0)
        )
        if data0 is None: data0 = [scat]
        else: data0.append(scat)
        buttons.append(dict(label=cand,
                            method="update",
                            args=[{"visible":[k==i for k in range(len(per_cand))]},
                                  {"title":f"UMAP 2D — {cand} (HDBSCAN: -1=ruido)"}]))
        i += 1

    fig = go.Figure(data=data0)
    fig.update_layout(
        title=f"UMAP 2D — {list(per_cand.keys())[0]} (HDBSCAN: -1=ruido)",
        updatemenus=[dict(type="dropdown", x=1.02, y=1.0, showactive=True, buttons=buttons)],
        xaxis_title="UMAP1", yaxis_title="UMAP2"
    )
    return fig

fig_2d = fig_umap_2d_dropdown(per_cand)
fig_2d.show()

# 13P.2 — UMAP 3D con dropdown de candidato
def fig_umap_3d_dropdown(per_cand):
    buttons = []
    data0 = None
    i = 0
    for cand, d in per_cand.items():
        idx, lab, U3c = d["idx"], d["labels"], d["U3"]
        if U3c is None:
            U3c = np.zeros((len(idx), 3)); lab = np.full(len(idx), -1)
        scat = go.Scatter3d(
            x=U3c[:,0], y=U3c[:,1], z=U3c[:,2],
            mode="markers",
            marker=dict(size=3, color=lab, colorscale="Viridis"),
            name=cand,
            text=[f"{cand} — {paras_df.iloc[j]['filename']} — ¶{int(paras_df.iloc[j]['para_id'])}" for j in idx],
            hovertemplate="%{text}<extra></extra>",
            visible=(i==0)
        )
        if data0 is None: data0 = [scat]
        else: data0.append(scat)
        buttons.append(dict(label=cand,
                            method="update",
                            args=[{"visible":[k==i for k in range(len(per_cand))]},
                                  {"title":f"UMAP 3D — {cand} (HDBSCAN: -1=ruido)"}]))
        i += 1

    fig = go.Figure(data=data0)
    fig.update_layout(
        title=f"UMAP 3D — {list(per_cand.keys())[0]} (HDBSCAN: -1=ruido)",
        updatemenus=[dict(type="dropdown", x=1.05, y=1.0, showactive=True, buttons=buttons)],
        scene=dict(xaxis_title="UMAP1", yaxis_title="UMAP2", zaxis_title="UMAP3")
    )
    return fig

fig_3d = fig_umap_3d_dropdown(per_cand)
fig_3d.show()


[13P] HDBSCAN por candidato:   0%|          | 0/8 [00:00<?, ?it/s]


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting 

In [10]:
# %% BERTopic por candidato — HDBSCAN + c-TF-IDF + jerarquía + resúmenes + distribuciones (Plotly)
!pip -q install bertopic hdbscan umap-learn scikit-learn tqdm plotly

import numpy as np, pandas as pd
from tqdm.auto import tqdm
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import pairwise_distances
import umap, hdbscan
import plotly.express as px
import plotly.graph_objects as go

# --------- PRECONDICIONES ---------
# Debes tener en el entorno:
#   - paras_df: DataFrame con columnas ["candidate","filename","para_id","text"]
#   - Z_para  : np.ndarray de shape (n_parrafos, d) con embeddings normalizados
assert 'paras_df' in globals() and 'Z_para' in globals(), "Faltan 'paras_df' y/o 'Z_para' en memoria."

# --------- PARÁMETROS ---------
RANDOM_STATE = 42
MIN_CLUSTER_SIZE = 10     # HDBSCAN por candidato; ajusta según longitud del programa
MIN_SAMPLES      = 5
UMAP_N_NEIGHBORS = 15
UMAP_MIN_DIST    = 0.10

TOP_TERMS = 12            # top términos c-TF-IDF para mostrar por tópico
TOP_DOCS  = 3             # nº de párrafos representativos a mostrar por tópico
MAX_TOPICS_PLOT = 25      # cap visualizaciones extensas

# Vectorizador para c-TF-IDF (n-gramas cortos en español)
vectorizer = CountVectorizer(
    lowercase=True,
    strip_accents='unicode',
    ngram_range=(1,2),
    min_df=2,
    max_df=0.95
)

# UMAP/HDBSCAN base (BERTopic reducirá embeddings con UMAP antes de clusterizar)
umap_model = umap.UMAP(
    n_neighbors=UMAP_N_NEIGHBORS,
    n_components=5,          # compresión previa antes de HDBSCAN
    min_dist=UMAP_MIN_DIST,
    metric="cosine",
    random_state=RANDOM_STATE
)
hdb_model = hdbscan.HDBSCAN(
    min_cluster_size=MIN_CLUSTER_SIZE,
    min_samples=MIN_SAMPLES,
    metric="euclidean",
    prediction_data=True
)

# Paleta cualitativa segura
try:
    palette = px.colors.qualitative.Dark24
except AttributeError:
    palette = px.colors.qualitative.Plotly

# --------- FUNCIÓN UTIL — construir tablas/visuales por candidato ---------
def run_bertopic_for_candidate(cand_name, show_hierarchy=True):
    # subset del candidato
    idx = np.where(paras_df["candidate"].values == cand_name)[0]
    if len(idx) < max(20, MIN_CLUSTER_SIZE + 5):
        print(f"[BERTopic] {cand_name}: muy pocos párrafos ({len(idx)}); omito.")
        return

    texts = paras_df.iloc[idx]["text"].tolist()
    files = paras_df.iloc[idx]["filename"].tolist()
    para_ids = paras_df.iloc[idx]["para_id"].tolist()
    X = Z_para[idx]

    # Modelo BERTopic (sin re-embed: usamos embeddings precomputados)
    topic_model = BERTopic(
        embedding_model=None,             # ya tenemos Z_para
        vectorizer_model=vectorizer,
        umap_model=umap_model,
        hdbscan_model=hdb_model,
        calculate_probabilities=True,     # para resúmenes por probabilidad
        language="multilingual",
        nr_topics="auto",                 # reducción jerárquica automática
        verbose=False
    )

    topics, probs = topic_model.fit_transform(texts, embeddings=X)

    # --- Tabla de tópicos (info general) ---
    info = topic_model.get_topic_info()      # columns: Topic, Count, Name
    # excluimos -1 (ruido) para resúmenes/distribuciones
    info_nz = info[info["Topic"]!=-1].copy().reset_index(drop=True)
    if info_nz.empty:
        print(f"[BERTopic] {cand_name}: solo ruido; revisa parámetros.")
        return

    # --- Tabla “Tema → top términos c-TF-IDF” ---
    rows = []
    for t_id in info_nz["Topic"].tolist():
        words = topic_model.get_topic(t_id) or []   # [(term, score), ...]
        for rank, (term, score) in enumerate(words[:TOP_TERMS], start=1):
            rows.append({"candidate": cand_name, "topic_id": t_id,
                         "rank": rank, "term": term, "ctfidf": float(score)})
    topic_terms = pd.DataFrame(rows)
    display(topic_terms.head(min(10, len(topic_terms))))

    # --- Resumen extractivo por tópico: top párrafos representativos ---
    # Si el método nativo está presente (v.0.16+):
    try:
        rep_map = {t: topic_model.get_representative_docs(t) for t in info_nz["Topic"].tolist()}
    except Exception:
        rep_map = {}

    summary_rows = []
    for t_id in info_nz["Topic"].tolist():
        # fallback si no hay método nativo: ordenar párrafos del tópico por probabilidad
        sel = np.where(np.array(topics) == t_id)[0]
        if len(sel) == 0:
            continue
        if rep_map.get(t_id):
            reps = rep_map[t_id][:TOP_DOCS]
            for j, txt in enumerate(reps, start=1):
                summary_rows.append({"candidate": cand_name, "topic_id": t_id, "order": j,
                                     "snippet": (txt[:420]+"…") if len(txt)>420 else txt})
        else:
            if probs is not None:
                # probabilidad topic-wise
                t_col = t_id if t_id < probs.shape[1] else None
                if t_col is not None:
                    local = sorted([(int(i), float(probs[i, t_col])) for i in sel],
                                   key=lambda x: x[1], reverse=True)[:TOP_DOCS]
                    for j,(i,p) in enumerate(local, start=1):
                        txt = texts[i]
                        summary_rows.append({"candidate": cand_name, "topic_id": t_id, "order": j,
                                             "prob": p, "snippet": (txt[:420]+"…") if len(txt)>420 else txt})
                else:
                    # si no hay prob col — usa distancia al centroide semántico del tópico
                    # centroide con promedio de embeddings del tópico:
                    centroid = X[sel].mean(axis=0, keepdims=True)
                    d = pairwise_distances(X[sel], centroid, metric="cosine").ravel()
                    topk = np.argsort(d)[:TOP_DOCS]
                    for j, k in enumerate(topk, start=1):
                        i = sel[k]
                        txt = texts[i]
                        summary_rows.append({"candidate": cand_name, "topic_id": t_id, "order": j,
                                             "snippet": (txt[:420]+"…") if len(txt)>420 else txt})
            else:
                centroid = X[sel].mean(axis=0, keepdims=True)
                d = pairwise_distances(X[sel], centroid, metric="cosine").ravel()
                topk = np.argsort(d)[:TOP_DOCS]
                for j, k in enumerate(topk, start=1):
                    i = sel[k]
                    txt = texts[i]
                    summary_rows.append({"candidate": cand_name, "topic_id": t_id, "order": j,
                                         "snippet": (txt[:420]+"…") if len(txt)>420 else txt})

    topic_summ = pd.DataFrame(summary_rows)
    if not topic_summ.empty:
        print(f"=== Resúmenes extractivos — {cand_name} ===")
        display(topic_summ.head(min(10, len(topic_summ))))

    # --- Distribución porcentual de párrafos por tópico (mix de agenda) ---
    # (excluye -1)
    mask = np.array(topics) != -1
    vc = pd.Series(np.array(topics)[mask]).value_counts().rename_axis("topic_id").reset_index(name="n")
    vc["pct"] = 100 * vc["n"] / vc["n"].sum()
    vc = vc.sort_values("pct", ascending=False)
    print(f"=== Distribución de párrafos por tópico — {cand_name} ===")
    display(vc)

    # Visual: barras de distribución
    fig_mix = px.bar(
        vc.head(MAX_TOPICS_PLOT), x="topic_id", y="pct",
        title=f"Mix de agenda — {cand_name} (porcentaje de párrafos por tópico)",
        labels={"topic_id":"Tópico", "pct":"% de párrafos"},
        text=vc.head(MAX_TOPICS_PLOT)["pct"].round(1).astype(str)+"%"
    )
    fig_mix.update_traces(marker_color=palette[0], textposition="outside")
    fig_mix.update_layout(yaxis=dict(range=[0, max(10, vc["pct"].max()*1.15)]))
    fig_mix.show()

    # --- Tabla compacta de términos por tópico (top TOP_TERMS) ---
    # Render en forma "Topic → términos"
    cards = []
    for t_id in info_nz["Topic"].tolist():
        terms = [w for w,_ in (topic_model.get_topic(t_id) or [])][:TOP_TERMS]
        cards.append({"topic_id": t_id, "terms": ", ".join(terms)})
    topic_cards = pd.DataFrame(cards).sort_values("topic_id")
    print(f"=== Términos (c-TF-IDF) por tópico — {cand_name} ===")
    display(topic_cards)

    # --- Jerarquía de temas (reducción jerárquica) ---
    if show_hierarchy:
        try:
            # BERTopic produce una figura Plotly nativa
            fig_h = topic_model.visualize_hierarchy(top_n_topics=min(MAX_TOPICS_PLOT, len(info_nz)))
            fig_h.update_layout(title_text=f"Jerarquía de tópicos — {cand_name}")
            fig_h.show()
        except Exception as e:
            print(f"[BERTopic] Jerarquía no disponible ({e})")

    # --- Opcional: UMAP 2D embebido del candidato coloreado por tópico ---
    try:
        # coordenadas internas de BERTopic (si se desea reutilizar)
        umap2 = umap.UMAP(n_components=2, random_state=RANDOM_STATE, metric="cosine").fit_transform(X)
        df_u2 = pd.DataFrame({"UMAP1": umap2[:,0], "UMAP2": umap2[:,1], "topic": topics, "file": files, "para_id": para_ids})
        df_u2 = df_u2[df_u2["topic"]!=-1]
        fig_u2 = px.scatter(
            df_u2, x="UMAP1", y="UMAP2", color="topic",
            hover_data=["file","para_id"],
            title=f"UMAP 2D — {cand_name} (color = tópico HDBSCAN)",
            color_continuous_scale="Viridis"
        )
        fig_u2.update_traces(marker=dict(size=6, opacity=0.8))
        fig_u2.show()
    except Exception as e:
        print(f"[BERTopic] UMAP 2D por candidato no disponible ({e})")

    # Devuelve artefactos en memoria por si quieres reusarlos después
    return {
        "model": topic_model,
        "topic_info": info,
        "topic_terms": topic_terms,
        "topic_summary": topic_summ,
        "mix_agenda": vc
    }

# --------- EJECUCIÓN: por cada candidato ---------
candidatos = sorted(paras_df["candidate"].unique().tolist())
artefactos_por_candidato = {}

for cand in tqdm(candidatos, desc="[BERTopic] candidatos"):
    print("\n" + "="*80)
    print(f"BERTopic — {cand}")
    print("="*80)
    artefactos_por_candidato[cand] = run_bertopic_for_candidate(cand, show_hierarchy=True)



[notice] A new release of pip is available: 24.1.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


[BERTopic] candidatos:   0%|          | 0/8 [00:00<?, ?it/s]


BERTopic — artes


Unnamed: 0,candidate,topic_id,rank,term,ctfidf
0,artes,0,1,politica,0.020488
1,artes,0,2,pueblos,0.01849
2,artes,0,3,los pueblos,0.017236
3,artes,0,4,oro,0.017216
4,artes,0,5,patria,0.0166
5,artes,0,6,sistema,0.015964
6,artes,0,7,refundacion,0.013757
7,artes,0,8,en chile,0.013453
8,artes,0,9,refundacion de,0.013293
9,artes,0,10,proceso,0.013222


=== Resúmenes extractivos — artes ===


Unnamed: 0,candidate,topic_id,order,snippet
0,artes,0,1,• El Gobierno Patriótico Popular promoverá un ...
1,artes,0,2,• El Gobierno Patriótico Popular se empeñará d...
2,artes,0,3,• Establecer relaciones diplomáticas basadas e...
3,artes,1,1,La cantidad de empresas afectas durante la pri...
4,artes,1,2,• El Primer Sistema Nacional de Planificación ...
5,artes,1,3,El Primer Sistema Nacional de Planificación Qu...
6,artes,2,1,Tal posibilidad se abre cuando las grandes mas...
7,artes,2,2,"En cambio, las grandes mayorías se han quedado..."
8,artes,2,3,"Asimismo, el sólo hecho de tomarse el poder po..."
9,artes,3,1,"Los trabajadores del sector público, por ejemp..."


=== Distribución de párrafos por tópico — artes ===


Unnamed: 0,topic_id,n,pct
0,0,96,37.209302
1,1,66,25.581395
2,2,29,11.24031
3,3,23,8.914729
4,4,17,6.589147
5,5,14,5.426357
6,6,13,5.03876


=== Términos (c-TF-IDF) por tópico — artes ===


Unnamed: 0,topic_id,terms
0,0,"politica, pueblos, los pueblos, oro, patria, s..."
1,1,"de produccion, empresas, sistema, de planifica..."
2,2,"poder, politico, poder politico, ninos, social..."
3,3,"trabajo, trabajadores, los trabajadores, salar..."
4,4,"pensiones, sistema, de pensiones, sistema de, ..."
5,5,"crimen, organizado, crimen organizado, del cri..."
6,6,"energia, la energia, de energia, generacion, g..."



BERTopic — harold


Unnamed: 0,candidate,topic_id,rank,term,ctfidf
0,harold,0,1,salud,0.026067
1,harold,0,2,seguridad,0.023018
2,harold,0,3,de salud,0.017932
3,harold,0,4,personas,0.017868
4,harold,0,5,que no,0.016981
5,harold,0,6,prevencion,0.016405
6,harold,0,7,de seguridad,0.015912
7,harold,0,8,familias,0.015912
8,harold,0,9,temprana,0.015497
9,harold,0,10,en el,0.015452


=== Resúmenes extractivos — harold ===


Unnamed: 0,candidate,topic_id,order,snippet
0,harold,0,1,Esta estrategia contará con financiamiento ded...
1,harold,0,2,"Frente a este clamor ciudadano, nuestro gobier..."
2,harold,0,3,Proponemos un Programa Nacional de Salud Menta...
3,harold,1,1,42 Esta es una invitación a quienes no se resi...
4,harold,1,2,Esto permitirá incluir a los migrantes de buen...
5,harold,1,3,______________________________________________...
6,harold,2,1,Propondremos un cambio estructural radical en ...
7,harold,2,2,Se crearán más ligas y campeonatos femeninos e...
8,harold,2,3,"Por ello, proponemos también incorporar pausas..."
9,harold,3,1,Chile enfrenta serios desafíos en esta materia...


=== Distribución de párrafos por tópico — harold ===


Unnamed: 0,topic_id,n,pct
0,0,32,30.47619
1,1,27,25.714286
2,2,18,17.142857
3,3,15,14.285714
4,4,13,12.380952


=== Términos (c-TF-IDF) por tópico — harold ===


Unnamed: 0,topic_id,terms
0,0,"salud, seguridad, de salud, personas, que no, ..."
1,1,"todos, de chile, donde, nuestro, queremos, chi..."
2,2,"deporte, nacional de, educacion, en el, datos,..."
3,3,"obesidad, salud, chilenos, fisica, vida, activ..."
4,4,"viviendas, de viviendas, territorial, construc..."



BERTopic — jara


Unnamed: 0,candidate,topic_id,rank,term,ctfidf
0,jara,0,1,como,0.072641
1,jara,0,2,participacion,0.056403
2,jara,0,3,el derecho,0.056403
3,jara,0,4,vivir,0.054714
4,jara,0,5,derecho,0.054714
5,jara,0,6,vida,0.042176
6,jara,0,7,fortaleceremos,0.042176
7,jara,0,8,cuidados,0.04212
8,jara,0,9,sistema nacional,0.04212
9,jara,0,10,humanos,0.04212


=== Resúmenes extractivos — jara ===


Unnamed: 0,candidate,topic_id,order,snippet
0,jara,0,1,"sin participación, sin derechos garantizados, ..."
1,jara,0,2,Impulsaremos trayectorias educativas continuas...
2,jara,0,3,Esta articulación no sólo ofrece una estrategi...
3,jara,1,1,Eso no es justo . Nuestro compromiso es que la...
4,jara,1,2,Todo esto ha sido posible gracias a una férrea...
5,jara,1,3,"A creer que es posible un Chile distinto, cons..."


=== Distribución de párrafos por tópico — jara ===


Unnamed: 0,topic_id,n,pct
0,0,10,50.0
1,1,10,50.0


=== Términos (c-TF-IDF) por tópico — jara ===


Unnamed: 0,topic_id,terms
0,0,"como, participacion, el derecho, vivir, derech..."
1,1,"vida, las familias, todas, la vida, anos, que ..."



BERTopic — kaiser


Unnamed: 0,candidate,topic_id,rank,term,ctfidf
0,kaiser,0,1,que,0.029303
1,kaiser,0,2,los,0.026243
2,kaiser,0,3,se,0.02087
3,kaiser,0,4,un,0.019269
4,kaiser,0,5,por,0.015876
5,kaiser,0,6,al,0.015669
6,kaiser,0,7,no,0.015097
7,kaiser,0,8,como,0.012891
8,kaiser,0,9,mas,0.012845
9,kaiser,0,10,su,0.012084


=== Resúmenes extractivos — kaiser ===


Unnamed: 0,candidate,topic_id,order,snippet
0,kaiser,0,1,Reducir el costo de vida de las personas no es...
1,kaiser,0,2,Este pilar reúne un conjunto de iniciativas qu...
2,kaiser,0,3,Nuestro modelo de desarrollo económico será ex...
3,kaiser,1,1,"• A los grandes inversionistas, que verán en C..."
4,kaiser,1,2,Que exige un país donde el esfuerzo valga la p...
5,kaiser,1,3,Este no es un beneficio para “los grandes”. Es...
6,kaiser,2,1,para lo que se propone la creación de un Conse...
7,kaiser,2,2,"Luego, se ha visualizado el diseño de un Siste..."
8,kaiser,2,3,PROPUESTA S PARA EL SISTEMA JUDICIAL Resultado...
9,kaiser,3,1,• Baja capacidad para liderar políticas de cre...


=== Distribución de párrafos por tópico — kaiser ===


Unnamed: 0,topic_id,n,pct
0,0,203,36.25
1,1,88,15.714286
2,2,56,10.0
3,3,35,6.25
4,4,27,4.821429
5,5,26,4.642857
6,6,25,4.464286
7,7,23,4.107143
8,8,16,2.857143
9,9,15,2.678571


=== Términos (c-TF-IDF) por tópico — kaiser ===


Unnamed: 0,topic_id,terms
0,0,"que, los, se, un, por, al, no, como, mas, su"
1,1,"que, chile, un, no, los, por, al, se, estado, mas"
2,2,"judicial, poder, que, los, poder judicial, del..."
3,3,"mineras, mineria, minero, crecimiento, sector,..."
4,4,"transporte, metro, publico, transporte publico..."
5,5,"hombre, que, un, como, lo, se, realidad, no, i..."
6,6,"salud, de salud, atencion, pacientes, los, de ..."
7,7,"energia, de energia, que, plantas, residuos, f..."
8,8,"deportivas, deportistas, deportiva, deporte, n..."
9,9,"desarrollo, plan, trabajo, chile, 21, nacional..."



BERTopic — kast


Unnamed: 0,candidate,topic_id,rank,term,ctfidf
0,kast,0,1,chile,0.068777
1,kast,0,2,es,0.042174
2,kast,0,3,pais,0.037513
3,kast,0,4,chilenos,0.030064
4,kast,0,5,un pais,0.024452
5,kast,0,6,se ha,0.022571
6,kast,0,7,nos,0.021845
7,kast,0,8,orden,0.020991
8,kast,0,9,hoy,0.020154
9,kast,0,10,fuerza,0.019242


=== Resúmenes extractivos — kast ===


Unnamed: 0,candidate,topic_id,order,snippet
0,kast,0,1,"El desarrollo, la oportunidad de un mejor Chil..."
1,kast,0,2,"Juntos, con decisión y fe, vamos a construir u..."
2,kast,0,3,a las pymes que no bajan la cortina; al campo ...
3,kast,1,1,La decadencia no se circunscribe a lo estricta...
4,kast,1,2,"Lo haremos con metas exigentes, pero alcanzabl..."
5,kast,1,3,"Implementaremos, desde el día uno, medidas adm..."
6,kast,2,1,Esto es esencial para recuperar el orden públi...
7,kast,2,2,"sin orden, no hay seguridad; y sin seguridad, ..."
8,kast,2,3,Para una estrategia de seguridad integral y ef...
9,kast,3,1,Consolidaremos una red de atención oncológica ...


=== Distribución de párrafos por tópico — kast ===


Unnamed: 0,topic_id,n,pct
0,0,38,36.538462
1,1,21,20.192308
2,2,17,16.346154
3,3,15,14.423077
4,4,13,12.5


=== Términos (c-TF-IDF) por tópico — kast ===


Unnamed: 0,topic_id,terms
0,0,"chile, es, pais, chilenos, un pais, se ha, nos..."
1,1,"inversion, empleo, gasto publico, central, par..."
2,2,"es, pais, crimen, plan de, la seguridad, accio..."
3,3,"atencion, salud, de salud, red, listas, de esp..."
4,4,"escolar, ninos, padres, la educacion, puedan, ..."



BERTopic — matthei


Unnamed: 0,candidate,topic_id,rank,term,ctfidf
0,matthei,0,1,de los,0.036004
1,matthei,0,2,educacion,0.03163
2,matthei,0,3,programas,0.028095
3,matthei,0,4,proyectos,0.023782
4,matthei,0,5,para que,0.023599
5,matthei,0,6,calidad,0.022736
6,matthei,0,7,plan,0.022115
7,matthei,0,8,laboral,0.020417
8,matthei,0,9,chilenos,0.020008
9,matthei,0,10,uso,0.020008


=== Resúmenes extractivos — matthei ===


Unnamed: 0,candidate,topic_id,order,snippet
0,matthei,0,1,iii) Se creará un Plan Nacional de Accesibilid...
1,matthei,0,2,ii) Se promoverá el uso de la madera como mate...
2,matthei,0,3,El crecimiento sostenible de Chile dependerá d...
3,matthei,1,1,b) Crearemos una Unidad de Cumplimiento (UNICO...
4,matthei,1,2,También reforzaremos la ciberinteligencia en c...
5,matthei,1,3,"estableceremos modelos de trabajo permanentes,..."


=== Distribución de párrafos por tópico — matthei ===


Unnamed: 0,topic_id,n,pct
0,0,130,79.754601
1,1,33,20.245399


=== Términos (c-TF-IDF) por tópico — matthei ===


Unnamed: 0,topic_id,terms
0,0,"de los, educacion, programas, proyectos, para ..."
1,1,"crimen, organizado, crimen organizado, nuestra..."



BERTopic — meo


Unnamed: 0,candidate,topic_id,rank,term,ctfidf
0,meo,0,1,nueva,0.033866
1,meo,0,2,en la,0.031558
2,meo,0,3,empleo,0.031391
3,meo,0,4,seguridad,0.029719
4,meo,0,5,sector,0.028378
5,meo,0,6,ello,0.026318
6,meo,0,7,nacional de,0.026298
7,meo,0,8,el sector,0.025642
8,meo,0,9,ha,0.025113
9,meo,0,10,desde,0.02463


=== Resúmenes extractivos — meo ===


Unnamed: 0,candidate,topic_id,order,snippet
0,meo,0,1,1 PROGRAMA PRESIDENCIAL MARCO ENRÍQUEZ-OMINAMI...
1,meo,0,2,No es así como se logrará crecer de manera ráp...
2,meo,0,3,3 PROGRAMA PRESIDENCIAL MARCO ENRÍQUEZ-OMINAMI...
3,meo,1,1,El impuesto afectará toda extracción consuntiv...
4,meo,1,2,La agricultura captura alrededor de 70% de esa...
5,meo,1,3,Garantizar una administración pública y transp...
6,meo,2,1,El objetivo central será pasar de un modelo re...
7,meo,2,2,es la presencia activa del Estado en el territ...
8,meo,2,3,NUESTRAS SIETE PRIMERAS MEDIDAS SERÁN PILAR 1 ...


=== Distribución de párrafos por tópico — meo ===


Unnamed: 0,topic_id,n,pct
0,0,29,43.283582
1,1,26,38.80597
2,2,12,17.910448


=== Términos (c-TF-IDF) por tópico — meo ===


Unnamed: 0,topic_id,terms
0,0,"nueva, en la, empleo, seguridad, sector, ello,..."
1,1,"impuesto, ambiental, medicamentos, salud, haci..."
2,2,"inteligencia, criminal, recuperar, presencia, ..."



BERTopic — parisi


Unnamed: 0,candidate,topic_id,rank,term,ctfidf
0,parisi,0,1,agua,0.025203
1,parisi,0,2,riesgo,0.022582
2,parisi,0,3,ambiental,0.021663
3,parisi,0,4,suelos,0.020939
4,parisi,0,5,cultivos,0.019825
5,parisi,0,6,agricola,0.019825
6,parisi,0,7,agricultura,0.019084
7,parisi,0,8,inia,0.016863
8,parisi,0,9,agricolas,0.016656
9,parisi,0,10,local,0.016555


=== Resúmenes extractivos — parisi ===


Unnamed: 0,candidate,topic_id,order,snippet
0,parisi,0,1,La red aprovecha la infraestructura y equipos ...
1,parisi,0,2,4. Se p ublicará un tablero comunal con adopci...
2,parisi,0,3,Instalar y operar una red territorial de parce...
3,parisi,1,1,"No más diagnósticos eternos, no más agencias q..."
4,parisi,1,2,d) La capacitación básica contemplaría materia...
5,parisi,1,3,"Por otro lado, las limitadas capacidades de in..."
6,parisi,2,1,"Sin embargo, estos instrumentos presentan supe..."
7,parisi,2,2,Existe un déficit de iniciativas tecnológicas ...
8,parisi,2,3,"En efecto, nuestro país cuenta con Fondos ambi..."
9,parisi,3,1,Por su parte Reino Unido (NHS) incorporó AMG c...


=== Distribución de párrafos por tópico — parisi ===


Unnamed: 0,topic_id,n,pct
0,0,64,25.396825
1,1,52,20.634921
2,2,41,16.269841
3,3,36,14.285714
4,4,30,11.904762
5,5,29,11.507937


=== Términos (c-TF-IDF) por tópico — parisi ===


Unnamed: 0,topic_id,terms
0,0,"agua, riesgo, ambiental, suelos, cultivos, agr..."
1,1,"inteligencia, seguridad, operaciones, carabine..."
2,2,"fondos, inversion, pais, proyectos, vivienda, ..."
3,3,"salud, de salud, pacientes, atencion, espera, ..."
4,4,"cooperativas, fondef, capital, empresas, otl, ..."
5,5,"ninos, violencia, familia, ninas, cuidadores, ..."


In [17]:
# %% Macro-taxonomía por prototipos (Versión A CORREGIDA)
import numpy as np, pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px, plotly.graph_objects as go
import torch
from sentence_transformers import SentenceTransformer

assert 'paras_df' in globals() and 'Z_para' in globals(), "Faltan paras_df o Z_para en memoria."

# 1) Taxonomía macro
CATEGORIES = [
    "Economía","Trabajo","Educación","Salud","Seguridad","Justicia",
    "Medioambiente","Vivienda","Transporte","Descentralización",
    "Ciencia y Tecnología","Cultura","Género","Relaciones Internacionales",
    "Inclusión Social","Pueblos Originarios"
]

def prompts_for(cat):
    base = [
        f"Este párrafo trata sobre {cat}.",
        f"Política pública de {cat}.",
        f"Tema principal: {cat}."
    ]
    extras = {
        "Economía": ["Crecimiento, inversión, impuestos, presupuesto y gasto público."],
        "Trabajo": ["Empleo, sindicatos, salarios, formalización laboral."],
        "Educación": ["Escuelas, universidades, acceso educativo, formación docente."],
        "Salud": ["Sistema de salud, hospitales, FONASA, ISAPRE, atención primaria."],
        "Seguridad": ["Delincuencia, Carabineros, crimen organizado, orden público."],
        "Justicia": ["Tribunales, Ministerio Público, derechos humanos, Gendarmería."],
        "Medioambiente": ["Cambio climático, energía renovable, contaminación."],
        "Vivienda": ["Déficit habitacional, urbanismo, barrios, arriendo."],
        "Transporte": ["Infraestructura vial, metro, movilidad sustentable."],
        "Ciencia y Tecnología": ["Innovación, investigación, desarrollo digital, IA."],
        "Cultura": ["Patrimonio, artes, identidad, expresiones culturales."],
        "Género": ["Equidad, derechos de las mujeres, violencia de género."],
        "Descentralización": ["Gobiernos regionales, autonomía local, participación."],
        "Inclusión Social": ["Reducción de pobreza, políticas sociales, equidad."],
        "Relaciones Internacionales": ["Diplomacia, comercio exterior, cooperación."],
        "Pueblos Originarios": ["Reconocimiento, autonomía, interculturalidad."]
    }
    return base + extras.get(cat, [])

# 2) Embeddings de prototipos
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model_st = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", device=DEVICE)

proto_texts = [p for cat in CATEGORIES for p in prompts_for(cat)]
E_prompts = model_st.encode(proto_texts, normalize_embeddings=True, batch_size=128, show_progress_bar=False)

# promedio por categoría
offset = 0
E_cats = []
for cat in CATEGORIES:
    n = len(prompts_for(cat))
    E_cats.append(E_prompts[offset:offset+n].mean(0))
    offset += n
E_cats = np.vstack(E_cats)

# 3) Similitud y umbral
def l2norm(x, eps=1e-9): return x / (np.linalg.norm(x, axis=1, keepdims=True) + eps)
Zp = l2norm(Z_para); Ec = l2norm(E_cats)
S = (cosine_similarity(Zp, Ec) + 1.0) / 2.0   # [0,1]
THRESH = 0.55
S_bin = (S >= THRESH).astype(float)

# 4) Agregación por candidato
def mix_table(scores):
    sums = scores.sum(0)
    tot = float(sums.sum()) if float(sums.sum())>0 else 1.0
    pct = 100.0 * sums / tot
    return pd.DataFrame({"category": CATEGORIES, "pct": pct}).sort_values("pct", ascending=False)

arte = {}
row_ptr = 0
for cand in tqdm(sorted(paras_df["candidate"].unique())):
    dfc = paras_df[paras_df["candidate"]==cand].reset_index(drop=True)
    n = len(dfc)
    if n == 0: 
        continue
    Sc = S[row_ptr:row_ptr+n]; Sb = S_bin[row_ptr:row_ptr+n]
    arte[cand] = {"df": dfc, "mix": mix_table(Sc), "scores": Sc, "hard": Sb}
    row_ptr += n

# 5) Visualizaciones (corregido el concat y con escalas mejoradas)
mix_df_list = []
for cand, v in arte.items():
    df = v["mix"].copy()
    df["candidate"] = cand
    mix_df_list.append(df)
mix_df = pd.concat(mix_df_list, ignore_index=True)
mix_pivot = mix_df.pivot(index="candidate", columns="category", values="pct").fillna(0.0)
mix_pivot = mix_pivot.reindex(columns=CATEGORIES, fill_value=0.0)

# 5.a Heatmap absoluto con zmax dinámico
zmax = float(mix_pivot.values.max()) if mix_pivot.size else 100.0
fig_heat_abs = px.imshow(
    mix_pivot.values, x=mix_pivot.columns, y=mix_pivot.index,
    aspect="auto", origin="upper",
    color_continuous_scale="Viridis", zmin=0, zmax=zmax,
    labels=dict(x="Categoría", y="Candidato", color="% énfasis"),
    title=f"Heatmap — % real por macro (zmax={zmax:.1f})"
)
fig_heat_abs.update_xaxes(side="top")
fig_heat_abs.show()

# 5.b Heatmap normalizado por fila (0–100 relativo al máximo por candidato)
rowmax = mix_pivot.max(axis=1).replace(0, np.nan)
mix_row = mix_pivot.div(rowmax, axis=0).fillna(0.0) * 100.0
fig_heat_row = px.imshow(
    mix_row.values, x=mix_row.columns, y=mix_row.index,
    aspect="auto", origin="upper",
    color_continuous_scale="Plasma", zmin=0, zmax=100,
    labels=dict(x="Categoría", y="Candidato", color="% relativo (fila)"),
    title="Heatmap — % relativo por candidato (normalización por fila)"
)
fig_heat_row.update_xaxes(side="top")
fig_heat_row.show()

# 5.c Radar con rango radial automático
rad_max = float(mix_pivot.values.max()) if mix_pivot.size else 100.0
fig_radar = go.Figure()
for c in mix_pivot.index:
    vals = mix_pivot.loc[c, CATEGORIES].tolist()
    vals += [vals[0]]
    fig_radar.add_trace(go.Scatterpolar(
        r=vals, theta=CATEGORIES + [CATEGORIES[0]],
        mode="lines+markers", fill="toself", name=c
    ))
fig_radar.update_layout(
    title=f"Radar — Perfil temático macro (rango 0–{rad_max:.1f})",
    polar=dict(radialaxis=dict(range=[0, rad_max])),
    legend_title="Candidato"
)
fig_radar.show()


  0%|          | 0/8 [00:00<?, ?it/s]

In [18]:
# %% Jerárquico macro → sub (Versión B CORREGIDA)
import numpy as np, pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm
import plotly.graph_objects as go, plotly.express as px
import torch
from sentence_transformers import SentenceTransformer

assert 'paras_df' in globals() and 'Z_para' in globals(), "Faltan paras_df o Z_para en memoria."

# 1) Jerarquía
HIERARCHY = {
    "Economía": ["Política fiscal","Inversión","Productividad","Impuestos","Pymes"],
    "Salud": ["Atención primaria","Hospitales","FONASA","ISAPRE","Salud mental"],
    "Educación": ["Escuela","Universidad","Docencia","Infraestructura escolar"],
    "Seguridad": ["Carabineros","Crimen organizado","Prevención del delito"],
    "Trabajo": ["Empleo","Salario mínimo","Sindicatos","Formalización"],
    "Medioambiente": ["Cambio climático","Energías renovables","Contaminación"],
    "Justicia": ["Tribunales","Gendarmería","Reinserción"],
    "Vivienda": ["Déficit habitacional","Urbanismo","Arriendo"],
    "Género": ["Violencia de género","Equidad","Cuidados"],
    "Ciencia y Tecnología": ["Innovación","Transformación digital","IA","Investigación"],
    "Cultura": ["Patrimonio","Artes","Identidad cultural"],
    "Descentralización": ["Gobiernos regionales","Autonomía local"],
    "Transporte": ["Movilidad","Infraestructura","Transporte público"],
    "Inclusión Social": ["Pobreza","Políticas sociales","Migración"],
    "Relaciones Internacionales": ["Diplomacia","Comercio exterior","Integración regional"]
}

# 2) Prompts
def prompts_macro(m):
    base = [
        f"Este párrafo trata sobre {m}.",
        f"Política pública de {m}.",
        f"Tema principal: {m}."
    ]
    extras = {
        "Economía": ["Crecimiento, inversión, impuestos y presupuesto público."],
        "Salud": ["Sistema sanitario, hospitales, FONASA/ISAPRE, atención primaria."],
        "Seguridad": ["Delincuencia, orden público, crimen organizado, policía."]
    }
    return base + extras.get(m, [])

def prompts_sub(m, s):
    return [
        f"{s}, parte del área {m}.",
        f"Subtema de {m}: {s}.",
        f"El párrafo aborda {s} dentro de {m}."
    ]

# 3) Embeddings de prototipos
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model_st = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", device=DEVICE)

macros = list(HIERARCHY.keys())
macro_proto_texts, macro_slices = [], []
for m in macros:
    ps = prompts_macro(m)
    s = len(macro_proto_texts); macro_proto_texts.extend(ps); e = len(macro_proto_texts)
    macro_slices.append((s, e))

Em_prompts = model_st.encode(macro_proto_texts, normalize_embeddings=True, batch_size=128, show_progress_bar=False)
E_macro = np.vstack([Em_prompts[s:e].mean(0) for (s, e) in macro_slices])

subs_pairs, sub_proto_texts, sub_slices = [], [], []
for m in macros:
    for s in HIERARCHY[m]:
        ps = prompts_sub(m, s)
        st = len(sub_proto_texts); sub_proto_texts.extend(ps); en = len(sub_proto_texts)
        sub_slices.append((st, en)); subs_pairs.append((m, s))

Es_prompts = model_st.encode(sub_proto_texts, normalize_embeddings=True, batch_size=128, show_progress_bar=False)
E_sub = np.vstack([Es_prompts[s:e].mean(0) for (s, e) in sub_slices])

# 4) Similitudes y asignación jerárquica
def l2norm(x, eps=1e-9): return x / (np.linalg.norm(x, axis=1, keepdims=True) + eps)
Zp = l2norm(Z_para); Em = l2norm(E_macro); Es = l2norm(E_sub)
Sm = (cosine_similarity(Zp, Em) + 1.0) / 2.0
Ss = (cosine_similarity(Zp, Es) + 1.0) / 2.0

best_macro_idx = Sm.argmax(1)
best_sub_idx   = Ss.argmax(1)

paras_df = paras_df.copy()
paras_df["macro"]       = np.array(macros, dtype=object)[best_macro_idx]
paras_df["macro_score"] = Sm[np.arange(Sm.shape[0]), best_macro_idx]
paras_df["sub_macro"]   = np.array([subs_pairs[i][0] for i in best_sub_idx], dtype=object)
paras_df["subcat"]      = np.array([subs_pairs[i][1] for i in best_sub_idx], dtype=object)
paras_df["sub_score"]   = Ss[np.arange(Ss.shape[0]), best_sub_idx]

# coherencia macro-sub
mask_incoh = paras_df["macro"] != paras_df["sub_macro"]
paras_df.loc[mask_incoh, ["subcat","sub_score","sub_macro"]] = [None, np.nan, None]

# (opcional) umbral “duro”
THR = 0.55
paras_df["macro_hard"] = paras_df["macro_score"] >= THR
paras_df["sub_hard"]   = paras_df["sub_score"].fillna(0.0) >= THR

# 5) Agregaciones
mix_macro = (paras_df.groupby(["candidate","macro"], as_index=False)
             .size().rename(columns={"size":"count"}))
mix_macro["pct"] = 100.0 * mix_macro["count"] / mix_macro.groupby("candidate")["count"].transform("sum")

mix_sub = (paras_df.dropna(subset=["subcat"]).groupby(["candidate","macro","subcat"], as_index=False)
           .size().rename(columns={"size":"count"}))
mix_sub["pct"] = 100.0 * mix_sub["count"] / mix_sub.groupby("candidate")["count"].transform("sum")

# 6) Visuales — heatmaps con mejor contraste + radar auto + sunburst por candidato
macro_order = macros
pivot_macro = mix_macro.pivot(index="candidate", columns="macro", values="pct").fillna(0.0)
pivot_macro = pivot_macro.reindex(columns=macro_order, fill_value=0.0)

# 6.a Heatmap absoluto con escala dinámica
zmax = float(pivot_macro.values.max()) if pivot_macro.size else 100.0
fig_heat_abs = px.imshow(
    pivot_macro.values, x=pivot_macro.columns, y=pivot_macro.index,
    aspect="auto", origin="upper",
    color_continuous_scale="Viridis", zmin=0, zmax=zmax,
    labels=dict(x="Macro-área", y="Candidato", color="% énfasis"),
    title=f"Heatmap — % real por macro (zmax={zmax:.1f})"
)
fig_heat_abs.update_xaxes(side="top")
fig_heat_abs.show()

# 6.b Heatmap normalizado por fila
rowmax = pivot_macro.max(axis=1).replace(0, np.nan)
pivot_row = pivot_macro.div(rowmax, axis=0).fillna(0.0) * 100.0
fig_heat_row = px.imshow(
    pivot_row.values, x=pivot_row.columns, y=pivot_row.index,
    aspect="auto", origin="upper",
    color_continuous_scale="Plasma", zmin=0, zmax=100,
    labels=dict(x="Macro-área", y="Candidato", color="% relativo (fila)"),
    title="Heatmap — % relativo por candidato (normalización por fila)"
)
fig_heat_row.update_xaxes(side="top")
fig_heat_row.show()

# 6.c Radar con rango radial automático
rad_max = float(pivot_macro.values.max()) if pivot_macro.size else 100.0
fig_radar = go.Figure()
for c in pivot_macro.index:
    vals = pivot_macro.loc[c, macro_order].tolist()
    vals += [vals[0]]
    fig_radar.add_trace(go.Scatterpolar(
        r=vals, theta=macro_order + [macro_order[0]],
        mode="lines+markers", fill="toself", name=c
    ))
fig_radar.update_layout(
    title=f"Radar jerárquico — Perfil macro (rango 0–{rad_max:.1f})",
    polar=dict(radialaxis=dict(range=[0, rad_max])),
    legend_title="Candidato"
)
fig_radar.show()

# 6.d Sunburst jerárquico con dropdown de candidato
candidatos = pivot_macro.index.tolist()
fig_sun = go.Figure()
visible = []
for i, c in enumerate(candidatos):
    dfc = mix_sub[mix_sub["candidate"]==c]
    if dfc.empty:
        fig_sun.add_trace(go.Sunburst(labels=[], parents=[], values=[]))
        visible.append(False); continue
    labels = [c]; parents = [""]; values = [dfc["pct"].sum()]
    for m in dfc["macro"].unique():
        subm = dfc[dfc["macro"]==m]
        labels.append(m); parents.append(c); values.append(subm["pct"].sum())
        for _, r in subm.iterrows():
            labels.append(r["subcat"]); parents.append(m); values.append(r["pct"])
    fig_sun.add_trace(go.Sunburst(labels=labels, parents=parents, values=values, branchvalues="total"))
    visible.append(i==0)

buttons=[]
for i, c in enumerate(candidatos):
    vis = [False]*len(candidatos); vis[i]=True
    buttons.append(dict(label=c, method="update",
                        args=[{"visible": vis},
                              {"title": f"Sunburst — Jerarquía temática de {c}"}]))
fig_sun.update_layout(
    title=f"Sunburst — Jerarquía temática de {candidatos[0] if candidatos else ''}",
    updatemenus=[dict(type="dropdown", x=1.02, y=1.0, buttons=buttons, showactive=True)]
)
for t, v in zip(fig_sun.data, visible): t.visible = v
fig_sun.show()


In [20]:
# ==== BLOQUE COMPLETO CORREGIDO — CARGA ROBUSTA DE ANALIZADORES ABSA (offline/online) ====
# Uso: ejecuta este bloque ANTES de calcular sentimientos/emociones.
# Expone: sent_an, emo_an, absa_sent_predict(texts), absa_emo_predict(texts)

import os, sys, subprocess, torch
from pathlib import Path

# ---------- util: asegurar dependencias mínimas ----------
def _ensure(pkgs):
    for p in pkgs:
        try:
            __import__(p.split("==")[0].split(">=")[0])
        except Exception:
            subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", p])

_ensure(["pysentimiento>=0.7.2", "transformers>=4.43", "huggingface_hub>=0.23"])

from pysentimiento import create_analyzer

DEVICE = 0 if torch.cuda.is_available() else -1
BASE_MODELS = Path("hf_models")  # donde predescargaste modelos (opcional, pero recomendado)
sent_local = BASE_MODELS / "robertuito-sentiment"
emo_local  = BASE_MODELS / "robertuito-emotion"

_ABSA_SENT_BACKEND = "pysent"   # o "cardiff"
_ABSA_EMO_BACKEND  = "pysent"   # o "stub"

def build_analyzers():
    """
    1) Intenta cargar modelos LOCALES de pysentimiento (sin red).
    2) Si falla, intenta descarga online por defecto de pysentimiento.
    3) Si falla, fallback: sentimiento con 'cardiffnlp/twitter-xlm-roberta-base-sentiment' y emociones stub.
    """
    global _ABSA_SENT_BACKEND, _ABSA_EMO_BACKEND

    # ---- 1) Local (sin red) ----
    if sent_local.exists() and emo_local.exists():
        try:
            sa = create_analyzer(task="sentiment", lang="es", model_name=str(sent_local), device=DEVICE)
            ea = create_analyzer(task="emotion",   lang="es", model_name=str(emo_local),  device=DEVICE)
            _ABSA_SENT_BACKEND, _ABSA_EMO_BACKEND = "pysent", "pysent"
            return sa, ea
        except Exception as e:
            print(f"[ABSA] Carga local falló, probando online… ({e})")

    # ---- 2) Online (puede fallar por restricciones de red) ----
    try:
        sa = create_analyzer(task="sentiment", lang="es", device=DEVICE)
        ea = create_analyzer(task="emotion",   lang="es", device=DEVICE)
        _ABSA_SENT_BACKEND, _ABSA_EMO_BACKEND = "pysent", "pysent"
        return sa, ea
    except Exception as e_online:
        print("[ABSA] Descarga online no disponible:", e_online)

    # ---- 3) Fallback mínimo (sentimiento real + emociones stub) ----
    from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
    try:
        mdl = AutoModelForSequenceClassification.from_pretrained(
            "cardiffnlp/twitter-xlm-roberta-base-sentiment"
        )
        tok = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment")
        sa = TextClassificationPipeline(model=mdl, tokenizer=tok, device=DEVICE)

        class _DummyEmotion:
            def predict(self, texts):
                from types import SimpleNamespace
                # devuelve probabilidades nulas (placeholder)
                return [SimpleNamespace(probas={"anger":0,"fear":0,"joy":0,"sadness":0,"disgust":0,"surprise":0}) for _ in texts]

        ea = _DummyEmotion()
        _ABSA_SENT_BACKEND, _ABSA_EMO_BACKEND = "cardiff", "stub"
        print("[ABSA] Fallback activado: sentimiento 'cardiffnlp', emociones 'stub'.")
        return sa, ea
    except Exception as e_fallback:
        raise RuntimeError(f"[ABSA] No se pudieron inicializar analizadores (ni local, ni online, ni fallback): {e_fallback}")

sent_an, emo_an = build_analyzers()

# ---------- Wrappers UNIFORMES para el resto del pipeline ----------
def absa_sent_predict(texts):
    """
    Devuelve lista de dicts con claves: {'POS','NEU','NEG'} en [0,1],
    independientemente del backend real.
    """
    out = []
    if _ABSA_SENT_BACKEND == "pysent":
        res = sent_an.predict(list(texts))
        for o in res:
            p = o.probas
            out.append({
                "POS": float(p.get("POS", p.get("pos", 0.0))),
                "NEU": float(p.get("NEU", p.get("neu", 0.0))),
                "NEG": float(p.get("NEG", p.get("neg", 0.0))),
            })
    else:  # cardiffnlp pipeline (labels: negative/neutral/positive)
        res = sent_an(list(texts), truncation=True)
        for r in res:
            # r puede ser una lista (top-k) o un dict; normalizamos
            if isinstance(r, list):
                d = {x["label"].lower(): x["score"] for x in r}
            else:
                d = {r["label"].lower(): r["score"]}
            pos = float(d.get("positive", d.get("pos", 0.0)))
            neu = float(d.get("neutral",  d.get("neu", 0.0)))
            neg = float(d.get("negative", d.get("neg", 0.0)))
            s = pos + neu + neg
            if s > 0:
                pos, neu, neg = pos/s, neu/s, neg/s
            out.append({"POS": pos, "NEU": neu, "NEG": neg})
    return out

def absa_emo_predict(texts):
    """
    Devuelve lista de dicts con emociones en [0,1]. Si backend es 'stub',
    retornará ceros (no bloquea el flujo).
    """
    res = emo_an.predict(list(texts))
    out = []
    for o in res:
        p = getattr(o, "probas", {})  # pysentimiento -> SimpleNamespace(probas=...)
        out.append({k: float(v) for k, v in p.items()})
    return out

# ---------- Diagnóstico opcional ----------
def absa_selfcheck(n=3):
    sample = ["La delincuencia ha aumentado y exigimos más seguridad.",
              "Las pymes requieren incentivos tributarios y acceso a crédito.",
              "La atención primaria de salud debe fortalecerse."]
    print(">> Backend sentimiento:", _ABSA_SENT_BACKEND)
    print(">> Backend emociones :", _ABSA_EMO_BACKEND)
    print(">> Test rápido:")
    print(absa_sent_predict(sample)[:n])
    print(absa_emo_predict(sample)[:n])

# Llama absa_selfcheck() si quieres verificar los backends y salidas.
# ==== FIN BLOQUE ====


model.safetensors:   0%|          | 0.00/435M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/167 [00:00<?, ?B/s]