In [3]:

import re, numpy as np, pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer

# Plotly (renderizador para notebooks)
import plotly.io as pio
pio.renderers.default = "vscode"  # usa "colab" o "vscode" si lo prefieres

# ---------- parámetros ----------
DOCS = Path("documents.parquet")
PAGES = Path("pages.parquet")
assert DOCS.exists() and PAGES.exists(), "Se requieren documents.parquet y pages.parquet"

MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
PARA_MAX_TOK = 120      # ~ palabras por párrafo
MIN_PARA_LEN = 8
BATCH_SIZE   = 64
RANDOM_STATE = 42

def norm_space(s):
    s = re.sub(r"\s+", " ", s)
    s = re.sub(r"(\.)([A-ZÁÉÍÓÚÑ])", r"\1 \2", s)
    return s.strip()

def split_paragraphs(text):
    parts = [p.strip() for p in re.split(r"\n\s*\n+", text) if p.strip()] or [text]
    out = []
    for p in parts:
        toks = p.split()
        if len(toks) <= PARA_MAX_TOK:
            out.append(norm_space(p))
        else:
            sents = re.split(r"(?<=[\.\?\!;:])\s+", p)
            buf, count = [], 0
            for s in sents:
                n = len(s.split())
                if count + n > PARA_MAX_TOK and buf:
                    out.append(norm_space(" ".join(buf))); buf, count = [], 0
                buf.append(s); count += n
            if buf: out.append(norm_space(" ".join(buf)))
    return [p for p in out if len(p.split()) >= MIN_PARA_LEN]

# 11P.1 — leer y segmentar
pages = pd.read_parquet(PAGES).sort_values(["candidate","filename","page"])
para_rows = []
for (cand, fname), g in tqdm(pages.groupby(["candidate","filename"], sort=False), desc="[11P] párrafos por doc"):
    txt = "\n".join([str(x) for x in g["text"] if isinstance(x,str)])
    for i, p in enumerate(split_paragraphs(txt), 1):
        para_rows.append({"candidate":cand, "filename":fname, "para_id":i, "text":p})
paras_df = pd.DataFrame(para_rows)
print("[11P] párrafos construidos:", paras_df.shape)

# 11P.2 — embeddings BERT
model = SentenceTransformer(MODEL_NAME)
def encode_texts(texts):
    return model.encode(texts, batch_size=BATCH_SIZE, show_progress_bar=True, normalize_embeddings=True).astype("float32")

Z_para = encode_texts(paras_df["text"].tolist())  # (N_parrafos, d)

# 11P.3 — promedios por documento y por candidato (sólo en memoria)
key_doc = paras_df[["candidate","filename"]].astype(str).agg("||".join, axis=1).to_numpy()
doc_keys = np.unique(key_doc)
E_docs = np.vstack([Z_para[key_doc==k].mean(axis=0) for k in tqdm(doc_keys, desc="[11P] avg doc")])
meta_docs = pd.DataFrame([dict(zip(["candidate","filename"], k.split("||",1))) for k in doc_keys])

cands = sorted(paras_df["candidate"].unique().tolist())
E_cand = np.vstack([E_docs[meta_docs["candidate"]==c].mean(axis=0) for c in tqdm(cands, desc="[11P] avg cand")])

print("[11P] shapes — Z_para:", Z_para.shape, " E_docs:", E_docs.shape, " E_cand:", E_cand.shape)


[11P] párrafos por doc:   0%|          | 0/8 [00:00<?, ?it/s]

[11P] párrafos construidos: (2021, 4)


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

[11P] avg doc:   0%|          | 0/8 [00:00<?, ?it/s]

[11P] avg cand:   0%|          | 0/8 [00:00<?, ?it/s]

[11P] shapes — Z_para: (2021, 384)  E_docs: (8, 384)  E_cand: (8, 384)


In [8]:
# %% MÓDULO 12-P — Radar + UMAP 2D/3D interactivos
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd, numpy as np, umap
import plotly.graph_objects as go
import plotly.express as px

# 12-P.1 — matriz coseno por candidato
S = cosine_similarity(E_cand, E_cand)
labels = cands

# 12-P.2 — radar interactivo con leyenda explicativa
S01 = (S - S.min()) / (S.max() - S.min() + 1e-9)
S100 = 100.0 * S01
theta = labels + [labels[0]]

fig_radar = go.Figure()
for i, name in enumerate(labels):
    vals = S100[i, :].tolist() + [S100[i, 0]]
    fig_radar.add_trace(go.Scatterpolar(
        r=vals, theta=theta,
        mode="lines+markers", fill="toself",
        name=name,
        hovertemplate="<b>Referencia:</b> "+name+
                      "<br>%{theta}: %{r:.1f}<extra></extra>"
    ))
fig_radar.update_layout(
    title="Afinidad BERT entre candidatos (radar interactivo)",
    polar=dict(radialaxis=dict(range=[0,100], tick0=0, dtick=20)),
    legend_title_text="Color = candidato de referencia"
)
fig_radar.show()

# 12-P.3 — UMAP 2D global
cand_codes = paras_df["candidate"].astype("category")
cand_idx = cand_codes.cat.codes
cand_names = cand_codes.cat.categories.tolist()

um2 = umap.UMAP(
    n_components=2, random_state=42,
    metric="cosine", n_neighbors=15, min_dist=0.1
).fit_transform(Z_para)

df_um2 = pd.DataFrame({
    "UMAP1": um2[:, 0],
    "UMAP2": um2[:, 1],
    "candidate": paras_df["candidate"],
    "filename": paras_df["filename"],
    "para_id": paras_df["para_id"],
    "text": [t[:280]+"…" if isinstance(t,str) and len(t)>280 else t
             for t in paras_df["text"]]
})

# paleta cualitativa segura
try:
    palette = px.colors.qualitative.Dark24
except AttributeError:
    palette = px.colors.qualitative.Plotly

fig_u2 = px.scatter(
    df_um2, x="UMAP1", y="UMAP2",
    color="candidate",
    hover_data=["filename", "para_id", "text"],
    title="UMAP 2D — párrafos coloreados por candidato",
    color_discrete_sequence=palette
)
fig_u2.update_traces(marker=dict(size=6, opacity=0.8))
fig_u2.show()

# 12-P.4 — UMAP 3D global
um3 = umap.UMAP(
    n_components=3, random_state=42,
    metric="cosine", n_neighbors=15, min_dist=0.1
).fit_transform(Z_para)

df_um3 = pd.DataFrame({
    "UMAP1": um3[:, 0],
    "UMAP2": um3[:, 1],
    "UMAP3": um3[:, 2],
    "candidate": paras_df["candidate"],
    "filename": paras_df["filename"],
    "para_id": paras_df["para_id"],
    "text": [t[:200]+"…" if isinstance(t,str) and len(t)>200 else t
             for t in paras_df["text"]]
})

fig_u3 = px.scatter_3d(
    df_um3, x="UMAP1", y="UMAP2", z="UMAP3",
    color="candidate",
    hover_data=["filename", "para_id", "text"],
    title="UMAP 3D — párrafos coloreados por candidato",
    color_discrete_sequence=palette
)
fig_u3.update_traces(marker=dict(size=3, opacity=0.8))
fig_u3.show()



n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.




n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [9]:
# %% MÓDULO 13-P — HDBSCAN por candidato + UMAP 2D/3D con selector
import numpy as np, pandas as pd
import hdbscan, umap
import plotly.express as px
import plotly.graph_objects as go

MIN_CLUSTER_SIZE = 8
MIN_SAMPLES      = 5
METRIC           = "euclidean"   # con embeddings normalizados, "cosine" también funciona

# precomputamos por candidato: labels HDBSCAN + UMAP 2D/3D
per_cand = {}  # cand -> dict(X_idx, labels, U2, U3)
for cand in tqdm(cands, desc="[13P] HDBSCAN por candidato"):
    idx = np.where(paras_df["candidate"].values == cand)[0]
    Xc  = Z_para[idx]
    if Xc.shape[0] < max(10, MIN_CLUSTER_SIZE):
        per_cand[cand] = {"idx":idx, "labels":np.full(len(idx), -1), "U2":None, "U3":None}
        continue
    lab = hdbscan.HDBSCAN(min_cluster_size=MIN_CLUSTER_SIZE, min_samples=MIN_SAMPLES, metric=METRIC).fit_predict(Xc)
    U2c = umap.UMAP(n_components=2, random_state=42, metric="cosine", n_neighbors=15, min_dist=0.1).fit_transform(Xc)
    U3c = umap.UMAP(n_components=3, random_state=42, metric="cosine", n_neighbors=15, min_dist=0.1).fit_transform(Xc)
    per_cand[cand] = {"idx":idx, "labels":lab, "U2":U2c, "U3":U3c}

# 13P.1 — UMAP 2D con dropdown de candidato
def fig_umap_2d_dropdown(per_cand):
    buttons = []
    data0 = None
    i = 0
    for cand, d in per_cand.items():
        idx, lab, U2c = d["idx"], d["labels"], d["U2"]
        if U2c is None:
            U2c = np.zeros((len(idx), 2)); lab = np.full(len(idx), -1)
        scat = go.Scatter(
            x=U2c[:,0], y=U2c[:,1],
            mode="markers",
            marker=dict(size=6, color=lab, colorscale="Viridis"),
            name=cand,
            text=[f"{cand} — {paras_df.iloc[j]['filename']} — ¶{int(paras_df.iloc[j]['para_id'])}" for j in idx],
            hovertemplate="%{text}<extra></extra>",
            visible=(i==0)
        )
        if data0 is None: data0 = [scat]
        else: data0.append(scat)
        buttons.append(dict(label=cand,
                            method="update",
                            args=[{"visible":[k==i for k in range(len(per_cand))]},
                                  {"title":f"UMAP 2D — {cand} (HDBSCAN: -1=ruido)"}]))
        i += 1

    fig = go.Figure(data=data0)
    fig.update_layout(
        title=f"UMAP 2D — {list(per_cand.keys())[0]} (HDBSCAN: -1=ruido)",
        updatemenus=[dict(type="dropdown", x=1.02, y=1.0, showactive=True, buttons=buttons)],
        xaxis_title="UMAP1", yaxis_title="UMAP2"
    )
    return fig

fig_2d = fig_umap_2d_dropdown(per_cand)
fig_2d.show()

# 13P.2 — UMAP 3D con dropdown de candidato
def fig_umap_3d_dropdown(per_cand):
    buttons = []
    data0 = None
    i = 0
    for cand, d in per_cand.items():
        idx, lab, U3c = d["idx"], d["labels"], d["U3"]
        if U3c is None:
            U3c = np.zeros((len(idx), 3)); lab = np.full(len(idx), -1)
        scat = go.Scatter3d(
            x=U3c[:,0], y=U3c[:,1], z=U3c[:,2],
            mode="markers",
            marker=dict(size=3, color=lab, colorscale="Viridis"),
            name=cand,
            text=[f"{cand} — {paras_df.iloc[j]['filename']} — ¶{int(paras_df.iloc[j]['para_id'])}" for j in idx],
            hovertemplate="%{text}<extra></extra>",
            visible=(i==0)
        )
        if data0 is None: data0 = [scat]
        else: data0.append(scat)
        buttons.append(dict(label=cand,
                            method="update",
                            args=[{"visible":[k==i for k in range(len(per_cand))]},
                                  {"title":f"UMAP 3D — {cand} (HDBSCAN: -1=ruido)"}]))
        i += 1

    fig = go.Figure(data=data0)
    fig.update_layout(
        title=f"UMAP 3D — {list(per_cand.keys())[0]} (HDBSCAN: -1=ruido)",
        updatemenus=[dict(type="dropdown", x=1.05, y=1.0, showactive=True, buttons=buttons)],
        scene=dict(xaxis_title="UMAP1", yaxis_title="UMAP2", zaxis_title="UMAP3")
    )
    return fig

fig_3d = fig_umap_3d_dropdown(per_cand)
fig_3d.show()


[13P] HDBSCAN por candidato:   0%|          | 0/8 [00:00<?, ?it/s]


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


n_jobs value 1 overridden to 1 by setting 

In [10]:
# %% BERTopic por candidato — HDBSCAN + c-TF-IDF + jerarquía + resúmenes + distribuciones (Plotly)
!pip -q install bertopic hdbscan umap-learn scikit-learn tqdm plotly

import numpy as np, pandas as pd
from tqdm.auto import tqdm
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import pairwise_distances
import umap, hdbscan
import plotly.express as px
import plotly.graph_objects as go

# --------- PRECONDICIONES ---------
# Debes tener en el entorno:
#   - paras_df: DataFrame con columnas ["candidate","filename","para_id","text"]
#   - Z_para  : np.ndarray de shape (n_parrafos, d) con embeddings normalizados
assert 'paras_df' in globals() and 'Z_para' in globals(), "Faltan 'paras_df' y/o 'Z_para' en memoria."

# --------- PARÁMETROS ---------
RANDOM_STATE = 42
MIN_CLUSTER_SIZE = 10     # HDBSCAN por candidato; ajusta según longitud del programa
MIN_SAMPLES      = 5
UMAP_N_NEIGHBORS = 15
UMAP_MIN_DIST    = 0.10

TOP_TERMS = 12            # top términos c-TF-IDF para mostrar por tópico
TOP_DOCS  = 3             # nº de párrafos representativos a mostrar por tópico
MAX_TOPICS_PLOT = 25      # cap visualizaciones extensas

# Vectorizador para c-TF-IDF (n-gramas cortos en español)
vectorizer = CountVectorizer(
    lowercase=True,
    strip_accents='unicode',
    ngram_range=(1,2),
    min_df=2,
    max_df=0.95
)

# UMAP/HDBSCAN base (BERTopic reducirá embeddings con UMAP antes de clusterizar)
umap_model = umap.UMAP(
    n_neighbors=UMAP_N_NEIGHBORS,
    n_components=5,          # compresión previa antes de HDBSCAN
    min_dist=UMAP_MIN_DIST,
    metric="cosine",
    random_state=RANDOM_STATE
)
hdb_model = hdbscan.HDBSCAN(
    min_cluster_size=MIN_CLUSTER_SIZE,
    min_samples=MIN_SAMPLES,
    metric="euclidean",
    prediction_data=True
)

# Paleta cualitativa segura
try:
    palette = px.colors.qualitative.Dark24
except AttributeError:
    palette = px.colors.qualitative.Plotly

# --------- FUNCIÓN UTIL — construir tablas/visuales por candidato ---------
def run_bertopic_for_candidate(cand_name, show_hierarchy=True):
    # subset del candidato
    idx = np.where(paras_df["candidate"].values == cand_name)[0]
    if len(idx) < max(20, MIN_CLUSTER_SIZE + 5):
        print(f"[BERTopic] {cand_name}: muy pocos párrafos ({len(idx)}); omito.")
        return

    texts = paras_df.iloc[idx]["text"].tolist()
    files = paras_df.iloc[idx]["filename"].tolist()
    para_ids = paras_df.iloc[idx]["para_id"].tolist()
    X = Z_para[idx]

    # Modelo BERTopic (sin re-embed: usamos embeddings precomputados)
    topic_model = BERTopic(
        embedding_model=None,             # ya tenemos Z_para
        vectorizer_model=vectorizer,
        umap_model=umap_model,
        hdbscan_model=hdb_model,
        calculate_probabilities=True,     # para resúmenes por probabilidad
        language="multilingual",
        nr_topics="auto",                 # reducción jerárquica automática
        verbose=False
    )

    topics, probs = topic_model.fit_transform(texts, embeddings=X)

    # --- Tabla de tópicos (info general) ---
    info = topic_model.get_topic_info()      # columns: Topic, Count, Name
    # excluimos -1 (ruido) para resúmenes/distribuciones
    info_nz = info[info["Topic"]!=-1].copy().reset_index(drop=True)
    if info_nz.empty:
        print(f"[BERTopic] {cand_name}: solo ruido; revisa parámetros.")
        return

    # --- Tabla “Tema → top términos c-TF-IDF” ---
    rows = []
    for t_id in info_nz["Topic"].tolist():
        words = topic_model.get_topic(t_id) or []   # [(term, score), ...]
        for rank, (term, score) in enumerate(words[:TOP_TERMS], start=1):
            rows.append({"candidate": cand_name, "topic_id": t_id,
                         "rank": rank, "term": term, "ctfidf": float(score)})
    topic_terms = pd.DataFrame(rows)
    display(topic_terms.head(min(10, len(topic_terms))))

    # --- Resumen extractivo por tópico: top párrafos representativos ---
    # Si el método nativo está presente (v.0.16+):
    try:
        rep_map = {t: topic_model.get_representative_docs(t) for t in info_nz["Topic"].tolist()}
    except Exception:
        rep_map = {}

    summary_rows = []
    for t_id in info_nz["Topic"].tolist():
        # fallback si no hay método nativo: ordenar párrafos del tópico por probabilidad
        sel = np.where(np.array(topics) == t_id)[0]
        if len(sel) == 0:
            continue
        if rep_map.get(t_id):
            reps = rep_map[t_id][:TOP_DOCS]
            for j, txt in enumerate(reps, start=1):
                summary_rows.append({"candidate": cand_name, "topic_id": t_id, "order": j,
                                     "snippet": (txt[:420]+"…") if len(txt)>420 else txt})
        else:
            if probs is not None:
                # probabilidad topic-wise
                t_col = t_id if t_id < probs.shape[1] else None
                if t_col is not None:
                    local = sorted([(int(i), float(probs[i, t_col])) for i in sel],
                                   key=lambda x: x[1], reverse=True)[:TOP_DOCS]
                    for j,(i,p) in enumerate(local, start=1):
                        txt = texts[i]
                        summary_rows.append({"candidate": cand_name, "topic_id": t_id, "order": j,
                                             "prob": p, "snippet": (txt[:420]+"…") if len(txt)>420 else txt})
                else:
                    # si no hay prob col — usa distancia al centroide semántico del tópico
                    # centroide con promedio de embeddings del tópico:
                    centroid = X[sel].mean(axis=0, keepdims=True)
                    d = pairwise_distances(X[sel], centroid, metric="cosine").ravel()
                    topk = np.argsort(d)[:TOP_DOCS]
                    for j, k in enumerate(topk, start=1):
                        i = sel[k]
                        txt = texts[i]
                        summary_rows.append({"candidate": cand_name, "topic_id": t_id, "order": j,
                                             "snippet": (txt[:420]+"…") if len(txt)>420 else txt})
            else:
                centroid = X[sel].mean(axis=0, keepdims=True)
                d = pairwise_distances(X[sel], centroid, metric="cosine").ravel()
                topk = np.argsort(d)[:TOP_DOCS]
                for j, k in enumerate(topk, start=1):
                    i = sel[k]
                    txt = texts[i]
                    summary_rows.append({"candidate": cand_name, "topic_id": t_id, "order": j,
                                         "snippet": (txt[:420]+"…") if len(txt)>420 else txt})

    topic_summ = pd.DataFrame(summary_rows)
    if not topic_summ.empty:
        print(f"=== Resúmenes extractivos — {cand_name} ===")
        display(topic_summ.head(min(10, len(topic_summ))))

    # --- Distribución porcentual de párrafos por tópico (mix de agenda) ---
    # (excluye -1)
    mask = np.array(topics) != -1
    vc = pd.Series(np.array(topics)[mask]).value_counts().rename_axis("topic_id").reset_index(name="n")
    vc["pct"] = 100 * vc["n"] / vc["n"].sum()
    vc = vc.sort_values("pct", ascending=False)
    print(f"=== Distribución de párrafos por tópico — {cand_name} ===")
    display(vc)

    # Visual: barras de distribución
    fig_mix = px.bar(
        vc.head(MAX_TOPICS_PLOT), x="topic_id", y="pct",
        title=f"Mix de agenda — {cand_name} (porcentaje de párrafos por tópico)",
        labels={"topic_id":"Tópico", "pct":"% de párrafos"},
        text=vc.head(MAX_TOPICS_PLOT)["pct"].round(1).astype(str)+"%"
    )
    fig_mix.update_traces(marker_color=palette[0], textposition="outside")
    fig_mix.update_layout(yaxis=dict(range=[0, max(10, vc["pct"].max()*1.15)]))
    fig_mix.show()

    # --- Tabla compacta de términos por tópico (top TOP_TERMS) ---
    # Render en forma "Topic → términos"
    cards = []
    for t_id in info_nz["Topic"].tolist():
        terms = [w for w,_ in (topic_model.get_topic(t_id) or [])][:TOP_TERMS]
        cards.append({"topic_id": t_id, "terms": ", ".join(terms)})
    topic_cards = pd.DataFrame(cards).sort_values("topic_id")
    print(f"=== Términos (c-TF-IDF) por tópico — {cand_name} ===")
    display(topic_cards)

    # --- Jerarquía de temas (reducción jerárquica) ---
    if show_hierarchy:
        try:
            # BERTopic produce una figura Plotly nativa
            fig_h = topic_model.visualize_hierarchy(top_n_topics=min(MAX_TOPICS_PLOT, len(info_nz)))
            fig_h.update_layout(title_text=f"Jerarquía de tópicos — {cand_name}")
            fig_h.show()
        except Exception as e:
            print(f"[BERTopic] Jerarquía no disponible ({e})")

    # --- Opcional: UMAP 2D embebido del candidato coloreado por tópico ---
    try:
        # coordenadas internas de BERTopic (si se desea reutilizar)
        umap2 = umap.UMAP(n_components=2, random_state=RANDOM_STATE, metric="cosine").fit_transform(X)
        df_u2 = pd.DataFrame({"UMAP1": umap2[:,0], "UMAP2": umap2[:,1], "topic": topics, "file": files, "para_id": para_ids})
        df_u2 = df_u2[df_u2["topic"]!=-1]
        fig_u2 = px.scatter(
            df_u2, x="UMAP1", y="UMAP2", color="topic",
            hover_data=["file","para_id"],
            title=f"UMAP 2D — {cand_name} (color = tópico HDBSCAN)",
            color_continuous_scale="Viridis"
        )
        fig_u2.update_traces(marker=dict(size=6, opacity=0.8))
        fig_u2.show()
    except Exception as e:
        print(f"[BERTopic] UMAP 2D por candidato no disponible ({e})")

    # Devuelve artefactos en memoria por si quieres reusarlos después
    return {
        "model": topic_model,
        "topic_info": info,
        "topic_terms": topic_terms,
        "topic_summary": topic_summ,
        "mix_agenda": vc
    }

# --------- EJECUCIÓN: por cada candidato ---------
candidatos = sorted(paras_df["candidate"].unique().tolist())
artefactos_por_candidato = {}

for cand in tqdm(candidatos, desc="[BERTopic] candidatos"):
    print("\n" + "="*80)
    print(f"BERTopic — {cand}")
    print("="*80)
    artefactos_por_candidato[cand] = run_bertopic_for_candidate(cand, show_hierarchy=True)



[notice] A new release of pip is available: 24.1.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


[BERTopic] candidatos:   0%|          | 0/8 [00:00<?, ?it/s]


BERTopic — artes


Unnamed: 0,candidate,topic_id,rank,term,ctfidf
0,artes,0,1,politica,0.020488
1,artes,0,2,pueblos,0.01849
2,artes,0,3,los pueblos,0.017236
3,artes,0,4,oro,0.017216
4,artes,0,5,patria,0.0166
5,artes,0,6,sistema,0.015964
6,artes,0,7,refundacion,0.013757
7,artes,0,8,en chile,0.013453
8,artes,0,9,refundacion de,0.013293
9,artes,0,10,proceso,0.013222


=== Resúmenes extractivos — artes ===


Unnamed: 0,candidate,topic_id,order,snippet
0,artes,0,1,• El Gobierno Patriótico Popular promoverá un ...
1,artes,0,2,• El Gobierno Patriótico Popular se empeñará d...
2,artes,0,3,• Establecer relaciones diplomáticas basadas e...
3,artes,1,1,La cantidad de empresas afectas durante la pri...
4,artes,1,2,• El Primer Sistema Nacional de Planificación ...
5,artes,1,3,El Primer Sistema Nacional de Planificación Qu...
6,artes,2,1,Tal posibilidad se abre cuando las grandes mas...
7,artes,2,2,"En cambio, las grandes mayorías se han quedado..."
8,artes,2,3,"Asimismo, el sólo hecho de tomarse el poder po..."
9,artes,3,1,"Los trabajadores del sector público, por ejemp..."


=== Distribución de párrafos por tópico — artes ===


Unnamed: 0,topic_id,n,pct
0,0,96,37.209302
1,1,66,25.581395
2,2,29,11.24031
3,3,23,8.914729
4,4,17,6.589147
5,5,14,5.426357
6,6,13,5.03876


=== Términos (c-TF-IDF) por tópico — artes ===


Unnamed: 0,topic_id,terms
0,0,"politica, pueblos, los pueblos, oro, patria, s..."
1,1,"de produccion, empresas, sistema, de planifica..."
2,2,"poder, politico, poder politico, ninos, social..."
3,3,"trabajo, trabajadores, los trabajadores, salar..."
4,4,"pensiones, sistema, de pensiones, sistema de, ..."
5,5,"crimen, organizado, crimen organizado, del cri..."
6,6,"energia, la energia, de energia, generacion, g..."



BERTopic — harold


Unnamed: 0,candidate,topic_id,rank,term,ctfidf
0,harold,0,1,salud,0.026067
1,harold,0,2,seguridad,0.023018
2,harold,0,3,de salud,0.017932
3,harold,0,4,personas,0.017868
4,harold,0,5,que no,0.016981
5,harold,0,6,prevencion,0.016405
6,harold,0,7,de seguridad,0.015912
7,harold,0,8,familias,0.015912
8,harold,0,9,temprana,0.015497
9,harold,0,10,en el,0.015452


=== Resúmenes extractivos — harold ===


Unnamed: 0,candidate,topic_id,order,snippet
0,harold,0,1,Esta estrategia contará con financiamiento ded...
1,harold,0,2,"Frente a este clamor ciudadano, nuestro gobier..."
2,harold,0,3,Proponemos un Programa Nacional de Salud Menta...
3,harold,1,1,42 Esta es una invitación a quienes no se resi...
4,harold,1,2,Esto permitirá incluir a los migrantes de buen...
5,harold,1,3,______________________________________________...
6,harold,2,1,Propondremos un cambio estructural radical en ...
7,harold,2,2,Se crearán más ligas y campeonatos femeninos e...
8,harold,2,3,"Por ello, proponemos también incorporar pausas..."
9,harold,3,1,Chile enfrenta serios desafíos en esta materia...


=== Distribución de párrafos por tópico — harold ===


Unnamed: 0,topic_id,n,pct
0,0,32,30.47619
1,1,27,25.714286
2,2,18,17.142857
3,3,15,14.285714
4,4,13,12.380952


=== Términos (c-TF-IDF) por tópico — harold ===


Unnamed: 0,topic_id,terms
0,0,"salud, seguridad, de salud, personas, que no, ..."
1,1,"todos, de chile, donde, nuestro, queremos, chi..."
2,2,"deporte, nacional de, educacion, en el, datos,..."
3,3,"obesidad, salud, chilenos, fisica, vida, activ..."
4,4,"viviendas, de viviendas, territorial, construc..."



BERTopic — jara


Unnamed: 0,candidate,topic_id,rank,term,ctfidf
0,jara,0,1,como,0.072641
1,jara,0,2,participacion,0.056403
2,jara,0,3,el derecho,0.056403
3,jara,0,4,vivir,0.054714
4,jara,0,5,derecho,0.054714
5,jara,0,6,vida,0.042176
6,jara,0,7,fortaleceremos,0.042176
7,jara,0,8,cuidados,0.04212
8,jara,0,9,sistema nacional,0.04212
9,jara,0,10,humanos,0.04212


=== Resúmenes extractivos — jara ===


Unnamed: 0,candidate,topic_id,order,snippet
0,jara,0,1,"sin participación, sin derechos garantizados, ..."
1,jara,0,2,Impulsaremos trayectorias educativas continuas...
2,jara,0,3,Esta articulación no sólo ofrece una estrategi...
3,jara,1,1,Eso no es justo . Nuestro compromiso es que la...
4,jara,1,2,Todo esto ha sido posible gracias a una férrea...
5,jara,1,3,"A creer que es posible un Chile distinto, cons..."


=== Distribución de párrafos por tópico — jara ===


Unnamed: 0,topic_id,n,pct
0,0,10,50.0
1,1,10,50.0


=== Términos (c-TF-IDF) por tópico — jara ===


Unnamed: 0,topic_id,terms
0,0,"como, participacion, el derecho, vivir, derech..."
1,1,"vida, las familias, todas, la vida, anos, que ..."



BERTopic — kaiser


Unnamed: 0,candidate,topic_id,rank,term,ctfidf
0,kaiser,0,1,que,0.029303
1,kaiser,0,2,los,0.026243
2,kaiser,0,3,se,0.02087
3,kaiser,0,4,un,0.019269
4,kaiser,0,5,por,0.015876
5,kaiser,0,6,al,0.015669
6,kaiser,0,7,no,0.015097
7,kaiser,0,8,como,0.012891
8,kaiser,0,9,mas,0.012845
9,kaiser,0,10,su,0.012084


=== Resúmenes extractivos — kaiser ===


Unnamed: 0,candidate,topic_id,order,snippet
0,kaiser,0,1,Reducir el costo de vida de las personas no es...
1,kaiser,0,2,Este pilar reúne un conjunto de iniciativas qu...
2,kaiser,0,3,Nuestro modelo de desarrollo económico será ex...
3,kaiser,1,1,"• A los grandes inversionistas, que verán en C..."
4,kaiser,1,2,Que exige un país donde el esfuerzo valga la p...
5,kaiser,1,3,Este no es un beneficio para “los grandes”. Es...
6,kaiser,2,1,para lo que se propone la creación de un Conse...
7,kaiser,2,2,"Luego, se ha visualizado el diseño de un Siste..."
8,kaiser,2,3,PROPUESTA S PARA EL SISTEMA JUDICIAL Resultado...
9,kaiser,3,1,• Baja capacidad para liderar políticas de cre...


=== Distribución de párrafos por tópico — kaiser ===


Unnamed: 0,topic_id,n,pct
0,0,203,36.25
1,1,88,15.714286
2,2,56,10.0
3,3,35,6.25
4,4,27,4.821429
5,5,26,4.642857
6,6,25,4.464286
7,7,23,4.107143
8,8,16,2.857143
9,9,15,2.678571


=== Términos (c-TF-IDF) por tópico — kaiser ===


Unnamed: 0,topic_id,terms
0,0,"que, los, se, un, por, al, no, como, mas, su"
1,1,"que, chile, un, no, los, por, al, se, estado, mas"
2,2,"judicial, poder, que, los, poder judicial, del..."
3,3,"mineras, mineria, minero, crecimiento, sector,..."
4,4,"transporte, metro, publico, transporte publico..."
5,5,"hombre, que, un, como, lo, se, realidad, no, i..."
6,6,"salud, de salud, atencion, pacientes, los, de ..."
7,7,"energia, de energia, que, plantas, residuos, f..."
8,8,"deportivas, deportistas, deportiva, deporte, n..."
9,9,"desarrollo, plan, trabajo, chile, 21, nacional..."



BERTopic — kast


Unnamed: 0,candidate,topic_id,rank,term,ctfidf
0,kast,0,1,chile,0.068777
1,kast,0,2,es,0.042174
2,kast,0,3,pais,0.037513
3,kast,0,4,chilenos,0.030064
4,kast,0,5,un pais,0.024452
5,kast,0,6,se ha,0.022571
6,kast,0,7,nos,0.021845
7,kast,0,8,orden,0.020991
8,kast,0,9,hoy,0.020154
9,kast,0,10,fuerza,0.019242


=== Resúmenes extractivos — kast ===


Unnamed: 0,candidate,topic_id,order,snippet
0,kast,0,1,"El desarrollo, la oportunidad de un mejor Chil..."
1,kast,0,2,"Juntos, con decisión y fe, vamos a construir u..."
2,kast,0,3,a las pymes que no bajan la cortina; al campo ...
3,kast,1,1,La decadencia no se circunscribe a lo estricta...
4,kast,1,2,"Lo haremos con metas exigentes, pero alcanzabl..."
5,kast,1,3,"Implementaremos, desde el día uno, medidas adm..."
6,kast,2,1,Esto es esencial para recuperar el orden públi...
7,kast,2,2,"sin orden, no hay seguridad; y sin seguridad, ..."
8,kast,2,3,Para una estrategia de seguridad integral y ef...
9,kast,3,1,Consolidaremos una red de atención oncológica ...


=== Distribución de párrafos por tópico — kast ===


Unnamed: 0,topic_id,n,pct
0,0,38,36.538462
1,1,21,20.192308
2,2,17,16.346154
3,3,15,14.423077
4,4,13,12.5


=== Términos (c-TF-IDF) por tópico — kast ===


Unnamed: 0,topic_id,terms
0,0,"chile, es, pais, chilenos, un pais, se ha, nos..."
1,1,"inversion, empleo, gasto publico, central, par..."
2,2,"es, pais, crimen, plan de, la seguridad, accio..."
3,3,"atencion, salud, de salud, red, listas, de esp..."
4,4,"escolar, ninos, padres, la educacion, puedan, ..."



BERTopic — matthei


Unnamed: 0,candidate,topic_id,rank,term,ctfidf
0,matthei,0,1,de los,0.036004
1,matthei,0,2,educacion,0.03163
2,matthei,0,3,programas,0.028095
3,matthei,0,4,proyectos,0.023782
4,matthei,0,5,para que,0.023599
5,matthei,0,6,calidad,0.022736
6,matthei,0,7,plan,0.022115
7,matthei,0,8,laboral,0.020417
8,matthei,0,9,chilenos,0.020008
9,matthei,0,10,uso,0.020008


=== Resúmenes extractivos — matthei ===


Unnamed: 0,candidate,topic_id,order,snippet
0,matthei,0,1,iii) Se creará un Plan Nacional de Accesibilid...
1,matthei,0,2,ii) Se promoverá el uso de la madera como mate...
2,matthei,0,3,El crecimiento sostenible de Chile dependerá d...
3,matthei,1,1,b) Crearemos una Unidad de Cumplimiento (UNICO...
4,matthei,1,2,También reforzaremos la ciberinteligencia en c...
5,matthei,1,3,"estableceremos modelos de trabajo permanentes,..."


=== Distribución de párrafos por tópico — matthei ===


Unnamed: 0,topic_id,n,pct
0,0,130,79.754601
1,1,33,20.245399


=== Términos (c-TF-IDF) por tópico — matthei ===


Unnamed: 0,topic_id,terms
0,0,"de los, educacion, programas, proyectos, para ..."
1,1,"crimen, organizado, crimen organizado, nuestra..."



BERTopic — meo


Unnamed: 0,candidate,topic_id,rank,term,ctfidf
0,meo,0,1,nueva,0.033866
1,meo,0,2,en la,0.031558
2,meo,0,3,empleo,0.031391
3,meo,0,4,seguridad,0.029719
4,meo,0,5,sector,0.028378
5,meo,0,6,ello,0.026318
6,meo,0,7,nacional de,0.026298
7,meo,0,8,el sector,0.025642
8,meo,0,9,ha,0.025113
9,meo,0,10,desde,0.02463


=== Resúmenes extractivos — meo ===


Unnamed: 0,candidate,topic_id,order,snippet
0,meo,0,1,1 PROGRAMA PRESIDENCIAL MARCO ENRÍQUEZ-OMINAMI...
1,meo,0,2,No es así como se logrará crecer de manera ráp...
2,meo,0,3,3 PROGRAMA PRESIDENCIAL MARCO ENRÍQUEZ-OMINAMI...
3,meo,1,1,El impuesto afectará toda extracción consuntiv...
4,meo,1,2,La agricultura captura alrededor de 70% de esa...
5,meo,1,3,Garantizar una administración pública y transp...
6,meo,2,1,El objetivo central será pasar de un modelo re...
7,meo,2,2,es la presencia activa del Estado en el territ...
8,meo,2,3,NUESTRAS SIETE PRIMERAS MEDIDAS SERÁN PILAR 1 ...


=== Distribución de párrafos por tópico — meo ===


Unnamed: 0,topic_id,n,pct
0,0,29,43.283582
1,1,26,38.80597
2,2,12,17.910448


=== Términos (c-TF-IDF) por tópico — meo ===


Unnamed: 0,topic_id,terms
0,0,"nueva, en la, empleo, seguridad, sector, ello,..."
1,1,"impuesto, ambiental, medicamentos, salud, haci..."
2,2,"inteligencia, criminal, recuperar, presencia, ..."



BERTopic — parisi


Unnamed: 0,candidate,topic_id,rank,term,ctfidf
0,parisi,0,1,agua,0.025203
1,parisi,0,2,riesgo,0.022582
2,parisi,0,3,ambiental,0.021663
3,parisi,0,4,suelos,0.020939
4,parisi,0,5,cultivos,0.019825
5,parisi,0,6,agricola,0.019825
6,parisi,0,7,agricultura,0.019084
7,parisi,0,8,inia,0.016863
8,parisi,0,9,agricolas,0.016656
9,parisi,0,10,local,0.016555


=== Resúmenes extractivos — parisi ===


Unnamed: 0,candidate,topic_id,order,snippet
0,parisi,0,1,La red aprovecha la infraestructura y equipos ...
1,parisi,0,2,4. Se p ublicará un tablero comunal con adopci...
2,parisi,0,3,Instalar y operar una red territorial de parce...
3,parisi,1,1,"No más diagnósticos eternos, no más agencias q..."
4,parisi,1,2,d) La capacitación básica contemplaría materia...
5,parisi,1,3,"Por otro lado, las limitadas capacidades de in..."
6,parisi,2,1,"Sin embargo, estos instrumentos presentan supe..."
7,parisi,2,2,Existe un déficit de iniciativas tecnológicas ...
8,parisi,2,3,"En efecto, nuestro país cuenta con Fondos ambi..."
9,parisi,3,1,Por su parte Reino Unido (NHS) incorporó AMG c...


=== Distribución de párrafos por tópico — parisi ===


Unnamed: 0,topic_id,n,pct
0,0,64,25.396825
1,1,52,20.634921
2,2,41,16.269841
3,3,36,14.285714
4,4,30,11.904762
5,5,29,11.507937


=== Términos (c-TF-IDF) por tópico — parisi ===


Unnamed: 0,topic_id,terms
0,0,"agua, riesgo, ambiental, suelos, cultivos, agr..."
1,1,"inteligencia, seguridad, operaciones, carabine..."
2,2,"fondos, inversion, pais, proyectos, vivienda, ..."
3,3,"salud, de salud, pacientes, atencion, espera, ..."
4,4,"cooperativas, fondef, capital, empresas, otl, ..."
5,5,"ninos, violencia, familia, ninas, cuidadores, ..."


In [17]:
# %% Macro-taxonomía por prototipos (Versión A CORREGIDA)
import numpy as np, pandas as pd
from tqdm.auto import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px, plotly.graph_objects as go
import torch
from sentence_transformers import SentenceTransformer

assert 'paras_df' in globals() and 'Z_para' in globals(), "Faltan paras_df o Z_para en memoria."

# 1) Taxonomía macro
CATEGORIES = [
    "Economía","Trabajo","Educación","Salud","Seguridad","Justicia",
    "Medioambiente","Vivienda","Transporte","Descentralización",
    "Ciencia y Tecnología","Cultura","Género","Relaciones Internacionales",
    "Inclusión Social","Pueblos Originarios"
]

def prompts_for(cat):
    base = [
        f"Este párrafo trata sobre {cat}.",
        f"Política pública de {cat}.",
        f"Tema principal: {cat}."
    ]
    extras = {
        "Economía": ["Crecimiento, inversión, impuestos, presupuesto y gasto público."],
        "Trabajo": ["Empleo, sindicatos, salarios, formalización laboral."],
        "Educación": ["Escuelas, universidades, acceso educativo, formación docente."],
        "Salud": ["Sistema de salud, hospitales, FONASA, ISAPRE, atención primaria."],
        "Seguridad": ["Delincuencia, Carabineros, crimen organizado, orden público."],
        "Justicia": ["Tribunales, Ministerio Público, derechos humanos, Gendarmería."],
        "Medioambiente": ["Cambio climático, energía renovable, contaminación."],
        "Vivienda": ["Déficit habitacional, urbanismo, barrios, arriendo."],
        "Transporte": ["Infraestructura vial, metro, movilidad sustentable."],
        "Ciencia y Tecnología": ["Innovación, investigación, desarrollo digital, IA."],
        "Cultura": ["Patrimonio, artes, identidad, expresiones culturales."],
        "Género": ["Equidad, derechos de las mujeres, violencia de género."],
        "Descentralización": ["Gobiernos regionales, autonomía local, participación."],
        "Inclusión Social": ["Reducción de pobreza, políticas sociales, equidad."],
        "Relaciones Internacionales": ["Diplomacia, comercio exterior, cooperación."],
        "Pueblos Originarios": ["Reconocimiento, autonomía, interculturalidad."]
    }
    return base + extras.get(cat, [])

# 2) Embeddings de prototipos
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model_st = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", device=DEVICE)

proto_texts = [p for cat in CATEGORIES for p in prompts_for(cat)]
E_prompts = model_st.encode(proto_texts, normalize_embeddings=True, batch_size=128, show_progress_bar=False)

# promedio por categoría
offset = 0
E_cats = []
for cat in CATEGORIES:
    n = len(prompts_for(cat))
    E_cats.append(E_prompts[offset:offset+n].mean(0))
    offset += n
E_cats = np.vstack(E_cats)

# 3) Similitud y umbral
def l2norm(x, eps=1e-9): return x / (np.linalg.norm(x, axis=1, keepdims=True) + eps)
Zp = l2norm(Z_para); Ec = l2norm(E_cats)
S = (cosine_similarity(Zp, Ec) + 1.0) / 2.0   # [0,1]
THRESH = 0.55
S_bin = (S >= THRESH).astype(float)

# 4) Agregación por candidato
def mix_table(scores):
    sums = scores.sum(0)
    tot = float(sums.sum()) if float(sums.sum())>0 else 1.0
    pct = 100.0 * sums / tot
    return pd.DataFrame({"category": CATEGORIES, "pct": pct}).sort_values("pct", ascending=False)

arte = {}
row_ptr = 0
for cand in tqdm(sorted(paras_df["candidate"].unique())):
    dfc = paras_df[paras_df["candidate"]==cand].reset_index(drop=True)
    n = len(dfc)
    if n == 0: 
        continue
    Sc = S[row_ptr:row_ptr+n]; Sb = S_bin[row_ptr:row_ptr+n]
    arte[cand] = {"df": dfc, "mix": mix_table(Sc), "scores": Sc, "hard": Sb}
    row_ptr += n

# 5) Visualizaciones (corregido el concat y con escalas mejoradas)
mix_df_list = []
for cand, v in arte.items():
    df = v["mix"].copy()
    df["candidate"] = cand
    mix_df_list.append(df)
mix_df = pd.concat(mix_df_list, ignore_index=True)
mix_pivot = mix_df.pivot(index="candidate", columns="category", values="pct").fillna(0.0)
mix_pivot = mix_pivot.reindex(columns=CATEGORIES, fill_value=0.0)

# 5.a Heatmap absoluto con zmax dinámico
zmax = float(mix_pivot.values.max()) if mix_pivot.size else 100.0
fig_heat_abs = px.imshow(
    mix_pivot.values, x=mix_pivot.columns, y=mix_pivot.index,
    aspect="auto", origin="upper",
    color_continuous_scale="Viridis", zmin=0, zmax=zmax,
    labels=dict(x="Categoría", y="Candidato", color="% énfasis"),
    title=f"Heatmap — % real por macro (zmax={zmax:.1f})"
)
fig_heat_abs.update_xaxes(side="top")
fig_heat_abs.show()

# 5.b Heatmap normalizado por fila (0–100 relativo al máximo por candidato)
rowmax = mix_pivot.max(axis=1).replace(0, np.nan)
mix_row = mix_pivot.div(rowmax, axis=0).fillna(0.0) * 100.0
fig_heat_row = px.imshow(
    mix_row.values, x=mix_row.columns, y=mix_row.index,
    aspect="auto", origin="upper",
    color_continuous_scale="Plasma", zmin=0, zmax=100,
    labels=dict(x="Categoría", y="Candidato", color="% relativo (fila)"),
    title="Heatmap — % relativo por candidato (normalización por fila)"
)
fig_heat_row.update_xaxes(side="top")
fig_heat_row.show()

# 5.c Radar con rango radial automático
rad_max = float(mix_pivot.values.max()) if mix_pivot.size else 100.0
fig_radar = go.Figure()
for c in mix_pivot.index:
    vals = mix_pivot.loc[c, CATEGORIES].tolist()
    vals += [vals[0]]
    fig_radar.add_trace(go.Scatterpolar(
        r=vals, theta=CATEGORIES + [CATEGORIES[0]],
        mode="lines+markers", fill="toself", name=c
    ))
fig_radar.update_layout(
    title=f"Radar — Perfil temático macro (rango 0–{rad_max:.1f})",
    polar=dict(radialaxis=dict(range=[0, rad_max])),
    legend_title="Candidato"
)
fig_radar.show()


  0%|          | 0/8 [00:00<?, ?it/s]

In [18]:
# %% Jerárquico macro → sub (Versión B CORREGIDA)
import numpy as np, pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.auto import tqdm
import plotly.graph_objects as go, plotly.express as px
import torch
from sentence_transformers import SentenceTransformer

assert 'paras_df' in globals() and 'Z_para' in globals(), "Faltan paras_df o Z_para en memoria."

# 1) Jerarquía
HIERARCHY = {
    "Economía": ["Política fiscal","Inversión","Productividad","Impuestos","Pymes"],
    "Salud": ["Atención primaria","Hospitales","FONASA","ISAPRE","Salud mental"],
    "Educación": ["Escuela","Universidad","Docencia","Infraestructura escolar"],
    "Seguridad": ["Carabineros","Crimen organizado","Prevención del delito"],
    "Trabajo": ["Empleo","Salario mínimo","Sindicatos","Formalización"],
    "Medioambiente": ["Cambio climático","Energías renovables","Contaminación"],
    "Justicia": ["Tribunales","Gendarmería","Reinserción"],
    "Vivienda": ["Déficit habitacional","Urbanismo","Arriendo"],
    "Género": ["Violencia de género","Equidad","Cuidados"],
    "Ciencia y Tecnología": ["Innovación","Transformación digital","IA","Investigación"],
    "Cultura": ["Patrimonio","Artes","Identidad cultural"],
    "Descentralización": ["Gobiernos regionales","Autonomía local"],
    "Transporte": ["Movilidad","Infraestructura","Transporte público"],
    "Inclusión Social": ["Pobreza","Políticas sociales","Migración"],
    "Relaciones Internacionales": ["Diplomacia","Comercio exterior","Integración regional"]
}

# 2) Prompts
def prompts_macro(m):
    base = [
        f"Este párrafo trata sobre {m}.",
        f"Política pública de {m}.",
        f"Tema principal: {m}."
    ]
    extras = {
        "Economía": ["Crecimiento, inversión, impuestos y presupuesto público."],
        "Salud": ["Sistema sanitario, hospitales, FONASA/ISAPRE, atención primaria."],
        "Seguridad": ["Delincuencia, orden público, crimen organizado, policía."]
    }
    return base + extras.get(m, [])

def prompts_sub(m, s):
    return [
        f"{s}, parte del área {m}.",
        f"Subtema de {m}: {s}.",
        f"El párrafo aborda {s} dentro de {m}."
    ]

# 3) Embeddings de prototipos
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model_st = SentenceTransformer("sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", device=DEVICE)

macros = list(HIERARCHY.keys())
macro_proto_texts, macro_slices = [], []
for m in macros:
    ps = prompts_macro(m)
    s = len(macro_proto_texts); macro_proto_texts.extend(ps); e = len(macro_proto_texts)
    macro_slices.append((s, e))

Em_prompts = model_st.encode(macro_proto_texts, normalize_embeddings=True, batch_size=128, show_progress_bar=False)
E_macro = np.vstack([Em_prompts[s:e].mean(0) for (s, e) in macro_slices])

subs_pairs, sub_proto_texts, sub_slices = [], [], []
for m in macros:
    for s in HIERARCHY[m]:
        ps = prompts_sub(m, s)
        st = len(sub_proto_texts); sub_proto_texts.extend(ps); en = len(sub_proto_texts)
        sub_slices.append((st, en)); subs_pairs.append((m, s))

Es_prompts = model_st.encode(sub_proto_texts, normalize_embeddings=True, batch_size=128, show_progress_bar=False)
E_sub = np.vstack([Es_prompts[s:e].mean(0) for (s, e) in sub_slices])

# 4) Similitudes y asignación jerárquica
def l2norm(x, eps=1e-9): return x / (np.linalg.norm(x, axis=1, keepdims=True) + eps)
Zp = l2norm(Z_para); Em = l2norm(E_macro); Es = l2norm(E_sub)
Sm = (cosine_similarity(Zp, Em) + 1.0) / 2.0
Ss = (cosine_similarity(Zp, Es) + 1.0) / 2.0

best_macro_idx = Sm.argmax(1)
best_sub_idx   = Ss.argmax(1)

paras_df = paras_df.copy()
paras_df["macro"]       = np.array(macros, dtype=object)[best_macro_idx]
paras_df["macro_score"] = Sm[np.arange(Sm.shape[0]), best_macro_idx]
paras_df["sub_macro"]   = np.array([subs_pairs[i][0] for i in best_sub_idx], dtype=object)
paras_df["subcat"]      = np.array([subs_pairs[i][1] for i in best_sub_idx], dtype=object)
paras_df["sub_score"]   = Ss[np.arange(Ss.shape[0]), best_sub_idx]

# coherencia macro-sub
mask_incoh = paras_df["macro"] != paras_df["sub_macro"]
paras_df.loc[mask_incoh, ["subcat","sub_score","sub_macro"]] = [None, np.nan, None]

# (opcional) umbral “duro”
THR = 0.55
paras_df["macro_hard"] = paras_df["macro_score"] >= THR
paras_df["sub_hard"]   = paras_df["sub_score"].fillna(0.0) >= THR

# 5) Agregaciones
mix_macro = (paras_df.groupby(["candidate","macro"], as_index=False)
             .size().rename(columns={"size":"count"}))
mix_macro["pct"] = 100.0 * mix_macro["count"] / mix_macro.groupby("candidate")["count"].transform("sum")

mix_sub = (paras_df.dropna(subset=["subcat"]).groupby(["candidate","macro","subcat"], as_index=False)
           .size().rename(columns={"size":"count"}))
mix_sub["pct"] = 100.0 * mix_sub["count"] / mix_sub.groupby("candidate")["count"].transform("sum")

# 6) Visuales — heatmaps con mejor contraste + radar auto + sunburst por candidato
macro_order = macros
pivot_macro = mix_macro.pivot(index="candidate", columns="macro", values="pct").fillna(0.0)
pivot_macro = pivot_macro.reindex(columns=macro_order, fill_value=0.0)

# 6.a Heatmap absoluto con escala dinámica
zmax = float(pivot_macro.values.max()) if pivot_macro.size else 100.0
fig_heat_abs = px.imshow(
    pivot_macro.values, x=pivot_macro.columns, y=pivot_macro.index,
    aspect="auto", origin="upper",
    color_continuous_scale="Viridis", zmin=0, zmax=zmax,
    labels=dict(x="Macro-área", y="Candidato", color="% énfasis"),
    title=f"Heatmap — % real por macro (zmax={zmax:.1f})"
)
fig_heat_abs.update_xaxes(side="top")
fig_heat_abs.show()

# 6.b Heatmap normalizado por fila
rowmax = pivot_macro.max(axis=1).replace(0, np.nan)
pivot_row = pivot_macro.div(rowmax, axis=0).fillna(0.0) * 100.0
fig_heat_row = px.imshow(
    pivot_row.values, x=pivot_row.columns, y=pivot_row.index,
    aspect="auto", origin="upper",
    color_continuous_scale="Plasma", zmin=0, zmax=100,
    labels=dict(x="Macro-área", y="Candidato", color="% relativo (fila)"),
    title="Heatmap — % relativo por candidato (normalización por fila)"
)
fig_heat_row.update_xaxes(side="top")
fig_heat_row.show()

# 6.c Radar con rango radial automático
rad_max = float(pivot_macro.values.max()) if pivot_macro.size else 100.0
fig_radar = go.Figure()
for c in pivot_macro.index:
    vals = pivot_macro.loc[c, macro_order].tolist()
    vals += [vals[0]]
    fig_radar.add_trace(go.Scatterpolar(
        r=vals, theta=macro_order + [macro_order[0]],
        mode="lines+markers", fill="toself", name=c
    ))
fig_radar.update_layout(
    title=f"Radar jerárquico — Perfil macro (rango 0–{rad_max:.1f})",
    polar=dict(radialaxis=dict(range=[0, rad_max])),
    legend_title="Candidato"
)
fig_radar.show()

# 6.d Sunburst jerárquico con dropdown de candidato
candidatos = pivot_macro.index.tolist()
fig_sun = go.Figure()
visible = []
for i, c in enumerate(candidatos):
    dfc = mix_sub[mix_sub["candidate"]==c]
    if dfc.empty:
        fig_sun.add_trace(go.Sunburst(labels=[], parents=[], values=[]))
        visible.append(False); continue
    labels = [c]; parents = [""]; values = [dfc["pct"].sum()]
    for m in dfc["macro"].unique():
        subm = dfc[dfc["macro"]==m]
        labels.append(m); parents.append(c); values.append(subm["pct"].sum())
        for _, r in subm.iterrows():
            labels.append(r["subcat"]); parents.append(m); values.append(r["pct"])
    fig_sun.add_trace(go.Sunburst(labels=labels, parents=parents, values=values, branchvalues="total"))
    visible.append(i==0)

buttons=[]
for i, c in enumerate(candidatos):
    vis = [False]*len(candidatos); vis[i]=True
    buttons.append(dict(label=c, method="update",
                        args=[{"visible": vis},
                              {"title": f"Sunburst — Jerarquía temática de {c}"}]))
fig_sun.update_layout(
    title=f"Sunburst — Jerarquía temática de {candidatos[0] if candidatos else ''}",
    updatemenus=[dict(type="dropdown", x=1.02, y=1.0, buttons=buttons, showactive=True)]
)
for t, v in zip(fig_sun.data, visible): t.visible = v
fig_sun.show()


In [1]:
# ==== BLOQUE COMPLETO CORREGIDO — CARGA ROBUSTA DE ANALIZADORES ABSA (offline/online) ====
# Uso: ejecuta este bloque ANTES de calcular sentimientos/emociones.
# Expone: sent_an, emo_an, absa_sent_predict(texts), absa_emo_predict(texts)

import os, sys, subprocess, torch
from pathlib import Path

# ---------- util: asegurar dependencias mínimas ----------
def _ensure(pkgs):
    for p in pkgs:
        try:
            __import__(p.split("==")[0].split(">=")[0])
        except Exception:
            subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", p])

_ensure(["pysentimiento>=0.7.2", "transformers>=4.43", "huggingface_hub>=0.23"])

from pysentimiento import create_analyzer

DEVICE = 0 if torch.cuda.is_available() else -1
BASE_MODELS = Path("hf_models")  # donde predescargaste modelos (opcional, pero recomendado)
sent_local = BASE_MODELS / "robertuito-sentiment"
emo_local  = BASE_MODELS / "robertuito-emotion"

_ABSA_SENT_BACKEND = "pysent"   # o "cardiff"
_ABSA_EMO_BACKEND  = "pysent"   # o "stub"

def build_analyzers():
    """
    1) Intenta cargar modelos LOCALES de pysentimiento (sin red).
    2) Si falla, intenta descarga online por defecto de pysentimiento.
    3) Si falla, fallback: sentimiento con 'cardiffnlp/twitter-xlm-roberta-base-sentiment' y emociones stub.
    """
    global _ABSA_SENT_BACKEND, _ABSA_EMO_BACKEND

    # ---- 1) Local (sin red) ----
    if sent_local.exists() and emo_local.exists():
        try:
            sa = create_analyzer(task="sentiment", lang="es", model_name=str(sent_local), device=DEVICE)
            ea = create_analyzer(task="emotion",   lang="es", model_name=str(emo_local),  device=DEVICE)
            _ABSA_SENT_BACKEND, _ABSA_EMO_BACKEND = "pysent", "pysent"
            return sa, ea
        except Exception as e:
            print(f"[ABSA] Carga local falló, probando online… ({e})")

    # ---- 2) Online (puede fallar por restricciones de red) ----
    try:
        sa = create_analyzer(task="sentiment", lang="es", device=DEVICE)
        ea = create_analyzer(task="emotion",   lang="es", device=DEVICE)
        _ABSA_SENT_BACKEND, _ABSA_EMO_BACKEND = "pysent", "pysent"
        return sa, ea
    except Exception as e_online:
        print("[ABSA] Descarga online no disponible:", e_online)

    # ---- 3) Fallback mínimo (sentimiento real + emociones stub) ----
    from transformers import AutoTokenizer, AutoModelForSequenceClassification, TextClassificationPipeline
    try:
        mdl = AutoModelForSequenceClassification.from_pretrained(
            "cardiffnlp/twitter-xlm-roberta-base-sentiment"
        )
        tok = AutoTokenizer.from_pretrained("cardiffnlp/twitter-xlm-roberta-base-sentiment")
        sa = TextClassificationPipeline(model=mdl, tokenizer=tok, device=DEVICE)

        class _DummyEmotion:
            def predict(self, texts):
                from types import SimpleNamespace
                # devuelve probabilidades nulas (placeholder)
                return [SimpleNamespace(probas={"anger":0,"fear":0,"joy":0,"sadness":0,"disgust":0,"surprise":0}) for _ in texts]

        ea = _DummyEmotion()
        _ABSA_SENT_BACKEND, _ABSA_EMO_BACKEND = "cardiff", "stub"
        print("[ABSA] Fallback activado: sentimiento 'cardiffnlp', emociones 'stub'.")
        return sa, ea
    except Exception as e_fallback:
        raise RuntimeError(f"[ABSA] No se pudieron inicializar analizadores (ni local, ni online, ni fallback): {e_fallback}")

sent_an, emo_an = build_analyzers()

# ---------- Wrappers UNIFORMES para el resto del pipeline ----------
def absa_sent_predict(texts):
    """
    Devuelve lista de dicts con claves: {'POS','NEU','NEG'} en [0,1],
    independientemente del backend real.
    """
    out = []
    if _ABSA_SENT_BACKEND == "pysent":
        res = sent_an.predict(list(texts))
        for o in res:
            p = o.probas
            out.append({
                "POS": float(p.get("POS", p.get("pos", 0.0))),
                "NEU": float(p.get("NEU", p.get("neu", 0.0))),
                "NEG": float(p.get("NEG", p.get("neg", 0.0))),
            })
    else:  # cardiffnlp pipeline (labels: negative/neutral/positive)
        res = sent_an(list(texts), truncation=True)
        for r in res:
            # r puede ser una lista (top-k) o un dict; normalizamos
            if isinstance(r, list):
                d = {x["label"].lower(): x["score"] for x in r}
            else:
                d = {r["label"].lower(): r["score"]}
            pos = float(d.get("positive", d.get("pos", 0.0)))
            neu = float(d.get("neutral",  d.get("neu", 0.0)))
            neg = float(d.get("negative", d.get("neg", 0.0)))
            s = pos + neu + neg
            if s > 0:
                pos, neu, neg = pos/s, neu/s, neg/s
            out.append({"POS": pos, "NEU": neu, "NEG": neg})
    return out

def absa_emo_predict(texts):
    """
    Devuelve lista de dicts con emociones en [0,1]. Si backend es 'stub',
    retornará ceros (no bloquea el flujo).
    """
    res = emo_an.predict(list(texts))
    out = []
    for o in res:
        p = getattr(o, "probas", {})  # pysentimiento -> SimpleNamespace(probas=...)
        out.append({k: float(v) for k, v in p.items()})
    return out

# ---------- Diagnóstico opcional ----------
def absa_selfcheck(n=3):
    sample = ["La delincuencia ha aumentado y exigimos más seguridad.",
              "Las pymes requieren incentivos tributarios y acceso a crédito.",
              "La atención primaria de salud debe fortalecerse."]
    print(">> Backend sentimiento:", _ABSA_SENT_BACKEND)
    print(">> Backend emociones :", _ABSA_EMO_BACKEND)
    print(">> Test rápido:")
    print(absa_sent_predict(sample)[:n])
    print(absa_emo_predict(sample)[:n])

# Llama absa_selfcheck() si quieres verificar los backends y salidas.
# ==== FIN BLOQUE ====





In [2]:
absa_selfcheck()

>> Backend sentimiento: pysent
>> Backend emociones : pysent
>> Test rápido:


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

[{'POS': 0.008695165626704693, 'NEU': 0.05370123311877251, 'NEG': 0.9376037120819092}, {'POS': 0.16647596657276154, 'NEU': 0.6617985367774963, 'NEG': 0.17172549664974213}, {'POS': 0.13309623301029205, 'NEU': 0.626047670841217, 'NEG': 0.2408560961484909}]


Map:   0%|          | 0/3 [00:00<?, ? examples/s]

[{'others': 0.5704341530799866, 'joy': 0.0032083981204777956, 'sadness': 0.06744280457496643, 'anger': 0.16594739258289337, 'surprise': 0.011281581595540047, 'disgust': 0.05660318210721016, 'fear': 0.12508246302604675}, {'others': 0.9923608899116516, 'joy': 0.0024516666308045387, 'sadness': 0.0010753668611869216, 'anger': 0.0011240122839808464, 'surprise': 0.0013099713250994682, 'disgust': 0.0005138636915944517, 'fear': 0.00116440886631608}, {'others': 0.9904582500457764, 'joy': 0.0045673903077840805, 'sadness': 0.0010258723050355911, 'anger': 0.0010272541549056768, 'surprise': 0.0009321919060312212, 'disgust': 0.0005717985914088786, 'fear': 0.0014172060182318091}]


In [3]:
# %% ABSA — pipeline completo usando los WRAPPERS ya cargados (absa_sent_predict / absa_emo_predict)
# Requiere que hayas ejecutado antes el "BLOQUE COMPLETO CORREGIDO" (sent_an, emo_an, wrappers)

import re, math, numpy as np, pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
import plotly.express as px
import plotly.graph_objects as go
import spacy

# ---------- 0) Carga ----------
PAGES = Path("pages.parquet"); DOCS = Path("documents.parquet")
assert PAGES.exists() or DOCS.exists(), "Falta pages.parquet o documents.parquet"

if PAGES.exists():
    df_pages = pd.read_parquet(PAGES)
    assert {"candidate","filename","page","text"}.issubset(df_pages.columns)
else:
    df_docs = pd.read_parquet(DOCS)
    assert {"candidate","filename","text"}.issubset(df_docs.columns)
    df_pages = df_docs.assign(page=1)

df_pages["text"] = df_pages["text"].fillna("")
tqdm.write(f"[ABSA] páginas: {len(df_pages):,}")

# ---------- 1) Segmentación de oraciones ----------
try:
    nlp = spacy.load("es_core_news_lg")
except Exception:
    try:
        nlp = spacy.load("es_core_news_md")
    except Exception:
        nlp = spacy.blank("es")
        if "sentencizer" not in nlp.pipe_names:
            nlp.add_pipe("sentencizer")

rows = []
for _, r in tqdm(df_pages.iterrows(), total=len(df_pages), desc="[ABSA] sentencizar"):
    doc = nlp(r["text"])
    for i, s in enumerate(doc.sents):
        st = s.text.strip()
        if st:
            rows.append({"candidate":r["candidate"],"filename":r["filename"],
                         "page":r["page"], "sent_id":i, "sent_text":st})
sents = pd.DataFrame(rows)
tqdm.write(f"[ABSA] oraciones: {len(sents):,}")

# ---------- 2) Aspectos (ajústalos a tu dominio) ----------
ASPECTS = {
    "delincuencia":  [r"\bdelincuen", r"crimen organiz", r"seguridad ciudad", r"narco", r"violenc"],
    "pymes":         [r"\bpymes?\b", r"pequeñ[ao]s? empres", r"emprendim"],
    "salud_primaria":[r"atenci[oó]n primaria", r"\bcesfam\b", r"f(onasa|onasa)\b", r"isapre"],
    "educ_superior": [r"universidad", r"educaci[oó]n superior", r"gratuidad", r"arancel"],
    "impuestos":     [r"\biva\b", r"impuest", r"tribut", r"recaudaci[oó]n", r"renta"],
    "pensiones":     [r"\bafp\b", r"pensio", r"\bjubilaci[oó]n\b", r"\bpgu\b"],
    "energía":       [r"energ", r"electricidad", r"renovable", r"hidrógeno", r"gas"],
    "medioambiente": [r"medio ambiente|medioambiente", r"contaminaci[oó]n", r"emisiones", r"clim[aá]tico"],
    "transporte":    [r"transporte p[uú]blico", r"metro", r"\bbus(es)?\b", r"movilidad"],
    "vivienda":      [r"vivienda", r"subsidio habit", r"d[eé]ficit habit", r"arriendo"],
    "carabineros":   [r"\bcarabiner", r"polic[ií]a"],
    "gendarmería":   [r"gendarmer", r"c[aá]rcel", r"penitenciar"],
    "migración":     [r"migraci[oó]n", r"inmigraci[oó]n", r"frontera", r"regularizaci[oó]n"]
}
ASP_RX = {a: re.compile("|".join(p), flags=re.I) for a,p in ASPECTS.items()}

def aspects_for(text):
    return [a for a,rx in ASP_RX.items() if rx.search(text)]

absa_rows = []
for _, r in tqdm(sents.iterrows(), total=len(sents), desc="[ABSA] detectar aspectos"):
    hits = aspects_for(r["sent_text"])
    for a in hits:
        absa_rows.append({**r, "aspect": a})
absa = pd.DataFrame(absa_rows)
tqdm.write(f"[ABSA] sentencias con aspecto: {len(absa):,}")

if absa.empty:
    raise RuntimeError("No se detectaron aspectos con los patrones actuales; amplía ASPECTS.")

# ---------- 3) Sentimiento + Emociones (wrappers) ----------
BATCH = 128
def batched(lst, n=BATCH):
    for i in range(0, len(lst), n):
        yield lst[i:i+n]

pos, neu, neg = [], [], []
for chunk in tqdm(list(batched(absa["sent_text"].tolist())), desc="[ABSA] sentimiento"):
    sc = absa_sent_predict(chunk)                # <- wrapper ya cargado
    pos += [d["POS"] for d in sc]
    neu += [d["NEU"] for d in sc]
    neg += [d["NEG"] for d in sc]
absa["pos"], absa["neu"], absa["neg"] = pos, neu, neg
absa["polarity"] = absa["pos"] - absa["neg"]

emo_all = []
for chunk in tqdm(list(batched(absa["sent_text"].tolist())), desc="[ABSA] emociones"):
    emo_all += absa_emo_predict(chunk)           # <- wrapper ya cargado

# emociones: normalizamos claves y rellenamos faltantes
emo_keys = sorted({k for d in emo_all for k in d.keys()})
for k in emo_keys:
    absa[k] = [d.get(k, 0.0) for d in emo_all]

# ---------- 4) Agregados por candidato × aspecto ----------
agg_pol = (absa
           .groupby(["candidate","aspect"])
           .agg(n=("polarity","size"),
                polarity_mean=("polarity","mean"),
                polarity_std=("polarity","std"),
                pos_mean=("pos","mean"),
                neg_mean=("neg","mean"))
           .reset_index()
           .sort_values(["candidate","polarity_mean"], ascending=[True, False]))

emo_cols = [k for k in emo_keys if k != "others"]
agg_emo = (absa
           .groupby(["candidate","aspect"])[emo_cols]
           .mean()
           .reset_index())

# fragmentos extremos
def extremos(df, k=5):
    top = df.nlargest(k, "polarity")[["filename","page","sent_text","polarity"]].copy()
    bot = df.nsmallest(k, "polarity")[["filename","page","sent_text","polarity"]].copy()
    top["which"]="máx"; bot["which"]="mín"
    return pd.concat([top,bot], ignore_index=True)

ext_rows = []
for (cand, asp), grp in absa.groupby(["candidate","aspect"]):
    ex = extremos(grp, k=5)
    ex.insert(0,"candidate",cand); ex.insert(1,"aspect",asp)
    ext_rows.append(ex)
extreme_table = pd.concat(ext_rows, ignore_index=True)

# ---------- 5) Visualizaciones (Plotly) ----------
# 5.1 Heatmap de polaridad media
pv = agg_pol.pivot(index="candidate", columns="aspect", values="polarity_mean").fillna(0.0)
asp_order = sorted(pv.columns.tolist())
zmax = float(np.abs(pv.values).max()) or 1.0
fig_hm = px.imshow(
    pv[asp_order].to_numpy(),
    x=asp_order, y=pv.index.tolist(),
    color_continuous_scale="RdBu_r", zmin=-zmax, zmax=zmax,
    labels=dict(x="Aspecto", y="Candidato", color="Polaridad media"),
    title="ABSA — Polaridad media por candidato × aspecto"
)
fig_hm.update_xaxes(side="top"); fig_hm.show()

# 5.2 Barras por candidato (dropdown)
fig_bar = go.Figure()
cands = sorted(absa["candidate"].unique().tolist())
if cands:
    c0 = cands[0]
    data0 = agg_pol[agg_pol["candidate"]==c0].sort_values("polarity_mean")
    fig_bar.add_trace(go.Bar(x=data0["aspect"], y=data0["polarity_mean"], name=c0))
    btn=[]
    for c in cands:
        d = agg_pol[agg_pol["candidate"]==c].sort_values("polarity_mean")
        btn.append(dict(label=c, method="update",
                        args=[{"x":[d["aspect"]], "y":[d["polarity_mean"]]},
                              {"title": f"ABSA — Polaridad media por aspecto ({c})"}]))
    fig_bar.update_layout(title=f"ABSA — Polaridad media por aspecto ({c0})",
                          xaxis_title="Aspecto", yaxis_title="Polaridad (pos - neg)",
                          updatemenus=[dict(type="dropdown", x=1.02, y=1.0, buttons=btn, showactive=True)],
                          showlegend=False)
fig_bar.show()

# 5.3 Trayectorias emocionales (candidato × aspecto × emoción)
def trajectory(df, emotion="anger", win=12):
    d = df.sort_values(["page","sent_id"]).reset_index(drop=True)
    y = d[emotion].rolling(win, min_periods=max(1,win//2)).mean()
    return pd.DataFrame({"idx":np.arange(len(d)),
                         "page":d["page"].values,
                         "sent_id":d["sent_id"].values,
                         "value":y.values,
                         "text":d["sent_text"].values})

EMO_SHOW = [e for e in emo_cols if e in {"anger","fear","joy","sadness","disgust","surprise"}] or emo_cols[:6]
asp0 = asp_order[0] if asp_order else list(ASPECTS.keys())[0]
cand0 = cands[0] if cands else None
emo0  = EMO_SHOW[0] if EMO_SHOW else None

def build_trace(cand, emo, asp):
    sub = absa[(absa["candidate"]==cand) & (absa["aspect"]==asp)]
    if sub.empty: return go.Scatter(x=[],y=[],mode="lines")
    tr = trajectory(sub, emotion=emo, win=12)
    return go.Scatter(x=tr["idx"], y=tr["value"], mode="lines",
                      name=f"{cand}-{asp}", hovertext=tr["text"], hoverinfo="text+x+y")

fig_traj = go.Figure()
if cand0 and emo0 and asp0:
    fig_traj.add_trace(build_trace(cand0, emo0, asp0))

buttons=[]
for c in cands:
    for e in EMO_SHOW:
        for a in asp_order:
            t = build_trace(c, e, a)
            buttons.append(dict(label=f"{c} | {e} | {a}", method="update",
                                args=[{"data":[t]},
                                      {"title":f"Trayectoria emocional — {c} | emoción={e} | aspecto={a}"}]))
fig_traj.update_layout(title=f"Trayectoria emocional — {cand0} | emoción={emo0} | aspecto={asp0}",
                       xaxis_title="Índice de oración (orden por página)",
                       yaxis_title="Media móvil prob(emoción)",
                       updatemenus=[dict(type="dropdown", x=1.02, y=1.0, buttons=buttons, showactive=True)],
                       showlegend=False)
fig_traj.show()

# ---------- 6) Entregables en memoria ----------
from IPython.display import display
tqdm.write("[ABSA] listo — tablas y figuras generadas")
display(agg_pol.head(20))
display(agg_emo.head(20))
display(extreme_table.head(12))


[ABSA] páginas: 542


[ABSA] sentencizar:   0%|          | 0/542 [00:00<?, ?it/s]

[ABSA] oraciones: 9,270


[ABSA] detectar aspectos:   0%|          | 0/9270 [00:00<?, ?it/s]

[ABSA] sentencias con aspecto: 2,204


[ABSA] sentimiento:   0%|          | 0/18 [00:00<?, ?it/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

[ABSA] emociones:   0%|          | 0/18 [00:00<?, ?it/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

Map:   0%|          | 0/28 [00:00<?, ? examples/s]

[ABSA] listo — tablas y figuras generadas


Unnamed: 0,candidate,aspect,n,polarity_mean,polarity_std,pos_mean,neg_mean
2,artes,educ_superior,7,-0.052052,0.440677,0.168368,0.22042
12,artes,vivienda,23,-0.106331,0.48946,0.203254,0.309585
9,artes,pymes,8,-0.152765,0.403851,0.119108,0.271873
3,artes,energía,53,-0.168853,0.529634,0.212563,0.381416
11,artes,transporte,6,-0.213392,0.61224,0.174744,0.388136
0,artes,carabineros,6,-0.255261,0.558134,0.19846,0.453721
8,artes,pensiones,26,-0.342034,0.356134,0.067695,0.409729
4,artes,gendarmería,6,-0.392928,0.524446,0.08647,0.479398
5,artes,impuestos,49,-0.49455,0.450709,0.078085,0.572635
6,artes,medioambiente,5,-0.568721,0.326823,0.043374,0.612095


Unnamed: 0,candidate,aspect,anger,disgust,fear,joy,sadness,surprise
0,artes,carabineros,0.106639,0.008459,0.003676,0.006638,0.045165,0.003095
1,artes,delincuencia,0.423883,0.06191,0.004324,0.003003,0.05717,0.001271
2,artes,educ_superior,0.093544,0.005161,0.001194,0.002973,0.010474,0.001048
3,artes,energía,0.096003,0.011765,0.003757,0.005186,0.02752,0.002368
4,artes,gendarmería,0.100155,0.010938,0.003373,0.001429,0.023022,0.001445
5,artes,impuestos,0.191364,0.019036,0.006342,0.010365,0.042183,0.002413
6,artes,medioambiente,0.270801,0.019198,0.00225,0.001594,0.030515,0.001239
7,artes,migración,0.325708,0.046173,0.016294,0.003266,0.11409,0.00248
8,artes,pensiones,0.105241,0.011972,0.001256,0.002218,0.015851,0.001534
9,artes,pymes,0.001952,0.000538,0.000999,0.001902,0.003569,0.001476


Unnamed: 0,candidate,aspect,filename,page,sent_text,polarity,which
0,artes,carabineros,artes.pdf,35,policial de las fuerzas de orden refundadas y ...,0.479099,máx
1,artes,carabineros,artes.pdf,34,Desde el\nMinisterio del Interior se deberá fo...,0.235758,máx
2,artes,carabineros,artes.pdf,34,• Las fuerzas policiales refundadas deberán te...,-0.028991,máx
3,artes,carabineros,artes.pdf,36,• Retiro urgente de todas las fuerzas militare...,-0.613858,máx
4,artes,carabineros,artes.pdf,7,"Por eso, nos\nsumamos al reclamo generalizado ...",-0.79894,máx
5,artes,carabineros,artes.pdf,24,• La destitución de los magistrados de la Cort...,-0.804631,mín
6,artes,carabineros,artes.pdf,7,"Por eso, nos\nsumamos al reclamo generalizado ...",-0.79894,mín
7,artes,carabineros,artes.pdf,36,• Retiro urgente de todas las fuerzas militare...,-0.613858,mín
8,artes,carabineros,artes.pdf,34,• Las fuerzas policiales refundadas deberán te...,-0.028991,mín
9,artes,carabineros,artes.pdf,34,Desde el\nMinisterio del Interior se deberá fo...,0.235758,mín


In [4]:
# %% MÓDULO 5 — NER + grafos de actores/territorios por candidato
import pandas as pd, numpy as np, re
from pathlib import Path
from tqdm.auto import tqdm
import spacy, networkx as nx
import plotly.graph_objects as go

# ---------- 0) Carga + oraciones (reutiliza sents si existe) ----------
PAGES = Path("pages.parquet")
DOCS  = Path("documents.parquet")
assert PAGES.exists() or DOCS.exists()

if "sents" in globals():
    tqdm.write("[M5] reutilizando sents del módulo previo")
    sents_m5 = sents.copy()
else:
    tqdm.write("[M5] construyendo sents desde páginas")
    if PAGES.exists():
        df_pages = pd.read_parquet(PAGES)
        assert {"candidate","filename","page","text"}.issubset(df_pages.columns)
    else:
        df_docs = pd.read_parquet(DOCS)
        df_pages = df_docs.assign(page=1)
    df_pages["text"] = df_pages["text"].fillna("")

    try:
        nlp_m5 = spacy.load("es_core_news_lg")
    except Exception:
        try:
            nlp_m5 = spacy.load("es_core_news_md")
        except Exception:
            nlp_m5 = spacy.blank("es")
            if "sentencizer" not in nlp_m5.pipe_names:
                nlp_m5.add_pipe("sentencizer")

    rows = []
    for _, r in tqdm(df_pages.iterrows(), total=len(df_pages), desc="[M5] sentencizar"):
        doc = nlp_m5(r["text"])
        for i, s in enumerate(doc.sents):
            st = s.text.strip()
            if st:
                rows.append({
                    "candidate": r["candidate"],
                    "filename":  r["filename"],
                    "page":      r["page"],
                    "sent_id":   i,
                    "sent_text": st
                })
    sents_m5 = pd.DataFrame(rows)

tqdm.write(f"[M5] oraciones: {len(sents_m5):,}")

# ---------- 1) NER con spaCy ----------
try:
    nlp_ner = spacy.load("es_core_news_lg")
except Exception:
    nlp_ner = spacy.load("es_core_news_md")

ENT_TYPES = {"PER","ORG","LOC","GPE","NORP"}  # actores, instituciones, colectivos, territorios

ner_rows = []
for _, r in tqdm(sents_m5.iterrows(), total=len(sents_m5), desc="[M5] NER"):
    doc = nlp_ner(r["sent_text"])
    ents = [e for e in doc.ents if e.label_ in ENT_TYPES]
    if not ents:
        continue
    for e in ents:
        ner_rows.append({
            "candidate": r["candidate"],
            "filename":  r["filename"],
            "page":      r["page"],
            "sent_id":   r["sent_id"],
            "sent_text": r["sent_text"],
            "ent_text":  e.text.strip(),
            "ent_label": e.label_
        })

ner_df = pd.DataFrame(ner_rows)
tqdm.write(f"[M5] entidades extraídas: {len(ner_df):,}")

# ---------- 2) Grafo de co-ocurrencia por candidato ----------
# Co-ocurrencia: entidades que aparecen en la misma oración
def build_graph_for_candidate(cand_df, min_freq=2):
    G = nx.Graph()
    # contar frecuencia individual
    freq = cand_df["ent_text"].value_counts()
    # nodos
    for ent, f in freq.items():
        if f < min_freq:
            continue
        G.add_node(ent, freq=f)
    # aristas por co-ocurrencia en la misma oración
    for (fname, page, sid), grp in cand_df.groupby(["filename","page","sent_id"]):
        ents = [e for e in grp["ent_text"].unique() if e in G.nodes]
        if len(ents) < 2:
            continue
        for i in range(len(ents)):
            for j in range(i+1, len(ents)):
                u, v = ents[i], ents[j]
                if G.has_edge(u, v):
                    G[u][v]["weight"] += 1
                else:
                    G.add_edge(u, v, weight=1)
    if G.number_of_edges() == 0:
        return G, pd.DataFrame(), pd.DataFrame()
    # centralidades
    deg = nx.degree_centrality(G)
    btw = nx.betweenness_centrality(G, weight="weight", normalized=True)
    cen_df = pd.DataFrame({
        "entity": list(G.nodes()),
        "freq":   [G.nodes[n].get("freq", freq.get(n,0)) for n in G.nodes()],
        "deg":    [deg[n] for n in G.nodes()],
        "betw":   [btw[n] for n in G.nodes()]
    }).sort_values("deg", ascending=False)
    # listado de aristas
    edges_df = pd.DataFrame([
        {"source":u, "target":v, "weight":d.get("weight",1)}
        for u,v,d in G.edges(data=True)
    ]).sort_values("weight", ascending=False)
    return G, cen_df, edges_df

graphs = {}
for cand, grp in ner_df.groupby("candidate"):
    Gc, cen, ed = build_graph_for_candidate(grp, min_freq=2)
    graphs[cand] = {"G":Gc, "centrality":cen, "edges":ed}
    tqdm.write(f"[M5] {cand}: nodos={Gc.number_of_nodes()}, aristas={Gc.number_of_edges()}")

# ---------- 3) Tabla top entidades por candidato ----------
top_entities = []
for cand, obj in graphs.items():
    cen = obj["centrality"]
    if cen.empty: 
        continue
    top = cen.head(20).copy()
    top.insert(0, "candidate", cand)
    top_entities.append(top)
top_entities_df = pd.concat(top_entities, ignore_index=True) if top_entities else pd.DataFrame()
display(top_entities_df.head(40))

# ---------- 4) Grafo interactivo Plotly (un candidato a la vez) ----------
def plot_graph_candidate(cand, k=0.5):
    obj = graphs.get(cand)
    if obj is None or obj["centrality"].empty:
        print(f"[M5] Sin grafo significativo para {cand}")
        return
    G = obj["G"]
    pos = nx.spring_layout(G, k=k, weight="weight", seed=42)
    nodes = list(G.nodes())
    x = [pos[n][0] for n in nodes]
    y = [pos[n][1] for n in nodes]
    sizes = [5 + 30 * obj["centrality"].set_index("entity").loc[n,"deg"] for n in nodes]
    # aristas
    edge_x, edge_y = [], []
    for u,v in G.edges():
        edge_x += [pos[u][0], pos[v][0], None]
        edge_y += [pos[u][1], pos[v][1], None]
    edge_trace = go.Scatter(
        x=edge_x, y=edge_y,
        mode="lines", line=dict(width=0.5, color="rgba(150,150,150,0.5)"),
        hoverinfo="none", showlegend=False
    )
    node_trace = go.Scatter(
        x=x, y=y, mode="markers+text",
        text=nodes, textposition="top center",
        marker=dict(size=sizes, color="rgba(31,119,180,0.8)"),
        hovertext=[f"{n}<br>freq={G.nodes[n].get('freq',0)}" for n in nodes],
        hoverinfo="text", name=f"Grafo actores/territorios — {cand}"
    )
    fig = go.Figure(data=[edge_trace, node_trace])
    fig.update_layout(
        title=f"Grafo de actores / territorios — {cand}",
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        showlegend=False
    )
    fig.show()

# Ejemplo de uso:
# plot_graph_candidate("kast")


[M5] reutilizando sents del módulo previo
[M5] oraciones: 9,270


[M5] NER:   0%|          | 0/9270 [00:00<?, ?it/s]

[M5] entidades extraídas: 5,738
[M5] artes: nodos=53, aristas=99
[M5] harold: nodos=24, aristas=7
[M5] jara: nodos=5, aristas=0
[M5] kaiser: nodos=288, aristas=450
[M5] kast: nodos=13, aristas=8
[M5] matthei: nodos=31, aristas=39
[M5] meo: nodos=27, aristas=27
[M5] parisi: nodos=169, aristas=251


Unnamed: 0,candidate,entity,freq,deg,betw
0,artes,Chile,144,0.596154,0.59399
1,artes,Gobierno Patriótico Popular,65,0.326923,0.266913
2,artes,Estado,88,0.153846,0.091755
3,artes,Unidad Popular,8,0.134615,0.051147
4,artes,golpe de Estado,2,0.134615,0.051147
5,artes,Cuba,4,0.134615,0.011804
6,artes,Bolivia,2,0.115385,0.000724
7,artes,Nicaragua,2,0.115385,0.0
8,artes,Venezuela,4,0.115385,0.0
9,artes,China,4,0.096154,0.011077


In [11]:
plot_graph_candidate("artes")

In [12]:
# %% MÓDULO 6 — Extracción de compromisos y tipo de política
import pandas as pd, numpy as np, re, math
from pathlib import Path
from tqdm.auto import tqdm
import spacy

from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# ---------- 0) Oraciones (reutilizar sents si existe) ----------
if "sents" in globals():
    sents_m6 = sents.copy()
elif "sents_m5" in globals():
    sents_m6 = sents_m5.copy()
else:
    PAGES = Path("pages.parquet"); DOCS = Path("documents.parquet")
    assert PAGES.exists() or DOCS.exists()
    if PAGES.exists():
        df_pages = pd.read_parquet(PAGES)
    else:
        df_docs = pd.read_parquet(DOCS)
        df_pages = df_docs.assign(page=1)
    df_pages["text"] = df_pages["text"].fillna("")

    try:
        nlp_s6 = spacy.load("es_core_news_lg")
    except Exception:
        try:
            nlp_s6 = spacy.load("es_core_news_md")
        except Exception:
            nlp_s6 = spacy.blank("es")
            if "sentencizer" not in nlp_s6.pipe_names:
                nlp_s6.add_pipe("sentencizer")
    rows = []
    for _, r in tqdm(df_pages.iterrows(), total=len(df_pages), desc="[M6] sentencizar"):
        doc = nlp_s6(r["text"])
        for i, s in enumerate(doc.sents):
            st = s.text.strip()
            if st:
                rows.append({"candidate":r["candidate"],"filename":r["filename"],
                             "page":r["page"],"sent_id":i,"sent_text":st})
    sents_m6 = pd.DataFrame(rows)

tqdm.write(f"[M6] oraciones: {len(sents_m6):,}")

# ---------- 1) Patrones propositivos (supervisados por léxico) ----------
# Verbos performativos/modales
ACTION_LEMMAS = {
    "proponer","proponeremos","propondremos","proponeremos",
    "crear","crearemos","establecer","estableceremos",
    "aumentar","disminuir","reducir","reformar","implementar","garantizar",
    "fortalecer","mejorar","promover","fomentar","eliminar","derogar","subir","bajar"
}
# patrones de texto adicionales
ACTION_REGEX = re.compile(
    r"(propondremos|proponemos|se propone|nos comprometemos|"
    r"crearemos|crearemos|se crear[aá]|vamos a\s+\w+)",
    flags=re.I
)

try:
    nlp_m6 = spacy.load("es_core_news_lg")
except Exception:
    nlp_m6 = spacy.load("es_core_news_md")

def is_propositive(doc):
    if ACTION_REGEX.search(doc.text):
        return True
    for t in doc:
        if t.pos_ == "VERB" and t.lemma_.lower() in ACTION_LEMMAS:
            return True
    return False

prop_rows = []
for _, r in tqdm(sents_m6.iterrows(), total=len(sents_m6), desc="[M6] detectar propositivas"):
    doc = nlp_m6(r["sent_text"])
    if not is_propositive(doc):
        continue
    prop_rows.append({**r})
prop_df = pd.DataFrame(prop_rows)
tqdm.write(f"[M6] candidatos a compromiso (sin NLI): {len(prop_df):,}")

# ---------- 2) NLI para filtrar verdaderos compromisos (opcional) ----------
# Modelo multinlingüe XNLI
try:
    nli_tok = AutoTokenizer.from_pretrained("joeddav/xlm-roberta-large-xnli")
    nli_mdl = AutoModelForSequenceClassification.from_pretrained("joeddav/xlm-roberta-large-xnli")
    nli_pipe = pipeline("text-classification", model=nli_mdl, tokenizer=nli_tok,
                        return_all_scores=True, truncation=True)
    HAS_NLI = True
    tqdm.write("[M6] NLI XLM-R cargado")
except Exception as e:
    HAS_NLI = False
    tqdm.write(f"[M6] NLI no disponible ({e}), se omite filtrado NLI.")

def nli_commit_prob(sent):
    """
    Probabilidad de que la oración exprese un compromiso/propuesta concreta.
    Hipótesis genérica: 'Esta frase expresa un compromiso de política pública.'
    """
    if not HAS_NLI:
        return 0.5
    premise = sent
    hypothesis = "Esta frase expresa un compromiso de política pública."
    res = nli_pipe(f"{premise} </s></s> {hypothesis}")[0]  # lista de dicts
    scores = {x["label"].lower(): x["score"] for x in res}
    # etiquetas típicas: 'entailment','neutral','contradiction'
    return float(scores.get("entailment", 0.0))

NLI_THRESH = 0.55 if HAS_NLI else 0.0  # si no hay NLI, no filtramos

nli_scores = []
for txt in tqdm(prop_df["sent_text"].tolist(), desc="[M6] NLI scoring", disable=not HAS_NLI):
    nli_scores.append(nli_commit_prob(txt))
prop_df["nli_score"] = nli_scores
if HAS_NLI:
    prop_df = prop_df[prop_df["nli_score"] >= NLI_THRESH].reset_index(drop=True)
    tqdm.write(f"[M6] compromisos tras NLI (≥{NLI_THRESH}): {len(prop_df):,}")

# ---------- 3) Extracción (acción, objeto, cuantificador) ----------
def extract_triplet(text):
    doc = nlp_m6(text)
    # acción = verbo raíz/finito más saliente
    verb = None
    for t in doc:
        if t.pos_ == "VERB" and t.dep_ in {"ROOT","ccomp","xcomp"}:
            verb = t; break
    if verb is None:
        for t in doc:
            if t.pos_ == "VERB":
                verb = t; break
    action_lemma = verb.lemma_ if verb is not None else ""
    # objeto: subárbol del objeto directo / complemento
    obj_span = ""
    quant = ""
    if verb is not None:
        objs = [c for c in verb.children if c.dep_ in {"obj","dobj","obl","iobj","nsubj:pass","nsubjpass"}]
        if objs:
            head = objs[0]
            # expandimos a la frase nominal
            tokens = [t for t in head.subtree]
            tokens = sorted(tokens, key=lambda x: x.i)
            obj_span = doc[tokens[0].i : tokens[-1].i+1].text
    # cuantificador: números, porcentajes, montos
    nums = [t for t in doc if t.like_num or t.text.endswith("%")]
    if nums:
        # tomamos la primera frase que contiene el número
        n = nums[0]
        q_tokens = [t for t in n.subtree]
        q_tokens = sorted(q_tokens, key=lambda x: x.i)
        quant = doc[q_tokens[0].i : q_tokens[-1].i+1].text
    return action_lemma, obj_span.strip(), quant.strip()

triplets = [extract_triplet(t) for t in tqdm(prop_df["sent_text"].tolist(), desc="[M6] extraer tripletas")]
prop_df["action_lemma"] = [a for a,_,_ in triplets]
prop_df["object"]       = [o for _,o,_ in triplets]
prop_df["quantifier"]   = [q for *_,q in triplets]

# ---------- 4) Tipo de política (reglas heurísticas) ----------
def classify_policy(action, obj, text):
    t = f"{action} {obj} {text}".lower()
    if re.search(r"impuest|tribut|iva|contribuci[oó]n|subsidio|bono|transferencia", t):
        return "fiscal"
    if re.search(r"obligatori|prohibir|regular|regulaci[oó]n|ley|norma|c[oó]digo", t):
        return "regulatoria"
    if re.search(r"programa|plan|servicio|pol[ií]tica p[uú]blica|beneficio", t):
        return "programática"
    if re.search(r"ministerio|agencia|instituci[oó]n|superintendencia|organismo|ente", t):
        return "institucional"
    return "otro"

prop_df["policy_type"] = [
    classify_policy(a, o, txt) for a,o,txt in zip(prop_df["action_lemma"], prop_df["object"], prop_df["sent_text"])
]

# ---------- 5) Catálogo de compromisos + tablas resumen ----------
commitments = prop_df[[
    "candidate","filename","page","sent_id","sent_text",
    "action_lemma","object","quantifier","policy_type","nli_score"
]].copy()

display(commitments.head(30))

# mix por candidato x tipo
mix_pol = (commitments
           .groupby(["candidate","policy_type"])
           .size().reset_index(name="n"))
mix_pol["pct"] = mix_pol.groupby("candidate")["n"].transform(lambda x: x/x.sum()*100)
display(mix_pol)

# ---------- 6) Visualizaciones (Plotly) ----------
import plotly.express as px

# Barras apiladas candidato × tipo de política
fig_mix = px.bar(
    mix_pol, x="candidate", y="pct", color="policy_type",
    title="Distribución de compromisos por tipo de política",
    labels={"pct":"% compromisos","candidate":"Candidato","policy_type":"Tipo de política"},
    barmode="stack"
)
fig_mix.show()

# Treemap por candidato (volumen de compromisos)
fig_tree = px.treemap(
    mix_pol, path=["candidate","policy_type"], values="n",
    title="Catálogo de compromisos por candidato y tipo"
)
fig_tree.show()

# commitments queda en memoria para usar en módulos 8 y 9
tqdm.write("[M6] catálogo de compromisos disponible en 'commitments'")


[M6] oraciones: 9,270


[M6] detectar propositivas:   0%|          | 0/9270 [00:00<?, ?it/s]

[M6] candidatos a compromiso (sin NLI): 1,706
[M6] NLI XLM-R cargado



`return_all_scores` is now deprecated,  if want a similar functionality use `top_k=None` instead of `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.



[M6] NLI scoring:   0%|          | 0/1706 [00:00<?, ?it/s]

[M6] compromisos tras NLI (≥0.55): 929


[M6] extraer tripletas:   0%|          | 0/929 [00:00<?, ?it/s]

Unnamed: 0,candidate,filename,page,sent_id,sent_text,action_lemma,object,quantifier,policy_type,nli_score
0,artes,artes.pdf,2,3,Como consecuencia\nmisma del desarrollo del ca...,aumentar,Como consecuencia\nmisma del desarrollo del ca...,,fiscal,0.880165
1,artes,artes.pdf,2,26,Una industria nacional sólida y orientada a la...,fomentar,el trabajo,,institucional,0.997149
2,artes,artes.pdf,3,30,"Para ello, consideramos altamente necesario qu...",considerar,"Para ello,",,institucional,0.995851
3,artes,artes.pdf,5,19,Porque permiten a la patria altos grados de in...,permitir,a la patria,,institucional,0.99925
4,artes,artes.pdf,8,0,PROGRAMA DE PATRIA NUEV A Y POPULAR\nPrograma\...,realizar él,PROGRAMA DE PATRIA NUEV A Y POPULAR\nPrograma,,fiscal,0.999002
5,artes,artes.pdf,8,13,"En conclusión, los comunistas creemos que las ...",creer,"En conclusión,",,institucional,0.989972
6,artes,artes.pdf,8,17,• Establecer relaciones diplomáticas basadas e...,establecer,relaciones diplomáticas basadas en el respeto ...,,institucional,0.883834
7,artes,artes.pdf,8,18,• Establecer relaciones diplomáticas basadas e...,establecer,relaciones diplomáticas basadas en la solidari...,,institucional,0.997392
8,artes,artes.pdf,9,2,"• El Gobierno Patriótico Popular, a través de ...",intentar,,,institucional,0.99892
9,artes,artes.pdf,9,6,• El Gobierno Patriótico Popular establecerá r...,establecer,"relaciones\ndiplomáticas, de solidaridad, de l...",,institucional,0.999083


Unnamed: 0,candidate,policy_type,n,pct
0,artes,fiscal,15,21.428571
1,artes,institucional,27,38.571429
2,artes,otro,19,27.142857
3,artes,programática,7,10.0
4,artes,regulatoria,2,2.857143
5,harold,fiscal,29,32.222222
6,harold,institucional,19,21.111111
7,harold,otro,32,35.555556
8,harold,programática,9,10.0
9,harold,regulatoria,1,1.111111


[M6] catálogo de compromisos disponible en 'commitments'


In [13]:
# %% MÓDULO 7 — Framing (moral foundations) por candidato
import pandas as pd, numpy as np, re
from tqdm.auto import tqdm
import plotly.express as px
import plotly.graph_objects as go

# ---------- 0) Texto base ----------
# Usaremos sents_m6 (oraciones) como unidad
base_sents = sents_m6.copy()
base_sents["sent_text"] = base_sents["sent_text"].fillna("")

# ---------- 1) Lexicón básico de marcos (expandible) ----------
MF_LEX = {
    "cuidado": [
        r"cuidar", r"cuidado", r"derecho a la salud", r"protecci[oó]n social",
        r"solidaridad", r"apoyo", r"inclusi[oó]n"
    ],
    "equidad": [
        r"justicia social", r"equidad", r"igualdad", r"redistribuci[oó]n",
        r"desigualdad", r"fair", r"imparcial"
    ],
    "libertad": [
        r"libertad", r"libre elecci[oó]n", r"autonom[ií]a", r"derechos individuales",
        r"libre mercado"
    ],
    "autoridad": [
        r"autoridad", r"orden", r"disciplina", r"obediencia", r"respeto a la ley",
        r"seguridad", r"mano dura"
    ],
    "lealtad": [
        r"patria", r"naci[oó]n", r"lealtad", r"unidad nacional", r"cohesi[oó]n",
        r"nuestro pueblo"
    ],
    "pureza": [
        r"corrupci[oó]n", r"moral", r"valores tradicionales", r"familia tradicional",
        r"decencia"
    ]
}
MF_RX = {k: re.compile("|".join(v), flags=re.I) for k,v in MF_LEX.items()}
MF_LABELS = list(MF_LEX.keys())

def mf_scores(text):
    t = text.lower()
    return {k: int(bool(rx.search(t))) for k,rx in MF_RX.items()}

mf_rows = []
for _, r in tqdm(base_sents.iterrows(), total=len(base_sents), desc="[M7] scoring MF"):
    sc = mf_scores(r["sent_text"])
    row = dict(r)
    for k in MF_LABELS:
        row[k] = sc.get(k, 0)
    mf_rows.append(row)

mf_df = pd.DataFrame(mf_rows)

# ---------- 2) Agregación por candidato ----------
agg_mf = (mf_df
          .groupby("candidate")[MF_LABELS]
          .sum()
          .reset_index())
# normalizamos a proporciones
tot = agg_mf[MF_LABELS].sum(axis=1)
for k in MF_LABELS:
    agg_mf[k] = agg_mf[k] / tot.replace(0,np.nan)

display(agg_mf)

# ---------- 3) Radar comparativo por candidato ----------
radar_rows = []
for _, r in agg_mf.iterrows():
    for k in MF_LABELS:
        radar_rows.append({"candidate":r["candidate"], "frame":k, "value":r[k]})

radar_df = pd.DataFrame(radar_rows)

fig_rad = px.line_polar(
    radar_df, r="value", theta="frame", color="candidate",
    line_close=True,
    title="Framing — Peso relativo de marcos morales por candidato",
    range_r=[0, radar_df["value"].max() or 1]
)
fig_rad.update_traces(fill="toself", opacity=0.5)
fig_rad.show()

# ---------- 4) Fragmentos más representativos por marco/candidato ----------
top_fragments = []
for cand, grp_c in mf_df.groupby("candidate"):
    for f in MF_LABELS:
        sub = grp_c[grp_c[f] > 0]
        # orden aproximada: páginas/posición
        sub = sub.sort_values(["page","sent_id"]).head(5)
        for _, r in sub.iterrows():
            top_fragments.append({
                "candidate": cand,
                "frame": f,
                "filename": r["filename"],
                "page": r["page"],
                "sent_id": r["sent_id"],
                "sent_text": r["sent_text"]
            })
top_frag_df = pd.DataFrame(top_fragments)
display(top_frag_df.head(40))

tqdm.write("[M7] framing listo — radar y fragmentos representativos en 'top_frag_df'")


[M7] scoring MF:   0%|          | 0/9270 [00:00<?, ?it/s]

Unnamed: 0,candidate,cuidado,equidad,libertad,autoridad,lealtad,pureza
0,artes,0.058201,0.021164,0.05291,0.097884,0.746032,0.02381
1,harold,0.165179,0.133929,0.075893,0.241071,0.334821,0.049107
2,jara,0.152174,0.173913,0.108696,0.173913,0.304348,0.086957
3,kaiser,0.09009,0.070946,0.167793,0.156532,0.484234,0.030405
4,kast,0.104575,0.03268,0.169935,0.385621,0.27451,0.03268
5,matthei,0.217228,0.041199,0.048689,0.307116,0.348315,0.037453
6,meo,0.072917,0.145833,0.03125,0.208333,0.53125,0.010417
7,parisi,0.130081,0.01897,0.0271,0.214092,0.590786,0.01897


Unnamed: 0,candidate,frame,filename,page,sent_id,sent_text
0,artes,cuidado,artes.pdf,3,7,"Controlan la\neconomía, la prensa y otros medi..."
1,artes,cuidado,artes.pdf,3,30,"Para ello, consideramos altamente necesario qu..."
2,artes,cuidado,artes.pdf,6,3,"Sin embargo, especialmente la generación de el..."
3,artes,cuidado,artes.pdf,6,4,Este hecho puede representar\nun significativo...
4,artes,cuidado,artes.pdf,7,0,colaboración y de solidaridad entre ellos.
5,artes,equidad,artes.pdf,5,11,Vemos necesarias una fuerte participación del\...
6,artes,equidad,artes.pdf,9,20,• Rechazamos categóricamente la discriminación...
7,artes,equidad,artes.pdf,16,12,Las grandes empresas agrícolas privadas que es...
8,artes,equidad,artes.pdf,18,16,• El gobierno debe incluir medidas que adecúen...
9,artes,equidad,artes.pdf,25,16,Existe una profunda\ndesigualdad entre la cali...


[M7] framing listo — radar y fragmentos representativos en 'top_frag_df'


In [1]:
# %% MÓDULO 8 — Afinidad por candidato (BERT embeddings)
import pandas as pd, numpy as np
from pathlib import Path
from tqdm.auto import tqdm
import plotly.express as px
import plotly.graph_objects as go

from sentence_transformers import SentenceTransformer, util

DOCS = Path("documents.parquet")
assert DOCS.exists(), "Se requiere documents.parquet para afinidad a nivel programa"

docs = pd.read_parquet(DOCS)
assert {"candidate","filename","text"}.issubset(docs.columns)
docs["text"] = docs["text"].fillna("")

# ---------- 1) Embeddings SBERT ----------
MODEL_NAME = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
tqdm.write(f"[M8] cargando modelo SBERT: {MODEL_NAME}")
sbert = SentenceTransformer(MODEL_NAME)

doc_texts = docs["text"].tolist()
Z_docs = sbert.encode(doc_texts, batch_size=32, show_progress_bar=True, convert_to_tensor=True)

docs_emb = docs.copy()
docs_emb["emb_idx"] = np.arange(len(docs_emb))

# embedding por candidato = promedio de sus documentos
cand_embs = {}
for cand, grp in docs_emb.groupby("candidate"):
    idx = grp["emb_idx"].values
    cand_embs[cand] = Z_docs[idx].mean(dim=0)

candidates = sorted(cand_embs.keys())
C = len(candidates)
M_cand = np.zeros((C,C))
for i, ci in enumerate(candidates):
    for j, cj in enumerate(candidates):
        M_cand[i,j] = util.cos_sim(cand_embs[ci], cand_embs[cj]).item()

# ---------- 2) Heatmap de similitud entre candidatos ----------
fig_cand_sim = px.imshow(
    M_cand, x=candidates, y=candidates,
    color_continuous_scale="Viridis",
    zmin=0, zmax=1,
    labels={"x":"Candidato", "y":"Candidato", "color":"Similitud coseno"},
    title="Afinidad programática — similitud entre candidatos (SBERT)"
)
fig_cand_sim.update_xaxes(side="top")
fig_cand_sim.show()

# ---------- 3) Afinidad candidato × temas (macro categorías) ----------
THEMES = [
    "Economía y trabajo",
    "Salud y seguridad social",
    "Educación y cultura",
    "Seguridad y delincuencia",
    "Medioambiente y cambio climático",
    "Género y diversidades",
    "Instituciones y reforma del Estado",
    "Desarrollo productivo e innovación",
    "Cabotaje Nacional y transporte",
    "Vivienda y urbanismo",
    "Minería y recursos naturales",
    "Política internacional y relaciones exteriores",
    "Pensiones",
    "PRAIS",
    "Financiamiento educacional",
    "CAE o Crédito Universitario",
    "Jubilación de personas que no tienen fondos previsionales"
]

theme_embs = sbert.encode(THEMES, batch_size=8, show_progress_bar=False, convert_to_tensor=True)

M_ct = np.zeros((C, len(THEMES)))
for i, ci in enumerate(candidates):
    for j in range(len(THEMES)):
        M_ct[i,j] = util.cos_sim(cand_embs[ci], theme_embs[j]).item()

# Heatmap candidato × tema
fig_ct = px.imshow(
    M_ct,
    x=THEMES, y=candidates,
    color_continuous_scale="Plasma",
    zmin=float(M_ct.min()), zmax=float(M_ct.max()),
    labels={"x":"Tema", "y":"Candidato", "color":"Similitud coseno"},
    title="Afinidad candidato × temas macro (SBERT)"
)
fig_ct.update_xaxes(side="top")
fig_ct.show()

# Radar por candidato
radar_rows = []
for i, c in enumerate(candidates):
    for j, t in enumerate(THEMES):
        radar_rows.append({"candidate":c, "theme":t, "value":M_ct[i,j]})
radar_df = pd.DataFrame(radar_rows)

fig_rad_ct = px.line_polar(
    radar_df, r="value", theta="theme", color="candidate",
    line_close=True,
    title="Perfil temático — afinidad con temas macro por candidato"
)
fig_rad_ct.update_traces(fill="toself", opacity=0.4)
fig_rad_ct.show()

tqdm.write("[M8] afinidad candidata ↔ candidatas y candidata ↔ temas generada")



[M8] cargando modelo SBERT: sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[M8] afinidad candidata ↔ candidatas y candidata ↔ temas generada


In [2]:
# %% MÓDULO 9 — Compromisos + Coherencia / contradicciones internas (NLI)
import re, itertools
import pandas as pd, numpy as np, math
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import spacy
import plotly.express as px

tqdm.write("[M9] Inicio módulo integral (pages → compromisos → NLI)")

# ============================================================
# 1) Carga de páginas y segmentación en oraciones
# ============================================================
PAGES_FILE = "pages.parquet"   # ajusta si está en otro path

pages = pd.read_parquet(PAGES_FILE)
expected_cols = {"candidate","filename","page","text"}
missing = expected_cols - set(pages.columns)
if missing:
    raise ValueError(f"[M9] Faltan columnas en pages.parquet: {missing}")

tqdm.write(f"[M9] pages.shape={pages.shape}")

# spaCy sólo para segmentar oraciones (y NER si quieres reutilizar luego)
try:
    nlp_s = spacy.load("es_core_news_lg")
except OSError:
    try:
        nlp_s = spacy.load("es_core_news_md")
    except OSError:
        nlp_s = spacy.blank("es")
        nlp_s.add_pipe("sentencizer")

tqdm.write(f"[M9] spaCy cargado para segmentación: {nlp_s.lang}")

sent_records = []
for _, row in tqdm(pages.iterrows(), total=len(pages), desc="[M9] segmentando oraciones"):
    txt = row["text"]
    if not isinstance(txt, str) or not txt.strip():
        continue
    doc = nlp_s(txt)
    for sid, sent in enumerate(doc.sents):
        st = sent.text.strip()
        if not st:
            continue
        sent_records.append({
            "candidate": row["candidate"],
            "filename":  row["filename"],
            "page":      row["page"],
            "sent_id":   sid,
            "sent_text": st,
        })

sent_df = pd.DataFrame(sent_records)
tqdm.write(f"[M9] sent_df.shape={sent_df.shape}")

# ============================================================
# 2) Heurística de compromisos/propuestas → DataFrame 'commitments'
# ============================================================
VERB_PATTERNS = [
    r"propondr[ea]mos?", r"proponemos", r"propondremos",
    r"crearemos?", r"implementaremos?", r"impulsaremos?",
    r"reformaremos?", r"reformar", r"modernizaremos?",
    r"aumentaremos?", r"reduciremos?", r"disminuiremos?",
    r"garantizaremos?", r"aseguraremos?", r"fortaleceremos?",
    r"estableceremos?", r"desarrollaremos?", r"promoveremos?",
    r"apoyaremos?", r"subiremos?", r"bajaremos?"
]
pattern_commit = re.compile(r"\b(" + "|".join(VERB_PATTERNS) + r")\b", re.IGNORECASE)

MIN_LEN = 40  # filtrar frases demasiado cortas

def is_commitment(s):
    if not isinstance(s, str):
        return False
    s2 = s.strip()
    if len(s2) < MIN_LEN:
        return False
    return bool(pattern_commit.search(s2))

sent_df["is_commitment"] = sent_df["sent_text"].apply(is_commitment)
commitments = (
    sent_df[sent_df["is_commitment"]]
    .drop(columns=["is_commitment"])
    .reset_index(drop=True)
)

tqdm.write(f"[M9] commitments.shape={commitments.shape}")
if commitments.empty:
    raise RuntimeError("[M9] No se detectaron compromisos con la heurística actual. Ajusta VERB_PATTERNS/MIN_LEN.")

# ============================================================
# 3) Modelo NLI (más liviano y sólo CPU para evitar crashes)
# ============================================================
MODEL_NLI = "joeddav/xlm-roberta-large-xnli"  # base (no large) para reducir RAM
DEVICE = 0                                  # forzar CPU; cambia a 0 si tu GPU está estable

try:
    nli_tok2 = AutoTokenizer.from_pretrained(MODEL_NLI)
    nli_mdl2 = AutoModelForSequenceClassification.from_pretrained(MODEL_NLI)
    nli2 = pipeline(
        "text-classification",
        model=nli_mdl2,
        tokenizer=nli_tok2,
        return_all_scores=True,   # misma lógica que tu código original
        truncation=True,
        device=DEVICE
    )
    tqdm.write(f"[M9] NLI XLM-R (base) cargado en device={DEVICE}")
except Exception as e:
    raise RuntimeError(f"[M9] No se pudo cargar el modelo NLI '{MODEL_NLI}': {e}")

LABEL_ORDER = ["contradiction", "neutral", "entailment"]

def nli_pair(premise, hypothesis):
    """
    Devuelve dict con P(contradiction), P(neutral), P(entailment)
    usando el pipeline NLI (return_all_scores=True) como en tu módulo original.
    """
    res = nli2(f"{premise} </s></s> {hypothesis}")[0]  # lista de dicts
    d = {x["label"].lower(): x["score"] for x in res}
    return {k: float(d.get(k, 0.0)) for k in LABEL_ORDER}

# ============================================================
# 4) Construcción de pares de compromisos por candidato
# ============================================================
MAX_PER_CAND = 35  # puedes subir/bajar; n*(n-1)/2 por candidato

cand_pairs = []
for cand, grp in commitments.groupby("candidate"):
    grp = grp.sort_values(["filename", "page", "sent_id"]).head(MAX_PER_CAND)
    texts = grp["sent_text"].tolist()
    idxs  = grp.index.tolist()
    for i, j in itertools.combinations(range(len(texts)), 2):
        cand_pairs.append({
            "candidate": cand,
            "idx1": idxs[i],
            "idx2": idxs[j],
            "text1": texts[i],
            "text2": texts[j],
        })

pairs_df = pd.DataFrame(cand_pairs)
tqdm.write(f"[M9] pares a evaluar: {len(pairs_df):,}")

if pairs_df.empty:
    raise RuntimeError("[M9] No hay pares de compromisos para evaluar.")

# ============================================================
# 5) Scoring NLI para cada par
# ============================================================
contr, ent, neu = [], [], []
for _, r in tqdm(pairs_df.iterrows(), total=len(pairs_df), desc="[M9] NLI pares"):
    scores = nli_pair(r["text1"], r["text2"])
    contr.append(scores["contradiction"])
    ent.append(scores["entailment"])
    neu.append(scores["neutral"])

pairs_df["p_contr"] = contr
pairs_df["p_ent"]   = ent
pairs_df["p_neu"]   = neu

# ============================================================
# 6) Filtrar tensiones internas (contradicciones)
# ============================================================
THRESH_CONTR = 0.65
tensions = pairs_df[pairs_df["p_contr"] >= THRESH_CONTR].copy()
tensions = tensions.sort_values(["candidate", "p_contr"], ascending=[True, False])
tqdm.write(f"[M9] tensiones detectadas (p_contr ≥ {THRESH_CONTR}): {len(tensions):,}")

# ============================================================
# 7) Trazabilidad: archivo, página, sent_id para cada frase
# ============================================================
meta = commitments[["sent_text", "candidate", "filename", "page", "sent_id"]].copy()
meta = meta.rename(columns={
    "sent_text": "sent_text_m",
    "filename":  "filename_m",
    "page":      "page_m",
    "sent_id":   "sent_id_m",
})

tensions = tensions.merge(
    meta.add_suffix("1"),
    left_on=["candidate", "text1"],
    right_on=["candidate1", "sent_text_m1"],
    how="left"
).merge(
    meta.add_suffix("2"),
    left_on=["candidate", "text2"],
    right_on=["candidate2", "sent_text_m2"],
    how="left"
)

cols_show = [
    "candidate", "p_contr", "p_ent", "p_neu",
    "text1", "filename_m11", "page_m11", "sent_id_m11",
    "text2", "filename_m22", "page_m22", "sent_id_m22",
]

# algunas columnas pueden faltar si no hubo match; hacemos intersección segura
cols_show = [c for c in cols_show if c in tensions.columns]

tensions_view = tensions[cols_show].rename(columns={
    "filename_m11": "filename1", "page_m11": "page1", "sent_id_m11": "sent_id1",
    "filename_m22": "filename2", "page_m22": "page2", "sent_id_m22": "sent_id2",
})

tqdm.write("[M9] ejemplo de tensiones (primeras 30 filas):")
display(tensions_view.head(30))

# ============================================================
# 8) Estadístico por candidato
# ============================================================
if not tensions_view.empty:
    summary_tens = (
        tensions_view
        .groupby("candidate")
        .agg(
            n_tensiones=("p_contr", "size"),
            p_contr_med=("p_contr", "mean"),
            p_contr_max=("p_contr", "max"),
        )
        .reset_index()
    )
else:
    summary_tens = pd.DataFrame(
        columns=["candidate", "n_tensiones", "p_contr_med", "p_contr_max"]
    )

tqdm.write("[M9] resumen por candidato:")
display(summary_tens)

# ============================================================
# 9) Visualización rápida (histograma por candidato)
# ============================================================
if not tensions_view.empty:
    fig_tens = px.histogram(
        tensions_view,
        x="p_contr",
        color="candidate",
        nbins=20,
        barmode="overlay",
        opacity=0.6,
        title="Distribución de P(contradicción) entre compromisos (por candidato)",
        labels={"p_contr": "P(contradicción)"},
    )
    fig_tens.show()

tqdm.write("[M9] FIN módulo integral (pages → compromisos → NLI)")


[M9] Inicio módulo integral (pages → compromisos → NLI)
[M9] pages.shape=(542, 6)
[M9] spaCy cargado para segmentación: es


[M9] segmentando oraciones:   0%|          | 0/542 [00:00<?, ?it/s]

[M9] sent_df.shape=(9270, 5)
[M9] commitments.shape=(455, 5)


: 