In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import re
from tqdm import tqdm
from gensim.corpora import Dictionary
from gensim.models import LdaModel, CoherenceModel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import silhouette_score
import networkx as nx
import community as community_louvain
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.io as pio


In [6]:
df = pd.read_parquet("01_bbdd_think_tanks_no_stopwords.parquet" , engine="fastparquet")
df = df[(df["FechaPublicacion"] >= "2019-01-01") & (df["FechaPublicacion"] <= "2023-12-31")]
df["Dia"] = df["FechaPublicacion"].dt.to_period("D")

with open("stopwords.txt", "r", encoding="utf-8") as f:
    stopwords_custom = set(line.strip().lower() for line in f if line.strip())
def preprocess(text):
    text = re.sub(r"[^a-zA-ZáéíóúüñÁÉÍÓÚÜÑ\s]", "", text.lower())
    return [w for w in text.split() if len(w) > 2 and w not in stopwords_custom]



In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 17432 entries, 205 to 31640
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   ID                    17432 non-null  int64         
 1   Think Tank            17432 non-null  object        
 2   Tipo de Think Tank    17432 non-null  object        
 3   Orientación Política  17432 non-null  object        
 4   Autor                 8390 non-null   object        
 5   Título                17425 non-null  object        
 6   Medio                 2498 non-null   object        
 7   Corpus                16690 non-null  object        
 8   Producto              10353 non-null  object        
 9   Enlace                17432 non-null  object        
 10  CorpusPDF             237 non-null    object        
 11  FechaPublicacion      17432 non-null  datetime64[ns]
 12  Año                   17432 non-null  float64       
 13  Mes                

In [8]:
df["tokens"] = df["Texto"].dropna().apply(preprocess)

In [9]:
global_texts = df["tokens"].tolist()
global_dict = Dictionary(global_texts)
global_dict.filter_extremes(no_below=5, no_above=0.5)

In [10]:
def best_lda_model(texts, dictionary, corpus, min_topics=1, max_topics=2):
    best_model, best_coh = None, -1
    for k in range(min_topics, max_topics + 1):
        lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=k, random_state=42)
        coh = CoherenceModel(model=lda, texts=texts, dictionary=dictionary, coherence='c_v').get_coherence()
        if coh > best_coh:
            best_model, best_coh = lda, coh
    return best_model

In [11]:
topics_all = []

for dia, grupo in tqdm(df.groupby("Dia")):
    textos = grupo["tokens"].dropna().tolist()
    if not textos: continue
    corpus = [global_dict.doc2bow(text) for text in textos]
    if all(len(doc) == 0 for doc in corpus): continue

    lda = best_lda_model(textos, global_dict, corpus)
    for idx, topic in enumerate(lda.show_topics(num_words=3, formatted=False)):
        palabras = [w for w,_ in topic[1]]
        vector = np.zeros(len(global_dict))
        for w_id, prob in lda.get_topic_terms(idx, topn=len(global_dict)):
            vector[w_id] = prob

        peso = np.mean([prob[idx] for prob in lda.get_document_topics(corpus, minimum_probability=0)])
        topics_all.append({
            "dia": str(dia),
            "topico_id": idx,
            "palabras": ", ".join(palabras),
            "vector": vector,
            "peso": peso
        })

  0%|          | 0/1797 [00:00<?, ?it/s]

  m_lr_i = np.log(numerator / denominator)
  return cv1.T.dot(cv2)[0, 0] / (_magnitude(cv1) * _magnitude(cv2))
  perwordbound, np.exp2(-perwordbound), len(chunk), corpus_words
 62%|██████▏   | 1122/1797 [7:47:43<4:41:22, 25.01s/it]


AttributeError: 'NoneType' object has no attribute 'show_topics'

In [None]:
df_topics = pd.DataFrame(topics_all)
df_topics.to_parquet("lda_topics_day.parquet", index=False)


In [None]:
def evaluar_thresholds(df_topics, thresholds):
    elbows, sils = [], []
    vecs = np.vstack(df_topics["vector"].to_numpy())

    for thresh in thresholds:
        G = nx.Graph()
        for i, v in enumerate(vecs):
            G.add_node(i)
        for i in range(len(vecs)):
            for j in range(i+1, len(vecs)):
                sim = cosine_similarity([vecs[i]], [vecs[j]])[0][0]
                if sim > thresh:
                    G.add_edge(i, j)

        if G.number_of_edges()==0:
            elbows.append(0); sils.append(-1)
            continue

        part = community_louvain.best_partition(G)
        labels = list(part.values())
        elbows.append(len(set(labels)))
        try:
            sils.append(silhouette_score(vecs, labels))
        except:
            sils.append(-1)

    return elbows, sils

thresholds = np.linspace(0.4, 0.9, 10)
elbows, sils = evaluar_thresholds(df_topics, thresholds)

# Graficar resultados
plt.figure(figsize=(12, 5))
plt.subplot(1,2,1)
plt.plot(thresholds, elbows, marker='o')
plt.title("Elbow: # de clústeres vs threshold")
plt.xlabel("Threshold")
plt.ylabel("# clústeres")
plt.subplot(1,2,2)
plt.plot(thresholds, sils, marker='o', color='purple')
plt.title("Silhouette Score vs threshold")
plt.xlabel("Threshold")
plt.ylabel("Silhouette")
plt.tight_layout()
plt.show()


In [None]:
manual_threshold = 0.75  # placeholder para definir manualmente
similarity_threshold = manual_threshold

# Construir grafo y agrupar
G = nx.Graph()
for i1, row1 in df_topics.iterrows():
    for i2, row2 in df_topics.iterrows():
        if i1>=i2: continue
        if cosine_similarity([row1["vector"]],[row2["vector"]])[0][0] > similarity_threshold:
            G.add_edge(i1, i2)
partition = community_louvain.best_partition(G)
df_topics["cluster"] = df_topics.index.map(partition)


In [None]:
df_plot = df_topics.copy()
df_plot["Fecha"] = pd.to_datetime(df_plot["dia"])

fig = px.scatter(
    df_plot,
    x="Fecha", y="peso", color="cluster",
    hover_data=["palabras"], title="Evolución diaria de tópicos emergentes",
    labels={"peso":"Importancia", "Fecha":"Día"}, template="plotly_white"
)
fig.update_traces(marker=dict(size=8, line=dict(width=0.5, color='DarkSlateGrey')))
fig.show()

# Timeline
timeline = df_topics.groupby(["dia", "cluster"])["peso"].sum().reset_index()
timeline["Fecha"] = pd.to_datetime(timeline["dia"])
fig2 = px.line(
    timeline, x="Fecha", y="peso", color="cluster", markers=True,
    title="Línea de tiempo diaria por clúster", template="plotly_white"
)
fig2.show()


In [None]:
from collections import Counter

def top_words(df, top_n=10):
    cw = {}
    for c in df["cluster"].unique():
        allwords = [w for txt in df[df["cluster"]==c]["palabras"] for w in txt.split(", ")]
        cw[c] = [w for w,_ in Counter(allwords).most_common(top_n)]
    return pd.DataFrame.from_dict(cw, orient="index")

hw = top_words(df_topics, top_n=10)
plt.figure(figsize=(12,6))
sns.heatmap(pd.DataFrame([[1]*10]*len(hw), index=hw.index, columns=hw.columns),
            annot=hw.values, fmt="", cbar=False, cmap="viridis")
plt.title("Palabras por clúster")
plt.yticks(rotation=0)
plt.show()
