In [1]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [2]:
# --- Funciones de limpieza ---
def remove_numbers(text):
    return " ".join(re.sub(r"[0-9]", " ", text).split())

def remove_unprintable_(text):
    printable = set(string.printable + "ñáéíóúüÑÁÉÍÓÚÜ")
    return "".join(filter(lambda x: x in printable, text))

def remove_punctuation(text):
    pattern = re.compile(r"[^\w\sáéíóúüñÁÉÍÓÚÜÑ]")
    return re.sub(" +", " ", pattern.sub(" ", text))

def reduce_spam(text):
    text = re.sub(r"(\w+)(\s+\1){2,}", r"\1", text)
    text = re.sub(r"(\w+\s+\w+)(\s+\1){2,}", r"\1", text)
    return text

def remove_vowels_accents(text):
    return (
        text.replace("á", "a").replace("é", "e")
            .replace("í", "i").replace("ó", "o")
            .replace("ú", "u").replace("ü", "u")
    )

def remove_stopwords(text, stopwords_list):
    return " ".join([word for word in text.split() if word not in stopwords_list])

def clean_text(text, stopwords_list):
    text = text.lower()
    text = remove_numbers(text)
    text = remove_unprintable_(text)
    text = remove_punctuation(text)
    text = reduce_spam(text)
    text = remove_stopwords(text, stopwords_list)
    text = remove_vowels_accents(text)
    return text.strip()


In [3]:
# --- Carga del corpus ---
df = pd.read_excel("01_bbdd_think_tanks.xlsx")  # Ajusta la ruta real

# --- Carga de stopwords mejoradas ---
with open("stopwords.txt", "r", encoding="utf-8") as f:
    stopwords_list = [line.strip().lower() for line in f if line.strip()]

# --- Preprocesamiento ---
df = df.dropna(subset=["Corpus"])
df["FechaPublicación"] = pd.to_datetime(df["FechaPublicación"], errors="coerce")
df["TextoLimpio"] = df["Corpus"].apply(lambda x: clean_text(str(x), stopwords_list))


In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np

def compute_lda_and_tfidf(documents, n_topics=6, n_top_words=10):
    if len(documents) < 3:
        return [], []

    # LDA
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(documents)
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda_matrix = lda.fit_transform(X)
    topics = [list(np.where(row > 0.1)[0]) for row in lda_matrix]

    # TF-IDF
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2))
    X_tfidf = tfidf_vectorizer.fit_transform(documents)
    tfidf_sum = X_tfidf.sum(axis=0).A1
    feature_names = tfidf_vectorizer.get_feature_names_out()
    top_indices = tfidf_sum.argsort()[::-1][:n_top_words]
    tfidf_top_words = [feature_names[i] for i in top_indices]

    return topics, tfidf_top_words


In [None]:
def aplicar_modelado_por_grupo(df, columna_categoria, n_topics=6):
    df_result = df.copy()
    col_lda = f"LDA_Topics_{columna_categoria}"
    col_tfidf = f"TFIDF_TopWords_{columna_categoria}"
    
    df_result[col_lda] = None
    df_result[col_tfidf] = None

    for (grupo, fecha), subgrupo in df.groupby([columna_categoria, "FechaPublicación"]):
        textos = subgrupo["TextoLimpio"].dropna().tolist()
        textos = [t for t in textos if len(t.split()) >= 3]
        if len(textos) < 3:
            continue

        try:
            lda_topics, tfidf_words = compute_lda_and_tfidf(textos, n_topics=n_topics)
            idx = subgrupo.index
            df_result.loc[idx, col_lda] = [lda_topics] * len(idx)
            df_result.loc[idx, col_tfidf] = [tfidf_words] * len(idx)
        except Exception as e:
            print(f"Error en grupo {grupo} - {fecha}: {e}")
            continue

    return df_result


In [None]:
# Aplicar por Think Tank
df = aplicar_modelado_por_grupo(df, "Think Tank")

# Aplicar por Orientación Política
df = aplicar_modelado_por_grupo(df, "Orientación Política")


Error en grupo (CDC, 2021-11-26 00:00:00): setting an array element with a sequence.
Error en grupo (CDC, 2021-12-03 00:00:00): setting an array element with a sequence.
Error en grupo (CDC, 2021-12-24 00:00:00): setting an array element with a sequence.
Error en grupo (CDC, 2022-01-21 00:00:00): setting an array element with a sequence.
Error en grupo (CDC, 2022-03-06 00:00:00): setting an array element with a sequence.
Error en grupo (CDC, 2022-09-30 00:00:00): setting an array element with a sequence.
Error en grupo (CDC, 2022-10-14 00:00:00): setting an array element with a sequence.
Error en grupo (CED, 2022-02-10 00:00:00): setting an array element with a sequence.
Error en grupo (CED, 2022-06-15 00:00:00): setting an array element with a sequence.
Error en grupo (CED, 2022-07-25 00:00:00): setting an array element with a sequence.
Error en grupo (CED, 2022-08-04 00:00:00): setting an array element with a sequence.
Error en grupo (CED, 2023-03-15 00:00:00): setting an array eleme

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 30240 entries, 0 to 31646
Data columns (total 25 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   ID                                   30240 non-null  int64         
 1   Think Tank                           30240 non-null  object        
 2   Tipo de Think Tank                   30240 non-null  object        
 3   Orientación Política                 30240 non-null  object        
 4   Autor                                13818 non-null  object        
 5   Título                               30237 non-null  object        
 6   FechaPublicación                     30068 non-null  datetime64[ns]
 7   Unnamed: 7                           75 non-null     object        
 8   Medio                                5762 non-null   object        
 9   Corpus                               30240 non-null  object        
 10  Producto       