## Librerías

In [1]:
import pandas as pd
import re
import string

## Ingesta

In [2]:
df = pd.read_excel("df.xlsx")

## Ajustes DateTime

In [3]:
def parsear_fecha(fecha):
    try:
        return pd.to_datetime(fecha, format="%Y-%m-%d %H:%M:%S")
    except:
        try:
            return pd.to_datetime(fecha, format="%d-%m-%Y")
        except:
            return pd.NaT

In [4]:
df["FechaPublicacion"] = df["FechaPublicación"].apply(parsear_fecha)
df["Año"] = df["FechaPublicacion"].dt.year
df["Mes"] = df["FechaPublicacion"].dt.month
df["Dia"] = df["FechaPublicacion"].dt.day

### Drop 

In [5]:
df = df.drop(columns=["FechaPublicación"])

## Concatenación de las variables con texto informativo

In [6]:
df["Título"] = df["Título"].fillna("")
df["Corpus"] = df["Corpus"].fillna("")
df["CorpusPDF"] = df["CorpusPDF"].fillna("")
df["Texto"] = df["Título"] + ". " + df["Corpus"] + ". " + df["CorpusPDF"] 

## Funciones de limpieza texto

In [7]:
def load_stopwords(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        stopwords = file.read().splitlines()
    return [word.lower() for word in stopwords if word.strip()]

stopwords_list = load_stopwords("stopwords.txt")

In [8]:
def remove_numbers(text):
    text = re.sub(r"[0-9]", " ", text)
    return " ".join(text.split())

def remove_unprintable_(text):
    printable = set(string.printable + "ñáéíóúü" + "ÑÁÉÍÓÚÜ")
    text = "".join(filter(lambda x: x in printable, text))
    return text

def remove_punctuation(text):
    pattern = re.compile(r"[^\w\sáéíóúüñÁÉÍÓÚÜÑ]")
    t = pattern.sub(r" ", text)
    return re.sub(" +", " ", t)

def reduce_spam(text):
    text = re.sub(r"(\w+)(\s+\1){2,}", r"\1", text)
    text = re.sub(r"(\w+\s+\w+)(\s+\1){2,}", r"\1", text)
    return text

def remove_vowels_accents(text):
    return (
        text.replace("á", "a")
        .replace("é", "e")
        .replace("í", "i")
        .replace("ó", "o")
        .replace("ú", "u")
        .replace("ü", "u")
    )

def remove_stopwords(text, stopwords_list):
    return " ".join(
        [word for word in str(text).split() if word not in stopwords_list]
    )

def clean_text(text):
    text = text.lower()
    text = remove_numbers(text)
    text = remove_unprintable_(text)
    text = remove_punctuation(text)
    text = reduce_spam(text)
    text = remove_stopwords(text, stopwords_list)
    text = remove_vowels_accents(text)
    return text.strip()

## Se aplica la función

In [9]:
df["TextoLimpio"] = df["Texto"].astype(str).apply(clean_text)

In [10]:
df = df.drop(columns=["Texto", "FechaPublicación2", "CorpusPDF", "EnlacePDF", "año", "Corpus"])

## Se exporta

In [11]:
df.to_parquet("Database/01_bbdd_think_tanks.parquet", index=False)

In [12]:
df.to_excel("Database/01_bbdd_think_tanks.xlsx", index=False)