In [8]:
import pandas as pd
import os

first_csv = pd.read_csv('Chile_all(1).csv', sep=';')

In [9]:
second_csv = pd.DataFrame()
size = len(os.listdir('./data'))

for i in range(1, size + 1):
    df = pd.read_csv(f'./data/Chile_all1_part{i}.csv', sep=';')
    second_csv = pd.concat([second_csv, df], ignore_index=True)


In [None]:
def read_tripadvisor_data(path: str) -> pd.DataFrame:
    df = pd.DataFrame()
    size = len(os.listdir(path))

    for i in range(size):
        file_path = os.path.join(path, f"Chile_all1_part{i}.csv")
        df = pd.concat([df, pd.read_csv(file_path, sep=';')], ignore_index=True)

    return df

In [None]:
# --- Instalación (una sola vez) ---
# !pip install -U bertopic sentence-transformers umap-learn hdbscan

from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
import hdbscan
import pandas as pd
import numpy as np
import re, unicodedata, random

# ---------- 1) Datos (ejemplo) ----------
docs = [
  "La habitación estaba limpia, pero el check-in fue lentísimo.",
  "Excelente ubicación, aunque hubo ruido por las noches.",
  "El desayuno frío y poco variado. El personal muy amable.",
  "Wifi intermitente, imposible trabajar. La vista al mar, increíble.",
  "Todo bien salvo el baño: poca presión de agua.",
  "Atención fantástica, volvería sin dudarlo.",
  "Demasiadas escaleras y mala señalización para llegar.",
  "La ducha no calentaba y la habitación olía a humedad."
]

# ---------- 2) Limpieza ligera recomendada ----------
def normalize(text):
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

docs = [normalize(d) for d in docs]

# ---------- 3) Embeddings multilingües ----------
embedder = SentenceTransformer("paraphrase-multilingual-mpnet-base-v2")
embeddings = embedder.encode(docs, batch_size=32, show_progress_bar=False, normalize_embeddings=True)

# ---------- 4) Reducir y clusterizar ----------
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine", random_state=42)
hdb = hdbscan.HDBSCAN(min_cluster_size=2, metric='euclidean', prediction_data=True)

topic_model = BERTopic(
    umap_model=umap_model,
    hdbscan_model=hdb,
    language="multilingual",
    calculate_probabilities=True,
    verbose=False,
    nr_topics=None  # dejar que encuentre los temas
)

topics, probs = topic_model.fit_transform(docs, embeddings)

# ---------- 5) Resultados clave ----------
info = topic_model.get_topic_info()   # resumen de temas
print(info)                           # Topic -1 = outliers

# Palabras clave por tema y ejemplos representativos
for t in info.topic.unique():
    if t == -1: 
        continue
    print("\n=== Tema", t, "===")
    print(topic_model.get_topic(t))                 # [(palabra, peso), ...]
    reps = topic_model.get_representative_docs(t)   # citas representativas
    for r in reps[:2]:
        print(" •", r)

# ---------- 6) Exportar a CSV (útil para informe) ----------
df = pd.DataFrame({"doc": docs, "topic": topics, "prob": probs.max(axis=1)})
df.to_csv("temas_bertopic.csv", index=False)
print("\nGuardado: temas_bertopic.csv")


  from .autonotebook import tqdm as notebook_tqdm


  [2m2025-09-05T15:53:24.397580Z[0m [33m WARN[0m  [33mReqwest(reqwest::Error { kind: Request, url: "https://transfer.xethub.hf.co/xorbs/default/5540a67de9600e578e6f5ce3090b5720e4b25abea1c262524524acce6dc47fd1?X-Xet-Signed-Range=bytes%3D0-63740847&Expires=1757090343&Policy=eyJTdGF0ZW1lbnQiOlt7IlJlc291cmNlIjoiaHR0cHM6Ly90cmFuc2Zlci54ZXRodWIuaGYuY28veG9yYnMvZGVmYXVsdC81NTQwYTY3ZGU5NjAwZTU3OGU2ZjVjZTMwOTBiNTcyMGU0YjI1YWJlYTFjMjYyNTI0NTI0YWNjZTZkYzQ3ZmQxP1gtWGV0LVNpZ25lZC1SYW5nZT1ieXRlcyUzRDAtNjM3NDA4NDciLCJDb25kaXRpb24iOnsiRGF0ZUxlc3NUaGFuIjp7IkFXUzpFcG9jaFRpbWUiOjE3NTcwOTAzNDN9fX1dfQ__&Signature=qOmasBdRxJqoCC7HhH9BHXfQYlwAC3Yxmse3f6hpa7dL~trY0p4dCCjmohet0pjgKSoqXIDM4zOP12l6G4ksuInS7zwxvXtiDnNyvLubF7uLHmn--2~qg0SYUzQUwzCu700QReg4U199d4ufMDxZegyaDlwaOWMX8p38NApJx3CBpbMQNNfILDr~Olb2sX70P~5-3CmfX~kmcCAEsgj5GXWp7UOBahwnwGW8iF-96onL6l-csh95YXhrgj8xMLetaYZDxMFGAYxnR8~EIaP275FnYloe3qh59S53tg0szK4Mpcpl7MvV6aAM~PFU7zmTVCUROnXu1A~mTz6oFUdw~Q__&Key-Pair-Id=K2L8F4GPSG1IFC", source: hyper_util::c

OSError: Can't load the model for 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'sentence-transformers/paraphrase-multilingual-mpnet-base-v2' is the correct path to a directory containing a file named pytorch_model.bin, tf_model.h5, model.ckpt or flax_model.msgpack.

  [2m2025-09-05T16:11:38.149427Z[0m [31mERROR[0m  [31mFatal Client Error: s3::get_range api call failed (retry 1): error sending request for url (https://transfer.xethub.hf.co/xorbs/default/bd1a3ba9747305e78d25499a36699543febdb140eb42d282ca6f03ad73796ed1?X-Xet-Signed-Range=bytes%3D0-63916296&Expires=1757090343&Policy=eyJTdGF0ZW1lbnQiOlt7IlJlc291cmNlIjoiaHR0cHM6Ly90cmFuc2Zlci54ZXRodWIuaGYuY28veG9yYnMvZGVmYXVsdC9iZDFhM2JhOTc0NzMwNWU3OGQyNTQ5OWEzNjY5OTU0M2ZlYmRiMTQwZWI0MmQyODJjYTZmMDNhZDczNzk2ZWQxP1gtWGV0LVNpZ25lZC1SYW5nZT1ieXRlcyUzRDAtNjM5MTYyOTYiLCJDb25kaXRpb24iOnsiRGF0ZUxlc3NUaGFuIjp7IkFXUzpFcG9jaFRpbWUiOjE3NTcwOTAzNDN9fX1dfQ__&Signature=aryAsG6LjWTAoS1Ta8tJ9hW3WSJgDe6oqyT0fBcarHwXR70lGyoAYM06yzT219xAbCtLVoQivGtOF7BJGtQFNdxSoGSLf~mYKzq3EAdFfQZzOMw0zdS76rBgJVLIN3TqVCyf2ru~9XgWnDT~NKZARz78nEDRkTQTtPezZdnikoekpbGbx78lYY7z73JogdZ4UDBHNVo9X3lIO0ikwugG5km4FveSPei1BoTHMto4HNHd2MmkZ55eLs4TE2NCSVS0SfP99SLNAWOu3snFBwFiJF9j5gmUeEDRDlchODhfAbyOUXaM2V-RkfE~TzC-kN05uRSWhcQhhyMtduOlbETtUg__&Key-