# Bibliotecas

In [22]:
import os
import json

from huggingface_hub import InferenceClient
from sklearn.cluster import KMeans
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.prompts import PromptTemplate

from dotenv import load_dotenv

In [23]:
# Carrega as variáveis em `.env`
load_dotenv()

# Verifica se o token está configurado
assert "HF_API_TOKEN" in os.environ, "Defina a variável de ambiente HF_API_TOKEN!"

# Setando variável com o token
hf_token = os.getenv("HF_API_TOKEN")

# Problema

Temos diversos documentos falando sobre diversos tópicos. O objetivo é agrupar esses documentos por temáticas.

Aqui vamos usar LangChain (framework comum) para fazer o embedding do banco vetorial, e usaremos o HuggingFaceHub para utilizar uma LLM para criar um label para os grupos.

# Dados Base

In [24]:
docs = [
    "Python vs JavaScript: An in-depth comparison of two programming languages.",
    "10 tips for Python coding beginners.",
    "How to bake the perfect sourdough bread at home.",
    "Mastering JavaScript closures and callbacks.",
    "Sourdough starter maintenance and bread recipes."
]


# Solução
A ideia vai ser:
1. Transformar os documentos num banco vetorial (embeddings)
2. Fazer uma clusterização simples nos vetores (k-means)
3. Nomear os clusters com uma LLM

## Carregando modelos
Vamos usar:
- `HuggingFaceTB/SmolLM3-3B` para rodar via API o labeling.
- `sentence-transformers/all-MiniLM-L6-v2` para fazer o embedding.

In [25]:

client = InferenceClient(
    model="HuggingFaceTB/SmolLM3-3B",
    provider="hf-inference",
    api_key=os.environ["HF_API_TOKEN"],
)

In [26]:
embedding_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

## Construindo o Banco Vetorial

In [27]:
doc_vectors = embedding_model.embed_documents(docs)
for vector in doc_vectors:
    print(vector) # São um vetor para cada documento

[-0.1299819052219391, -0.03272879868745804, -0.005013075191527605, 0.038952793926000595, -0.010549793019890785, -0.13975493609905243, -0.04043034091591835, 0.10483346879482269, -0.006779563147574663, -0.09836924076080322, -0.048221636563539505, 0.04107325151562691, 0.031375203281641006, 0.0038871546275913715, 0.09986567497253418, -0.049420882016420364, -0.039418697357177734, -0.0466584749519825, 0.001095207640901208, -0.0873037800192833, -0.03551401197910309, -0.035559091717004776, -0.004482835065573454, 0.0047917780466377735, 0.06876824796199799, 0.012685302644968033, -0.038620490580797195, -0.018657272681593895, -0.008617821149528027, -0.0031979032792150974, -0.045820657163858414, 0.0021065499167889357, -0.033459898084402084, 0.015275835990905762, -0.00968517828732729, 0.07830029726028442, 0.01569613441824913, -0.08963347226381302, -0.05270892381668091, -0.04664050415158272, -0.11525322496891022, 0.02294735796749592, -0.00944540649652481, 0.005314378067851067, -0.03083278425037861, 0

## Clusterizando com K-Means
Vamos clusterizar os vetores em 2 grupos.

In [28]:
# Rodando o k-means
num_clusters = 2 # Assumindo que já sabemos que há 2 grupos
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
labels = kmeans.fit_predict(doc_vectors)

In [29]:
# Montando o resultado
clusters = {i: [] for i in range(num_clusters)}
for doc, lab in zip(docs, labels):
    clusters[lab].append(doc)

print("Clusters brutos:")
print(json.dumps(clusters, indent=4)) # Só pra ficar mais legível


Clusters brutos:
{
    "0": [
        "Python vs JavaScript: An in-depth comparison of two programming languages.",
        "10 tips for Python coding beginners.",
        "Mastering JavaScript closures and callbacks."
    ],
    "1": [
        "How to bake the perfect sourdough bread at home.",
        "Sourdough starter maintenance and bread recipes."
    ]
}


## Rotulando os Clusters


In [30]:
prompt = PromptTemplate.from_template(
    "Here are some documents about the same topic:\n\n"
    "{documents}\n\n"
    "Provide a short label with one or two words, "
    "describing their common theme, all lower case and with only letters and space:"
)

In [31]:
for cid, docs_in_cluster in clusters.items():
    # join cluster docs into bullet list
    docs_text = "\n".join(f"- {d}" for d in docs_in_cluster)
    full_label = client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": prompt.invoke({"documents": docs_text}).to_string()
            }
        ]
    ).choices[0].message.content
    label = full_label.split("</think>")[1].strip()
    print(f"Cluster {cid} label: {label}")

Cluster 0 label: programming languages
Cluster 1 label: sourdough
