# Inspect Chroma Database


In [1]:
import chromadb
from chromadb.config import Settings
from pathlib import Path
import pandas as pd
import json
from src.app.config.settings import settings


## Conectar ao ChromaDB

In [2]:
client = chromadb.PersistentClient(
    path=str(settings.chroma_dir),
    settings=Settings(anonymized_telemetry=False)
)
client

<chromadb.api.client.Client at 0x10af3e570>

## Listar cole√ß√µes dispon√≠veis

In [3]:
collections = client.list_collections()
collections

[Collection(name=documents)]

##  Abrir cole√ß√£o principal

In [13]:
collection = client.get_collection(settings.chroma_collection)
collection

Collection(name=documents)

##  Contar quantos documentos existem no banco

In [14]:
count = collection.count()
count

539

##  Carregar todos os documentos (cuidado em base muito grande)

In [15]:
all_docs = collection.get(include=["documents", "metadatas", "embeddings"])
len(all_docs["documents"])

539

##  Mostrar preview dos primeiros 5 documentos

In [16]:
df_preview = pd.DataFrame({
    "text": all_docs["documents"][:5],
    "metadata": all_docs["metadatas"][:5]
})
df_preview

Unnamed: 0,text,metadata
0,(Mark One) \n‚òí ANNUAL REPORT PURSUANT TO SECT...,{'doc_id': 'fd8502ebf9d3a72c3a5e95308a59589d14...
1,| Title of each class |...,{'doc_id': 'fd8502ebf9d3a72c3a5e95308a59589d14...
2,| 3.600% Notes due 2042 |...,"{'section_path': '', 'order': 2, 'doc_id': 'fd..."
3,## UNITED STATES SECURITIES AND EXCHANGE COMMI...,{'section_path': 'UNITED STATES SECURITIES AND...
4,## FORM 10-K \nIndicate by check mark whether...,"{'doc_date': '', 'section_path': 'FORM 10-K', ..."


## Ver todas as chaves de metadados existentes

In [8]:
metadata_keys = set()
for meta in all_docs["metadatas"]:
    metadata_keys |= set(meta.keys())
metadata_keys

{'chunk_kind',
 'company',
 'doc_date',
 'doc_id',
 'doc_type',
 'order',
 'section_path'}

## Filtrar documentos por doc_id

In [9]:
def find_by_doc_id(doc_id):
    docs = collection.get(
        where={"doc_id": {"$eq": doc_id}},
        include=["documents", "metadatas"]
    )
    return pd.DataFrame({
        "text": docs["documents"],
        "metadata": docs["metadatas"]
    })

# Exemplo:
# find_by_doc_id("nvidia-q3-2024")

##  Amostrar chunks aleat√≥rios

In [10]:
import random

def sample_chunks(n=3):
    idxs = random.sample(range(len(all_docs["documents"])), min(n, len(all_docs["documents"])))
    data = []
    for i in idxs:
        data.append({
            "text": all_docs["documents"][i],
            "metadata": all_docs["metadatas"][i]
        })
    return pd.DataFrame(data)

sample_chunks(3)

Unnamed: 0,text,metadata
0,## Comit√™ de Pol√≠tica Monet√°ria -Copom \n10 e...,"{'chunk_kind': 'text', 'section_path': 'Comit√™..."
1,| iTunes K.K. ...,"{'doc_type': '10-Q Filing', 'order': 388, 'com..."
2,## Macroeconomic and Industry Risks,"{'company': 'Apple', 'doc_date': '', 'doc_id':..."


## üîü Ver dados agrupados por se√ß√£o (section_path)

In [11]:
def find_by_section(section: str):
    docs = collection.get(
        where={"section_path": {"$eq": section}},
        include=["documents", "metadatas"]
    )
    return pd.DataFrame({
        "text": docs["documents"],
        "metadata": docs["metadatas"]
    })

# Exemplo:
# find_by_section("Financial Statements > Revenue")

## 1Ô∏è‚É£1Ô∏è‚É£ Ver formato dos embeddings

In [12]:
emb = all_docs["embeddings"][0]
len(emb), emb[:10]

(1536,
 array([ 0.02699038,  0.01287768,  0.03478646,  0.03010367,  0.01716166,
        -0.00547398,  0.00225295, -0.00515236, -0.01037547,  0.01525767]))

## 1Ô∏è‚É£2Ô∏è‚É£ An√°lise Completa dos Metadados

In [17]:
# An√°lise detalhada de cada chave de metadado
metadata_analysis = {}

for key in sorted(metadata_keys):
    values = [meta.get(key) for meta in all_docs["metadatas"] if key in meta]
    unique_values = set(values)
    
    metadata_analysis[key] = {
        "total_ocorr√™ncias": len(values),
        "valores_√∫nicos": len(unique_values),
        "exemplos": list(unique_values)[:5],  # At√© 5 exemplos
        "tipo_dados": type(values[0]).__name__ if values else "None"
    }

# Mostrar an√°lise formatada
for key, info in metadata_analysis.items():
    print(f"\nüìå '{key}':")
    print(f"   Tipo: {info['tipo_dados']}")
    print(f"   Ocorr√™ncias: {info['total_ocorr√™ncias']}")
    print(f"   Valores √∫nicos: {info['valores_√∫nicos']}")
    print(f"   Exemplos: {info['exemplos']}")



üìå 'chunk_kind':
   Tipo: str
   Ocorr√™ncias: 539
   Valores √∫nicos: 2
   Exemplos: ['text', 'table']

üìå 'company':
   Tipo: str
   Ocorr√™ncias: 539
   Valores √∫nicos: 6
   Exemplos: ['', 'Microsoft', 'Apple', 'Banco Central do Brasil', 'NVIDIA']

üìå 'doc_date':
   Tipo: str
   Ocorr√™ncias: 539
   Valores √∫nicos: 3
   Exemplos: ['', 'November 2023', '2023-11-21']

üìå 'doc_id':
   Tipo: str
   Ocorr√™ncias: 539
   Valores √∫nicos: 6
   Exemplos: ['c013e7147789ebe5be4b8b2d52d4083a1134056d', '76069ec81d26753e4be5042f2ef3ee6b7d1d6e88', '588ad141573701c6a100a73ecfc8a0d2a4f2436a', 'fd8502ebf9d3a72c3a5e95308a59589d14c91c7d', '25fa12860dc1573585f15942e9b1b6d0620581d4']

üìå 'doc_type':
   Tipo: str
   Ocorr√™ncias: 539
   Valores √∫nicos: 3
   Exemplos: ['10-Q Filing', 'Research Report', 'Meeting Minutes']

üìå 'order':
   Tipo: int
   Ocorr√™ncias: 539
   Valores √∫nicos: 399
   Exemplos: [0, 1, 2, 3, 4]

üìå 'section_path':
   Tipo: str
   Ocorr√™ncias: 539
   Valores √∫ni

## 1Ô∏è‚É£3Ô∏è‚É£ Tabela Resumida dos Metadados

In [None]:
# Criar dataframe com resumo dos metadados
df_metadata_summary = pd.DataFrame([
    {
        "Chave": key,
        "Tipo": info["tipo_dados"],
        "Ocorr√™ncias": info["total_ocorr√™ncias"],
        "Valores √önicos": info["valores_√∫nicos"],
        "Cobertura %": f"{(info['total_ocorr√™ncias']/len(all_docs['metadatas'])*100):.1f}%"
    }
    for key, info in metadata_analysis.items()
])

df_metadata_summary.sort_values("Ocorr√™ncias", ascending=False)

## 1Ô∏è‚É£4Ô∏è‚É£ Exemplos Reais de Metadados Completos

In [None]:
# Mostrar metadados completos de alguns documentos
print("=" * 80)
print("EXEMPLOS DE METADADOS COMPLETOS:")
print("=" * 80)

for i in range(min(3, len(all_docs["metadatas"]))):
    print(f"\nüìÑ Documento #{i+1}:")
    print(f"Texto (primeiros 150 chars): {all_docs['documents'][i][:150]}...")
    print(f"\nMetadados:")
    for key, value in all_docs["metadatas"][i].items():
        print(f"  ‚Ä¢ {key}: {value}")
    print("-" * 80)

## 1Ô∏è‚É£5Ô∏è‚É£ Exportar Metadados para JSON

In [None]:
# Exportar metadados em JSON para an√°lise completa
output_file = Path("data/staged/metadados_completos.json")
output_file.parent.mkdir(parents=True, exist_ok=True)

# Estruturas √∫teis para exportar
export_data = {
    "total_documentos": len(all_docs["documents"]),
    "total_chunks": len(all_docs["documents"]),
    "chaves_metadados": sorted(list(metadata_keys)),
    "analise_chaves": metadata_analysis,
    "amostra_metadados": all_docs["metadatas"][:10]  # Primeiros 10 documentos
}

with open(output_file, "w") as f:
    json.dump(export_data, f, indent=2, ensure_ascii=False)

print(f"‚úÖ Metadados exportados para: {output_file}")
print(f"Arquivo criado com {len(export_data)} se√ß√µes principais")
print(f"\nConte√∫do resumido:")
for section, value in export_data.items():
    if isinstance(value, (list, dict)):
        print(f"  ‚Ä¢ {section}: {len(value)} itens")
    else:
        print(f"  ‚Ä¢ {section}: {value}")