**Resumen del flujo**
1. Cargar todos los `_chunks.txt` de `data/preprocessed/processed_*`  
2. Extraer metadata y construir `formatted_chunk_id` (ej: `twilight_400_100_chunk_3`)  
3. Crear DataFrame de control y verificar distribuci√≥n de chunks/tokens  
4. Construir embeddings y poblar ChromaDB  
5. Ejecutar consultas de prueba y validar resultados

In [2]:
# 1.1 ‚Äî Importaciones y configuraci√≥n de rutas
import os, sys, pprint
project_root = os.path.abspath("..") 
src_path = os.path.join(project_root, "src")
if src_path not in sys.path:
    sys.path.append(src_path)

print("Project root:", project_root)
print("Src path added:", src_path)

Project root: c:\Users\Sofia\RAGModel_MineriaMultimedia_202520
Src path added: c:\Users\Sofia\RAGModel_MineriaMultimedia_202520\src


In [3]:
from utils import load_chunks_from_folder
import pandas as pd
from tqdm.auto import tqdm

# Folder base de preprocesados
BASE_PREPROCESSED = os.path.join(project_root, "data", "preprocessed")

# Mostramos las carpetas detectadas (sanity check)
folders = sorted([os.path.join(BASE_PREPROCESSED, f) for f in os.listdir(BASE_PREPROCESSED) if f.startswith("processed_")])
print("Carpetas procesadas detectadas:")
pprint.pprint(folders)


Carpetas procesadas detectadas:
['c:\\Users\\Sofia\\RAGModel_MineriaMultimedia_202520\\data\\preprocessed\\processed_400_100',
 'c:\\Users\\Sofia\\RAGModel_MineriaMultimedia_202520\\data\\preprocessed\\processed_600_150',
 'c:\\Users\\Sofia\\RAGModel_MineriaMultimedia_202520\\data\\preprocessed\\processed_800_200']


  from .autonotebook import tqdm as notebook_tqdm


### 3 ‚Äî Cargar todos los `_chunks.txt` y crear un DataFrame unificado
La funci√≥n `load_chunks_from_folder` devuelve una lista de diccionarios con campos:
- chunk_id (num√©rico seg√∫n utils.py v1)
- text
- book_name
- chunk_size
- overlap
- chunk_number
- word_count

En esta celda combinaremos todo y construiremos un `formatted_chunk_id` con el esquema:
`{book_name}_{chunk_size}_{overlap}_chunk_{chunk_number:02d}`.

In [4]:
# Cargar datos
records = []
for folder in folders:
    recs = load_chunks_from_folder(folder)
    print(f"Le√≠dos {len(recs)} registros desde {folder}")
    records.extend(recs)

# Convertir a DataFrame
df = pd.DataFrame.from_records(records)
print("Total chunks cargados:", len(df))
df.head()

Le√≠dos 28 registros desde c:\Users\Sofia\RAGModel_MineriaMultimedia_202520\data\preprocessed\processed_400_100
Le√≠dos 21 registros desde c:\Users\Sofia\RAGModel_MineriaMultimedia_202520\data\preprocessed\processed_600_150
Le√≠dos 17 registros desde c:\Users\Sofia\RAGModel_MineriaMultimedia_202520\data\preprocessed\processed_800_200
Total chunks cargados: 66


Unnamed: 0,chunk_id,text,book_name,chunk_size,overlap,chunk_number,word_count
0,1,"1 Bella Swan moves from Phoenix, Arizona to th...",data_summary,400,100,1,400
1,2,"Meanwhile, Victoria, a vengeful vampire, build...",data_summary,400,100,2,400
2,3,"their sparkling skin. 21 Charlie Swan, Bella‚Äôs...",data_summary,400,100,3,253
3,4,"After Bella's pickup truck dies a ""natural dea...",breakingdawn_bookone,400,100,1,400
4,5,love cautiously and Edward does not inflict an...,breakingdawn_bookone,400,100,2,186


In [5]:
# Crear formatted_chunk_id
def make_formatted_id(row):
    return f"{row['book_name']}_{row['chunk_size']}_{row['overlap']}_chunk_{int(row['chunk_number']):02d}"

df['formatted_chunk_id'] = df.apply(make_formatted_id, axis=1)

# Mostrar distribuci√≥n b√°sica
print("Ejemplo de formatted_chunk_id:")
display(df[['formatted_chunk_id', 'book_name', 'chunk_number', 'word_count']].head(6))

print("\nConteo por chunk_size:")
display(df['chunk_size'].value_counts())

print("\nTop 5 libros por n√∫mero de chunks:")
display(df['book_name'].value_counts().head(8))

Ejemplo de formatted_chunk_id:


Unnamed: 0,formatted_chunk_id,book_name,chunk_number,word_count
0,data_summary_400_100_chunk_01,data_summary,1,400
1,data_summary_400_100_chunk_02,data_summary,2,400
2,data_summary_400_100_chunk_03,data_summary,3,253
3,breakingdawn_bookone_400_100_chunk_01,breakingdawn_bookone,1,400
4,breakingdawn_bookone_400_100_chunk_02,breakingdawn_bookone,2,186
5,breakingdawn_bookthree_400_100_chunk_01,breakingdawn_bookthree,1,400



Conteo por chunk_size:


chunk_size
400    28
600    21
800    17
Name: count, dtype: int64


Top 5 libros por n√∫mero de chunks:


book_name
newmoon                   16
breakingdawn_bookthree    12
breakingdawn_booktwo       9
data_summary               7
twilight                   7
eclipse                    7
breakingdawn_bookone       5
midnightsun                3
Name: count, dtype: int64

### 3 ‚Äî Cargar modelo de embeddings (SentenceTransformers: all-MiniLM-L6-v2)
Este modelo es r√°pido y produce embeddings peque√±os y pr√°cticos para pruebas.

In [6]:
from sentence_transformers import SentenceTransformer
embed_model_name = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
print("Cargando modelo de embeddings:", embed_model_name)
embed_model = SentenceTransformer(embed_model_name)

# Token count (subword) -> utilidad informativa
def token_count(text):
    tokens = embed_model.tokenize(text)
    return tokens['input_ids'].shape[1]

# Calculamos token_count en una muestra
df['token_count_sample'] = df['text'].apply(lambda x: token_count(x) if len(x.split()) < 1000 else None)

display(df[['formatted_chunk_id','word_count','token_count_sample']].head(6))

Cargando modelo de embeddings: sentence-transformers/multi-qa-MiniLM-L6-cos-v1


Unnamed: 0,formatted_chunk_id,word_count,token_count_sample
0,data_summary_400_100_chunk_01,400,3
1,data_summary_400_100_chunk_02,400,3
2,data_summary_400_100_chunk_03,253,3
3,breakingdawn_bookone_400_100_chunk_01,400,3
4,breakingdawn_bookone_400_100_chunk_02,186,3
5,breakingdawn_bookthree_400_100_chunk_01,400,3


### 5 ‚Äî Preparar listas para insertar en ChromaDB
ChromaDB recibir√°:
- ids: lista de `formatted_chunk_id`
- documents: lista de textos (`text`)
- metadatas: lista de dicts

In [7]:
# Preparar listas para Chroma / FAISS
ids = df['formatted_chunk_id'].astype(str).tolist()
documents = df['text'].astype(str).tolist()
metadatas = df.apply(lambda r: {
    "book_name": r['book_name'],
    "chunk_size": int(r['chunk_size']) if pd.notnull(r['chunk_size']) else None,
    "overlap": int(r['overlap']) if pd.notnull(r['overlap']) else None,
    "chunk_number": int(r['chunk_number']),
    "word_count": int(r['word_count'])
}, axis=1).tolist()

print("Documentos a indexar:", len(ids))

Documentos a indexar: 66


### 6 ‚Äî Construir √≠ndice ChromaDB y poblarlo con embeddings
Usaremos `chromadb` con persistencia local en `data/index/chroma`.  

In [11]:
import os
import chromadb
from chromadb import PersistentClient

# Ruta persistente del √≠ndice
project_root = os.getcwd()  # << Mantengo tu l√≥gica
CHROMA_PERSIST_DIR = os.path.join(project_root, "data", "index", "chroma")
os.makedirs(CHROMA_PERSIST_DIR, exist_ok=True)

# Inicializar Chroma (modo persistente)
client = PersistentClient(path=CHROMA_PERSIST_DIR)
print("‚úÖ Chroma PersistentClient inicializado en:", CHROMA_PERSIST_DIR)

COLLECTION_NAME = "chunks_collection"

# Cargar o crear colecci√≥n
try:
    collection = client.get_collection(COLLECTION_NAME)
    print(f"üìÇ Colecci√≥n existente cargada: {COLLECTION_NAME}")
except Exception:
    collection = client.create_collection(name=COLLECTION_NAME)
    print(f"üÜï Colecci√≥n creada: {COLLECTION_NAME}")

# Evitar duplicados
try:
    existing_docs = collection.get(include=["ids"])
    existing_ids = set(existing_docs["ids"])
except Exception:
    existing_ids = set()

to_add_indices = [i for i, _id in enumerate(ids) if _id not in existing_ids]

print(f"üîç Nuevos docs a insertar: {len(to_add_indices)} / {len(ids)}")

# Insertar nuevos chunks en batches
BATCH = 100
for start in range(0, len(to_add_indices), BATCH):
    end = start + BATCH
    batch_idx = to_add_indices[start:end]

    collection.add(
        ids=[ids[i] for i in batch_idx],
        documents=[documents[i] for i in batch_idx],
        metadatas=[metadatas[i] for i in batch_idx]
    )
    print(f"‚úÖ Insertados: {len(batch_idx)} (Total progreso: {end}/{len(to_add_indices)})")

print("\nüìå Indexaci√≥n completada")
print("üìÅ Total en colecci√≥n ahora:", collection.count())

‚úÖ Chroma PersistentClient inicializado en: c:\Users\Sofia\RAGModel_MineriaMultimedia_202520\notebooks\data\index\chroma
üÜï Colecci√≥n creada: chunks_collection
üîç Nuevos docs a insertar: 66 / 66


C:\Users\Sofia\.cache\chroma\onnx_models\all-MiniLM-L6-v2\onnx.tar.gz: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 79.3M/79.3M [00:03<00:00, 26.6MiB/s]


‚úÖ Insertados: 66 (Total progreso: 100/66)

üìå Indexaci√≥n completada
üìÅ Total en colecci√≥n ahora: 66


In [12]:
client.list_collections()

[Collection(name=chunks_collection)]

### 5 ‚Äî Construir un √≠ndice FAISS local con las mismas embeddings
Si `faiss` est√° disponible, construimos un √≠ndice IndexFlatIP (producto interno) sobre embeddings L2-normalizados.

In [None]:
import numpy as np

use_faiss = True
try:
    import faiss
except Exception:
    print("faiss no est√° disponible; se omitir√° la construcci√≥n de FAISS.")
    use_faiss = False

faiss_index_path = os.path.join(project_root, "data", "index", "faiss_index.bin")
faiss_meta_path = os.path.join(project_root, "data", "index", "faiss_metadata.parquet")

if use_faiss:
    # calcular embeddings para TODO el set
    print("Calculando embeddings para FAISS (todo el conjunto)...")
    all_embeddings = embed_model.encode(documents, show_progress_bar=True, convert_to_numpy=True)

    # normalizar (IP con vectores unitarios ‚âà cos sim)
    def normalize(x):
        norms = np.linalg.norm(x, axis=1, keepdims=True)
        norms[norms==0] = 1e-9
        return x / norms
    all_embeddings = normalize(all_embeddings).astype('float32')

    # crear √≠ndice
    d = all_embeddings.shape[1]
    index = faiss.IndexFlatIP(d)
    index.add(all_embeddings)
    print("FAISS index creado. N¬∞ vectores:", index.ntotal)

    # guardar √≠ndice y metadata (ids, metadatas)
    faiss.write_index(index, faiss_index_path)
    print("FAISS index guardado en:", faiss_index_path)

    # guardar metadata (ids, formatted id, book_name, etc.)
    import pyarrow as pa  # optional, but pandas.to_parquet usa pyarrow
    df_meta = df.copy()
    df_meta['faiss_id'] = range(len(df_meta))  # mapping index position -> metadata
    # Convertir columnas a tipos "seguros" para Parquet
    df_meta_clean = df_meta.copy()
    for col in df_meta_clean.columns:
        if df_meta_clean[col].dtype == "Int64":     # Int64 nullable ‚Üí int32 o int64 normal
            df_meta_clean[col] = df_meta_clean[col].astype("int64")
        elif df_meta_clean[col].dtype == "string":  # String Arrow type ‚Üí str (object)
            df_meta_clean[col] = df_meta_clean[col].astype(str)

    df_meta_clean.to_parquet(faiss_meta_path, index=False)
    print("‚úÖ Metadata FAISS guardada sin Arrow extension types en:", faiss_meta_path)

    print("Metadata FAISS guardada en:", faiss_meta_path)
else:
    print("FAISS no construido.")

Calculando embeddings para FAISS (todo el conjunto)...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:18<00:00,  6.13s/it]


FAISS index creado. N¬∞ vectores: 66
FAISS index guardado en: c:\Users\Sofia\RAGModel_MineriaMultimedia_202520\notebooks\data\index\faiss_index.bin


ArrowKeyError: No type extension with name arrow.py_extension_type found

In [None]:
# 6 ‚Äî Funciones de b√∫squeda (Chroma)
def chroma_search(query, top_k=5):
    q_emb = embed_model.encode([query], convert_to_numpy=True)[0].tolist()
    res = collection.query(query_embeddings=[q_emb], n_results=top_k, include=['ids','documents','metadatas','distances'])
    return res

# 7 ‚Äî Funci√≥n de b√∫squeda FAISS (usa embeddings normalizados)
if use_faiss:
    # recargar √≠ndice y metadata (por si se ejecuta en otra sesi√≥n)
    idx = faiss.read_index(faiss_index_path)
    import pandas as pd
    faiss_meta = pd.read_parquet(faiss_meta_path)

    def faiss_search(query, top_k=5):
        q_emb = embed_model.encode([query], convert_to_numpy=True)
        # normalizar
        q_emb = q_emb / (np.linalg.norm(q_emb, axis=1, keepdims=True) + 1e-9)
        D, I = idx.search(q_emb.astype('float32'), top_k)
        results = []
        for dist, ind in zip(D[0], I[0]):
            meta = faiss_meta.iloc[ind].to_dict()
            doc = faiss_meta.iloc[ind]['text']
            results.append({"index": int(ind), "distance": float(dist), "meta": meta, "doc": doc})
        return results
else:
    def faiss_search(query, top_k=5):
        raise RuntimeError("FAISS no est√° disponible en este entorno.")

In [None]:
queries = [
    "Who saves Bella from the van?",
    "Which Cullen family member is a doctor?",
    "Where does Bella move to?",
    "What family feeds on animal blood?"
]

print("=== Chroma results ===")
for q in queries:
    print("\n>>> Query:", q)
    r = chroma_search(q, top_k=5)
    for i in range(len(r['ids'][0])):
        print(f"Rank {i+1} ‚Äî id: {r['ids'][0][i]} dist={r['distances'][0][i]:.4f}")
        meta = r['metadatas'][0][i]
        print("  book:", meta.get('book_name'), "chunk:", meta.get('chunk_number'), "words:", meta.get('word_count'))
        print("  snippet:", r['documents'][0][i][:200].replace('\n',' '), "...")
    print("-"*60)

if use_faiss:
    print("\n=== FAISS results ===")
    for q in queries:
        print("\n>>> Query:", q)
        res = faiss_search(q, top_k=5)
        for i, r in enumerate(res):
            print(f"Rank {i+1} ‚Äî faiss_index: {r['index']} dist={r['distance']:.4f} book={r['meta']['book_name']} chunk={r['meta']['chunk_number']}")
            print("  snippet:", r['doc'][:200].replace('\n',' '), "...")
        print("-"*60)

In [None]:
OUT_META_PATH = os.path.join(project_root, "data", "index", "chunks_metadata.parquet")
os.makedirs(os.path.dirname(OUT_META_PATH), exist_ok=True)
df.to_parquet(OUT_META_PATH, index=False)
print("Metadata guardada en:", OUT_META_PATH)