In [3]:
# 02 - Indexaci√≥n Unificada

# Objetivo: Crear √≠ndices TF-IDF, ChromaDB y FAISS a partir de chunks preprocesados.

# Flujo del notebook:
# 1. Cargar chunks desde data/preprocessed/processed_*
# 2. Crear √≠ndice TF-IDF (baseline)
# 3. Generar embeddings y poblar ChromaDB
# 4. Construir √≠ndice FAISS
# 5. Realizar consultas de prueba
# 6. Comparar resultados entre m√©todos

# ===================================
# 1. Configuraci√≥n y Setup
# ===================================

import os
import sys
import pprint
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

# Configurar rutas del proyecto
project_root = os.path.abspath("..")
src_path = os.path.join(project_root, "src")
if src_path not in sys.path:
    sys.path.append(src_path)

print("‚úÖ Project root:", project_root)
print("‚úÖ Src path added:", src_path)

# -----------------------------------

from utils import load_chunks_from_folder

BASE_PREPROCESSED = os.path.join(project_root, "data", "preprocessed")
folders = sorted([
    os.path.join(BASE_PREPROCESSED, f) 
    for f in os.listdir(BASE_PREPROCESSED) 
    if f.startswith("processed_")
])

print("üìÅ Carpetas procesadas detectadas:")
pprint.pprint(folders)

# ===================================
# 2. Carga de Datos y Metadata
# ===================================

records = []
for folder in folders:
    recs = load_chunks_from_folder(folder)
    print(f"üìÑ Le√≠dos {len(recs)} registros desde {folder}")
    records.extend(recs)

df = pd.DataFrame.from_records(records)
print(f"\n‚úÖ Total chunks cargados: {len(df)}")
df.head()

# -----------------------------------

def make_formatted_id(row):
    return f"{row['book_name']}_{row['chunk_size']}_{row['overlap']}_chunk_{int(row['chunk_number']):02d}"

df['formatted_chunk_id'] = df.apply(make_formatted_id, axis=1)

print("üìä Ejemplo de formatted_chunk_id:")
display(df[['formatted_chunk_id', 'book_name', 'chunk_number', 'word_count']].head(6))

print("\nüìà Conteo por chunk_size:")
display(df['chunk_size'].value_counts())

print("\nüìö Top libros por n√∫mero de chunks:")
display(df['book_name'].value_counts())

# ===================================
# 3. Indexaci√≥n TF-IDF (Baseline)
# ===================================

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def create_tfidf_index(chunks):
    if not chunks:
        raise ValueError("‚ùå La lista de chunks est√° vac√≠a. No se puede crear el √≠ndice TF-IDF.")
    
    print("‚öôÔ∏è Generando representaciones TF-IDF de los chunks...")
    vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
    X = vectorizer.fit_transform(chunks)
    print(f"üìà Indexados {len(chunks)} chunks.")
    return vectorizer, X

def query_tfidf(query, vectorizer, X, chunks, top_k=3):
    if not query.strip():
        raise ValueError("‚ùå La consulta est√° vac√≠a.")
    
    query_vec = vectorizer.transform([query])
    similarities = cosine_similarity(query_vec, X).flatten()
    top_indices = np.argsort(similarities)[::-1][:top_k]
    
    top_chunks = [(chunks[i], similarities[i]) for i in top_indices]
    return top_chunks

# -----------------------------------

# IMPORTANTE: Guardar documentos para TF-IDF antes de cualquier reasignaci√≥n
documents_tfidf = df['text'].astype(str).tolist()
vectorizer_tfidf, X_tfidf = create_tfidf_index(documents_tfidf)

print(f"\n‚úÖ √çndice TF-IDF creado con {X_tfidf.shape[0]} documentos")
print(f"üìä Dimensionalidad: {X_tfidf.shape[1]} features")

# ===================================
# 4. Indexaci√≥n Vectorial (ChromaDB + FAISS)
# ===================================

from sentence_transformers import SentenceTransformer

embed_model_name = "sentence-transformers/multi-qa-MiniLM-L6-cos-v1"
print(f"ü§ñ Cargando modelo de embeddings: {embed_model_name}")
embed_model = SentenceTransformer(embed_model_name)

def token_count(text):
    tokens = embed_model.tokenize(text)
    return tokens['input_ids'].shape[1]

df['token_count_sample'] = df['text'].apply(
    lambda x: token_count(x) if len(x.split()) < 1000 else None
)

print("\nüìä Informaci√≥n de tokenizaci√≥n:")
display(df[['formatted_chunk_id', 'word_count', 'token_count_sample']].head(6))

# -----------------------------------

ids = df['formatted_chunk_id'].astype(str).tolist()
documents = df['text'].astype(str).tolist()
metadatas = df.apply(lambda r: {
    "book_name": r['book_name'],
    "chunk_size": int(r['chunk_size']) if pd.notnull(r['chunk_size']) else None,
    "overlap": int(r['overlap']) if pd.notnull(r['overlap']) else None,
    "chunk_number": int(r['chunk_number']),
    "word_count": int(r['word_count'])
}, axis=1).tolist()

print(f"‚úÖ Preparados {len(ids)} documentos para indexaci√≥n")

‚úÖ Project root: c:\Users\Sofia\RAGModel_MineriaMultimedia_202520
‚úÖ Src path added: c:\Users\Sofia\RAGModel_MineriaMultimedia_202520\src
üìÅ Carpetas procesadas detectadas:
['c:\\Users\\Sofia\\RAGModel_MineriaMultimedia_202520\\data\\preprocessed\\processed_400_100',
 'c:\\Users\\Sofia\\RAGModel_MineriaMultimedia_202520\\data\\preprocessed\\processed_600_150',
 'c:\\Users\\Sofia\\RAGModel_MineriaMultimedia_202520\\data\\preprocessed\\processed_800_200']
üìÑ Le√≠dos 28 registros desde c:\Users\Sofia\RAGModel_MineriaMultimedia_202520\data\preprocessed\processed_400_100
üìÑ Le√≠dos 21 registros desde c:\Users\Sofia\RAGModel_MineriaMultimedia_202520\data\preprocessed\processed_600_150
üìÑ Le√≠dos 17 registros desde c:\Users\Sofia\RAGModel_MineriaMultimedia_202520\data\preprocessed\processed_800_200

‚úÖ Total chunks cargados: 66
üìä Ejemplo de formatted_chunk_id:


Unnamed: 0,formatted_chunk_id,book_name,chunk_number,word_count
0,data_summary_400_100_chunk_01,data_summary,1,400
1,data_summary_400_100_chunk_02,data_summary,2,400
2,data_summary_400_100_chunk_03,data_summary,3,253
3,breakingdawn_bookone_400_100_chunk_01,breakingdawn_bookone,1,400
4,breakingdawn_bookone_400_100_chunk_02,breakingdawn_bookone,2,186
5,breakingdawn_bookthree_400_100_chunk_01,breakingdawn_bookthree,1,400



üìà Conteo por chunk_size:


chunk_size
400    28
600    21
800    17
Name: count, dtype: int64


üìö Top libros por n√∫mero de chunks:


book_name
newmoon                   16
breakingdawn_bookthree    12
breakingdawn_booktwo       9
data_summary               7
twilight                   7
eclipse                    7
breakingdawn_bookone       5
midnightsun                3
Name: count, dtype: int64

'(ProtocolError('Connection aborted.', RemoteDisconnected('Remote end closed connection without response')), '(Request ID: 689b1d7a-b644-44cc-a683-f8f30c565100)')' thrown while requesting HEAD https://huggingface.co/sentence-transformers/multi-qa-MiniLM-L6-cos-v1/resolve/main/./modules.json
Retrying in 1s [Retry 1/5].


‚öôÔ∏è Generando representaciones TF-IDF de los chunks...
üìà Indexados 66 chunks.

‚úÖ √çndice TF-IDF creado con 66 documentos
üìä Dimensionalidad: 1538 features
ü§ñ Cargando modelo de embeddings: sentence-transformers/multi-qa-MiniLM-L6-cos-v1

üìä Informaci√≥n de tokenizaci√≥n:


Unnamed: 0,formatted_chunk_id,word_count,token_count_sample
0,data_summary_400_100_chunk_01,400,3
1,data_summary_400_100_chunk_02,400,3
2,data_summary_400_100_chunk_03,253,3
3,breakingdawn_bookone_400_100_chunk_01,400,3
4,breakingdawn_bookone_400_100_chunk_02,186,3
5,breakingdawn_bookthree_400_100_chunk_01,400,3


‚úÖ Preparados 66 documentos para indexaci√≥n
