# Cargar el libro 

Hacer RAG con el libro proporcionado

In [1]:
%pip install pymupdf

Note: you may need to restart the kernel to use updated packages.


In [2]:
import fitz  # PyMuPDF

# Use raw string (r prefix) or forward slashes to avoid escape sequence issues
doc = fitz.open(r"C:\Users\roble\OneDrive\Documentos\GitHub\Recuperacion\ir25a-main\ir25a-main\libro\irbookonlinereading.pdf")

In [3]:
import pandas as pd

In [4]:
data = []
for i, page in enumerate(doc):
    text = page.get_text().encode('utf-8')
    if text.strip():
        data.append({
            "page": i + 1,
            "raw": str(text).strip()
        })

df = pd.DataFrame(data)
df 

Unnamed: 0,page,raw
0,1,b'Online edition (c)\n2009 Cambridge UP\nAn\nI...
1,2,b'Online edition (c)\n2009 Cambridge UP\n'
2,3,b'Online edition (c)\n2009 Cambridge UP\nAn\nI...
3,4,b'Online edition (c)\n2009 Cambridge UP\nDRAFT...
4,5,b'Online edition (c)\n2009 Cambridge UP\nDRAFT...
...,...,...
576,577,b'Online edition (c)\n2009 Cambridge UP\n540\n...
577,578,b'Online edition (c)\n2009 Cambridge UP\nIndex...
578,579,b'Online edition (c)\n2009 Cambridge UP\n542\n...
579,580,b'Online edition (c)\n2009 Cambridge UP\nIndex...


In [5]:
# Verificar si el PDF tiene tabla de contenidos (TOC)
toc = doc.get_toc()

if toc:
    print(f"¡El PDF tiene {len(toc)} elementos en la tabla de contenidos!")
    print("\nPrimeras 10 entradas:")
    for i, (level, title, page) in enumerate(toc[:100]):
        indent = "  " * (level - 1)
        print(f"{indent}Nivel {level}: {title} (Página {page})")
else:
    print("El PDF no tiene tabla de contenidos estructurada")

¡El PDF tiene 256 elementos en la tabla de contenidos!

Primeras 10 entradas:
Nivel 1: List of Tables (Página 15)
Nivel 1: List of Figures (Página 19)
Nivel 1: Table of Notation (Página 27)
Nivel 1: Preface (Página 31)
Nivel 1: Boolean retrieval (Página 38)
  Nivel 2: An example information retrieval problem (Página 40)
  Nivel 2: A first take at building an inverted index (Página 43)
  Nivel 2: Processing Boolean queries (Página 47)
  Nivel 2: The extended Boolean model versus ranked retrieval (Página 51)
  Nivel 2: References and further reading (Página 54)
Nivel 1: The term vocabulary and postings lists (Página 56)
  Nivel 2: Document delineation and character sequence decoding (Página 56)
    Nivel 3: Obtaining the character sequence in a document (Página 56)
    Nivel 3: Choosing a document unit (Página 57)
  Nivel 2: Determining the vocabulary of terms (Página 59)
    Nivel 3: Tokenization (Página 59)
    Nivel 3: Dropping common terms: stop words (Página 64)
    Nivel 3: Normali

In [6]:
def extract_hierarchical_documents(doc, start_page=38):
    """
    Extrae documentos jerárquicos desde una página específica
    Cada elemento de TOC se convierte en un documento separado
    """
    toc = doc.get_toc()
    
    if not toc:
        print("No hay tabla de contenidos disponible")
        return None
    
    # Filtrar TOC desde la página especificada
    filtered_toc = [(level, title, page) for level, title, page in toc if page >= start_page]
    
    documents = []
    
    for i, (level, title, start_page_toc) in enumerate(filtered_toc):
        # Determinar página final del documento
        if i + 1 < len(filtered_toc):
            end_page = filtered_toc[i + 1][2] - 1
        else:
            end_page = len(doc)
        
        # Extraer texto del rango de páginas
        content = ""
        for page_num in range(start_page_toc - 1, min(end_page, len(doc))):
            if page_num < len(doc):
                page_text = doc[page_num].get_text()
                content += page_text + "\n"
        
        documents.append({
            'doc_id': i + 1,
            'level': level,
            'title': title.strip(),
            'start_page': start_page_toc,
            'end_page': end_page,
            'content': content.strip(),
            'word_count': len(content.split()),
            'char_count': len(content),
            'parent_level': level - 1 if level > 1 else None
        })
    
    return pd.DataFrame(documents)

# Extraer documentos jerárquicos desde página 38
hierarchical_docs_df = extract_hierarchical_documents(doc, start_page=38)

if hierarchical_docs_df is not None:
    print(f"Se extrajeron {len(hierarchical_docs_df)} documentos desde la página 38")
    print("\nPrimeros 10 documentos:")
    print(hierarchical_docs_df[['doc_id', 'level', 'title', 'start_page', 'end_page', 'word_count']].head(10))

Se extrajeron 252 documentos desde la página 38

Primeros 10 documentos:
   doc_id  level                                              title  \
0       1      1                                  Boolean retrieval   
1       2      2           An example information retrieval problem   
2       3      2         A first take at building an inverted index   
3       4      2                         Processing Boolean queries   
4       5      2  The extended Boolean model versus ranked retri...   
5       6      2                     References and further reading   
6       7      1             The term vocabulary and postings lists   
7       8      2  Document delineation and character sequence de...   
8       9      3     Obtaining the character sequence in a document   
9      10      3                           Choosing a document unit   

   start_page  end_page  word_count  
0          38        39         869  
1          40        42        1276  
2          43        46        

In [7]:
hierarchical_docs_df

Unnamed: 0,doc_id,level,title,start_page,end_page,content,word_count,char_count,parent_level
0,1,1,Boolean retrieval,38,39,Online edition (c)\n2009 Cambridge UP\nDRAFT! ...,869,5449,
1,2,2,An example information retrieval problem,40,42,Online edition (c)\n2009 Cambridge UP\n1.1\nAn...,1276,7296,1.0
2,3,2,A first take at building an inverted index,43,46,Online edition (c)\n2009 Cambridge UP\n6\n1\nB...,1732,9452,1.0
3,4,2,Processing Boolean queries,47,50,Online edition (c)\n2009 Cambridge UP\n10\n1\n...,1402,8273,1.0
4,5,2,The extended Boolean model versus ranked retri...,51,53,Online edition (c)\n2009 Cambridge UP\n14\n1\n...,1387,8516,1.0
...,...,...,...,...,...,...,...,...,...
247,248,2,Hubs and Authorities,511,513,Online edition (c)\n2009 Cambridge UP\n474\n21...,1261,7061,1.0
248,249,3,Choosing the subset of the Web,514,516,Online edition (c)\n2009 Cambridge UP\n21.3\nH...,1066,5945,2.0
249,250,2,References and further reading,517,519,Online edition (c)\n2009 Cambridge UP\n480\n21...,628,3784,1.0
250,251,1,Bibliography,520,557,Online edition (c)\n2009 Cambridge UP\nDRAFT! ...,14866,105679,


In [8]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import numpy as np
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\roble\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\roble\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [9]:
import re

stop_words = set(stopwords.words('english'))

def preprocess_docs(docs):
    # Primero, vamos a ver qué contiene realmente el texto
    # print(f"Texto original (primeros 200 chars): {repr(docs[:200])}")
    
    # Lista de textos específicos a eliminar con diferentes variaciones
    unwanted_texts = [
        "b'Online edition (c)\\n2009 Cambridge UP\\n",
        "b'Online edition (c)\\\\n2009 Cambridge UP\\\\n",
        "Online edition (c) 2009 Cambridge UP",
        "Online edition (c)\\n2009 Cambridge UP",
        "Cambridge University Press",
        "All rights reserved",
        "2009 Cambridge UP",
    ]
    
    # Eliminar textos específicos
    for unwanted in unwanted_texts:
        docs = docs.replace(unwanted, "")
    
    # Usar regex para patrones más flexibles
    patterns_to_remove = [
        r"b'Online edition.*?Cambridge UP.*?'",
        r"Online edition.*?Cambridge UP",
        r"Cambridge University Press",
        r"All rights reserved",
        r"2009 Cambridge UP",
        r"\\\\x[0-9a-fA-F]{2}",  # Caracteres hexadecimales con doble escape
    ]
    
    for pattern in patterns_to_remove:
        docs = re.sub(pattern, "", docs, flags=re.IGNORECASE | re.DOTALL)
    
    # Limpiar caracteres de escape y formateo
    docs = re.sub(r"b'", "", docs)  # Eliminar b'
    docs = re.sub(r"'", "", docs)   # Eliminar comillas finales
    docs = re.sub(r"\\\\n", " ", docs)  # Doble escape de saltos de línea
    docs = re.sub(r"\\n", " ", docs)  # Saltos de línea simples
    docs = re.sub(r"\\\\t", " ", docs)  # Doble escape de tabs
    docs = re.sub(r"\\t", " ", docs)  # Tabs simples
    docs = re.sub(r"\\\\x\w{2}", "", docs)  # Caracteres hexadecimales con doble escape
    docs = re.sub(r"\\x\w{2}", "", docs)  # Caracteres hexadecimales simples
    
    # Eliminar números de página aislados
    docs = re.sub(r'\b\d{1,3}\b', '', docs)
    
    # Eliminar caracteres especiales excesivos
    docs = re.sub(r'[^\w\s]', ' ', docs)
    
    # Limpiar espacios múltiples
    docs = re.sub(r'\s+', ' ', docs).strip()
    
    # Tokenizar y filtrar
    words = word_tokenize(docs)
    word_filtered = [w for w in words if w not in stop_words and w.isalpha() and len(w) > 2]
    return ' '.join(word_filtered)

In [10]:
def preprocess_docs_aggressive(docs):
    """Versión más agresiva de limpieza"""
    
    # Convertir a string si no lo es
    docs = str(docs)
    
    # Eliminar todo lo que empiece con "b'" y termine con una comilla
    docs = re.sub(r"b'.*?'", "", docs, flags=re.DOTALL)
    
    # Eliminar todas las menciones de Cambridge UP y related
    cambridge_patterns = [
        r".*Cambridge.*UP.*",
        r".*Online edition.*",
        r".*All rights reserved.*",
        r".*2009.*Cambridge.*"
    ]
    
    for pattern in cambridge_patterns:
        docs = re.sub(pattern, "", docs, flags=re.IGNORECASE)
    
    # Limpiar todos los tipos de escape
    escape_patterns = [
        r"\\\\n", r"\\n",      # Saltos de línea
        r"\\\\t", r"\\t",      # Tabulaciones  
        r"\\\\x[0-9a-fA-F]{2}", r"\\x[0-9a-fA-F]{2}",  # Hex
        r"\\\\", r"\\"         # Backslashes
    ]
    
    for pattern in escape_patterns:
        docs = re.sub(pattern, " ", docs)
    
    # Eliminar números aislados (páginas)
    docs = re.sub(r'\b\d{1,3}\b', '', docs)
    
    # Limpiar caracteres especiales
    docs = re.sub(r'[^\w\s]', ' ', docs)
    
    # Limpiar espacios múltiples
    docs = re.sub(r'\s+', ' ', docs).strip()
    
    # Tokenizar y filtrar
    words = word_tokenize(docs)
    word_filtered = [w for w in words if w not in stop_words and w.isalpha() and len(w) > 2]
    return ' '.join(word_filtered)

In [11]:
# Aplicar la nueva función de preprocesamiento
if hierarchical_docs_df is not None:
    print("Aplicando preprocesamiento mejorado...")
    
    # Probar con el primer documento para ver si funciona
    test_doc = hierarchical_docs_df.iloc[0]['content']
    print("Antes del preprocesamiento:")
    print(repr(test_doc[:200]))
    
    cleaned_test = preprocess_docs_aggressive(test_doc)
    print("\nDespués del preprocesamiento:")
    print(cleaned_test[:200])
    
    # Aplicar a todos los documentos
    hierarchical_docs_df['preprocessed'] = hierarchical_docs_df['content'].apply(preprocess_docs_aggressive)
    
    # Filtrar documentos con contenido válido
    hierarchical_docs_df = hierarchical_docs_df[hierarchical_docs_df['preprocessed'].str.len() > 50]
    
    print(f"\nDocumentos finales después del preprocesamiento: {len(hierarchical_docs_df)}")

Aplicando preprocesamiento mejorado...
Antes del preprocesamiento:
'Online edition (c)\n2009 Cambridge UP\nDRAFT! © April 1, 2009 Cambridge University Press. Feedback welcome.\n1\n1\nBoolean retrieval\nThe meaning of the term information retrieval can be very broad. Just ge'

Después del preprocesamiento:
Boolean retrieval The meaning term information retrieval broad Just getting credit card wallet type card number form information retrieval However academic ﬁeld study information retrieval might deﬁne

Documentos finales después del preprocesamiento: 215


In [12]:
hierarchical_docs_df

Unnamed: 0,doc_id,level,title,start_page,end_page,content,word_count,char_count,parent_level,preprocessed
0,1,1,Boolean retrieval,38,39,Online edition (c)\n2009 Cambridge UP\nDRAFT! ...,869,5449,,Boolean retrieval The meaning term information...
1,2,2,An example information retrieval problem,40,42,Online edition (c)\n2009 Cambridge UP\n1.1\nAn...,1276,7296,1.0,example information retrieval problem chapter ...
2,3,2,A first take at building an inverted index,43,46,Online edition (c)\n2009 Cambridge UP\n6\n1\nB...,1732,9452,1.0,Boolean retrieval Detailed discussion relevanc...
3,4,2,Processing Boolean queries,47,50,Online edition (c)\n2009 Cambridge UP\n10\n1\n...,1402,8273,1.0,Boolean retrieval Brutus Calpurnia Intersectio...
4,5,2,The extended Boolean model versus ranked retri...,51,53,Online edition (c)\n2009 Cambridge UP\n14\n1\n...,1387,8516,1.0,Boolean retrieval Term Postings size eyes kale...
...,...,...,...,...,...,...,...,...,...,...
247,248,2,Hubs and Authorities,511,513,Online edition (c)\n2009 Cambridge UP\n474\n21...,1261,7061,1.0,Link analysis Exercise Suppose web graph store...
248,249,3,Choosing the subset of the Web,514,516,Online edition (c)\n2009 Cambridge UP\n21.3\nH...,1066,5945,2.0,Hubs Authorities Output top scoring hubs top s...
249,250,2,References and further reading,517,519,Online edition (c)\n2009 Cambridge UP\n480\n21...,628,3784,1.0,Link analysis Exercise How would interpret ent...
250,251,1,Bibliography,520,557,Online edition (c)\n2009 Cambridge UP\nDRAFT! ...,14866,105679,,Bibliography use following abbreviated journal...


In [13]:
from sentence_transformers import SentenceTransformer
import numpy as np

In [14]:
model = SentenceTransformer('all-MiniLM-L6-v2')

In [15]:
embeddings = model.encode(hierarchical_docs_df['preprocessed'].tolist())

In [16]:
embeddings

array([[ 0.00876109, -0.01737127, -0.04260701, ..., -0.03232197,
         0.01516353, -0.02024754],
       [ 0.01462434,  0.01935861, -0.03255737, ..., -0.00875811,
         0.00207664,  0.02770639],
       [-0.00840228, -0.03102122, -0.04935666, ...,  0.00183343,
        -0.00840126,  0.02426352],
       ...,
       [-0.00951959, -0.05244111, -0.09500014, ...,  0.01779133,
        -0.04317988, -0.01395471],
       [-0.03801242, -0.06153281, -0.05434936, ..., -0.04515737,
        -0.0508875 , -0.01516703],
       [ 0.04286065, -0.05765121, -0.03717588, ..., -0.00721208,
         0.0131214 , -0.04086863]], dtype=float32)

In [17]:
hierarchical_docs_df['embedding'] = list(embeddings[:1000])
hierarchical_docs_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hierarchical_docs_df['embedding'] = list(embeddings[:1000])


Unnamed: 0,doc_id,level,title,start_page,end_page,content,word_count,char_count,parent_level,preprocessed,embedding
0,1,1,Boolean retrieval,38,39,Online edition (c)\n2009 Cambridge UP\nDRAFT! ...,869,5449,,Boolean retrieval The meaning term information...,"[0.008761087, -0.01737127, -0.042607006, 0.022..."
1,2,2,An example information retrieval problem,40,42,Online edition (c)\n2009 Cambridge UP\n1.1\nAn...,1276,7296,1.0,example information retrieval problem chapter ...,"[0.014624343, 0.01935861, -0.03255737, -0.0307..."
2,3,2,A first take at building an inverted index,43,46,Online edition (c)\n2009 Cambridge UP\n6\n1\nB...,1732,9452,1.0,Boolean retrieval Detailed discussion relevanc...,"[-0.008402278, -0.031021217, -0.049356658, 0.0..."
3,4,2,Processing Boolean queries,47,50,Online edition (c)\n2009 Cambridge UP\n10\n1\n...,1402,8273,1.0,Boolean retrieval Brutus Calpurnia Intersectio...,"[-0.020248102, -0.038778815, -0.07978982, 0.01..."
4,5,2,The extended Boolean model versus ranked retri...,51,53,Online edition (c)\n2009 Cambridge UP\n14\n1\n...,1387,8516,1.0,Boolean retrieval Term Postings size eyes kale...,"[0.032437306, -0.03302262, -0.023949256, 0.046..."
...,...,...,...,...,...,...,...,...,...,...,...
247,248,2,Hubs and Authorities,511,513,Online edition (c)\n2009 Cambridge UP\n474\n21...,1261,7061,1.0,Link analysis Exercise Suppose web graph store...,"[0.02554892, -0.024918225, -0.06523086, -0.009..."
248,249,3,Choosing the subset of the Web,514,516,Online edition (c)\n2009 Cambridge UP\n21.3\nH...,1066,5945,2.0,Hubs Authorities Output top scoring hubs top s...,"[0.025388028, -0.03269337, -0.059302106, -0.01..."
249,250,2,References and further reading,517,519,Online edition (c)\n2009 Cambridge UP\n480\n21...,628,3784,1.0,Link analysis Exercise How would interpret ent...,"[-0.009519593, -0.05244111, -0.09500014, -0.03..."
250,251,1,Bibliography,520,557,Online edition (c)\n2009 Cambridge UP\nDRAFT! ...,14866,105679,,Bibliography use following abbreviated journal...,"[-0.038012423, -0.061532807, -0.05434936, 0.02..."


In [18]:
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
query = "Benchmark"
query_emb = model.encode(query)
print(query_emb)

[-3.56612131e-02 -1.75286066e-02 -4.20964509e-02 -4.82783746e-03
  7.46160513e-03 -4.96165156e-02  2.50388943e-02  1.12307603e-02
 -5.43479212e-02 -1.36764301e-02 -1.60393212e-02 -6.74523115e-02
  5.85131394e-03 -2.74366476e-02 -4.30077128e-02 -5.87044060e-02
  1.35599896e-01  8.07881262e-03 -5.46144135e-02 -1.17088497e-01
 -8.14201981e-02 -9.00198519e-02  3.49609964e-02 -2.93366350e-02
  1.47342965e-01  8.41984618e-03 -4.04698066e-02  6.24017008e-02
  3.69590730e-03 -6.01331554e-02  3.09085324e-02 -1.41630759e-02
  3.47727649e-02  7.90730584e-03 -5.61596341e-02 -3.24862003e-02
 -3.14878160e-03 -3.45449522e-02  8.17072466e-02  1.95760708e-02
 -2.75278371e-02 -1.09901866e-02  5.54863364e-02  2.31004152e-02
  5.48508726e-02  1.78303197e-02 -6.53773360e-03 -6.62332680e-03
 -5.36817499e-02 -5.86747564e-03 -1.19842485e-01  3.91858630e-03
 -1.03120841e-02 -1.57604143e-02 -1.82269942e-02  7.70807713e-02
  4.85998280e-02 -6.73391372e-02  3.68102565e-02 -1.14895124e-02
  9.18866470e-02 -1.17844

In [20]:
similarities = cosine_similarity([query_emb], hierarchical_docs_df['embedding'].tolist())[0]

In [21]:
# Obtener los índices ordenados por similitud (de mayor a menor)
sorted_indices = similarities.argsort()[::-1]

print("Top 10 documentos más similares:")
for i in range(10):
    idx = sorted_indices[i]
    similarity = similarities[idx]
    title = hierarchical_docs_df.iloc[idx]['title']
    print(f"{i+1}. Doc {idx+1}: {similarity:.4f} - {title}")

Top 10 documentos más similares:
1. Doc 39: 0.3310 - Heaps' law: Estimating the number of terms
2. Doc 187: 0.3094 - References and further reading
3. Doc 81: 0.2841 - System issues
4. Doc 41: 0.2804 - Dictionary compression
5. Doc 76: 0.2746 - Standard test collections
6. Doc 143: 0.2741 - Time complexity and optimality of kNN
7. Doc 148: 0.2624 - Exercises
8. Doc 195: 0.2615 - Near-duplicates and shingling
9. Doc 68: 0.2478 - Tiered indexes
10. Doc 201: 0.2470 - The URL frontier


In [22]:
from openai import OpenAI

In [26]:
def rag_query_with_openai(query, top_k=10):

    # Obtener top k documentos más relevantes
    sorted_indices = similarities.argsort()[::-1]
    
    # Preparar contexto con los documentos más relevantes
    context_docs = []
    for i in range(min(top_k, len(sorted_indices))):
        idx = sorted_indices[i]
        doc_info = hierarchical_docs_df.iloc[idx]
        context_docs.append({
            'doc_id': doc_info['doc_id'],  # Agregar el doc_id real
            'title': doc_info['title'],
            'content': doc_info['content'][:2000],  # Limitar contenido para no exceder tokens
            'similarity': similarities[idx]
        })
    
    # Construir el prompt con contexto
    context_text = "\n\n".join([
        f"DOCUMENTO {doc['doc_id']}: {doc['title']}\n{doc['content']}"
        for i, doc in enumerate(context_docs)
    ])
    
    # 6. Hacer la consulta a OpenAI
    completion = client.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[
            {
                "role": "system",
                "content": "Eres un asistente especializado en Information Retrieval. "
                "Responde únicamente basándote en los documentos proporcionados. "
                "Si no encuentras información relevante en los documentos, di claramente que no sabes o "
                "que la información no está disponible en los documentos. "
                "NO inventes ni agregues información externa."
            },
            {
                "role": "user",
                "content": f"""Basándote ÚNICAMENTE en los siguientes documentos, 
                responde a la pregunta: "{query}"

DOCUMENTOS DE REFERENCIA:
{context_text}

PREGUNTA: {query}

INSTRUCCIONES:
- Responde solo con información que esté explícitamente en los documentos
- Si no hay información suficiente, di "No encuentro información suficiente en los documentos proporcionados"
- Cita el número de documento cuando sea relevante
- Sé preciso y conciso"""
            }
        ],
        max_tokens=2000,
        temperature=0.1
    )
    
    return {
        'query': query,
        'answer': completion.choices[0].message.content,
        'relevant_docs': context_docs,
        'num_docs_used': len(context_docs)
    }

# Ejemplo de uso
query = "What is the benchmark for information retrieval?"
result = rag_query_with_openai(query, top_k=10)

print(f"PREGUNTA: {result['query']}")
print(f"\nRESPUESTA:")
print(result['answer'])
print(f"\nDOCUMENTOS UTILIZADOS: {result['num_docs_used']}")

# Mostrar qué documentos específicos se usaron
print(f"\nDOCUMENTOS CONSULTADOS:")
for i, doc in enumerate(result['relevant_docs']):
    print(f"{i+1}. Documento {doc['doc_id']}: {doc['title']} (Similitud: {doc['similarity']:.4f})")

PREGUNTA: What is the benchmark for information retrieval?

RESPUESTA:
El benchmark para information retrieval se refiere a colecciones de prueba estándar y evaluaciones que permiten medir cuantitativamente la efectividad de los sistemas de recuperación de información. Un ejemplo destacado es la Text Retrieval Conference (TREC), organizada por el National Institute of Standards and Technology (NIST) desde 1992, que proporciona una serie de colecciones de prueba y pistas para evaluar sistemas de recuperación de información (Documento 89). También se menciona la colección Cranfield, pionera en permitir medidas cuantitativas precisas, aunque actualmente es demasiado pequeña para experimentos avanzados (Documento 89).

Por lo tanto, el benchmark para information retrieval son estas colecciones de prueba estándar como TREC y Cranfield, que permiten evaluar y comparar el rendimiento de los sistemas de recuperación de información.

DOCUMENTOS UTILIZADOS: 10

DOCUMENTOS CONSULTADOS:
1. Documen

In [None]:
# Mostrar el contenido del documento en la posición 45
print("Título:", hierarchical_docs_df.iloc[89]['title'])
print("\nContenido (primeros 500 caracteres):")
print(hierarchical_docs_df.iloc[89]['content'][:500])

Título: Indirect relevance feedback

Contenido (primeros 500 caracteres):
Online edition (c)
2009 Cambridge UP
9.1
Relevance feedback and pseudo relevance feedback
187
Precision at k = 50
Term weighting
no RF
pseudo RF
lnc.ltc
64.2%
72.7%
Lnu.ltu
74.2%
87.0%
◮Figure 9.5
Results showing pseudo relevance feedback greatly improving perfor-
mance. These results are taken from the Cornell SMART system at TREC 4 (Buckley
et al. 1995), and also contrast the use of two different length normalization schemes
(L vs. l); cf. Figure 6.15 (page 128). Pseudo relevance feedback cons
