# Recuperacion de la Información - Examen IIB 
---
**Nombre:** Anthony Reinoso


In [2]:
! pip install rank_bm25 sentence-transformers faiss-cpu transformers
import nltk
nltk.download('punkt')
nltk.download('stopwords')

Defaulting to user installation because normal site-packages is not writeable


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pituf\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pituf\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
def contar_documentos(path):
    with open(path, 'r', encoding='utf-8') as f:
        total = sum(1 for _ in f)
    return total

ruta = 'arxiv-metadata-oai-snapshot.json'
total_docs = contar_documentos(ruta)
print(f"El archivo contiene {total_docs} documentos.")


El archivo contiene 2792339 documentos.


# Implementación de la arquitectura.

## 1. Preprocesamiento del corpus


In [None]:
import json
import string
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english')) # Configuración de stopword

# Función de preprocesamiento
def preprocess(text):
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    return ' '.join(tokens)

def load_corpus(path, max_docs=28000):
    corpus = []
    with open(path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= max_docs:
                break
            doc = json.loads(line)
            doc['text'] = preprocess(doc['title'] + ' ' + doc['abstract'])
            corpus.append(doc)
    return corpus

corpus = load_corpus('arxiv-metadata-oai-snapshot.json')


### Mostrar en tabla el corpus

In [None]:

import pandas as pd
import json
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words('english'))

def preprocess(text):
    text = text.lower().translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    return ' '.join(tokens)

# Carga de corpus con textos originales y preprocesados
def load_corpus_for_display(path, max_docs=10):
    original = []
    procesado = []
    with open(path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if i >= max_docs:
                break
            doc = json.loads(line)
            title = doc.get('title', '')
            abstract = doc.get('abstract', '')
            combined_text = title + ' ' + abstract
            original.append(combined_text)
            procesado.append(preprocess(combined_text))
    return pd.DataFrame({
        "Texto original": original,
        "Texto preprocesado": procesado
    })

df_corpus_display = load_corpus_for_display("arxiv-metadata-oai-snapshot.json", max_docs=10)
df_corpus_display.head(10)

Unnamed: 0,Texto original,Texto preprocesado
0,Calculation of prompt diphoton production cros...,calculation prompt diphoton production cross s...
1,Sparsity-certifying Graph Decompositions We ...,sparsitycertifying graph decompositions descri...
2,The evolution of the Earth-Moon system based o...,evolution earthmoon system based dark matter f...
3,A determinant of Stirling cycle numbers counts...,determinant stirling cycle numbers counts unla...
4,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,dyadic lambdaalpha lambdaalpha paper show comp...
5,Bosonic characters of atomic Cooper pairs acro...,bosonic characters atomic cooper pairs across ...
6,Polymer Quantum Mechanics and its Continuum Li...,polymer quantum mechanics continuum limit rath...
7,Numerical solution of shock and ramp compressi...,numerical solution shock ramp compression gene...
8,"The Spitzer c2d Survey of Large, Nearby, Inste...",spitzer c2d survey large nearby insterstellar ...
9,"Partial cubes: structures, characterizations, ...",partial cubes structures characterizations con...


## 2. Indexación

### TF-IDF

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform([doc['text'] for doc in corpus])

### BM25

In [7]:
from rank_bm25 import BM25Okapi

tokenized_corpus = [doc['text'].split() for doc in corpus]
bm25 = BM25Okapi(tokenized_corpus)

### Embeddings

In [8]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode([doc['text'] for doc in corpus], show_progress_bar=True)
index_faiss = faiss.IndexFlatL2(embeddings.shape[1])
index_faiss.add(np.array(embeddings))

  from .autonotebook import tqdm as notebook_tqdm
Batches: 100%|██████████| 875/875 [12:22<00:00,  1.18it/s] 


## 3. Recuperacion

### TF-IDF

In [10]:
def search_tfidf(query, top_k=10):
    query_vec = tfidf_vectorizer.transform([preprocess(query)])
    scores = tfidf_matrix.dot(query_vec.T).toarray().ravel()
    top_indices = scores.argsort()[::-1][:top_k]
    return [corpus[i] for i in top_indices]

### BM25

In [11]:

def search_bm25(query, top_k=10):
    tokens = preprocess(query).split()
    scores = bm25.get_scores(tokens)
    top_indices = np.argsort(scores)[::-1][:top_k]
    return [corpus[i] for i in top_indices]

### FAISS

In [12]:

def search_faiss(query, top_k=10):
    vec = model.encode([preprocess(query)])
    scores, indices = index_faiss.search(np.array(vec), top_k)
    return [corpus[i] for i in indices[0]]

## 4. RAG

In [13]:
from transformers import pipeline
rag_pipeline = pipeline("text2text-generation", model="google/flan-t5-base")

def generate_rag_response(query, top_k=3):
    top_docs = search_faiss(query, top_k)
    context = " ".join([doc['abstract'] for doc in top_docs])
    prompt = f"Query: {query}\nContext: {context}\nAnswer:"
    response = rag_pipeline(prompt, max_length=256)[0]['generated_text']
    return response

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cpu


## 5. Evaluación

### Comparación de resultados

In [42]:
def compare_top10_ids(query):
    tfidf_ids = [doc['id'] for doc in search_tfidf(query)]
    bm25_ids = [doc['id'] for doc in search_bm25(query)]
    faiss_ids = [doc['id'] for doc in search_faiss(query)]

    common_ids = {
        'TF-IDF - BM25': list(set(tfidf_ids) & set(bm25_ids)),
        'TF-IDF - FAISS': list(set(tfidf_ids) & set(faiss_ids)),
        'BM25 - FAISS': list(set(bm25_ids) & set(faiss_ids)),
        'Todos en común': list(set(tfidf_ids) & set(bm25_ids) & set(faiss_ids))
    }

    return {
        'TF-IDF': tfidf_ids,
        'BM25': bm25_ids,
        'FAISS': faiss_ids,
        'Coincidencias': common_ids
    }

query = "machine learning for particle physics"
resultados = compare_top10_ids(query)
print("Coincidencias entre modelos:\n", resultados['Coincidencias'])

Coincidencias entre modelos:
 {'TF-IDF - BM25': ['0704.3905', '0704.3453', '0708.1564'], 'TF-IDF - FAISS': [], 'BM25 - FAISS': ['0707.0930'], 'Todos en común': []}


#### **¿Cuáles documentos aparecen en común?**
Al comparar los documentos recuperados por los tres modelos para una misma consulta, se encontró lo siguiente:

- TF-IDF y BM25 comparten 3 documentos en común: 0704.3905, 0704.3453 y 0708.1564. Esto muestra que ambos modelos, al basarse en estadísticas de frecuencia de palabras, tienden a coincidir cuando se usan términos específicos en la búsqueda.

- BM25 y FAISS solo comparten 1 documento (0707.0930), lo cual sugiere que FAISS recupera resultados con un enfoque más semántico, mientras que BM25 sigue más fiel a las coincidencias de términos.

- TF-IDF y FAISS no tienen documentos en común, lo que indica una diferencia clara en cómo interpretan la consulta.

- No hubo ningún documento que aparezca en los tres modelos a la vez, lo que refuerza la idea de que cada modelo prioriza distintos aspectos del contenido.

#### **¿Qué diferencias hay en el ordenamiento?**
- TF-IDF y BM25 tienden a priorizar los documentos donde las palabras clave aparecen con más frecuencia o en posiciones relevantes dentro del texto.

- FAISS, al usar embeddings, ordena los resultados según el significado general del texto, aunque no contenga las palabras exactas. Esto hace que el orden de resultados sea completamente diferente a los modelos clásicos.

Por eso, aunque dos artículos traten el mismo tema, si usan diferentes palabras o sinónimos, FAISS los puede ubicar más arriba que TF-IDF o BM25.

### Medir similitud entre ranking

In [40]:
def count_common_documents(query):
    tfidf_ids = set([doc['id'] for doc in search_tfidf(query)])
    bm25_ids = set([doc['id'] for doc in search_bm25(query)])
    faiss_ids = set([doc['id'] for doc in search_faiss(query)])

    inter_tfidf_bm25 = tfidf_ids & bm25_ids
    inter_tfidf_faiss = tfidf_ids & faiss_ids
    inter_bm25_faiss = bm25_ids & faiss_ids

    all_common = tfidf_ids & bm25_ids & faiss_ids

    return {
        "TF-IDF - BM25": len(inter_tfidf_bm25),
        "TF-IDF - FAISS": len(inter_tfidf_faiss),
        "BM25 - FAISS": len(inter_bm25_faiss),
        "Todos": len(all_common)
    }

print("Cantidad de coincidencias (top 10):")
print(count_common_documents(query))


Cantidad de coincidencias (top 10):
{'TF-IDF - BM25': 3, 'TF-IDF - FAISS': 0, 'BM25 - FAISS': 1, 'Todos': 0}


In [29]:
def show_rag_context_and_answer(query):
    top_docs = search_faiss(query, top_k=3)
    print("Títulos de documentos recuperados:")
    for i, doc in enumerate(top_docs):
        print(f"{i+1}. {doc['title']}")
    
    print("\nResumen combinado (contexto):")
    for doc in top_docs:
        print("- ", doc['abstract'][:300], "...\n")

    respuesta = generate_rag_response(query)
    print("\n Respuesta generada por RAG:\n", respuesta)

show_rag_context_and_answer("machine learning for particle physics")

Títulos de documentos recuperados:
1. A threshold-improved narrow-width approximation for BSM physics
2. What can we learn from fluctuations of particle ratios?
3. Bayesian Learning of Neural Networks for Signal/Background
  Discrimination in Particle Physics

Resumen combinado (contexto):
-    A modified narrow-width approximation that allows for O(Gamma/M)-accurate
predictions for resonant particle decay with similar intermediate masses is
proposed and applied to MSSM processes to demonstrate its importance for
searches for particle physics beyond the Standard Model.
 ...

-    We explain how fluctuations of ratios can constrain and falsify the
statistical model of particle production in heavy ion collisions, using $K/\pi$
fluctuations as an example. We define an observable capable of determining
which statistical model, if any, governs freeze-out in ultrarelativistic he ...

-    Neural networks are used extensively in classification problems in particle
physics research. Since the 

Both `max_new_tokens` (=256) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



 Respuesta generada por RAG:
 A modified narrow-width approximation that allows for O(Gamma/M)-accurate predictions for resonant particle decay with similar intermediate masses is proposed and applied to MSSM processes to demonstrate its importance for searches for particle physics beyond the Standard Model. We explain how fluctuations of ratios can constrain and falsify the statistical model of particle production in heavy ion collisions, using $K/pi$ fluctuations as an example. We define an observable capable of determining which statistical model, if any, governs freeze-out in ultrarelativistic heavy ion collisions, using $K/pi$ fluctuations as an example. We calculate this observable for $K/pi$ fluctuations, and show that it should be the same for RHIC and LHC energies, as well as independent of centrality, if the Grand-Canonical statistical model is an appropriate description and chemical equilibrium applies. We also introduce a similar observable capable, together with the publi


# Tabla comparativa (Benchmark) de resultados entre modelos.

### Calidad

In [41]:
import pandas as pd
# Leer queries desde el archivo txt
with open("queries.txt", "r") as f:
    queries = [line.strip() for line in f.readlines() if line.strip()]

# Función para evaluar calidad de recuperación
def benchmark_models(queries):
    records = []
    for query in queries:
        tfidf_ids = set([doc["id"] for doc in search_tfidf(query)])
        bm25_ids = set([doc["id"] for doc in search_bm25(query)])
        faiss_ids = set([doc["id"] for doc in search_faiss(query)])

        records.append({
            "Query": query,
            "TF-IDF - BM25": len(tfidf_ids & bm25_ids),
            "TF-IDF - FAISS": len(tfidf_ids & faiss_ids),
            "BM25 - FAISS": len(bm25_ids & faiss_ids),
            "Todos comunes": len(tfidf_ids & bm25_ids & faiss_ids)
        })
    return pd.DataFrame(records)

# Ejecutar el benchmark y mostrar la tabla
df_benchmark = benchmark_models(queries)
df_benchmark

Unnamed: 0,Query,TF-IDF - BM25,TF-IDF - FAISS,BM25 - FAISS,Todos comunes
0,diphoton production cross sections,9,4,3,3
1,quantum chromodynamics,8,4,4,4
2,higgs boson decay,8,2,4,2
3,machine learning for particle physics,3,0,1,0
4,top quark production,5,5,5,3


La tabla nos dice qué tan parecidos son los resultados que devuelve cada modelo.
Vemos que TF-IDF y BM25 suelen coincidir bastante, porque ambos buscan las palabras tal como las escribiste.
FAISS, en cambio, busca más por el sentido o el significado de lo que preguntas, así que encuentra cosas distintas.
Por eso, casi no hay documentos que aparezcan en los tres modelos a la vez. Cada uno entiende la búsqueda a su manera.



# Ejemplo de una consulta y su respuesta generada con RAG


In [21]:
query = "machine learning for particle physics"
respuesta = generate_rag_response(query)
print("Respuesta generada con RAG:\n", respuesta)

Both `max_new_tokens` (=256) and `max_length`(=256) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Respuesta generada con RAG:
 A modified narrow-width approximation that allows for O(Gamma/M)-accurate predictions for resonant particle decay with similar intermediate masses is proposed and applied to MSSM processes to demonstrate its importance for searches for particle physics beyond the Standard Model. We explain how fluctuations of ratios can constrain and falsify the statistical model of particle production in heavy ion collisions, using $K/pi$ fluctuations as an example. We define an observable capable of determining which statistical model, if any, governs freeze-out in ultrarelativistic heavy ion collisions, using $K/pi$ fluctuations as an example. We calculate this observable for $K/pi$ fluctuations, and show that it should be the same for RHIC and LHC energies, as well as independent of centrality, if the Grand-Canonical statistical model is an appropriate description and chemical equilibrium applies. We also introduce a similar observable capable, together with the publish

# Diferencias entre modelos y utilidad del RAG.

In [None]:
import pandas as pd

comparacion_modelos = pd.DataFrame({
    "Modelo": ["TF-IDF", "BM25", "FAISS", "RAG (con Flan-T5)"],
    "Ventajas": [
        "Es rápido, fácil de programar y te devuelve buenos resultados si buscas palabras exactas.",
        "Es más listo que TF-IDF porque da más peso a las palabras que realmente importan.",
        "Captura similitud semántica entre frases o sinónimos gracias a embeddings.",
        "Es como un asistente que busca y te explica. Resume la info y contesta con base en lo que encontró."
    ],
    "Limitaciones": [
        "No entiende el significado de las palabras. Si usas un sinónimo, no lo capta.",
        "Aun así, sigue sin entender ideas, solo se enfoca en cuántas veces aparece algo.",
        "Necesita mayor poder computacional y puede recuperar documentos no tan precisos.",
        "Depende de la calidad del índice vectorial y del modelo generativo usado."
    ]
})
comparacion_modelos


Unnamed: 0,Modelo,Ventajas,Limitaciones
0,TF-IDF,"Rápido, fácil de implementar y eficiente en bú...",No capta el significado de las palabras; no de...
1,BM25,Mejor manejo de la frecuencia de términos y má...,"Aunque mejora a TF-IDF, aún es un modelo estad..."
2,FAISS,Captura similitud semántica entre frases o sin...,Necesita mayor poder computacional y puede rec...
3,RAG (con Flan-T5),Genera respuestas completas y justificadas usa...,Depende de la calidad del índice vectorial y d...


## Utilidad del RAG

Es útil cuando no se tiene tiempo (por ciertas circuntancias) para leer todo o cuando los textos son muy técnicos. RAG da una respuesta clara, directa y con contexto, sin tener que estar adivinando qué dice cada paper.

