## Examen Segundo Bimestre

### CARGA Y PREPROCESAMIENTO

In [None]:
#Importar librerias
import json
import pandas as pd
import re
import nltk
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
#Descarga de stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\steve\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [None]:
# --- Cargar y preprocesar el corpus --- el archivo preprocesado ya es del 1%
with open('arxiv_sample.json') as f:
    data = [json.loads(line) for line in f]

In [15]:
#Extraer title y abstract
subset_cleaned = [{"title": d["title"], "abstract": d["abstract"]} for d in data if "title" in d and "abstract" in d]

In [97]:
subset_cleaned[:3]

[{'title': 'Role of electron correlations in transport through domain walls in\n  magnetic nanowires',
  'abstract': "  The transmission of correlated electrons through a domain wall in\nferromagnetic quasi-one-dimensional systems is studied theoretically in the\ncase when the domain wall width is comparable with the Fermi wavelength of the\ncharge carriers. The wall gives rise to both potential and spin dependent\nscattering. Using a poor man's renormalization group approach, we obtain\nscaling equations for the scattering amplitudes. For repulsive interactions,\nthe wall is shown to reflect all incident electrons at the zero temperature\nfixed points. In one of the fixed points the wall additionally flips the spin\nof all incident electrons, generating a finite spin current without associated\ncharge current.\n"},
 {'title': 'SU(3) Polyakov linear-sigma model: bulk and shear viscosity of QCD\n  matter in finite magnetic field',
  'abstract': '  Due to off-center relativistic motion o

In [None]:
# Normalizar y limpiar texto
def preprocess(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

In [None]:
#Guardar
with open("arxiv_subset_1pct.json", "w") as out_f:
    json.dump(subset_cleaned, out_f, indent=2)

In [43]:
# Crear DataFrame
corpus_df = pd.DataFrame(subset_cleaned )
corpus_df['text'] = corpus_df['title'] + ". " + corpus_df['abstract']
corpus_df['text_clean'] = corpus_df['text'].apply(preprocess)
corpus_df['id'] = corpus_df.index

In [26]:
# Normalizar y limpiar texto
def preprocess(text):
    text = text.lower()
    text = re.sub(f"[{string.punctuation}]", "", text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return " ".join(tokens)

corpus_df['text_clean'] = corpus_df['text'].apply(preprocess)

In [98]:
corpus_df[:5]

Unnamed: 0,title,abstract,text,text_clean,id
0,Role of electron correlations in transport thr...,The transmission of correlated electrons thr...,Role of electron correlations in transport thr...,role electron correlations transport domain wa...,0
1,SU(3) Polyakov linear-sigma model: bulk and sh...,Due to off-center relativistic motion of the...,SU(3) Polyakov linear-sigma model: bulk and sh...,su3 polyakov linearsigma model bulk shear visc...,1
2,Maximal Abelian gauge and a generalized BRST t...,We apply a generalized Becchi-Rouet-Stora-Ty...,Maximal Abelian gauge and a generalized BRST t...,maximal abelian gauge generalized brst transfo...,2
3,Stable Diffusion is Unstable,"Recently, text-to-image models have been thr...","Stable Diffusion is Unstable. Recently, text...",stable diffusion unstable recently texttoimage...,3
4,A New Proof of the New Intersection Theorem,In 1987 Roberts completed the proof of the N...,A New Proof of the New Intersection Theorem. ...,new proof new intersection theorem 1987 robert...,4


### Indexación TF-IDF

In [28]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus_df['text_clean'])

### Indexación BM25

In [29]:
bm25_corpus = [doc.split() for doc in corpus_df['text_clean']]
bm25 = BM25Okapi(bm25_corpus)

### Indexación Vectorial con FAISS

In [30]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(corpus_df['text_clean'].tolist(), show_progress_bar=True)

dimension = embeddings.shape[1]
index_faiss = faiss.IndexFlatL2(dimension)
index_faiss.add(np.array(embeddings))

  return forward_call(*args, **kwargs)
Batches: 100%|██████████| 873/873 [14:39<00:00,  1.01s/it]


## FUNCIONES DE BUSQUEDA

In [89]:
# TF-IDF Search Function
def search_tfidf(query, top_k=11):
    query_vec = tfidf_vectorizer.transform([preprocess(query)])
    scores = (tfidf_matrix @ query_vec.T).toarray().ravel()
    top_ids = scores.argsort()[::-1][:top_k]
    return corpus_df.iloc[top_ids][['id', 'title', 'abstract']]


In [88]:
# BM25 Search Function
def search_bm25(query, top_k=11):
    query_tokens = preprocess(query).split()
    scores = bm25.get_scores(query_tokens)
    top_ids = np.argsort(scores)[::-1][:top_k]
    return corpus_df.iloc[top_ids][['id', 'title', 'abstract']]

In [87]:
# Vectorial Search Function
def search_faiss(query, top_k=11):
    query_embedding = model.encode([preprocess(query)])
    distances, indices = index_faiss.search(query_embedding, top_k)
    return corpus_df.iloc[indices[0]][['id', 'title', 'abstract']]

### Comparacion de Rankings

In [106]:
import pandas as pd

def comparar_rankings(query):
    print(f"\n Evaluación para la consulta: \"{query}\"\n")

    r1_df = search_tfidf(query)
    r2_df = search_bm25(query)
    r3_df = search_faiss(query)

    # Extraer solo los títulos
    tfidf_titles = r1_df['title'].tolist()
    bm25_titles = r2_df['title'].tolist()
    faiss_titles = r3_df['title'].tolist()

    # Crear dataframe de comparación
    df_comparacion = pd.DataFrame({
        'TF-IDF': tfidf_titles,
        'BM25': bm25_titles,
        'FAISS': faiss_titles
    })
    print(df_comparacion)

    # Análisis de coincidencias
    comunes = set(tfidf_titles) & set(bm25_titles) & set(faiss_titles)
    print(f"\n Documentos comunes en los 3 modelos: {len(comunes)}")
    for doc in comunes:
        print(f"• {doc}")

    print("\n Coincidencias entre pares:")
    print(" - TF-IDF ∩ BM25:", len(set(tfidf_titles) & set(bm25_titles)))
    print(" - TF-IDF ∩ FAISS:", len(set(tfidf_titles) & set(faiss_titles)))
    print(" - BM25 ∩ FAISS:", len(set(bm25_titles) & set(faiss_titles)))

In [107]:
comparar_rankings("¿Qué describe la ecuación de Swift-Hohenberg cerca de una inestabilidad?")


 Evaluación para la consulta: "¿Qué describe la ecuación de Swift-Hohenberg cerca de una inestabilidad?"

                                               TF-IDF  \
0                                               Selex   
1   Sur les automorphismes et la rigidite des grou...   
2     De Rham prismatic crystals over $\mathcal{O}_K$   
3         Quantum Global Structure of de Sitter Space   
4             Blood Pulsation Intensity Video Mapping   
5   De Sitter Holography with a Finite Number of S...   
6   Real or Imaginary? (On pair creation in de Sit...   
7                       The Fall of Stringy de Sitter   
8   Compact spacelike surfaces in the 3-dimensiona...   
9     The Swift-Hohenberg equation on conic manifolds   
10  Anisotropic generalizations of de Sitter space...   

                                                 BM25  \
0   Sur les automorphismes et la rigidite des grou...   
1                                               Selex   
2   An infinitely differentiable func

  return forward_call(*args, **kwargs)


Los modelos TF-IDF y BM25 compartieron 4 documentos. FAISS, que se basa en la similitud de significado, solo coincidió en uno con TF-IDF y en ninguno con BM25. Esto demuestra que FAISS entiende el contexto y encuentra artículos relevantes, como los de la ecuación de Swift-Hohenberg, mientras que los otros modelos se limitan a la coincidencia exacta de palabras.

###  Módulo RAG 

In [None]:
#Libreria y clave API
from openai import OpenAI
client = OpenAI(api_key="")

In [None]:
#funcion de pregunta para modelo RAG
def generate_rag_answer(query):
    #Escogemos con la funcion de busqueda que queremos
    docs = search_faiss(query, top_k=3)
    context = "\n".join(docs['abstract'].tolist())

    prompt = f"""
    Utiliza el siguiente contexto extraído de artículos científicos para responder a la pregunta del usuario de forma clara y relevante.
    Contexto:
    {context}
    Pregunta: {query}
    Respuesta:
    """

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "Eres un asistente experto en artículos científicos."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.3,
        max_tokens=450
    )

    return response.choices[0].message.content.strip()

In [84]:
generate_rag_answer("¿Qué describe la ecuación de Swift-Hohenberg cerca de una inestabilidad?")

  return forward_call(*args, **kwargs)


'La ecuación de Swift-Hohenberg, en su variante conservativa compleja, describe una amplia gama de soluciones cerca de una inestabilidad, especialmente en el contexto de soluciones uniformes, periódicas y localizadas. En particular, las soluciones uniformes en el modelo conservativo son inherentemente inestables. Las soluciones periódicas también tienden a ser inestables, excepto dentro de un estrecho intervalo de parámetros que permite la existencia de múltiples estados localizados. Además, se establece un criterio generalizado de Vakhitov-Kolokolov para determinar la estabilidad de los estados localizados en la ecuación conservativa, lo cual se relaciona con las propiedades de estabilidad del modelo disipativo. Estos análisis y criterios ayudan a entender las características complejas de la ecuación cerca de una inestabilidad.'

### Prueba con Queriess.txt

In [94]:
with open('queries.txt') as f:
    queries = [line.strip() for line in f.readlines() if line.strip()]

In [109]:
for q in queries:
    print("\n=== Resultados para:", q)
    comparar_rankings(q)


=== Resultados para: electron correlations in magnetic nanowires

 Evaluación para la consulta: "electron correlations in magnetic nanowires"

                                               TF-IDF  \
0   Temperature dependence of coercivity for isola...   
1   Analysis of magic lengths in growth of support...   
2   The growth mechanism of CuO nanowires synthesi...   
3   Magnetic Moment Softening and Domain Wall Resi...   
4   Thermally assisted domain wall nucleation in p...   
5   Magnon contribution to the magnetoresistance o...   
6   Nucleation, growth, and dissolution of Ag nano...   
7   Dominance of quantum over classical correlatio...   
8   Concept of a laser-plasma based electron sourc...   
9   Current-phase Relationship, Thermal and Quantu...   
10  Almost-quantum correlations violate the isotro...   

                                                 BM25  \
0   Temperature dependence of coercivity for isola...   
1   Role of electron correlations in transport thr...   


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


                                               TF-IDF  \
0   Maximal Abelian gauge and a generalized BRST t...   
1   A Way of Separating Dynamics and Gauge Transfo...   
2   A Theory of Transformation Monoids: Combinator...   
3   The off-shell expansion relation of the Yang-M...   
4   Topological Vector Symmetry of BRSTQFT and Con...   
5   Integral transformation and Darboux transforma...   
6     Classes of confining gauge field configurations   
7   Global Regularity for the Yang-Mills Equations...   
8   Half-monopoles and half-vortices in the Yang-M...   
9   Exact Unitary Transformation of the One-Dimens...   
10  Translation and Rotation of Transformation Med...   

                                                 BM25  \
0   Maximal Abelian gauge and a generalized BRST t...   
1   A Way of Separating Dynamics and Gauge Transfo...   
2   A note on dual superconformal symmetry of the ...   
3   Lax Pair for Strings in Lunin-Maldacena Backgr...   
4   The off-shell expansion re

  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)
