# Recuperacion de la Informaicon
# Examen Segundo Bimestre RI

**Nombre:** Paul Lora

In [20]:
# imports

from sklearn.feature_extraction.text import TfidfVectorizer
import json
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import faiss
from openai import OpenAI
from dotenv import load_dotenv
import os



## Corpus

In [21]:
corpus = []
titles = []
identifiers = []
with open("../data/examen/data.json", "r") as f:
    for line in f:
        if line.strip():
            entry = json.loads(line)
            title = entry.get("title", "")
            abstract = entry.get("abstract", "")
            identifier = entry.get("id", "")
            corpus.append(abstract)
            titles.append(title)
            identifiers.append(identifier)

In [22]:
corpus_df = pd.DataFrame({
    "identifier": identifiers,
    "title": titles,
    "raw": corpus,
})

In [23]:
# preprocess function
stop_words = set(stopwords.words('english'))
def preprocess_doc(doc):
    words = word_tokenize(doc.lower())
    words_filtered = [word for word in words if word not in stop_words and word.isalpha()]
    return ' '.join(words_filtered)

In [24]:
corpus_df['processed'] = corpus_df['raw'].apply(preprocess_doc)
corpus_df['processed'] = corpus_df['processed'].str.replace("\n", " ")
corpus_df['processed'] = corpus_df['processed'].str.replace(r'\s+', ' ', regex=True).str.strip()
corpus_df

Unnamed: 0,identifier,title,raw,processed
0,0704.0001,Calculation of prompt diphoton production cros...,A fully differential calculation in perturba...,fully differential calculation perturbative qu...
1,0704.0002,Sparsity-certifying Graph Decompositions,"We describe a new algorithm, the $(k,\ell)$-...",describe new algorithm k game colors use obtai...
2,0704.0003,The evolution of the Earth-Moon system based o...,The evolution of Earth-Moon system is descri...,evolution system described dark matter field f...
3,0704.0004,A determinant of Stirling cycle numbers counts...,We show that a determinant of Stirling cycle...,show determinant stirling cycle numbers counts...
4,0704.0005,From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...,In this paper we show how to compute the $\L...,paper show compute norm using dyadic grid resu...
...,...,...,...,...
27918,0710.0971,The Extending for Composite Skyrme Model,"In this paper, we have extended the composit...",paper extended composite skyrme model proposed...
27919,0710.0972,A Floer homology for exact contact embeddings,In this paper we construct the Floer homolog...,paper construct floer homology action function...
27920,0710.0973,Modulation invariant bilinear T(1) theorem,We prove a T(1) theorem for bilinear singula...,prove theorem bilinear singular integral opera...
27921,0710.0974,Hawking radiation in GHS and non-extremal D1-D...,We apply the method of Banerjee and Kulkarni...,apply method banerjee kulkarni provide derivat...


## TFIDF

In [25]:
corpus_processed_list = corpus_df['processed'].tolist()

In [26]:
tfidf_vectorizer=TfidfVectorizer(use_idf=True)
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(corpus_processed_list)

### Indexacion TFIDF

In [27]:
df = pd.DataFrame(tfidf_vectorizer_vectors.T.todense(), index=tfidf_vectorizer.get_feature_names_out())
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,27913,27914,27915,27916,27917,27918,27919,27920,27921,27922
aa,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aab,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aac,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aadl,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
aadt,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zz,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzgam,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzgamma,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zzh,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
def similar_docs_tfidf(query, top_k=10):
    vertorized_query = tfidf_vectorizer.transform([query])
    cosine_similarities = cosine_similarity(vertorized_query, tfidf_vectorizer_vectors).flatten()
    top_indices = cosine_similarities.argsort()[-top_k:][::-1]
    return corpus_df.iloc[top_indices]['raw'].tolist()

## Faiss

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(corpus_df["processed"].tolist(), convert_to_numpy=True)

index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(embeddings)

corpus_df['faiss_embeddings'] = embeddings.tolist()
corpus_df

In [None]:
def similar_docs_faiss(index, query, top_k=10):
    query_embedding = model.encode([query], convert_to_numpy=True)
    _, indices = index.search(query_embedding, top_k)
    return corpus_df.iloc[indices.flatten()]['raw'].tolist()

## RAG

In [None]:
query = "Impacto del cambio climatico"

In [None]:
contexto_tfidf = similar_docs_tfidf(query, top_k=5)
contexto_faiss = similar_docs_faiss(index, query, top_k=5)

In [None]:
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=OPENAI_API_KEY)


### Respuesta TFIDF

In [None]:
prompt = f"""
Eres una aplicacion de Retrieval Augmented Generation que siempre responde en español.
Usa el siguiente contexto para responder a la pregunta del usuario.
Si la respuesta no se encuentra en el contexto, responde "No tengo suficiente información para responder a esa pregunta".

Contexto:
{contexto_tfidf}

Pregunta:
El usuario esta preguntando sobre "{query}".
"""

response = client.responses.create(
    model="gpt-4.1-mini",
    input=prompt
)
print(response.output_text)

No tengo suficiente información para responder a esa pregunta.


### Respuesta FAISS

In [None]:
prompt = f"""
Eres una aplicacion de Retrieval Augmented Generation que siempre responde en español.
Usa el siguiente contexto para responder a la pregunta del usuario.
Si la respuesta no se encuentra en el contexto, responde "No tengo suficiente información para responder a esa pregunta".

Contexto:
{contexto_faiss}

Pregunta:
El usuario esta preguntando sobre "{query}".
"""

response = client.responses.create(
    model="gpt-4.1-mini",
    input=prompt
)
print(response.output_text)

El cambio climático tiene un impacto negativo en la biodiversidad, ya que puede llevar a la pérdida de muchas especies debido a cambios en las zonas climáticas, acidificación de los océanos y la desintegración de glaciares que afectan el suministro de agua para millones de personas. Además, el calentamiento global y sus efectos asociados pueden provocar la extinción masiva de especies, alterando profundamente el "árbol de la vida" en un futuro cercano. Por lo tanto, es crucial mantener los niveles de CO2 bajos para evitar estos daños significativos a la biodiversidad.


## Evaluacion

### Documentos presentados por tfidf y faiss

In [None]:
def get_relevant_docs_df(similar_docs):
    relevant_df = corpus_df[corpus_df['raw'].isin(similar_docs)][['identifier', 'title', 'raw']]
    relevant_df = relevant_df.rename(columns={'raw': 'abstract'})
    return relevant_df

relevant_tfidf_df = get_relevant_docs_df(contexto_tfidf)
relevant_faiss_df = get_relevant_docs_df(contexto_faiss)

print(relevant_tfidf_df)
print(relevant_faiss_df)

      identifier                                              title  \
10358  0706.1672  Resonant phenomena in extended chaotic systems...   
12307  0706.3621                     Climate Change: The Sun's Role   
15742  0707.2572  Noise effects in extended chaotic system: stud...   
19998  0708.2147  The logistic equation and a critique of the th...   
24375  0709.2110  The climate version of the Eta regional foreca...   

                                                abstract  
10358    We investigate the effects of a time-correla...  
12307    The sun's role in the earth's recent warming...  
15742    We investigate the effects of a time-correla...  
19998    Species coexistence is one of the central th...  
24375    The regional climate model prepared from Eta...  
      identifier                                              title  \
2648   0704.2649  Hedging our bets: the expected contribution of...   
12307  0706.3621                     Climate Change: The Sun's Role   
12406 