In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/libreriajames/libJames.py


In [4]:
pip install ir_datasets


Note: you may need to restart the kernel to use updated packages.


#### Importación del dataset

In [5]:
import ir_datasets

ds = ir_datasets.load("beir/fiqa/test")

# Proyecto de 1er Bimestre
## Sistema de Recuperación de Información
por: Ozzy Loachamín

## A) Construcción del índice
1. Leer un corpus de documentos en texto plano.
2. Procesamiento básico: tokenización, normalización y remoción de stopwords.
3. Construcción de un índice invertido que almacene, para cada término, los documentos en los que aparece y su frecuencia.

### 1. Leer un corpus de documentos en texto plano.

In [6]:
docs = [
    {"doc_id": d.doc_id, "textD": d.text}
    for d in ds.docs_iter()
]

print(len(docs), docs[0])

docs_df = pd.DataFrame(docs)
docs_df.head()

[INFO] [starting] building docstore
[INFO] [starting] opening zip file                                              
[INFO] If you have a local copy of https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/fiqa.zip, you can symlink it here to avoid downloading it again: /root/.ir_datasets/downloads/17918ed23cd04fb15047f73e6c3bd9d9
[INFO] [starting] https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/fiqa.zip
docs_iter:   0%|                                     | 0/57638 [00:00<?, ?doc/s]
https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/fiqa.zip: 0.0%| 0.00/17.9M [00:00<?, ?B/s][A
https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/fiqa.zip: 34.0%| 6.11M/17.9M [00:00<00:01, 7.50MB/s][A
                                                                                1%| 12.2M/17.9M [00:01<00:00, 7.56MB/s][A
[A                                                                                                                    [

57638 {'doc_id': '3', 'textD': "I'm not saying I don't like the idea of on-the-job training too, but you can't expect the company to do that. Training workers is not their job - they're building software. Perhaps educational systems in the U.S. (or their students) should worry a little about getting marketable skills in exchange for their massive investment in education, rather than getting out with thousands in student debt and then complaining that they aren't qualified to do anything."}


Unnamed: 0,doc_id,textD
0,3,I'm not saying I don't like the idea of on-the...
1,31,So nothing preventing false ratings besides ad...
2,56,You can never use a health FSA for individual ...
3,59,Samsung created the LCD and other flat screen ...
4,63,Here are the SEC requirements: The federal sec...


In [7]:
queries = [
    {"query_id": q.query_id, "textQ": q.text}
    for q in ds.queries_iter()
]

queries_df = pd.DataFrame(queries)
queries_df.head()


[INFO] [starting] opening zip file
[INFO] [finished] opening zip file [1ms]
[INFO] [starting] opening zip file
[INFO] [finished] opening zip file [0ms]


Unnamed: 0,query_id,textQ
0,4641,Where should I park my rainy-day / emergency f...
1,5503,Tax considerations for selling a property belo...
2,7803,Can the Delta be used to calculate the option ...
3,7017,Basic Algorithmic Trading Strategy
4,10152,What does a high operating margin but a small ...


In [8]:
qrels = [
    {
        "query_id": r.query_id,
        "doc_id": r.doc_id,
        "relevance": r.relevance
    }
    for r in ds.qrels_iter()
]

qrels_df = pd.DataFrame(qrels)
qrels_df.head()


Unnamed: 0,query_id,doc_id,relevance
0,8,566392,1
1,8,65404,1
2,15,325273,1
3,18,88124,1
4,26,285255,1


### 2. Procesamiento básico: tokenización, normalización y remoción de stopwords.

In [9]:
# Importar la librería
import sys
if "/kaggle/input/libreriajames" not in sys.path:
    sys.path.append("/kaggle/input/libreriajames")

import libJames

In [10]:
import importlib
importlib.reload(libJames)

<module 'libJames' from '/kaggle/input/libreriajames/libJames.py'>

In [11]:
# Limpieza
docs_df['text_cleaned'] = docs_df['textD'].apply(libJames.clean_text_extended)
docs_df.head()

Unnamed: 0,doc_id,textD,text_cleaned
0,3,I'm not saying I don't like the idea of on-the...,I m not saying I don t like the idea of on the...
1,31,So nothing preventing false ratings besides ad...,So nothing preventing false ratings besides ad...
2,56,You can never use a health FSA for individual ...,You can never use a health FSA for individual ...
3,59,Samsung created the LCD and other flat screen ...,Samsung created the LCD and other flat screen ...
4,63,Here are the SEC requirements: The federal sec...,Here are the SEC requirements The federal secu...


In [12]:
# Normalización
docs_df['text_norm'] = docs_df['text_cleaned'].apply(libJames.normalize_text)
docs_df.head()

Unnamed: 0,doc_id,textD,text_cleaned,text_norm
0,3,I'm not saying I don't like the idea of on-the...,I m not saying I don t like the idea of on the...,i m not saying i don t like the idea of on the...
1,31,So nothing preventing false ratings besides ad...,So nothing preventing false ratings besides ad...,so nothing preventing false ratings besides ad...
2,56,You can never use a health FSA for individual ...,You can never use a health FSA for individual ...,you can never use a health fsa for individual ...
3,59,Samsung created the LCD and other flat screen ...,Samsung created the LCD and other flat screen ...,samsung created the lcd and other flat screen ...
4,63,Here are the SEC requirements: The federal sec...,Here are the SEC requirements The federal secu...,here are the sec requirements the federal secu...


In [13]:
# Tokenización
docs_df['text_tokenized'] = docs_df['text_norm'].apply(libJames.tokenize_whitespace)
docs_df.head()

Unnamed: 0,doc_id,textD,text_cleaned,text_norm,text_tokenized
0,3,I'm not saying I don't like the idea of on-the...,I m not saying I don t like the idea of on the...,i m not saying i don t like the idea of on the...,"[i, m, not, saying, i, don, t, like, the, idea..."
1,31,So nothing preventing false ratings besides ad...,So nothing preventing false ratings besides ad...,so nothing preventing false ratings besides ad...,"[so, nothing, preventing, false, ratings, besi..."
2,56,You can never use a health FSA for individual ...,You can never use a health FSA for individual ...,you can never use a health fsa for individual ...,"[you, can, never, use, a, health, fsa, for, in..."
3,59,Samsung created the LCD and other flat screen ...,Samsung created the LCD and other flat screen ...,samsung created the lcd and other flat screen ...,"[samsung, created, the, lcd, and, other, flat,..."
4,63,Here are the SEC requirements: The federal sec...,Here are the SEC requirements The federal secu...,here are the sec requirements the federal secu...,"[here, are, the, sec, requirements, the, feder..."


In [14]:
# Eliminación de Stop Words
libJames.init_nltk()
docs_df['toks_no_stopW'] = docs_df['text_tokenized'].apply(libJames.remove_stopwords)
docs_df.head()

Unnamed: 0,doc_id,textD,text_cleaned,text_norm,text_tokenized,toks_no_stopW
0,3,I'm not saying I don't like the idea of on-the...,I m not saying I don t like the idea of on the...,i m not saying i don t like the idea of on the...,"[i, m, not, saying, i, don, t, like, the, idea...","[saying, like, idea, job, training, expect, co..."
1,31,So nothing preventing false ratings besides ad...,So nothing preventing false ratings besides ad...,so nothing preventing false ratings besides ad...,"[so, nothing, preventing, false, ratings, besi...","[nothing, preventing, false, ratings, besides,..."
2,56,You can never use a health FSA for individual ...,You can never use a health FSA for individual ...,you can never use a health fsa for individual ...,"[you, can, never, use, a, health, fsa, for, in...","[never, use, health, fsa, individual, health, ..."
3,59,Samsung created the LCD and other flat screen ...,Samsung created the LCD and other flat screen ...,samsung created the lcd and other flat screen ...,"[samsung, created, the, lcd, and, other, flat,...","[samsung, created, lcd, flat, screen, technolo..."
4,63,Here are the SEC requirements: The federal sec...,Here are the SEC requirements The federal secu...,here are the sec requirements the federal secu...,"[here, are, the, sec, requirements, the, feder...","[sec, requirements, federal, securities, laws,..."


In [15]:
# Stemming
docs_df['toks_stemming'] = docs_df['toks_no_stopW'].apply(libJames.apply_stemming)
docs_df.head()

Unnamed: 0,doc_id,textD,text_cleaned,text_norm,text_tokenized,toks_no_stopW,toks_stemming
0,3,I'm not saying I don't like the idea of on-the...,I m not saying I don t like the idea of on the...,i m not saying i don t like the idea of on the...,"[i, m, not, saying, i, don, t, like, the, idea...","[saying, like, idea, job, training, expect, co...","[say, like, idea, job, train, expect, compani,..."
1,31,So nothing preventing false ratings besides ad...,So nothing preventing false ratings besides ad...,so nothing preventing false ratings besides ad...,"[so, nothing, preventing, false, ratings, besi...","[nothing, preventing, false, ratings, besides,...","[noth, prevent, fals, rate, besid, addit, scru..."
2,56,You can never use a health FSA for individual ...,You can never use a health FSA for individual ...,you can never use a health fsa for individual ...,"[you, can, never, use, a, health, fsa, for, in...","[never, use, health, fsa, individual, health, ...","[never, use, health, fsa, individu, health, in..."
3,59,Samsung created the LCD and other flat screen ...,Samsung created the LCD and other flat screen ...,samsung created the lcd and other flat screen ...,"[samsung, created, the, lcd, and, other, flat,...","[samsung, created, lcd, flat, screen, technolo...","[samsung, creat, lcd, flat, screen, technolog,..."
4,63,Here are the SEC requirements: The federal sec...,Here are the SEC requirements The federal secu...,here are the sec requirements the federal secu...,"[here, are, the, sec, requirements, the, feder...","[sec, requirements, federal, securities, laws,...","[sec, requir, feder, secur, law, defin, term, ..."


In [16]:
# clean_text
docs_df['text_processed'] = docs_df['textD'].apply(libJames.clean_text, remove_stopwords_flag=True, apply_stemming_flag=True)
docs_df.head()

Unnamed: 0,doc_id,textD,text_cleaned,text_norm,text_tokenized,toks_no_stopW,toks_stemming,text_processed
0,3,I'm not saying I don't like the idea of on-the...,I m not saying I don t like the idea of on the...,i m not saying i don t like the idea of on the...,"[i, m, not, saying, i, don, t, like, the, idea...","[saying, like, idea, job, training, expect, co...","[say, like, idea, job, train, expect, compani,...",say like idea job train expect compani train w...
1,31,So nothing preventing false ratings besides ad...,So nothing preventing false ratings besides ad...,so nothing preventing false ratings besides ad...,"[so, nothing, preventing, false, ratings, besi...","[nothing, preventing, false, ratings, besides,...","[noth, prevent, fals, rate, besid, addit, scru...",noth prevent fals rate besid addit scrutini ma...
2,56,You can never use a health FSA for individual ...,You can never use a health FSA for individual ...,you can never use a health fsa for individual ...,"[you, can, never, use, a, health, fsa, for, in...","[never, use, health, fsa, individual, health, ...","[never, use, health, fsa, individu, health, in...",never use health fsa individu health insur pre...
3,59,Samsung created the LCD and other flat screen ...,Samsung created the LCD and other flat screen ...,samsung created the lcd and other flat screen ...,"[samsung, created, the, lcd, and, other, flat,...","[samsung, created, lcd, flat, screen, technolo...","[samsung, creat, lcd, flat, screen, technolog,...",samsung creat lcd flat screen technolog like o...
4,63,Here are the SEC requirements: The federal sec...,Here are the SEC requirements The federal secu...,here are the sec requirements the federal secu...,"[here, are, the, sec, requirements, the, feder...","[sec, requirements, federal, securities, laws,...","[sec, requir, feder, secur, law, defin, term, ...",sec requir feder secur law defin term accredit...


Visualización final, comparación de los resultados

In [17]:
docs_df[['textD','text_processed']]

Unnamed: 0,textD,text_processed
0,I'm not saying I don't like the idea of on-the...,say like idea job train expect compani train w...
1,So nothing preventing false ratings besides ad...,noth prevent fals rate besid addit scrutini ma...
2,You can never use a health FSA for individual ...,never use health fsa individu health insur pre...
3,Samsung created the LCD and other flat screen ...,samsung creat lcd flat screen technolog like o...
4,Here are the SEC requirements: The federal sec...,sec requir feder secur law defin term accredit...
...,...,...
57633,"&gt;Well, first off, the roads are more than j...",gt well first road hobbi realli go place lot r...
57634,Yes they do. There are billions and billions s...,ye billion billion spent subsidi pharmaceut co...
57635,&gt;It's biggly sad you don't understand human...,gt biggli sad understand human natur noth huma...
57636,"""Did your CTO let a major group use """"admin/ad...",cto let major group use admin admin administr ...


Ahora, se hace el mismo procesado para el DataFrame de Queries

In [18]:
queries_df['text_cleaned'] = queries_df['textQ'].apply(libJames.clean_text_extended)
queries_df['text_norm'] = queries_df['text_cleaned'].apply(libJames.normalize_text)
queries_df['text_tokenized'] = queries_df['text_norm'].apply(libJames.tokenize_whitespace)
queries_df['toks_no_stopW'] = queries_df['text_tokenized'].apply(libJames.remove_stopwords)
queries_df['toks_stemming'] = queries_df['toks_no_stopW'].apply(libJames.apply_stemming)
queries_df['text_processed'] = queries_df['textQ'].apply(libJames.clean_text, remove_stopwords_flag=True, apply_stemming_flag=True)

In [19]:
queries_df.head()

Unnamed: 0,query_id,textQ,text_cleaned,text_norm,text_tokenized,toks_no_stopW,toks_stemming,text_processed
0,4641,Where should I park my rainy-day / emergency f...,Where should I park my rainy day emergency fund,where should i park my rainy day emergency fund,"[where, should, i, park, my, rainy, day, emerg...","[park, rainy, day, emergency, fund]","[park, raini, day, emerg, fund]",park raini day emerg fund
1,5503,Tax considerations for selling a property belo...,Tax considerations for selling a property belo...,tax considerations for selling a property belo...,"[tax, considerations, for, selling, a, propert...","[tax, considerations, selling, property, appra...","[tax, consider, sell, properti, apprais, valu,...",tax consider sell properti apprais valu famili
2,7803,Can the Delta be used to calculate the option ...,Can the Delta be used to calculate the option ...,can the delta be used to calculate the option ...,"[can, the, delta, be, used, to, calculate, the...","[delta, used, calculate, option, premium, give...","[delta, use, calcul, option, premium, given, c...",delta use calcul option premium given certain ...
3,7017,Basic Algorithmic Trading Strategy,Basic Algorithmic Trading Strategy,basic algorithmic trading strategy,"[basic, algorithmic, trading, strategy]","[basic, algorithmic, trading, strategy]","[basic, algorithm, trade, strategi]",basic algorithm trade strategi
4,10152,What does a high operating margin but a small ...,What does a high operating margin but a small ...,what does a high operating margin but a small ...,"[what, does, a, high, operating, margin, but, ...","[high, operating, margin, small, positive, roe...","[high, oper, margin, small, posit, roe, impli,...",high oper margin small posit roe impli compani


### 3. Construcción de un índice invertido que almacene, para cada término, los documentos en los que aparece y su frecuencia.

In [20]:
inv_index = libJames.build_inverted_index(
    docs=docs_df['text_processed'],
    doc_ids=docs_df['doc_id']
)
len(inv_index)

46440

In [21]:
# Mostrar el primer elemento del indice invertido (clave, valor)
# list(inv_index.items())[0]


In [22]:
# doc_id_buscado = "593" # en 593 aparecia 4 veces la palabra 'say'
# texto = docs_df.loc[docs_df["doc_id"] == doc_id_buscado, "text_processed"].iloc[0]
# print(texto)

## B) Modelo de recuperación
1. Implementar recuperación basada en similitud Jaccard utilizando vectores binarios.
2. Implementar recuperación basada en similitud de coseno utilizando TF-IDF.
3. Implementar recuperación con BM25.
4. Permitir la ejecución de consultas de texto libre.
5. Mostrar un ranking de documentos ordenados por relevancia.

### 1. Implementar recuperación basada en similitud Jaccard utilizando vectores binarios.

In [23]:
doc_texts = dict(zip(docs_df["doc_id"], docs_df["text_processed"]))


In [24]:
# Inserción de consulta y limpieza
query = "Where should I park my emergency fund?"
query_clean = libJames.clean_text(query)
print(query_clean)

where should i park my emergency fund


In [25]:
jaccard_results = libJames.jaccard_rank(
    query=query_clean,
    docs=docs_df["text_processed"],
    doc_ids=docs_df["doc_id"],
    top_k=10,
    doc_texts= doc_texts
)
jaccard_results


Unnamed: 0,doc_id,score,snippet
0,290830,0.142857,fund
1,589544,0.105263,invest exist would bank park overnight fund fe...
2,372677,0.090909,fund prospectu good place start
3,264740,0.090909,ishar jantzi social index fund
4,274859,0.083333,own physic gold assum coin own gold fund
5,551764,0.083333,option park money bank work best
6,490397,0.083333,per op request slush fund mayb
7,69915,0.076923,year comparison fund p say sure need
8,144054,0.076923,googl unclaim fund might need state separ
9,135164,0.071429,report e cash flow select mutual fund account


### 2. Implementar recuperación basada en similitud de coseno utilizando TF-IDF.

In [26]:
vect, tfidf_matrix, vocab_tfidf = libJames.fit_tfidf_vectorizer(
    docs_df["text_processed"],
    lowercase=False  # ya se procesaron los datos
)

In [27]:
tfidf_results = libJames.rank_documents_tfidf(
    query=query_clean,
    vectorizer=vect,
    tfidf_matrix=tfidf_matrix,
    doc_labels=docs_df["doc_id"],
    top_k=10,
    doc_texts=doc_texts
)
tfidf_results


Unnamed: 0,doc_label,score,snippet
0,178386,0.314745,look like use employe benefit pay park near ho...
1,551764,0.306865,option park money bank work best
2,73700,0.29032,inugo park space finder help find earli bird p...
3,298053,0.277022,thought find auckland park get stress level pe...
4,105340,0.265113,best tl dr could make origin http qz com your ...
5,290830,0.25113,fund
6,589544,0.233553,invest exist would bank park overnight fund fe...
7,71253,0.233026,usual approach set friend x park chariti chari...
8,303639,0.225598,factor coupl thing first occasion opportun get...
9,450228,0.208527,never invest money need short term alreadi sug...


### 3. Implementar recuperación con BM25.

In [28]:
bm25_results = libJames.bm25_rank(
    query=query_clean,
    inverted_index=inv_index,
    doc_texts=doc_texts,
    top_k=10
)
bm25_results


Unnamed: 0,doc_id,score,snippet
0,537111,12.053415,know free sourc year histor data larg set comp...
1,589544,11.622308,invest exist would bank park overnight fund fe...
2,376148,11.486115,bond necessarili safer stock market ultim thin...
3,241085,11.312113,go talk benefit offic understand deadlin rule ...
4,10374,10.945611,exactli newer set better cheaper past basic ai...
5,101543,10.836006,deduct talk flexibl spend plan otherwis known ...
6,226519,10.803354,would take item other consid would count expen...
7,178386,10.723835,look like use employe benefit pay park near ho...
8,73700,10.653935,inugo park space finder help find earli bird p...
9,298053,10.580504,thought find auckland park get stress level pe...


### 4. Permitir la ejecución de consultas de texto libre.

In [29]:
query = "How much money should I keep in my checking account?"
query_clean = libJames.clean_text(query)

df_results = libJames.search_query(
    query_text=query_clean,
    docs_clean=docs_df["text_processed"],
    doc_ids=docs_df["doc_id"],
    inverted_index=inv_index,
    vectorizer=vect,
    tfidf_matrix=tfidf_matrix,
    top_k=10,
    doc_texts=doc_texts
)


### 5. Mostrar un ranking de documentos ordenados por relevancia.

In [30]:
df_results

Unnamed: 0,query_id,model,doc_id,score,rank,snippet
0,manual,jaccard,157972,0.176471,1,would put money high interest save account ear...
1,manual,jaccard,535931,0.153846,2,truli emerg fund keep money safe unfortun inte...
2,manual,jaccard,517428,0.15,3,bottom line keep money account check privileg ...
3,manual,jaccard,396810,0.148148,4,save peopl click edit straight averag account ...
4,manual,jaccard,211767,0.142857,5,lot option person avoid keep money bank accoun...
5,manual,jaccard,34997,0.142857,6,answer money got bounc back account
6,manual,jaccard,339448,0.136364,7,person use earlier date quicken look like lose...
7,manual,jaccard,573713,0.136364,8,ever need money three year imagin today need m...
8,manual,jaccard,93463,0.133333,9,pretti much shot credibl possess follow money
9,manual,jaccard,444351,0.130435,10,like year ago besid point save save save notio...


## C) Interfaz básica
1. Interfaz de línea de comandos (CLI).
2. Funcionalidades mínimas:\
    2.1. Realizar consultas. \
    2.2. Visualizar los resultados.

In [31]:
!pip install nltk --quiet

In [32]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [33]:
!pip install scipy --quiet


In [34]:
# CELDA 2: guardar assets por separado (usa esto)
import pickle
import os
import joblib
import scipy.sparse as sp

# Ajusta estos nombres a tus variables en memoria
# docs_clean: list o pd.Series de textos limpios
# doc_ids: list o pd.Series de ids
# inverted_index: dict
# vectorizer: sklearn TfidfVectorizer
# tfidf_matrix: puede ser numpy array o scipy.sparse matrix (de preferencia sparse)

# 1) guarda objetos pequeños (sin la matriz)
doc_texts_map = dict(zip(docs_df['doc_id'].astype(str), docs_df['text_processed']))

small_assets = {
    "docs_clean": docs_df['text_processed'],
    "doc_ids": docs_df['doc_id'],
    "inverted_index": inv_index,
    "doc_texts": doc_texts_map
}
with open("retrieval_assets_small.pkl", "wb") as f:
    pickle.dump(small_assets, f)

# 2) guarda vectorizer con joblib (compacto)
joblib.dump(vect, "vectorizer.joblib", compress=3)

# 3) guarda la matriz TF-IDF como sparse .npz (si tfidf_matrix es numpy, convierte a sparse)
if not sp.issparse(tfidf_matrix):
    tfidf_matrix = sp.csr_matrix(tfidf_matrix)

sp.save_npz("tfidf_matrix.npz", tfidf_matrix, compressed=True)

# 4) mostrar tamaños
print("Saved small pickle:", os.path.getsize("retrieval_assets_small.pkl")/1024/1024, "MB")
print("Saved vectorizer.joblib:", os.path.getsize("vectorizer.joblib")/1024/1024, "MB")
print("Saved tfidf_matrix.npz:", os.path.getsize("tfidf_matrix.npz")/1024/1024, "MB")


Saved small pickle: 49.18948936462402 MB
Saved vectorizer.joblib: 0.5459012985229492 MB
Saved tfidf_matrix.npz: 25.370241165161133 MB


Crear el archivo cli_search.py

In [35]:
%%writefile cli_search.py
import argparse
import scipy.sparse as sp
import pickle
import joblib
import numpy as np
from libJames import search_query  # Asegúrate que esté definido en libJames

def load_assets(path_small="retrieval_assets_small.pkl",
                vectorizer_path="vectorizer.joblib",
                tfidf_path="tfidf_matrix.npz"):

    # 1) cargar assets pequeños (docs_clean, doc_ids, inverted_index)
    with open(path_small, "rb") as f:
        small = pickle.load(f)

    docs_clean = small["docs_clean"]
    doc_ids = small["doc_ids"]
    inverted_index = small["inverted_index"]
    doc_texts = small["doc_texts"]

    # 2) cargar vectorizer
    vectorizer = joblib.load(vectorizer_path)

    # 3) cargar matriz TF-IDF
    tfidf_matrix = sp.load_npz(tfidf_path)

    return docs_clean, doc_ids, inverted_index, doc_texts, vectorizer, tfidf_matrix


def main():
    parser = argparse.ArgumentParser(description="Realizar búsqueda semántica + lexical")
    parser.add_argument("query", type=str, help="Consulta del usuario")
    args = parser.parse_args()

    print("Cargando assets…")
    docs_clean, doc_ids, inverted_index, doc_texts, vectorizer, tfidf_matrix = load_assets()

    print("Procesando consulta…")
    results_df = search_query(
        query_text=args.query,
        vectorizer=vectorizer,
        tfidf_matrix=tfidf_matrix,
        docs_clean=docs_clean,
        doc_ids=doc_ids,
        inverted_index=inverted_index,
        doc_texts=doc_texts,
        top_k=5
    )

    # Convertimos cada fila a diccionario para iterar
    results = results_df.to_dict(orient="records")

    print("\n=== Resultados de búsqueda (agrupados por modelo) ===\n")

    # Agrupamos por modelo
    model_groups = {}
    for r in results:
        model_groups.setdefault(r["model"], []).append(r)
    
    # Función para recortar snippet a N caracteres
    def short_snippet(s, length=60):
        if not s:
            return ""
        s = s.replace("\n", " ").strip()
        return (s[:length] + "…") if len(s) > length else s
    
    # Imprimir resultados por modelo
    for model, items in model_groups.items():
        print(f"\n### Modelo: {model.upper()} ###")
    
        for i, r in enumerate(items, start=1):
            snippet_short = short_snippet(r.get("snippet", ""), 60)
            print(f"{i:>2}) ID: {r['doc_id']:<8}  Score: {r['score']:<8.4f}  Snippet: {snippet_short}")
    
        print("-" * 40)




if __name__ == "__main__":
    main()


Writing cli_search.py


Ejecutar desde la CLI de Kaggle

In [36]:
!cp /kaggle/input/libreriajames/libJames.py ./

### 1. Interfaz de línea de comandos (CLI).

In [37]:
!python cli_search.py <consulta>

/bin/bash: -c: line 1: syntax error near unexpected token `newline'
/bin/bash: -c: line 1: `python cli_search.py <consulta>'


### 2. Funcionalidades mínimas: realizar consultas y visualizar resultados

## Interfaz de Línea de Comandos

In [38]:
!python cli_search.py "Where should I park my emergency fund?"


Cargando assets…
Procesando consulta…

=== Resultados de búsqueda (agrupados por modelo) ===


### Modelo: JACCARD ###
 1) ID: 290830    Score: 0.1429    Snippet: fund
 2) ID: 589544    Score: 0.1053    Snippet: invest exist would bank park overnight fund feder reserv int…
 3) ID: 372677    Score: 0.0909    Snippet: fund prospectu good place start
 4) ID: 264740    Score: 0.0909    Snippet: ishar jantzi social index fund
 5) ID: 274859    Score: 0.0833    Snippet: own physic gold assum coin own gold fund
----------------------------------------

### Modelo: BM25 ###
 1) ID: 537111    Score: 12.0534   Snippet: know free sourc year histor data larg set compani singl comp…
 2) ID: 589544    Score: 11.6223   Snippet: invest exist would bank park overnight fund feder reserv int…
 3) ID: 376148    Score: 11.4861   Snippet: bond necessarili safer stock market ultim thing low risk mut…
 4) ID: 241085    Score: 11.3121   Snippet: go talk benefit offic understand deadlin rule program ir enf…
 5)

In [65]:
!python cli_search.py "pleas explain use histor exampl"

Cargando assets…
Procesando consulta…

=== Resultados de búsqueda (agrupados por modelo) ===


### Modelo: JACCARD ###
 1) ID: 169028    Score: 0.2500    Snippet: pleas explain use histor exampl would purchas debt solid fin…
 2) ID: 584801    Score: 0.1667    Snippet: use stockchart spread chart take question exampl chart appl …
 3) ID: 381310    Score: 0.1429    Snippet: folk explain comment
 4) ID: 425250    Score: 0.1304    Snippet: gt challeng view pleas provid exampl countri anywher world p…
 5) ID: 421112    Score: 0.1304    Snippet: dafuq read think would better explain excel chart idea wrote…
----------------------------------------

### Modelo: BM25 ###
 1) ID: 169028    Score: 25.3023   Snippet: pleas explain use histor exampl would purchas debt solid fin…
 2) ID: 5360      Score: 17.6643   Snippet: problem comment e declin uk manufactur foreign polici crude …
 3) ID: 82021     Score: 16.5534   Snippet: fair point histor fuck lol pleas explain sinc layman mayb di…
 4) ID: 425

In [64]:
doc_id_buscado = "169028" # en 593 aparecia 4 veces la palabra 'say'
texto = docs_df.loc[docs_df["doc_id"] == doc_id_buscado, "text_processed"].iloc[0]
print(texto)

pleas explain use histor exampl would purchas debt solid financi decis histor success anoth countri oh right mayb understand econom loser


## D) Evaluación de resultados
1. Usar un conjunto de consultas de prueba y documentos relevantes (qrels).
2. Calcular para cada consulta:\
    2.1. Precision.\
    2.2. Recall.
3. Calcular para todo el sistema:\
    3.1. MAP.

### 1. Usar un conjunto de consultas de prueba y documentos relevantes (qrels).

In [46]:
# Celda A: importar y cargar assets
import pickle, joblib
import scipy.sparse as sp
# import pandas as pd
# import numpy as np
# import libJames

# Ajusta rutas si es necesario
SMALL_PATH = "retrieval_assets_small.pkl"
VECT_PATH  = "vectorizer.joblib"
TFIDF_PATH = "tfidf_matrix.npz"

with open(SMALL_PATH, "rb") as f:
    small = pickle.load(f)

docs_clean = small["docs_clean"]   # pd.Series o list de textos "procesados"
doc_ids    = small["doc_ids"]      # pd.Series o list con doc ids (mismo orden que tfidf)
inverted_index = small["inverted_index"]
doc_texts = small.get("doc_texts", None)  # opcional, para snippets

vectorizer = joblib.load(VECT_PATH)
tfidf_matrix = sp.load_npz(TFIDF_PATH)   # scipy.sparse matrix

print("Assets cargados: docs:", len(docs_clean), "doc_ids:", len(doc_ids))
print("TF-IDF shape:", tfidf_matrix.shape)


Assets cargados: docs: 57638 doc_ids: 57638
TF-IDF shape: (57638, 46420)


In [47]:
print("Queries:", queries_df.shape)
print("Qrels:", qrels_df.shape)

Queries: (648, 8)
Qrels: (1706, 3)


In [48]:
# Celda C: helpers para ejecutar modelos y obtener ranked lists
def get_ranked_list_for_query(query_id, query_text, top_k=100, doc_texts=None):
    """
    Ejecuta search_query para una consulta y devuelve:
    - dict de DataFrames por modelo (jaccard, bm25, tfidf) con columnas estándar:
      ['query_id','model','doc_id','score','rank', 'snippet' (opt)]
    """
    df_all = libJames.search_query(
        query_text=query_text,
        docs_clean=docs_clean,
        doc_ids=doc_ids,
        inverted_index=inverted_index,
        vectorizer=vectorizer,
        tfidf_matrix=tfidf_matrix,
        top_k=top_k,
        return_all_models=True,
        doc_texts=(doc_texts if doc_texts is not None else None)
    )
    # df_all tiene concatenado jaccard/bm25/tfidf; convertir a dict por modelo
    models = {}
    for m in df_all['model'].unique():
        dfm = df_all[df_all['model'] == m].sort_values("score", ascending=False).reset_index(drop=True)
        # ranked_list como lista de doc_id (mantener tipo consistente con qrels)
        ranked_list = dfm['doc_id'].astype(str).tolist()
        models[m] = {
            "df": dfm,
            "ranked_list": ranked_list
        }
    return models


### 2. Calcular para cada consulta: precision y recall

In [49]:
# Celda D: evaluar para un conjunto de queries (las query_id que aparecen en qrels_df)
from collections import defaultdict

# Parámetros de evaluación
TOP_K = 10   # k para P@k y R@k
RANK_DEPTH = 100  # cuántos devolver para ranking (suficiente para calcular AP)
query_ids_to_eval = sorted(qrels_df['query_id'].astype(str).unique().tolist())

# Mapear qrels: query_id -> set(doc_id) (usar strings para evitar mismatch)
qid_to_relevant = {}
for _, row in qrels_df.iterrows():
    qid = str(row['query_id'])
    docid = str(row['doc_id'])
    if qid not in qid_to_relevant:
        qid_to_relevant[qid] = set()
    # considerar solo relevancia > 0 si tienes campo relevance
    if int(row.get('relevance', 1)) > 0:
        qid_to_relevant[qid].add(docid)

# DataFrame para guardar métricas por query y por modelo
metrics_rows = []

for qid in query_ids_to_eval:
    # obtener texto de la query; si no existiera saltarlo con warning
    try:
        # ajusta nombre de columna si tu df usa otra
        qtext = queries_df.loc[queries_df['query_id'].astype(str) == qid, 'textQ'].iloc[0]
    except Exception:
        # intenta otra columna: text_processed
        qtext = queries_df.loc[queries_df['query_id'].astype(str) == qid, 'text_processed'].iloc[0]
    relevant = qid_to_relevant.get(qid, set())

    # Ejecutar modelos y obtener ranked lists
    models = get_ranked_list_for_query(qid, qtext, top_k=RANK_DEPTH, doc_texts=doc_texts)

    for model_name, info in models.items():
        ranked_list = info['ranked_list']   # lista de doc_id (strings)
        # métricas
        p_at_k = libJames.precision_at_k(ranked_list, relevant, TOP_K)
        r_at_k = libJames.recall_at_k(ranked_list, relevant, TOP_K)
        ap = libJames.average_precision(ranked_list, relevant)

        metrics_rows.append({
            "query_id": qid,
            "model": model_name,
            "precision@{}".format(TOP_K): p_at_k,
            "recall@{}".format(TOP_K): r_at_k,
            "AP": ap,
            "n_relevant": len(relevant)
        })

metrics_df = pd.DataFrame(metrics_rows)
metrics_df.head(20)


  return pd.concat([df_j, df_bm25, df_tfidf], ignore_index=True)
  return pd.concat([df_j, df_bm25, df_tfidf], ignore_index=True)


Unnamed: 0,query_id,model,precision@10,recall@10,AP,n_relevant
0,10034,jaccard,0.0,0.0,0.0,2
1,10034,bm25,0.1,0.5,0.25,2
2,10034,tfidf,0.0,0.0,0.041667,2
3,10039,jaccard,0.0,0.0,0.0,1
4,10039,bm25,0.0,0.0,0.0,1
5,10039,tfidf,0.0,0.0,0.0,1
6,10109,jaccard,0.1,0.142857,0.071429,7
7,10109,bm25,0.2,0.285714,0.285714,7
8,10109,tfidf,0.2,0.285714,0.285714,7
9,10122,jaccard,0.0,0.0,0.0,4


### 3. Calcular para todo el sistema: MAP

In [50]:
# Celda E: MAP y promedios por modelo
summary = metrics_df.groupby("model").agg({
    "precision@{}".format(TOP_K): "mean",
    "recall@{}".format(TOP_K): "mean",
    "AP": "mean",
    "n_relevant": "sum"
}).rename(columns={
    "precision@{}".format(TOP_K): "mean_precision@{}".format(TOP_K),
    "recall@{}".format(TOP_K): "mean_recall@{}".format(TOP_K),
    "AP": "MAP"
})

print(summary)
# también guardar resultados
metrics_df.to_csv("evaluation_per_query_model.csv", index=False)
summary.to_csv("evaluation_summary_by_model.csv")
print("Guardado: evaluation_per_query_model.csv  and evaluation_summary_by_model.csv")


         mean_precision@10  mean_recall@10       MAP  n_relevant
model                                                           
bm25              0.028638        0.138506  0.072424        1700
jaccard           0.008333        0.037242  0.019926        1706
tfidf             0.020062        0.087760  0.045794        1706
Guardado: evaluation_per_query_model.csv  and evaluation_summary_by_model.csv
