In [8]:
import pandas as pd


In [9]:
df = pd.read_csv('data/relevance-judgments/relevance-judgments.tsv', sep='\t', header=None, names=['query', 'value'])
df.head(3)

Unnamed: 0,query,value
0,q01,"d186:4,d254:5,d016:5"
1,q02,"d136:2,d139:2,d143:4,d283:4,d228:4,d164:4,d318..."
2,q03,"d152:3,d291:4,d283:4,d147:3,d318:2,d105:2"


In [10]:
def count_documents(value):
    documents = value.split(',')
    return len(documents)

df['M'] = df['value'].apply(count_documents)

df.head()

Unnamed: 0,query,value,M
0,q01,"d186:4,d254:5,d016:5",3
1,q02,"d136:2,d139:2,d143:4,d283:4,d228:4,d164:4,d318...",11
2,q03,"d152:3,d291:4,d283:4,d147:3,d318:2,d105:2",6
3,q04,"d275:3,d010:3,d286:2,d019:2,d049:2,d330:2,d270:3",7
4,q06,"d069:2,d233:3,d257:2,d297:3,d026:4,d329:5",6


In [11]:
df

Unnamed: 0,query,value,M
0,q01,"d186:4,d254:5,d016:5",3
1,q02,"d136:2,d139:2,d143:4,d283:4,d228:4,d164:4,d318...",11
2,q03,"d152:3,d291:4,d283:4,d147:3,d318:2,d105:2",6
3,q04,"d275:3,d010:3,d286:2,d019:2,d049:2,d330:2,d270:3",7
4,q06,"d069:2,d233:3,d257:2,d297:3,d026:4,d329:5",6
5,q07,"d004:3,d077:3,d266:2,d179:3",4
6,q08,"d205:2,d005:4,d110:4,d108:3,d117:3,d081:2,d292...",12
7,q09,"d205:3,d199:5,d198:3,d223:2,d217:2,d177:2",6
8,q10,"d068:2,d100:2,d065:3,d076:3,d231:4,d199:4,d052...",8
9,q12,"d239:4,d277:4,d258:3,d250:4",4


In [12]:

# Cargar el archivo de juicios de relevancia
relevance_judgments = pd.read_csv('data/relevance-judgments/relevance-judgments.tsv', sep='\t', header=None, names=['query', 'value'])

# Crear un diccionario de relevancia
relevance_dict = {}
for index, row in relevance_judgments.iterrows():
    query_id = row['query']
    relevances = {doc.split(':')[0]: int(doc.split(':')[1]) for doc in row['value'].split(',')}
    relevance_dict[query_id] = relevances


In [13]:
def precision_at_m(retrieved_docs, relevant_docs, M):
    relevant_retrieved = [doc for doc in retrieved_docs[:M] if doc in relevant_docs]
    return len(relevant_retrieved) / M

def recall_at_m(retrieved_docs, relevant_docs, M):
    relevant_retrieved = [doc for doc in retrieved_docs[:M] if doc in relevant_docs]
    return len(relevant_retrieved) / len(relevant_docs)

# Ejemplo de uso para una consulta particular:
query_id = 'q01'
retrieved_docs = ['d016', 'd085', 'd259']  # ejemplo de documentos recuperados
relevant_docs = relevance_dict[query_id]  # documentos relevantes según el juicio de relevancia

M = 3
p_at_m = precision_at_m(retrieved_docs, relevant_docs, M)
r_at_m = recall_at_m(retrieved_docs, relevant_docs, M)

print(f"P@{M}: {p_at_m}")
print(f"R@{M}: {r_at_m}")

import numpy as np

def dcg_at_m(retrieved_docs, relevance_scores, M):
    dcg = 0.0
    for i in range(M):
        rel_score = relevance_scores.get(retrieved_docs[i], 0)
        dcg += (2**rel_score - 1) / np.log2(i + 2)
    return dcg

def ndcg_at_m(retrieved_docs, relevance_scores, M):
    dcg = dcg_at_m(retrieved_docs, relevance_scores, M)
    ideal_order = sorted(relevance_scores.values(), reverse=True)
    ideal_dcg = sum((2**rel - 1) / np.log2(i + 2) for i, rel in enumerate(ideal_order[:M]))
    return dcg / ideal_dcg if ideal_dcg > 0 else 0

P@3: 0.3333333333333333
R@3: 0.3333333333333333


In [18]:
queries = df['query'].tolist()

In [19]:

gensim_results = {}
with open('results/GENSIM-consultas_resultado.txt', 'r') as f:
    for line in f:
        parts = line.strip().split()
        query = parts[0]
        docs = [doc.split(':')[0] for doc in parts[1].split(',')]
        gensim_results[query] = docs

# Calcular métricas
for query in queries:
    retrieved_docs = gensim_results[query]
    relevant_docs = relevance_dict.get(query, {})
    M = len(relevant_docs)  # Número total de documentos relevantes para esa consulta

    p_at_m = precision_at_m(retrieved_docs, relevant_docs, M)
    r_at_m = recall_at_m(retrieved_docs, relevant_docs, M)
    ndcg_at_m_score = ndcg_at_m(retrieved_docs, relevant_docs, M)

    print(f"Query: {query} -> P@M: {p_at_m}, R@M: {r_at_m}, NDCG@M: {ndcg_at_m_score}")

Query: q01 -> P@M: 0.3333333333333333, R@M: 0.3333333333333333, NDCG@M: 0.5339412468168797
Query: q02 -> P@M: 0.6363636363636364, R@M: 0.6363636363636364, NDCG@M: 0.5219698408684085
Query: q03 -> P@M: 1.0, R@M: 1.0, NDCG@M: 0.9863710356171108
Query: q04 -> P@M: 0.7142857142857143, R@M: 0.7142857142857143, NDCG@M: 0.7263161387878696
Query: q06 -> P@M: 0.6666666666666666, R@M: 0.6666666666666666, NDCG@M: 0.7138583623930603
Query: q07 -> P@M: 0.25, R@M: 0.25, NDCG@M: 0.2159355773460337
Query: q08 -> P@M: 0.6666666666666666, R@M: 0.6666666666666666, NDCG@M: 0.8203388022728647
Query: q09 -> P@M: 0.8333333333333334, R@M: 0.8333333333333334, NDCG@M: 0.927691174011215
Query: q10 -> P@M: 0.375, R@M: 0.375, NDCG@M: 0.33897841901011766
Query: q12 -> P@M: 1.0, R@M: 1.0, NDCG@M: 0.9541999451448003
Query: q13 -> P@M: 0.8, R@M: 0.8, NDCG@M: 0.8303201738476969
Query: q14 -> P@M: 0.5833333333333334, R@M: 0.5833333333333334, NDCG@M: 0.4680675085146631
Query: q16 -> P@M: 0.5, R@M: 0.5, NDCG@M: 0.49663925