In [2]:
import pyterrier as pt
from tqdm import tqdm
if not pt.started():
    pt.init()

from tira.rest_api_client import Client
from wows_eval import evaluate as wows_evaluate
import pandas as pd
from jnius import autoclass
import numpy as np

# For measuring consumed resources (e.g., GPU, CPU, RAM, etc.)
from tirex_tracker import tracking, ExportFormat

pd.set_option('display.max_colwidth', None)

DATASET_ID = 'wows-eval/pairwise-smoke-test-20250210-training'
#DATASET_ID = 'wows-eval/pairwise-20250309-test'

tira = Client()
input_data = tira.pd.inputs(DATASET_ID)

In [3]:
class QueryByRelevantDocument:
    def __init__(self):
        self.results = {}
    def process(self, query, retrieval_system, rel, unk):
        if query in self.results:
            raise ValueError('This query was already processed: ' + query)
        
        ret = {}
        tokeniser = autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser()
        for doc in rel.values():
            doc_text = " ".join(tokeniser.getTokens(doc))
            run = retrieval_system.search(doc_text)
            last_rank = -1
            scores = {}
            for _, i in run.iterrows():
                assert last_rank < i['rank']
                last_rank = i['rank']
                if i['docno'] in unk:
                    scores[unk[i['docno']]] = i['rank']

            max_score = max(scores.values())
            min_score = min(scores.values())
            ret[doc] = {k: ((v-min_score)/(max_score-min_score)) for k, v in scores.items()}
        
        self.results[query] = ret


class QueryByUnknownDocument:
    def __init__(self):
        self.results = {}
    def process(self, query, retrieval_system, rel, unk):
        if query in self.results:
            raise ValueError('This query was already processed: ' + query)
        
        ret = {}
        tokeniser = autoclass("org.terrier.indexing.tokenisation.Tokeniser").getTokeniser()
        for doc in unk.values():
            doc_text = " ".join(tokeniser.getTokens(doc))
            run = retrieval_system.search(doc_text)
            last_rank = -1
            dcg = 0
            for _, i in run.iterrows():
                assert last_rank < i['rank']
                last_rank = i['rank']
                if i['rank'] >= 20:
                    break
                if i['docno'] in rel:
                    # https://github.com/joaopalotti/trectools/blob/master/trectools/trec_eval.py#L499C28-L499C56
                    dcg += 1. / np.log2(i['rank']+1)
            
            ret[doc] = dcg
        max_score = max(ret.values())
        min_score = min(ret.values())
        ret = {k: ((v-min_score)/(max_score-min_score)) for k, v in ret.items()}
        ret = {r: ret for r in rel.values()}

        self.results[query] = ret


In [4]:
system_name = 'query-by-relevant-doc'
#system_name = 'query-by-unknown-doc'

!rm -Rf tmp
with tracking(export_file_path='tmp/.metadata.yml', export_format=ExportFormat.IR_METADATA) as tracked_experiment:
    queries = set(input_data['query'].unique())

    def known_relevant_documents(query):
        docs = set(input_data[input_data['query'] == query]['relevant'].unique())
        return {f'{i[0]}-rel': i[1] for i in zip(range(len(docs)), docs)}

    def unknown_documents(query):
        docs = set(input_data[input_data['query'] == query]['unknown'].unique())
        return {f'{i[0]}-unkn': i[1] for i in zip(range(len(docs)), docs)}

    if system_name == 'query-by-relevant-doc':
        processor = QueryByRelevantDocument()
    elif system_name == 'query-by-unknown-doc':
        processor = QueryByUnknownDocument()
    else:
        raise ValueError('foo')

    for query in tqdm(queries):
        rel = known_relevant_documents(query)
        unk = unknown_documents(query)

        docs = [{'docno': k, 'text': v} for k, v in rel.items()]+[{'docno': k, 'text': v} for k, v in unk.items()]
        indexer = pt.IterDictIndexer("/tmp/index", overwrite=True, meta={'docno': 100, 'text': 20480})
        index_ref = indexer.index(docs)
        bm25 = pt.BatchRetrieve(index_ref, wmodel='BM25')
        processor.process(query, bm25, rel, unk)

    predictions = []
    for _, i in input_data.iterrows():
        res = processor.results[i['query']]
        res = res[i['relevant']]
        predictions.append({
            'id': i['id'],
            'probability_relevant': res.get(i['unknown'], -1)
        })
    predictions = pd.DataFrame(predictions)


PCM Info: setrlimit for file limit 1000000 failed with error Operation not permitted

=====  Processor information  =====
Linux arch_perfmon flag  : yes
Hybrid processor         : yes
IBRS and IBPB supported  : yes
STIBP supported          : yes
Spec arch caps supported : yes
Max CPUID level          : 32
CPU model number         : 154
ERROR: Can not open /sys/module/msr/parameters/allow_writes file.
PCM Error: can't open MSR handle for core 0 (No such file or directory)
Try no-MSR mode by setting env variable PCM_NO_MSR=1
Can not access CPUs Model Specific Registers (MSRs).
execute 'modprobe msr' as root user, then execute pcm as root user.
100%|██████████| 2/2 [00:00<00:00,  2.34it/s]


In [6]:
wows_evaluate(
    predictions,
    DATASET_ID,
    environment=tracked_experiment,
    system_name=system_name,
    #upload=True,
)

Unnamed: 0,system,tau_ap,kendall,spearman,pearson
0,query-by-relevant-doc,0.526667,0.485714,0.621429,0.621429
