In [86]:
from tira.third_party_integrations import ensure_pyterrier_is_loaded, persist_and_normalize_run, ir_datasets
from tira.rest_api_client import Client

# This method ensures that that PyTerrier is loaded so that it also works in the TIRA sandbox
ensure_pyterrier_is_loaded()
import pyterrier as pt

tira = Client()

In [2]:
dataset_id = 'longeval-long-september-20230513-training'

In [25]:
def similar_documents(dataset_ids):
    import gzip
    import json
    ret = []
    file_name = tira.get_run_output('ir-benchmarks/ows/isobaric-warbler', dataset_id) + '/corpus-graph-over-time.jsonl.gz'
    with gzip.open(file_name, 'rt') as f:
        for l in f:
            l = json.loads(l)
            if not dataset_ids or l['dataset'] in dataset_ids:
                ret.append(l)
    return ret

In [20]:
def top_2_multiplicator(h):
    if h['rank'] == 0:
        return 2
    elif h['rank'] == 1:
        return 1
    else:
        return 0

In [23]:
assert 2 == top_2_multiplicator({ 'rank': 0 })
assert 1 == top_2_multiplicator({ 'rank': 1 })
assert 0 == top_2_multiplicator({ 'rank': 4 })

In [30]:
def document_texts_in_reverted_index(similar_documents,multiplicator):
    ret = {}

    for i in similar_documents:
        q = i['query']
        if i['relevance'] < 1:
            continue

        for hit in i['top_bm25_results']:
            m = multiplicator(hit)
            additional_text = ' '.join([q]*m)
            if hit['docno'] not in ret:
                ret[hit['docno']] = ''
            
            ret[hit['docno']] += ' ' + additional_text
    
    return [{'docno': k, 'text': v.strip()} for k,v in ret.items() if len(v.strip()) > 0]


In [113]:
sim_docs = similar_documents((
    'longeval-train-20230513-training', 'longeval-heldout-20230513-training', 'longeval-short-july-20230513-training'
))
docs = document_texts_in_reverted_index(sim_docs, top_2_multiplicator)

In [114]:
len(docs)

6829

In [115]:
iter_indexer = pt.IterDictIndexer("./index", meta={'docno': 50, 'text': 4096}, overwrite=True)
index = iter_indexer.index(docs)

16:43:11.343 [ForkJoinPool-7-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 11 empty documents


In [116]:
bm25 = pt.BatchRetrieve(index, wmodel="BM25", verbose=True)

In [117]:
overlapping_queries = set([i['query'].lower().strip() for i in sim_docs])
queries = {i.query_id:i.text for i in ir_datasets.load(f'ir-benchmarks/{dataset_id}').queries_iter()}
overlapping_queries = set([k for k,v in queries.items() if v.lower().strip() in overlapping_queries])
del queries
print(len(overlapping_queries))


372


In [118]:
print('Create run')
pt_dataset = pt.get_dataset(f'irds:ir-benchmarks/{dataset_id}')
run = bm25(pt_dataset.get_topics("title"))
persist_and_normalize_run(run, 'reverted-index')
print('Done, run was created')


Create run


BR(BM25): 100%|██████████| 923/923 [00:14<00:00, 65.39q/s] 


I use the environment variable "TIRA_OUTPUT_DIR" to determine where I should store the run file using "." as default.
Done. run file is stored under "./run.txt".
Done, run was created


In [119]:
topics = pt_dataset.get_topics("title")
topics = topics[topics['qid'].isin(overlapping_queries)]
topics

Unnamed: 0,qid,query
0,q092223595,car shelter
4,q092244,airport
9,q0922131,free antivirus
10,q092232659,colloidal silver
11,q0922171,silverwood
...,...,...
908,q092236192,second world war
911,q092236371,download video facebook
913,q092236385,farmland
917,q092236647,video projector


In [120]:
# This assumes we have execited the retrieve-with-pyterrier.ipynb notebook before to create the run.txt file
bm25 = pt.io.read_results('run.txt')

pt.Experiment(
    [
        tira.pt.from_submission('ir-benchmarks/tira-ir-starter/BM25 Re-Rank (tira-ir-starter-pyterrier)', dataset_id),
        tira.pt.from_submission('ir-benchmarks/fschlatt/sparse-cross-encoder-4-512', dataset_id),
        bm25, 
    ],
    topics,
    pt_dataset.get_qrels(),
    ["ndcg_cut.10", "recip_rank", "recall_1000"],
    names=["BM25", "Sparse Cross Encoder", "BM25 (reverted index)"]
)

Unnamed: 0,name,ndcg_cut.10,recip_rank,recall_1000
0,BM25,0.180857,0.269993,0.516569
1,Sparse Cross Encoder,0.216468,0.322682,0.516569
2,BM25 (reverted index),0.376305,0.534087,0.640151


In [61]:
# This assumes we have execited the retrieve-with-pyterrier.ipynb notebook before to create the run.txt file
bm25 = pt.io.read_results('run.txt')

pt.Experiment(
    [bm25],
    pt_dataset.get_topics('title'),
    pt_dataset.get_qrels(),
    ["ndcg_cut.10", "recip_rank", "recall_1000"],
    names=["BM25"]
)

Unnamed: 0,name,ndcg_cut.10,recip_rank,recall_1000
0,BM25,0.820021,0.900293,0.952645
