# Helper functions. Run first:

In [6]:
%%capture

%pip install python-terrier
%pip install pandas
%pip install numpy

import pyterrier as pt
import pandas as pd
import numpy as np

if not pt.started():
    pt.init()

# Helper function to intialize multiple indices
# Prepares the index path and avoid errors with already existing indices
index_count = 0
def prepare_index_path(indexName):
    global index_count
    index_count = index_count + 1
    index_path = 'indices/' + indexName + str(index_count)

    if os.path.exists(index_path) & os.path.isdir(index_path):
        files = os.listdir(index_path)
        for file in files:
            file_name = index_path + '/' + file
            os.remove(file_name)
        os.rmdir(index_path)
    elif os.path.exists(index_path) & (not os.path.isdir(index_path)):
        os.rmove(index_path)

    return os.path.abspath(index_path)


def build_index(indexName, dataset):
    index_path = prepare_index_path(indexName)
    indexer = pt.IterDictIndexer(
        index_path, overwrite=True, blocks=True)
    indexer.setProperty(
        "stopwords.filename", os.path.abspath("en.txt"))
    index_created = indexer.index(dataset.get_corpus_iter(),
                                  fields=['title', 'doi', 'abstract'],
                                  meta=('docno',))
    return index_created

# Question 5

In [8]:
#  Get Dataset and build index
dataset = pt.datasets.get_dataset('irds:cord19/trec-covid')
trec_covid_index_ref = build_index('q5testindex', dataset)
trec_covid_index = pt.IndexFactory.of(trec_covid_index_ref)

  index_created = indexer.index(dataset.get_corpus_iter(),
cord19/trec-covid documents: 100%|██████████| 192509/192509 [01:38<00:00, 1958.16it/s]


In [11]:
# Example Queries
# Added some of the most unrelated terms I can think of
# Also found a better way of building the df. Now it works! \^_^/
example_tokens_array = [[0, 'picture'],
                        [1, 'virus'],
                        [2, 'plant'],
                        [3, 'jesus'],
                        [4, 'covid']]
example_tokens_df = pd.DataFrame(example_tokens_array, columns=["qid", "query"])
bm25 = pt.BatchRetrieve(trec_covid_index, wmodel='BM25')
bo1 = pt.rewrite.Bo1QueryExpansion(trec_covid_index)
pipelineQE = bm25 >> bo1 >> bm25

res = pipelineQE.transform(example_tokens_df)
sorted_results = res.sort_values(by="score", ascending=False)
display(sorted_results)

# What does this result tell me? What are the most relevant terms? 

Unnamed: 0,qid,docid,docno,rank,score,query
2683,3,22445,6g48atpz,0,23.533972,jesus
2684,3,119530,lc1n8ne0,1,19.332264,jesus
2685,3,183686,0cresfn8,2,19.033026,jesus
2686,3,13948,up7dcv9u,3,17.587281,jesus
2687,3,169420,dt0jku7v,4,16.268702,jesus
...,...,...,...,...,...,...
3684,4,111616,gsrbt037,995,1.271724,covid
3685,4,111617,kxwdansc,996,1.271724,covid
3686,4,121375,lf92u94q,997,1.271724,covid
3687,4,121376,80mm4gtq,998,1.271724,covid


Are those the top terms?
How do I find the top terms?

| Rank | Score | Query_0 |
|------|-------|---------|
| 0    | 53.87 | jesus   |
| 0    | 16.89 | plant   |
| 0    | 15.95 | picture |
| 0    | 7.29  | virus   |
| 0    | 2.4   | covid   |

(Full table in ./test.csv)