# Helper functions. Run first:

In [2]:
%%capture

%pip install python-terrier
%pip install pandas
%pip install numpy

import pyterrier as pt
import pandas as pd
import numpy as np

if not pt.started():
    pt.init()

# Helper function to intialize multiple indices
# Prepares the index path and avoid errors with already existing indices
index_count = 0
def prepare_index_path(indexName):
    global index_count
    index_count = index_count + 1
    index_path = 'indices/' + indexName + str(index_count)

    if os.path.exists(index_path) & os.path.isdir(index_path):
        files = os.listdir(index_path)
        for file in files:
            file_name = index_path + '/' + file
            os.remove(file_name)
        os.rmdir(index_path)
    elif os.path.exists(index_path) & (not os.path.isdir(index_path)):
        os.rmove(index_path)

    return os.path.abspath(index_path)


def build_index(indexName, dataset):
    index_path = prepare_index_path(indexName)
    indexer = pt.IterDictIndexer(
        index_path, overwrite=True, blocks=True)
    indexer.setProperty(
        "stopwords.filename", os.path.abspath("en.txt"))
    index_created = indexer.index(dataset.get_corpus_iter(),
                                  fields=['title', 'doi', 'abstract'],
                                  meta=('docno',))
    return index_created

# Question 5

In [4]:
#  Get Dataset and build index
dataset = pt.datasets.get_dataset('irds:cord19/trec-covid')
trec_covid_index_ref = build_index('q5testindex', dataset)
trec_covid_index = pt.IndexFactory.of(trec_covid_index_ref)

  index_created = indexer.index(dataset.get_corpus_iter(),
cord19/trec-covid documents: 100%|██████████| 192509/192509 [01:51<00:00, 1721.31it/s]


In [6]:
# Example Queries
# Added some of the most unrelated terms I can think of
# Also found a better way of building the df. Now it works! \^_^/
example_tokens_array = [[0, 'picture'],
                        [1, 'virus'],
                        [2, 'plant'],
                        [3, 'jesus'],
                        [4, 'covid'],
                        [5, 'patient']]
example_tokens_df = pd.DataFrame(
    example_tokens_array, columns=["qid", "query"])
tf_idf = pt.BatchRetrieve(trec_covid_index, wmodel="TF_IDF")
bm25 = pt.BatchRetrieve(trec_covid_index, wmodel='BM25')
bo1 = pt.rewrite.Bo1QueryExpansion(trec_covid_index)
pipelineQE = tf_idf >> bo1 >> tf_idf

res = pipelineQE.transform(example_tokens_df)
sorted_results = res.sort_values(by="score", ascending=False)
display(sorted_results)
sorted_results.to_csv('test2.csv')
# What does this result tell me? What are the most relevant terms?


Unnamed: 0,qid,docid,docno,rank,score,query_0,query
3000,3,119530,lc1n8ne0,0,28.152275,jesus,applypipeline:off jesu^1.955487492 korea^0.789...
3001,3,183686,0cresfn8,1,27.758642,jesus,applypipeline:off jesu^1.955487492 korea^0.789...
3002,3,22445,6g48atpz,2,12.936580,jesus,applypipeline:off jesu^1.955487492 korea^0.789...
3004,3,117644,1046cs2b,4,10.627188,jesus,applypipeline:off jesu^1.955487492 korea^0.789...
3003,3,117643,pt2hmfzw,3,10.627188,jesus,applypipeline:off jesu^1.955487492 korea^0.789...
...,...,...,...,...,...,...,...
995,0,951,r7y1cqou,995,0.253875,picture,applypipeline:off pictur^1.623535424 patient^0...
996,0,170200,ektqb4z5,996,0.253843,picture,applypipeline:off pictur^1.623535424 patient^0...
997,0,50475,hrbzgclc,997,0.253831,picture,applypipeline:off pictur^1.623535424 patient^0...
998,0,28439,v3cnggf5,998,0.253830,picture,applypipeline:off pictur^1.623535424 patient^0...


Are those the top terms?
How do I find the top terms?

| Rank | Score | Query_0 |
|------|-------|---------|
| 0    | 53.87 | jesus   |
| 0    | 16.89 | plant   |
| 0    | 15.95 | picture |
| 0    | 7.29  | virus   |
| 0    | 2.4   | covid   |

(Full table in ./test.csv)