# Helper functions. Run first:

In [None]:
%%capture

%pip install python-terrier
%pip install pandas
%pip install numpy

import pyterrier as pt
import pandas as pd
import numpy as np

if not pt.started():
    pt.init()

# Helper function to intialize multiple indices
# Prepares the index path and avoid errors with already existing indices
index_count = 0
def prepare_index_path(indexName):
    global index_count
    index_count = index_count + 1
    index_path = 'indices/' + indexName + str(index_count)

    if os.path.exists(index_path) & os.path.isdir(index_path):
        files = os.listdir(index_path)
        for file in files:
            file_name = index_path + '/' + file
            os.remove(file_name)
        os.rmdir(index_path)
    elif os.path.exists(index_path) & (not os.path.isdir(index_path)):
        os.rmove(index_path)

    return os.path.abspath(index_path)


def build_index(indexName, dataset):
    index_path = prepare_index_path(indexName)
    indexer = pt.IterDictIndexer(
        index_path, overwrite=True, blocks=True)
    indexer.setProperty(
        "stopwords.filename", os.path.abspath("en.txt"))
    index_created = indexer.index(dataset.get_corpus_iter(),
                                  fields=['title', 'doi', 'abstract'],
                                  meta=('docno',))
    return index_created

# Question 4

In [None]:
#  Get Dataset and build index
dataset = pt.datasets.get_dataset('irds:cord19/trec-covid')
trec_covid_index_ref = build_index('testindex', dataset)
trec_covid_index = pt.IndexFactory.of(trec_covid_index_ref)

# Get top-3 documents
title_queries = dataset.get_topics('title')
bm25 = pt.BatchRetrieve(trec_covid_index, wmodel='BM25', num_results=10)
res = bm25.transform(title_queries)
sorted_results = res.sort_values(by="score", ascending=False)
top_3_docs = sorted_results[:3]

In [None]:
# Get all titles of top-3 documents and add them to an array
docs = pd.DataFrame(dataset.get_corpus_iter())
tokens = []
for index, row in top_3_docs.iterrows():
    row = docs.loc[docs["docno"] == row.docno]
    #row_tokens = regex_tokenizer.tokenize(row["title"].item())
    #tokens = row_tokens + tokens
    row_titles = row["title"].item()
    tokens.append(row_titles)

In [10]:
# Put the array into a pd.dataframe for bo1-QE with column 'qid' and 'query' 
tokens_df = pd.DataFrame(tokens, columns=["query"])
tokens_df['qid'] = np.arange(1, len(tokens_df)+1) # start qid index at 1 because in title_queries its the same
tokens_df = tokens_df[['qid', 'query']]
tokens_df['qid'] = tokens_df['qid'].astype(object) # had to convert because it crashed before

  qid                                              query
0   1  A super-spreader of COVID-19 in Ningbo city in...
1   2  Significance of super spreader events in COVID-19
2   3  Significance of super spreader events in COVID-19
   qid                                          query
0    1                             coronavirus origin
1    2        coronavirus response to weather changes
2    3                           coronavirus immunity
3    4         how do people die from the coronavirus
4    5                      animal models of covid 19
5    6                 coronavirus test rapid testing
6    7              serological tests for coronavirus
7    8                    coronavirus under reporting
8    9                          coronavirus in canada
9   10           coronavirus social distancing impact
10  11                 coronavirus hospital rationing
11  12                         coronavirus quarantine
12  13                    how does coronavirus spread
13  14          

In [11]:
# Retrievel mit bm25 >> bo1 >> bm25 Pipeline and title_queries
# This should work
bm25 = pt.BatchRetrieve(trec_covid_index, wmodel='BM25')
bo1 = pt.rewrite.Bo1QueryExpansion(trec_covid_index)
pipelineQE = bm25 >> bo1 >> bm25

title_queries = dataset.get_topics('title')
res = pipelineQE.transform(title_queries)
sorted_results = res.sort_values(by="score", ascending=False)
print(sorted_results)

      qid   docid     docno  rank      score  \
1001   10  119890  km4qijqj     1  44.506971   
1000   10  119889  po2c65nb     0  44.506971   
1002   10  189615  o9ii9fj3     2  44.230909   
31000  38  111061  09lw7d2p     0  41.133142   
31001  38  126416  zvzfgdkt     1  41.133142   
...    ..     ...       ...   ...        ...   
36995  42    1197  vjxmrmfc   995   2.226237   
36996  42  163873  gabcaqfu   996   2.225578   
36997  42  124173  m7hcyyc3   997   2.225474   
36998  42   76050  4uxwojzo   998   2.225423   
36999  42   76544  4bnje5un   999   2.221218   

                                    query_0  \
1001   coronavirus social distancing impact   
1000   coronavirus social distancing impact   
1002   coronavirus social distancing impact   
31000           covid inflammatory response   
31001           covid inflammatory response   
...                                     ...   
36995                vitamin d and covid 19   
36996                vitamin d and covid 19   


In [12]:
# Retrievel mit bm25 >> bo1 >> bm25 Pipeline and title_queries
# This wont work
bm25 = pt.BatchRetrieve(trec_covid_index, wmodel='BM25')
bo1 = pt.rewrite.Bo1QueryExpansion(trec_covid_index)
pipelineQE = bm25 >> bo1 >> bm25

res = pipelineQE.transform(tokens_df)
sorted_results = res.sort_values(by="score", ascending=False)
print(sorted_results)

Empty DataFrame
Columns: [docid, docno, rank, score, query_0, qid, query]
Index: []
