# Setup Code (Run first)


In [1]:
from gensim.models import Word2Vec
from tqdm import tqdm
from pyterrier.measures import *
from ir_utils import *
import requests
import os
import pandas as pd
import numpy as np
import nltk
import time
import string

In [2]:
pt = get_pyterrier_instance()
irds_trec_covid_dataset = pt.datasets.get_dataset('irds:cord19/trec-covid')
irds_trec_covid_dataset_docs = pd.DataFrame(irds_trec_covid_dataset.get_corpus_iter())
print('Got dataset and docs...')
irds_trec_covid_dataset_metadata = pd.read_csv('~/.ir_datasets/cord19/2020-07-16/metadata.csv', low_memory=False)
print('Creating index...')
#irds_trec_covid_dataset_index = build_index('qetest', irds_trec_covid_dataset)
irds_trec_covid_dataset_index = pt.IndexRef.of("F:\Bibliotheken\Desktop\Skripte\packgaabwir2022\indices\qetest1\data.properties")

irds_trec_covid_dataset_topics_titles = irds_trec_covid_dataset.get_topics('title')
irds_trec_covid_dataset_topics_qrels = irds_trec_covid_dataset.get_qrels()
print('Got topics and qrels...')

PyTerrier 0.9.2 has loaded Terrier 5.7 (built by craigm on 2022-11-10 18:30) and terrier-helper 0.0.7

cord19/trec-covid documents: 100%|██████████| 192509/192509 [00:01<00:00, 136531.17it/s]


Got dataset and docs...
Creating index...
Got topics and qrels...


In [3]:
print('Starting  preprocessing...')
irds_trec_covid_dataset_en_docs_preprocessed = preprocess_data(irds_trec_covid_dataset_docs)

print('Loading models ...')
initial_word2vec_model = get_initial_word2vec_model(irds_trec_covid_dataset_en_docs_preprocessed)
global_word2vec_model = get_global_word2vec_model()

top_k_docs = get_top_k_docs_bm25(irds_trec_covid_dataset_index, irds_trec_covid_dataset_topics_titles, 1)
# 2.1.1 from procedure
top_k_docs_title_tokens_df = get_query_qid_df_tokens_from_docs_by_field(irds_trec_covid_dataset_docs,top_k_docs) # [qid, query]
print("Got top",len(top_k_docs),"docs...")

Starting  preprocessing...
Loading preprocessed data...
Loading models ...
Loading word2vec model...
Downloading word2vec model...
Got top 3 docs...


In [4]:
print('Getting refence titles from sm and pubmed...') 
# 2.1.2 from procedure
top_k_docs_references_titles = get_all_titles_from_references_of_docs_from_sm_and_pubmed(irds_trec_covid_dataset_metadata,irds_trec_covid_dataset_docs,top_k_docs)
print('Getting relevant papers from authors...') 
# 2.1.3 from procedure
top_k_docs_all_authors_relevant_papers_abstracts = get_field_for_all_relevant_authors(irds_trec_covid_dataset_metadata,top_k_docs,2,field="paperAbstract")

print('Tokens from top k docs...') 
top_k_docs_tokens_df = get_query_qid_df_from_all_preprocessed_tokens_of_docs(irds_trec_covid_dataset_metadata,top_k_docs,top_k_docs_references_titles,top_k_docs_all_authors_relevant_papers_abstracts,field="abstract")

print(top_k_docs_tokens_df)
print('Ranking tokens...') 
# 2.1.4 from procedure
top_k_docs_top_k_tokens_ranked = get_top_k_ranked_tokens_from_dataframe_with_bm25_and_bo1(irds_trec_covid_dataset_index,top_k_docs_tokens_df,2)

Getting refence titles from sm and pubmed...
172 titles found

Getting relevant papers from authors...
Searching current author: Lin, Jie
Query: coronavirus super spreaders
Searching current author: Yan, Kun
Query: coronavirus super spreaders
Searching current author: Zhang, Jingfeng
Query: coronavirus super spreaders
Searching current author: Cai, Ting
Query: coronavirus super spreaders
Searching current author: Zheng, Jianjun
Query: coronavirus super spreaders
Searching current author: Jing, Jane Lee Jia
Query: what alcohol sanitizer kills coronavirus
Searching current author: Pei Yi, Thong
Query: what alcohol sanitizer kills coronavirus
Searching current author: Bose, Rajendran J C
Query: what alcohol sanitizer kills coronavirus
Searching current author: McCarthy, Jason R
Query: what alcohol sanitizer kills coronavirus
Searching current author: Tharmalingam, Nagendran
Query: what alcohol sanitizer kills coronavirus
Searching current author: Madheswaran, Thiagarajan
Query: what alcoh

In [5]:
print('Preparing tokens for word2vec...') 
top_k_docs_all_tokenized_sentences = get_tokenized_sentences_for_word2vec(irds_trec_covid_dataset_metadata,top_k_docs,top_k_docs_references_titles,top_k_docs_all_authors_relevant_papers_abstracts)

print('Retraining word2vec...') 
retrained_word2vec_model = retrain_word2vec_model(initial_word2vec_model,top_k_docs_all_tokenized_sentences)

top_k_docs_tokens_array = query_qid_df_to_array(top_k_docs_tokens_df)
print(len(top_k_docs_tokens_array))
print('Expanding search terms...') 
top_k_docs_relevant_tokens_expanded_df = expand_search_terms(top_k_docs_tokens_array,retrained_word2vec_model,threshold=0.8,max_expansion_word_count=5)


print('Ranking expanded tokens...')
# 2.2.1 from procedure
top_k_docs_top_k_tokens_expanded_ranked = get_top_k_ranked_tokens_from_dataframe_with_bm25_and_bo1(irds_trec_covid_dataset_index,top_k_docs_relevant_tokens_expanded_df,2)

Preparing tokens for word2vec...
preparing abstracts...
preparing abstracts done.
all_terms_per_sentences done.
top_k_docs_references_titles done.
all_authors_relevant_papers_abstracts done.
Retraining word2vec...
1662
Expanding search terms...
Ranking expanded tokens...


In [162]:
# 2.2.2 from procedure
print('Merging tokens...')
top_k_docs_tokens_final = top_k_docs_top_k_tokens_ranked.tolist() + top_k_docs_top_k_tokens_expanded_ranked.tolist()
top_k_docs_tokens_final_ranked = get_top_k_ranked_tokens_from_dataframe_with_bm25_and_bo1(irds_trec_covid_dataset_index,get_qid_query_df_from_list(top_k_docs_tokens_final),2)
final_expanded_title_queries = expand_queries_with_array_of_tokens(irds_trec_covid_dataset_topics_titles,top_k_docs_tokens_final_ranked)

print('Expanding queries...')
local_expanded_queries = expand_queries_with_model(irds_trec_covid_dataset_topics_titles,initial_word2vec_model.wv.most_similar,threshold=0.3,max_expansion_word_count=5)
local_retrained_expanded_queries = expand_queries_with_model(irds_trec_covid_dataset_topics_titles,retrained_word2vec_model.wv.most_similar,threshold=0.3,max_expansion_word_count=5)
global_expanded_queries = expand_queries_with_model(irds_trec_covid_dataset_topics_titles,global_word2vec_model.most_similar,threshold=0.8,max_expansion_word_count=5)

Merging tokens...
Expanding queries...


In [167]:
print('Running experiments...')
# Experiments
experiment = run_experiment(irds_trec_covid_dataset_index, irds_trec_covid_dataset_topics_titles, irds_trec_covid_dataset_topics_qrels)
print("title_queries")
experiment.to_csv('title_queries_experiment.csv')
print(experiment)

experiment = run_experiment(irds_trec_covid_dataset_index, final_expanded_title_queries, irds_trec_covid_dataset_topics_qrels)
print("final_expanded_title_queries")
experiment.to_csv('final_expanded_title_queries.csv')
print(experiment)

experiment = run_experiment(irds_trec_covid_dataset_index, local_expanded_queries, irds_trec_covid_dataset_topics_qrels)
print("local_expanded_queries")
experiment.to_csv('local_expanded_queries.csv')
print(experiment)

experiment = run_experiment(irds_trec_covid_dataset_index, local_retrained_expanded_queries, irds_trec_covid_dataset_topics_qrels)
print("local_retrained_expanded_queries")
experiment.to_csv('local_retrained_expanded_queries.csv')
print(experiment)

experiment = run_experiment(irds_trec_covid_dataset_index, global_expanded_queries, irds_trec_covid_dataset_topics_qrels)
print("global_expanded_queries")
experiment.to_csv('global_expanded_queries.csv')
print(experiment)

Running experiments...
title_queries
                  name    P@5   P@10   nDCG@10      nDCG  RR(rel=2)       map
0               TF_IDF  0.688  0.680  0.610112  0.411849   0.688159  0.211321
1                 BM25  0.692  0.674  0.600675  0.409234   0.660762  0.206401
2                 InL2  0.660  0.634  0.571136  0.411746   0.661016  0.210294
3  bm25 >> bo1 >> bm25  0.700  0.662  0.594535  0.425644   0.660942  0.215591
final_expanded_title_queries
                  name    P@5   P@10   nDCG@10      nDCG  RR(rel=2)       map
0               TF_IDF  0.472  0.484  0.380664  0.364996   0.380071  0.174685
1                 BM25  0.420  0.430  0.342929  0.360201   0.356497  0.169918
2                 InL2  0.448  0.444  0.348149  0.362049   0.373465  0.170820
3  bm25 >> bo1 >> bm25  0.452  0.478  0.395623  0.386447   0.454776  0.190425
local_expanded_queries
                  name    P@5   P@10   nDCG@10      nDCG  RR(rel=2)       map
0               TF_IDF  0.460  0.472  0.400135  0.345