## Search queries from Topics

### imports

In [6]:
import pandas as pd
from elasticsearch import Elasticsearch
from credentials import username, password

### Read information need from xlm file

In [7]:
xml_filepath = "data/topics-rnd5_covid-complete.xml"
topics = pd.read_xml(xml_filepath)

In [8]:
# set index to number
topics.set_index("number", inplace=True)

In [9]:
topics.head(10)

Unnamed: 0_level_0,query,question,narrative
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,coronavirus origin,what is the origin of COVID-19,seeking range of information about the SARS-Co...
2,coronavirus response to weather changes,how does the coronavirus respond to changes in...,seeking range of information about the SARS-Co...
3,coronavirus immunity,will SARS-CoV2 infected people develop immunit...,seeking studies of immunity developed due to i...
4,how do people die from the coronavirus,what causes death from Covid-19?,Studies looking at mechanisms of death from Co...
5,animal models of COVID-19,what drugs have been active against SARS-CoV o...,Papers that describe the results of testing d...
6,coronavirus test rapid testing,what types of rapid testing for Covid-19 have ...,Looking for studies identifying ways to diagno...
7,serological tests for coronavirus,are there serological tests that detect antibo...,Looking for assays that measure immune respons...
8,coronavirus under reporting,how has lack of testing availability led to un...,Looking for studies answering questions of imp...
9,coronavirus in Canada,how has COVID-19 affected Canada,"seeking data related to infections (confirm, s..."
10,coronavirus social distancing impact,has social distancing had an impact on slowing...,seeking specific information on studies that h...


### Get elasticsearch instance

In [10]:
es = Elasticsearch("http://localhost:9200",
                   basic_auth=(username, password))

In [11]:
from datetime import datetime

In [15]:
def get_q(search_text: str):
    q = {
        "function_score": {
        "query": {
            "multi_match": {
                    "query": f"{search_text}",
                    "type": "cross_fields",
                    "fields": [
                        "source_x",
                        "title^3",
                        "abstract^2",
                        "journal",
                        "authors"
                ]},
            "functions": [
                {
                    "gauss": {
                    "publish_time": {
                        "origin": datetime.strftime(datetime.now(), "%Y-%m-%d"),
                        "scale": "100Y",
                        "offset": "3Y",
                        "decay": 0.2
                    }
                }}]
            }
        }}
        

    return q

run = "no_operator" + datetime.strftime(datetime.now(), "%d.%m-%H:%M")
with open(f"data/runs/{run}.txt", "w") as f:
    for idx, doc in topics.iterrows():
        q = get_q(doc["query"])
        result = es.search(index="test", body=q, size=1000)
        hits = result["hits"]["hits"]

        for rank, hit in enumerate(hits):
            f.write(" ".join([str(idx), str(0), hit["_source"]["cord_uid"], str(rank), str(hit["_score"]), run, "\n"]))

  result = es.search(index="test", body=q, size=1000)


TypeError: Elasticsearch.search() got an unexpected keyword argument 'function_score'

### Trec eval

If trec_eval is in /usr/local/bin

In [49]:
!trec_eval data/qrels-covid_d5_j0.5-5_covid-complete.txt data/runs/no_operator26.11-13\:06.txt

runid                 	all	no_operator26.11-13:06
num_q                 	all	50
num_ret               	all	50000
num_rel               	all	26664
num_rel_ret           	all	9832
map                   	all	0.1865
gm_map                	all	0.1000
Rprec                 	all	0.2802
bpref                 	all	0.3220
recip_rank            	all	0.7837
iprec_at_recall_0.00  	all	0.8473
iprec_at_recall_0.10  	all	0.4703
iprec_at_recall_0.20  	all	0.3778
iprec_at_recall_0.30  	all	0.2758
iprec_at_recall_0.40  	all	0.2050
iprec_at_recall_0.50  	all	0.1455
iprec_at_recall_0.60  	all	0.0716
iprec_at_recall_0.70  	all	0.0193
iprec_at_recall_0.80  	all	0.0071
iprec_at_recall_0.90  	all	0.0000
iprec_at_recall_1.00  	all	0.0000
P_5                   	all	0.6160
P_10                  	all	0.6240
P_15                  	all	0.6067
P_20                  	all	0.5780
P_30                  	all	0.5587
P_100                 	all	0.4646
P_200                 	all	0.3929
P_500                 	all	0.2814
P_1000

In [50]:
!trec_eval data/qrels-covid_d5_j0.5-5_covid-complete.txt data/runs/baseline.txt

runid                 	all	baseline
num_q                 	all	50
num_ret               	all	50000
num_rel               	all	26664
num_rel_ret           	all	9420
map                   	all	0.1645
gm_map                	all	0.0846
Rprec                 	all	0.2688
bpref                 	all	0.3101
recip_rank            	all	0.6215
iprec_at_recall_0.00  	all	0.7414
iprec_at_recall_0.10  	all	0.4357
iprec_at_recall_0.20  	all	0.3546
iprec_at_recall_0.30  	all	0.2389
iprec_at_recall_0.40  	all	0.1804
iprec_at_recall_0.50  	all	0.1102
iprec_at_recall_0.60  	all	0.0598
iprec_at_recall_0.70  	all	0.0173
iprec_at_recall_0.80  	all	0.0062
iprec_at_recall_0.90  	all	0.0000
iprec_at_recall_1.00  	all	0.0000
P_5                   	all	0.5120
P_10                  	all	0.5100
P_15                  	all	0.4640
P_20                  	all	0.4520
P_30                  	all	0.4453
P_100                 	all	0.4200
P_200                 	all	0.3678
P_500                 	all	0.2677
P_1000              