## Index Data

import packages

In [5]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import bulk
import pandas as pd
from credentials import username, password

Connect to elasticsearch

In [6]:
es = Elasticsearch(
    "http://localhost:9200",
    basic_auth=(username, password)
)

### Create index with analysers

In [7]:
mappings = {
    "properties": {
        "cord_uid": {"type": "keyword"},
        "sha": {"type": "keyword"},
        "source_x": {"type": "keyword"},
        "title":{"type": "text"},
        "doi": {"type": "keyword"},
        "pmcid": {"type": "keyword"},
        "pubmed_id": {"type": "integer"},
        "license": {"type": "keyword"},
        "abstract": {"type": "text", "normalizer": "my-normalizer"},
        "publish_time": {"type": "date"},
        "authors": {"type": "text"},
        "journal": {"type": "text"},
        "mag_id": {"type": "keyword"},
        "who_covidence_id": {"type": "keyword"},
        "arxiv_id": {"type": "keyword"},
        "pdf_json_files": {"type": "keyword"},
        "pmc_json_files": {"type": "keyword"},
        "url": {"type": "keyword"},
        "s2_id": {"type": "keyword"}
    }
}

In [11]:
settings = {
    "analysis": {
        "analyzer": {
            "my-analyzer": {
                "type": "custom",
                "tokenizer": "standard",
                "filter": ["stop"]
          } 
        },
        "normalizer": {
            "my-normalizer": {
                "type": "custom",
                "char_filter": [],
                "filter": ["lowercase", "asciifolding"]
            }
        }
    }
}

In [12]:
index = "test2"
# es.indices.delete(index=index)
es.indices.create(index=index, mappings=mappings, settings=settings)

BadRequestError: BadRequestError(400, 'mapper_parsing_exception', 'unknown parameter [normalizer] on mapper [abstract] of type [text]')

### Index the data

load data from [Moodle](https://elearning.iws.th-koeln.de/moodle/mod/resource/view.php?id=62943) and place into data

In [22]:
md = pd.read_csv("data/metadata.csv")
md.head(5)

  md = pd.read_csv("data/metadata.csv")


Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
0,ug7v899j,d1aafb70c066a2068b02786f8929fd9c900897fb,PMC,Clinical features of culture-proven Mycoplasma...,10.1186/1471-2334-1-6,PMC35282,11472636.0,no-cc,OBJECTIVE: This retrospective chart review des...,2001-07-04,"Madani, Tariq A; Al-Ghamdi, Aisha A",BMC Infect Dis,,,,document_parses/pdf_json/d1aafb70c066a2068b027...,document_parses/pmc_json/PMC35282.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3...,
1,02tnwd4m,6b0567729c2143a66d737eb0a2f63f2dce2e5a7d,PMC,Nitric oxide: a pro-inflammatory mediator in l...,10.1186/rr14,PMC59543,11667967.0,no-cc,Inflammatory diseases of the respiratory tract...,2000-08-15,"Vliet, Albert van der; Eiserich, Jason P; Cros...",Respir Res,,,,document_parses/pdf_json/6b0567729c2143a66d737...,document_parses/pmc_json/PMC59543.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
2,ejv2xln0,06ced00a5fc04215949aa72528f2eeaae1d58927,PMC,Surfactant protein-D and pulmonary host defense,10.1186/rr19,PMC59549,11667972.0,no-cc,Surfactant protein-D (SP-D) participates in th...,2000-08-25,"Crouch, Erika C",Respir Res,,,,document_parses/pdf_json/06ced00a5fc04215949aa...,document_parses/pmc_json/PMC59549.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
3,2b73a28n,348055649b6b8cf2b9a376498df9bf41f7123605,PMC,Role of endothelin-1 in lung disease,10.1186/rr44,PMC59574,11686871.0,no-cc,Endothelin-1 (ET-1) is a 21 amino acid peptide...,2001-02-22,"Fagan, Karen A; McMurtry, Ivan F; Rodman, David M",Respir Res,,,,document_parses/pdf_json/348055649b6b8cf2b9a37...,document_parses/pmc_json/PMC59574.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,
4,9785vg6d,5f48792a5fa08bed9f56016f4981ae2ca6031b32,PMC,Gene expression in epithelial cells in respons...,10.1186/rr61,PMC59580,11686888.0,no-cc,Respiratory syncytial virus (RSV) and pneumoni...,2001-05-11,"Domachowske, Joseph B; Bonville, Cynthia A; Ro...",Respir Res,,,,document_parses/pdf_json/5f48792a5fa08bed9f560...,document_parses/pmc_json/PMC59580.xml.json,https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5...,


In [23]:
md_sorted = md.iloc[md.isnull().sum(1).sort_values(ascending=1).index]

In [24]:
md_unique = md_sorted.drop_duplicates(subset=["cord_uid"], keep="first")
md_unique.reset_index(inplace=True, drop=True)

In [25]:
md_unique

Unnamed: 0,cord_uid,sha,source_x,title,doi,pmcid,pubmed_id,license,abstract,publish_time,authors,journal,mag_id,who_covidence_id,arxiv_id,pdf_json_files,pmc_json_files,url,s2_id
0,nc8cdgqo,591d913cad0522dcf942f213523920ae786b8a0c; d772...,ArXiv; Medline; PMC; WHO,Using Early Data to Estimate the Actual Infect...,10.3390/biology9050097,PMC7284549,32397286,cc-by,The number of screening tests carried out in F...,2020-05-08,"Roques, Lionel; Klein, Etienne K; Papaïx, Juli...",Biology (Basel),,#209944,2003.1072,document_parses/pdf_json/591d913cad0522dcf942f...,document_parses/pmc_json/PMC7284549.xml.json,https://arxiv.org/pdf/2003.10720v3.pdf; https:...,218619416.0
1,5jvuxob7,6c370a406b393ff0c6c1a4c44b8886050a34c2ba,ArXiv; Elsevier; Medline; PMC; WHO,Investigating the dynamics of COVID-19 pandemi...,10.1016/j.chaos.2020.109988,PMC7284270,32536763,no-cc,"In this paper, we investigate the ongoing dyna...",2020-06-10,"Pai, Chintamani; Bhaskar, Ankush; Rawoot, Vaibhav",Chaos Solitons Fractals,,#601738,2004.13337,document_parses/pdf_json/6c370a406b393ff0c6c1a...,document_parses/pmc_json/PMC7284270.xml.json,https://www.ncbi.nlm.nih.gov/pubmed/32536763/;...,216562300.0
2,8mj8gjv9,0a4379cb820735fce01ca648ecdab069f91d284d,ArXiv; Medline; PMC; WHO,Modeling the role of respiratory droplets in C...,10.1063/5.0015984,PMC7327718,32624650,cc-by,"In this paper, we develop a first principles m...",2020-06-01,"Chaudhuri, Swetaprovo; Basu, Saptarshi; Kabi, ...",Phys Fluids (1994),,#634223,2004.10929,document_parses/pdf_json/0a4379cb820735fce01ca...,document_parses/pmc_json/PMC7327718.xml.json,https://arxiv.org/pdf/2004.10929v3.pdf; https:...,216562586.0
3,okjm7kjo,4ca24acb1b198b40e74baa74a97067562417e7fc,ArXiv; Elsevier; Medline; PMC; WHO,How to reduce epidemic peaks keeping under con...,10.1016/j.chaos.2020.109940,PMC7274126,32518474,no-cc,One of the main challenges of the measures aga...,2020-06-05,"Cadoni, Mariano",Chaos Solitons Fractals,,#594353,2004.02189,document_parses/pdf_json/4ca24acb1b198b40e74ba...,document_parses/pmc_json/PMC7274126.xml.json,https://api.elsevier.com/content/article/pii/S...,214802754.0
4,m6479wyv,fd05a8fff57b7100589390bcd74f1752cbf5ddcd; 1267...,ArXiv; Elsevier; Medline; PMC; WHO,Analysis and forecast of COVID-19 spreading in...,10.1016/j.chaos.2020.109761,PMC7156225,32308258,els-covid,Abstract In this note we analyze the temporal ...,2020-05-31,"Fanelli, Duccio; Piazza, Francesco","Chaos, Solitons & Fractals",,#100477,2003.06031,document_parses/pdf_json/fd05a8fff57b710058939...,document_parses/pmc_json/PMC7156225.xml.json,https://api.elsevier.com/content/article/pii/S...,212718235.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
191170,4iunmx2w,,WHO,Prevention of Fogging of Protective Eyewear fo...,,,,unk,,2020,,,,#601490,,,,,218538953.0
191171,t6gqa48n,,WHO,Hypothesis for potential pathogenesis of SARS-...,,,,unk,Coronavirus disease 2019 (COVID-19) is an infe...,,,,,#11362,,,,,214601393.0
191172,w6r7nje8,,WHO,Mental Well-Being of Nursing Staff During the ...,,,,unk,,2020,,,,#32418672,,,,,215801036.0
191173,4e3enu6z,,WHO,Correction: Clinicopathological characteristic...,,,,unk,,2020,,,,#598124,,,,,218872883.0


### Use function index_data to send data to elastic

In [129]:
def index_data(df, index):
    bulk_data = []
    
    for idx, doc in df.iterrows():
        doc_dict = dict(doc)
        if type(doc_dict["pubmed_id"]) != float:
            doc_dict["pubmed_id"] = None
        if type(doc_dict["arxiv_id"]) != float:
            doc_dict["arxiv_id"] = None
        for key in doc_dict.keys():
            if pd.isna(doc_dict[key]):
                doc_dict[key] = None
    
        data = {
            "_index": index,
            "_id": idx,
            "_source": doc_dict
        }
        
        bulk_data.append(data)
    bulk(es, bulk_data)

In [151]:
start_at_entry = 0
index_data(md_unique.iloc[start_at_entry:], "test")

In [136]:
es.count(index="test")

ObjectApiResponse({'count': 191175, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}})

In [155]:
es.indices.analyze(analyzer="standard", text="covid 19")

ObjectApiResponse({'tokens': [{'token': 'covid', 'start_offset': 0, 'end_offset': 5, 'type': '<ALPHANUM>', 'position': 0}, {'token': '19', 'start_offset': 6, 'end_offset': 8, 'type': '<NUM>', 'position': 1}]})