## Indexing and Search

In [2]:
import time
import json as json
import re
import sys
import os
from collections import defaultdict
from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search, Q

### Connect Elasticsearch

In [3]:
with open("data/login.json","r") as fin:
    cc = json.load(fin)

In [4]:
es = Elasticsearch(
    [cc["remote_host"]],
    http_auth=(cc["user"],cc["password"]),
    scheme="https",
    port=443,
)

In [6]:
#es.info()

### Initiate Elasticsearch Index

In [135]:
INDEX_NAME = "covid19"
NUMBER_SHARDS = 5 
NUMBER_REPLICAS = 0

request_body = {
        "settings": {
            "number_of_shards": NUMBER_SHARDS,
            "number_of_replicas": NUMBER_REPLICAS
        },
        "mappings": {
                "properties": {
                    "id": {
                        "type": "keyword"
                    },
                    "title":{
                        "type": "text"
                    },
                    "abstract":{
                        "type": "text"
                    } 
                }
            }
        }

In [136]:
if es2.indices.exists(INDEX_NAME):
    res = es2.indices.delete(index = INDEX_NAME)
    print("Deleting index %s , Response: %s" % (INDEX_NAME, res))
    
res = es2.indices.create(index = INDEX_NAME, body = request_body)
print("Create index %s , Response: %s" % (INDEX_NAME, res))

Deleting index covid19 , Response: {'acknowledged': True}
Create index covid19 , Response: {'acknowledged': True, 'shards_acknowledged': True, 'index': 'covid19'}


------

In [137]:
with open("data/DATA.json")as fdata:
      DATA =  json.load(fdata)

In [138]:
len(DATA),DATA[0]

(13202,
 {'abstract': 'A 5-year-old male castrated Lhasa Apso cross was evaluated for a 1-month history of inappetence, lethargy, gagging, and progressive right thoracic limb lameness. Synovial fluid analysis revealed nonseptic suppurative inflammation, and a diagnosis of immune-mediated polyarthritis (IMPA) was made. After 3 months of treatment with prednisone and later cyclosporine, the dog developed multiple firm cutaneous and subcutaneous masses and a focal mass within the jejunum. Cultures of blood, urine, skin lesions, and the jejunal mass identified Nocardia veterana by matrix-absorption laser desorption ionization-time-of-flight mass spectrometry (MALDI-TOF MS) and allowed for earlier identification of the organism compared to more traditional secA1 gene sequencing. Immunosuppressive drug treatment was discontinued, and the dog was treated for 3 months by administration of trimethoprim-sulfamethoxazole (TMS). No recurrence of clinical signs was reported 1 year later. This case 

### Populating Indexing

In [139]:
INDEX_NAME = "covid19"
logFilePath = "log.txt"
ic = 0
ir = 0

with open(logFilePath, "w") as fout:
        start = time.time()
        bulk_size = 50 # number of document processed in each bulk index
        bulk_data = [] # data in bulk index

        cnt = 0
        for item in DATA: ## each item is single document
                cnt += 1
                data_dict = {}
                # update 
                data_dict["id"] = item["id"]
                data_dict["title"] = item["title"]
                data_dict["abstract"] = item["abstract"]
                
                ## Put current data into the bulk 
                op_dict = {
                    "index": {
                        "_index": INDEX_NAME,
                        "_id": data_dict["id"]
                    }
                }

                bulk_data.append(op_dict)
                bulk_data.append(data_dict) 
                  
                ## Start Bulk indexing
                if cnt % bulk_size == 0 and cnt != 0:
                    ic += 1
                    
                    tmp = time.time()
                    es.bulk(index=INDEX_NAME,\
                            body=bulk_data,\
                            request_timeout = 300)
                    
                    fout.write("bulk indexing... %s, escaped time %s (seconds) \n" \
                               % ( cnt, tmp - start ) )
                    
                    if ic%1000 ==0:
                        print(" i bulk indexing... %s, escaped time %s (seconds) " \
                              % ( cnt, tmp - start ) )
                    
                    
                    bulk_data = []
                
                
        
        ## indexing those left papers
        if bulk_data:
            ir +=1
            tmp = time.time()
            es.bulk(index=INDEX_NAME, body=bulk_data, request_timeout = 300)
            fout.write("bulk indexing... %s, escaped time %s (seconds) \n"\
                       % ( cnt, tmp - start ) )
            
            if ir%100 ==0:
                print(" r bulk indexing... %s, escaped time %s (seconds) "\
                      % ( cnt, tmp - start ) )
            bulk_data = []
            
        

        end = time.time()
        fout.write("Finish  meta-data indexing. Total escaped time %s (seconds) \n"\
                   % (end - start) )
        print("Finish meta-data indexing. Total escaped time %s (seconds) "\
              % (end - start) )

Finish meta-data indexing. Total escaped time 11.975986242294312 (seconds) 


### Search of Virus Proteins