In [7]:
import json
import os 
import elasticsearch
import nltk
import string
from elasticsearch import helpers
from elasticsearch import Elasticsearch
from nltk.corpus import stopwords

Import Json file to elasticsearch

In [3]:
path = "/Users/shengjieliu/Desktop/bia667/project/pdf_json/"
files = os.listdir(path)
files = files[:5000] # Just for a demo, due to large number (120000) corpus

In [4]:
corpus = []
for idx, file in enumerate(files):
    with open(path + file) as f:
        data = json.load(f)
    if (idx % 1000) == 0:
        print(f"read into {idx} files")
    body = data['body_text']
    combine_body = ""
    for para in body:
        combine_body += para['text']
    data['body_text'] = combine_body
    append_dic = {}
    append_dic['metadata'] = data['metadata']['title']
    append_dic['body_text'] = data['body_text']
    append_dic["_index"] = "covid"
    corpus.append(append_dic)

read into 0 files
read into 1000 files
read into 2000 files
read into 3000 files
read into 4000 files


In [5]:
corpus[0] # sample data 

{'metadata': 'Clinical and epidemiological characteristics of pediatric SARS-CoV-2 infections in China: A multicenter case series',
 'body_text': "a1111111111 a1111111111 a1111111111 a1111111111 a1111111111 available data; 78%). In addition, significant increases in the levels of lactate dehydrogenase and α-hydroxybutyrate dehydrogenase were detected in 28 patients (among 34 patients with available data; 82%) and 25 patients (among 34 patients with available data; 74%), respectively. Patchy lesions in lobules were detected by chest computed tomographic scans in 28 patients (82%). Ground-glass opacities, which were a typical feature in adults, were rare in pediatric patients (3%). Rapid radiologic progression and a late-onset pattern of lesions in the lobules were also noticed. Lesions in lobules still existed in 24 (among 32 patients with lesions; 75%) patients that were discharged, although the main symptoms disappeared a few days after treatment. All patients were discharged, and the

In [8]:
# create elasticsearch 
es = Elasticsearch(HOST = "http://localhost", PORT = 9200)

In [9]:
es.indices.delete("covid")

{'acknowledged': True}

In [10]:
# import json files into elasticsearch
res = helpers.bulk(es, corpus)

Preprocess query

In [11]:
stopwords = list(set(stopwords.words('english')))

In [12]:
def preprocess_query(query):
    query = query.lower()
    # remove punctuation
    pro_query = query.translate(str.maketrans('', '', string.punctuation))
    # tokenize query
    tokens = nltk.word_tokenize(pro_query)
    # filter stop words
    return list(set([token for token in tokens 
                if len(token) > 1
               and not token in stopwords 
               and not token.isnumeric()]))

In [13]:
# example query
query = "What do we know about COVID-19 risk factors ?"
pre = preprocess_query(query)

Search Documents through elasticsearch

In [14]:
def create_search(query, return_number = 1):
    
    # create empty body text
    body = {
        "from":0,
        "size": return_number,
        "query" : {"bool" : {"must" : [
    
]}            
                  }
       
       }
    # add requirment
    for word in query:
        
        Cap_word = word[0].upper() + word[1:]
        total_words = word + " " + Cap_word + " " + word.upper()
        body["query"]["bool"]["must"].append({"match": {"body_text": total_words}})
    
    return body

In [15]:
# example body

body = create_search(pre)
body

{'from': 0,
 'size': 1,
 'query': {'bool': {'must': [{'match': {'body_text': 'covid19 Covid19 COVID19'}},
    {'match': {'body_text': 'know Know KNOW'}},
    {'match': {'body_text': 'risk Risk RISK'}},
    {'match': {'body_text': 'factors Factors FACTORS'}}]}}}

In [16]:
# get search results (only return 1 result for the demo)

results = es.search(index = "covid", body = body)
results = results["hits"]["hits"][0]["_source"]

In [19]:
# exports results to json files

with open('data.json', 'w') as f:
    json.dump(results, f)