In [None]:
#!pip install unidecode tqdm pandas

In [None]:
# First parse the data

In [None]:
def read_csv(path_header = r'CORD-19-research-challenge/2020-03-13/'):
    import pandas as pd
    path_to_csv = path_header + r"all_sources_metadata_2020-03-13.csv"
    csv_df = pd.read_csv(path_to_csv)
    return csv_df
    

In [None]:
csv_df = read_csv()

In [None]:
def read_jsons(path_header = r'CORD-19-research-challenge/2020-03-13/'):
    import os, json
    import pandas as pd

    # this finds our json files
    path_to_json = path_header + r'biorxiv_medrxiv/biorxiv_medrxiv'
    path_to_json_2 = path_header + r"comm_use_subset/comm_use_subset"
    path_to_json_3 = path_header + r"noncomm_use_subset/noncomm_use_subset"
    path_to_json_4 = path_header + r"pmc_custom_license/pmc_custom_license"


    list_of_jsons= [path_to_json, path_to_json_2, path_to_json_3,path_to_json_4]

    json_files = []

    for i in list_of_jsons:
        json_files.extend([os.path.join(i,pos_json) for pos_json in os.listdir(i) if pos_json.endswith('.json')])



    #json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]


    # here I define my pandas Dataframe with the columns I want to get from the json
    jsons_data = pd.DataFrame(columns=['id', "title", "paper_abstract","paper_body"])

    # we need both the json and an index number so use enumerate()

    for index, js in enumerate(json_files):
        with open(js) as json_file:
            json_text = json.load(json_file)
            #print(json_text)
            # here you need to know the layout of your json and each json has to have
            # the same structure (obviously not the structure I have here)
            title = json_text['metadata']['title']

            paper_id = json_text["paper_id"]

            #reduces the list only if there is content
            if json_text["abstract"] != []:
                paper_abstract = (json_text["abstract"])
            else:
                paper_abstract = (json_text["abstract"])

            #paper_abstract = (json_text["abstract"])
            paper_body = (json_text["body_text"])
            #print(title)
            # here I push a list of data into a pandas DataFrame at row given by 'index'
            jsons_data.loc[index] = [paper_id, title, paper_abstract, paper_body]

    return jsons_data


In [None]:
jsons = read_jsons()

In [None]:
jsons

In [None]:
# join the frames together
combined_data = csv_df.merge(jsons, left_on='sha',right_on='id',how='left')

In [None]:
combined_data.to_csv("combined_data.csv")

In [None]:
#used this to buffer the extraction
import pandas as pd
combined_data = pd.read_csv("combined_data.csv")

In [None]:
combined_data

In [None]:
# make a fulltest column
# for this use the title_x (if Not NaN), title_x, all fields from the abstract, and all fields from the paper body together

def parse_json_section(text):
    import json
    if isinstance(text, float) :
        return ""
    else :
        sep = r" "
        whole_text_as_string = ""
        for section in text:
            #print(section['text'])
            whole_text_as_string += section['text'] + sep
        return whole_text_as_string
    

def make_catchall_field(row):
    sep = r" "
    title_x = row['title_x']
    title_y = row['title_y']
    abstract = row['abstract']
    abstract_fields = row['paper_abstract']
    body_fields = row['paper_body']
    abstract_text = parse_json_section(abstract_fields)
    body_text = parse_json_section(body_fields)
    
    catch_all = str(title_x) + sep + str(title_y) + sep + str(abstract) + sep + str(abstract_text) + sep + str(body_text)
    return catch_all



In [None]:
from tqdm import tqdm
tqdm.pandas()
combined_data['catchall'] = combined_data.progress_apply(lambda row: make_catchall_field(row), axis = 1)

In [None]:
combined_data.to_csv("combined_data_with_catchall.csv")

In [1]:
#used this to buffer the extraction
import pandas as pd
combined_data = pd.read_csv("combined_data_with_catchall.csv")

In [2]:
# Then we need to annotate that data

In [3]:
from unidecode import unidecode

def remove_non_ascii(text):
    return str(text).encode("ascii", errors="ignore").decode()

def analyze_text(text):
    import requests 
    # do some cleansing of this text    
    # defining the api-endpoint 
    API_ENDPOINT = 'http://nlu:8080/factextraction/analyze'
    headers = {"accept": "application/json", "content-type": "application/json"}
    # data to be sent to api
    #print("Text sent to ambiverse is: ", remove_non_ascii(text))
    data = r'{"docId": "doc2", "text":"' + remove_non_ascii(text) + r'", "extractConcepts": "true", "language": "en" }'
    # sending post request and saving response as response object 
    r = requests.post(url = API_ENDPOINT, data = data, headers = headers) 
    #print("returned text is: ", r.text)
    return r.text


In [4]:
# debugging only 
#from tqdm import tqdm
#import swifter
#jsons['out'] = jsons['title'].swifter.progress_bar(True).set_npartitions(npartitions=4).apply(analyze_text, axis=1)
#jsons['title_annotated'] = jsons.swifter.allow_dask_on_strings().progress_bar(True).set_npartitions(npartitions=8).apply(lambda row: analyze_text(row['title']), axis = 1)

#tqdm.pandas()
#jsons['title_annotated'] = jsons[0:12].apply(lambda row: analyze_text(row['title']), axis = 1) 

In [5]:
# adapted from here: https://www.kaggle.com/mlwhiz/parallelization-kernel
from multiprocessing import  Pool
import numpy as np
import pandas as pd


def parallelized_apply(df, func, numProcs=4):
    df_split = np.array_split(df, numProcs)
    pool = Pool(numProcs) 
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df
 

In [6]:
from tqdm import tqdm
tqdm.pandas()

def runAnnotate(df):
    # add here also the functions for all other columns
    df['catchall_annotated'] =  df.progress_apply(lambda row: analyze_text(row['catchall']), axis = 1)
    return df

# In theory every document annotation runs in a seperate thread, they all go to a single db (this db could handle a lot of parallel annotation threads)
# Every annotation thread should have 8 GB of memory, so running 10 annotation threads in parallel would need 80 GB of memory. 
# Be careful with the memory monitor (e.g. htop), the db just shows the in memory indices as cached pages (yellor/orange bars in htop), 
# if you consume more memory elsewhere, then these pages are stored back to disk and you loose performance.

# data_annotated = parallelized_apply(combined_data[0:399], runAnnotate, numProcs = 10) 

  from pandas import Panel


In [8]:
# do it first for the abstracts only
abstracts_only = combined_data[combined_data.has_full_text != True]
abstracts_annotated = parallelized_apply(abstracts_only, runAnnotate, numProcs = 12) 
abstracts_annotated.to_csv('abstracts_annotated.csv', index=True)

100%|██████████| 1356/1356 [2:45:57<00:00,  7.34s/it]  
100%|██████████| 1356/1356 [3:15:54<00:00,  8.67s/it]t]
100%|██████████| 1356/1356 [3:30:07<00:00,  9.30s/it]  
100%|██████████| 1357/1357 [3:39:45<00:00,  9.72s/it]t]
100%|██████████| 1357/1357 [3:41:04<00:00,  9.78s/it]t]
100%|██████████| 1357/1357 [3:48:04<00:00, 10.08s/it]  
100%|██████████| 1357/1357 [3:56:31<00:00, 10.46s/it]  
100%|██████████| 1357/1357 [3:57:40<00:00, 10.51s/it]
100%|██████████| 1357/1357 [4:13:19<00:00, 11.20s/it]  
100%|██████████| 1357/1357 [4:20:42<00:00, 11.53s/it]
100%|██████████| 1357/1357 [4:26:33<00:00, 11.79s/it]
100%|██████████| 1357/1357 [4:27:19<00:00, 11.82s/it]


In [9]:
abstracts_annotated.to_csv('abstracts_annotated.csv', index=True)
abstracts_annotated

Unnamed: 0.1,Unnamed: 0,sha,source_x,title_x,doi,pmcid,pubmed_id,license,abstract,publish_time,...,journal,Microsoft Academic Paper ID,WHO #Covidence,has_full_text,id,title_y,paper_abstract,paper_body,catchall,catchall_annotated
4,4,92c2c9839304b4f2bc1276d41b1aa885d8b364fd,CZI,Imaging changes in severe COVID-19 pneumonia,10.1007/s00134-020-05976-w,,32125453.0,cc-by-nc,,2020,...,Intensive Care Med,3.006643e+09,#3242,False,,,,,Imaging changes in severe COVID-19 pneumonia n...,"{""docId"":""doc2"",""language"":""en"",""matches"":[{""c..."
7,7,d13a685f861b0f1ba05afa6e005311ad1820fd3a,CZI,RETRACTED: Chinese medical staff request inter...,10.1016/s2214-109x(20)30065-6,,32105614.0,cc-by,,2020,...,The Lancet. Global health,2.627046e+09,#5386,False,,,,,RETRACTED: Chinese medical staff request inter...,"{""docId"":""doc2"",""language"":""en"",""matches"":[{""c..."
27,27,3c42431c5b95e707486ce7441ac139071e07b706,CZI,RETRACTION: Retraction-Chinese medical staff r...,10.1016/s2214-109x(20)30076-0,,32113504.0,cc-by,,2020,...,The Lancet. Global health,2.794879e+09,#5486,False,,,,,RETRACTION: Retraction-Chinese medical staff r...,"{""docId"":""doc2"",""language"":""en"",""matches"":[{""c..."
29,29,6fc52ed878c271020a2a375bb6e4b12943a7666c,CZI,Effectiveness for the Response to COVID-19: Th...,10.24171/j.phrp.2020.11.1.01,,,cc-by-nc-nd,In the current issue of Osong Public Health an...,2020,...,Osong Public Health and Research Perspectives,3.005888e+09,#2662,False,,,,,Effectiveness for the Response to COVID-19: Th...,"{""docId"":""doc2"",""language"":""en"",""matches"":[{""c..."
38,38,8819d114fe2dd2335a0545d53fea17deb6aa3943,CZI,Technical guidance for laboratory testing of 2...,10.1016/j.bsheal.2020.02.001,,,cc-by-nc-nd,,2020,...,Biosafety and Health,2.992772e+09,#351,False,,,,,Technical guidance for laboratory testing of 2...,"{""docId"":""doc2"",""language"":""en"",""matches"":[{""c..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29470,29470,083dd400b7854ffa908a21641197855aee981ea5,medrxiv,,doi.org/10.1101/2020.03.05.20032003,,,See https://www.medrxiv.org/submit-a-manuscript,,,...,,,,False,,,,,nan nan nan,"{""docId"":""doc2"",""language"":""en"",""matches"":[{""c..."
29476,29476,088d5c4a4f212be9590b6ce3d70a8e01847e76cf,medrxiv,,doi.org/10.1101/2020.03.06.20031880,,,See https://www.medrxiv.org/submit-a-manuscript,,,...,,,,False,,,,,nan nan nan,"{""docId"":""doc2"",""language"":""en"",""matches"":[{""c..."
29479,29479,bd08e2cbbc561e52823aedd0180f844e0c6cd2a6,medrxiv,,doi.org/10.1101/2020.03.06.20032177,,,See https://www.medrxiv.org/submit-a-manuscript,,,...,,,,False,,,,,nan nan nan,"{""docId"":""doc2"",""language"":""en"",""matches"":[{""c..."
29482,29482,3ba79e5d013c7c3cccd1449ccd16b93c61a4c576,medrxiv,,doi.org/10.1101/2020.03.06.20032425,,,See https://www.medrxiv.org/submit-a-manuscript,,,...,,,,False,,,,,nan nan nan,"{""docId"":""doc2"",""language"":""en"",""matches"":[{""c..."


In [None]:
# then for the fulltexts
full_texts_only = combined_data[combined_data.has_full_text == True]
full_texts_annotated = parallelized_apply(full_texts_only, runAnnotate, numProcs = 8)
full_texts_annotated.to_csv('full_texts_annotated.csv', index=True)

  3%|▎         | 50/1652 [1:54:14<85:59:49, 193.25s/it] 

In [None]:
full_texts_annotated.to_csv('full_texts_annotated.csv', index=True)
full_texts_annotated

In [None]:
data_annotated = pd.concat(abstracts_annotated, full_texts_annotated)

In [None]:
# save it to disk temporarly
data_annotated.to_csv('data_annotated.csv', index=True)

In [None]:
#print(jsons_annotated['title_annotated'].iloc[0:1][0])

In [None]:
# And ask the questions on it

In [None]:
#print(jsons_annotated.iloc[13000:13001].to_string())

In [None]:
# give me the articles talking about "behaviour" : https://www.wikidata.org/wiki/Q9332 in their headline:
behaviour_in_title = jsons_annotated[jsons_annotated['title_annotated'].str.contains("Q9332")]
print(behaviour_in_title[['id', 'title']].to_string())

In [None]:
# give me the articles talking about "social distancing" : https://www.wikidata.org/wiki/Q30314010 in their headline:
social_distancing_in_title = jsons_annotated[jsons_annotated['title_annotated'].str.contains("Q30314010")]
print(social_distancing_in_title[['id', 'title']].to_string())

In [None]:
# give me the articles talking about "Wuhan" : https://www.wikidata.org/wiki/Q11746 in their headline:
wuhan_in_title = jsons_annotated[jsons_annotated['title_annotated'].str.contains("Q11746")]
print(wuhan_in_title[['id', 'title']].to_string())