In [9]:
!pip install unidecode tqdm pandas rdflib SPARQLWrapper Unidecode



In [None]:
# First parse the data

In [None]:
def read_csv(path_header = r'CORD-19-research-challenge/2020-03-13/'):
    import pandas as pd
    path_to_csv = path_header + r"all_sources_metadata_2020-03-13.csv"
    csv_df = pd.read_csv(path_to_csv)
    return csv_df
    

In [None]:
csv_df = read_csv()

In [None]:
def read_jsons(path_header = r'CORD-19-research-challenge/2020-03-13/'):
    import os, json
    import pandas as pd

    # this finds our json files
    path_to_json = path_header + r'biorxiv_medrxiv/biorxiv_medrxiv'
    path_to_json_2 = path_header + r"comm_use_subset/comm_use_subset"
    path_to_json_3 = path_header + r"noncomm_use_subset/noncomm_use_subset"
    path_to_json_4 = path_header + r"pmc_custom_license/pmc_custom_license"


    list_of_jsons= [path_to_json, path_to_json_2, path_to_json_3,path_to_json_4]

    json_files = []

    for i in list_of_jsons:
        json_files.extend([os.path.join(i,pos_json) for pos_json in os.listdir(i) if pos_json.endswith('.json')])



    #json_files = [pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]


    # here I define my pandas Dataframe with the columns I want to get from the json
    jsons_data = pd.DataFrame(columns=['id', "title", "paper_abstract","paper_body"])

    # we need both the json and an index number so use enumerate()

    for index, js in enumerate(json_files):
        with open(js) as json_file:
            json_text = json.load(json_file)
            #print(json_text)
            # here you need to know the layout of your json and each json has to have
            # the same structure (obviously not the structure I have here)
            title = json_text['metadata']['title']

            paper_id = json_text["paper_id"]

            #reduces the list only if there is content
            if json_text["abstract"] != []:
                paper_abstract = (json_text["abstract"])
            else:
                paper_abstract = (json_text["abstract"])

            #paper_abstract = (json_text["abstract"])
            paper_body = (json_text["body_text"])
            #print(title)
            # here I push a list of data into a pandas DataFrame at row given by 'index'
            jsons_data.loc[index] = [paper_id, title, paper_abstract, paper_body]

    return jsons_data


In [None]:
jsons = read_jsons()

In [None]:
jsons

In [None]:
# join the frames together
combined_data = csv_df.merge(jsons, left_on='sha',right_on='id',how='left')

In [None]:
combined_data.to_csv("combined_data.csv")

In [None]:
#used this to buffer the extraction
import pandas as pd
combined_data = pd.read_csv("combined_data.csv")

In [None]:
combined_data

In [None]:
# make a fulltest column
# for this use the title_x (if Not NaN), title_x, all fields from the abstract, and all fields from the paper body together

def parse_json_section(text):
    import json
    if isinstance(text, float) :
        return ""
    else :
        sep = r" "
        whole_text_as_string = ""
        for section in text:
            #print(section['text'])
            whole_text_as_string += section['text'] + sep
        return whole_text_as_string
    

def make_catchall_field(row):
    sep = r" "
    title_x = row['title_x']
    title_y = row['title_y']
    abstract = row['abstract']
    abstract_fields = row['paper_abstract']
    body_fields = row['paper_body']
    abstract_text = parse_json_section(abstract_fields)
    body_text = parse_json_section(body_fields)
    
    catch_all = str(title_x) + sep + str(title_y) + sep + str(abstract) + sep + str(abstract_text) + sep + str(body_text)
    return catch_all



In [None]:
from tqdm import tqdm
tqdm.pandas()
combined_data['catchall'] = combined_data.progress_apply(lambda row: make_catchall_field(row), axis = 1)

In [None]:
combined_data.to_csv("combined_data_with_catchall.csv")

In [1]:
#used this to buffer the extraction
import pandas as pd
combined_data = pd.read_csv("combined_data_with_catchall.csv")

In [2]:
# Then we need to annotate that data

In [3]:
from unidecode import unidecode

# compile a regex that matches only alphanumeric plus the _
import re
pattern = re.compile('\W')


def remove_non_ascii(text):
    return str(re.sub(pattern, ' ', text)).encode("ascii", errors="ignore").decode()

def analyze_text(text):
    import requests 
    # do some cleansing of this text    
    # defining the api-endpoint 
    API_ENDPOINT = 'http://nlu:8080/factextraction/analyze'
    headers = {"accept": "application/json", "content-type": "application/json"}
    # data to be sent to api
    #print("Text sent to ambiverse is: ", remove_non_ascii(text))
    data = r'{"docId": "doc2", "text":"' + remove_non_ascii(text) + r'", "extractConcepts": "true", "language": "en" }'
    # sending post request and saving response as response object 
    r = requests.post(url = API_ENDPOINT, data = data, headers = headers) 
    #print("returned text is: ", r.text)
    return r.text


In [4]:
# debugging only 
#from tqdm import tqdm
#import swifter
#jsons['out'] = jsons['title'].swifter.progress_bar(True).set_npartitions(npartitions=4).apply(analyze_text, axis=1)
#jsons['title_annotated'] = jsons.swifter.allow_dask_on_strings().progress_bar(True).set_npartitions(npartitions=8).apply(lambda row: analyze_text(row['title']), axis = 1)

#tqdm.pandas()
#jsons['title_annotated'] = jsons[0:12].apply(lambda row: analyze_text(row['title']), axis = 1) 

In [5]:
# adapted from here: https://www.kaggle.com/mlwhiz/parallelization-kernel
from multiprocessing import  Pool
import numpy as np
import pandas as pd


def parallelized_apply(df, func, numProcs=4):
    df_split = np.array_split(df, numProcs)
    pool = Pool(numProcs) 
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df
 

In [6]:
from tqdm import tqdm
tqdm.pandas()

def runAnnotate(df):
    # add here also the functions for all other columns
    df['catchall_annotated'] =  df.progress_apply(lambda row: analyze_text(row['catchall']), axis = 1)
    return df

# In theory every document annotation runs in a seperate thread, they all go to a single db (this db could handle a lot of parallel annotation threads)
# Every annotation thread should have 8 GB of memory, so running 10 annotation threads in parallel would need 80 GB of memory. 
# Be careful with the memory monitor (e.g. htop), the db just shows the in memory indices as cached pages (yellor/orange bars in htop), 
# if you consume more memory elsewhere, then these pages are stored back to disk and you loose performance.

# data_annotated = parallelized_apply(combined_data[0:399], runAnnotate, numProcs = 10) 

  from pandas import Panel


In [None]:
# do it first for the abstracts only
abstracts_only = combined_data[combined_data.has_full_text != True]
abstracts_annotated = parallelized_apply(abstracts_only, runAnnotate, numProcs = 12) 
abstracts_annotated.to_csv('abstracts_annotated.csv', index=True)

In [None]:
abstracts_annotated

In [None]:
# then for the fulltexts on a E32s v3 with 32 vCPUs and 256 GB of memory, we need around 50GB for the DB,
# then we have around 100GB for NLU which should give us enough parallelism for 28 threads with large documents 
# and enough spare for python and the OS

# do the first 100 for a test
full_texts_only = combined_data[combined_data.has_full_text == True]
full_texts_annotated = parallelized_apply(full_texts_only, runAnnotate, numProcs = 28)
full_texts_annotated.to_csv('full_texts_annotated.csv', index=True)

  0%|          | 0/472 [00:00<?, ?it/s]

In [None]:
full_texts_annotated

In [None]:
#used this to buffer the extraction
import pandas as pd
abstracts_annotated = pd.read_csv("abstracts_annotated.csv")
full_texts_annotated = pd.read_csv("full_texts_annotated.csv")

In [None]:
data_annotated = pd.concat(abstracts_annotated, full_texts_annotated)

In [None]:
# save it to disk temporarly
data_annotated.to_csv('data_annotated.csv', index=True)

In [1]:
# load back the data
import pandas as pd
data_annotated = pd.read_csv("data_annotated.csv")

In [2]:
# we will want to get all further wikidata entity information to it, so we will need to load them. 

In [3]:
# deassable the jsons first
import json
from tqdm import tqdm
tqdm.pandas()
from rdflib import URIRef, BNode, Literal, Namespace, Graph
from rdflib.namespace import RDF, FOAF, RDFS
import urllib.parse





graph_root = "http://www.trivadis.com/kg/"
DCTYPE = Namespace("http://purl.org/dc/dcmitype/")
DCTERMS = Namespace("http://purl.org/dc/terms/")

# links from a document
hasMention = URIRef("http://www.trivadis.com/kg/ontology/hasMention")
hasFact = URIRef("http://www.trivadis.com/kg/ontology/hasFact")

# links from a mention to the outside
hasConcept = URIRef("http://www.trivadis.com/kg/ontology/hasConcept")

# types
mentionType = URIRef("http://www.trivadis.com/kg/ontology/Mention")
docMentionType = URIRef("http://www.trivadis.com/kg/ontology/DocMention")
factType = URIRef("http://www.trivadis.com/kg/ontology/Fact")
conceptType = URIRef("http://www.trivadis.com/kg/ontology/Concept")

#mention properties
hasCharLength = URIRef("http://www.trivadis.com/kg/ontology/hasCharLength")
hasCharOffset = URIRef("http://www.trivadis.com/kg/ontology/hasCharOffset")
hasText = URIRef("http://www.trivadis.com/kg/ontology/hasText")
hasConfidence = URIRef("http://www.trivadis.com/kg/ontology/hasConfidence")
hasName = URIRef("http://www.trivadis.com/kg/ontology/hasName")
hasURL = URIRef("http://www.trivadis.com/kg/ontology/hasURL")
hasType = URIRef("http://www.trivadis.com/kg/ontology/hasType")

# fact specifics
hasSubject = URIRef("http://www.trivadis.com/kg/ontology/hasSubject")
hasRelation = URIRef("http://www.trivadis.com/kg/ontology/hasRelation")
hasObject = URIRef("http://www.trivadis.com/kg/ontology/hasObject")


# keys for the json
match_key = 'matches'
entity_key = 'entities'
fact_key = 'facts'


# some lists for further analysis or debuggung
wikidata_uris = []
error_parse = []

def parse_row_to_graph(graph, row):
    annotation = row[['catchall_annotated']][0]
    doc_id = str(row[['sha']][0])
    doi = row[['doi']][0]
    catchall = row[['catchall']][0]

    # create a unique document id
    unique_id = str(hash(catchall + str(doc_id) + str(doi)))

    #print(doc_id)
    #print(catchall)
    #print(doi)
        
    theDoc = URIRef(graph_root + urllib.parse.quote(unique_id))
    g.add( (theDoc, RDF.type, DCTYPE.Text) )
    g.add( (theDoc, DCTERMS.identifier, Literal(doi)) )
    g.add( (theDoc, DCTERMS.description, Literal(catchall)) )
    
    
    try:
        json_annotation = json.loads(annotation)

        #print(json_annotation)

        if match_key in json_annotation:
            # add the annotations as mentions
            # print("Found matches: ")
            for match in json_annotation[match_key]:
                # A match is: {'charLength': 7, 'charOffset': 0, 'text': 'Imaging', 'entity': {'id': 'http://www.wikidata.org/entity/Q931309', 'confidence': 0.12630685029203295}}
                charLength = match['charLength']
                charOffset = match['charOffset']
                match_text = match['text']

                match_iri = URIRef(graph_root + urllib.parse.quote(unique_id + "/mention/" + str(charLength) + str(charOffset) + str(match_text)))

                g.add((theDoc, hasMention, match_iri))
                g.add((match_iri, RDF.type, mentionType))
                g.add((match_iri, hasCharLength, Literal(int(charLength))))
                g.add((match_iri, hasCharOffset, Literal(int(charOffset))))
                g.add((match_iri, hasText, Literal(match_text)))
                if ('entity' in match):
                    if ('id' in match['entity'] and 'confidence' in match['entity']):
                        match_entity = match['entity']['id']
                        wikidata_uris.append(match_entity)
                        match_confidence = match['entity']['confidence']
                        g.add((match_iri, hasConcept, URIRef(match_entity)))
                        g.add((URIRef(match_entity), RDF.type, conceptType))
                        g.add((match_iri, hasConfidence, Literal(float(match_confidence))))
                # print(match)

        if entity_key in json_annotation:
            # add the annotations as mentions
            # print("Found entities: ")
            for entity in json_annotation[entity_key]:
                # A entity is: {'id': 'http://www.wikidata.org/entity/Q931309', 'name': 'Medical imaging', 'url': 'http://en.wikipedia.org/wiki/Medical%20imaging', 'type': 'CONCEPT', 'salience': 0.0}
                match_entity = entity['id']
                wikidata_uris.append(match_entity)
                match_name = entity['name']
                match_url = entity['url']
                match_type = entity['type']
                match_confidence = entity['salience']
                match_iri = URIRef(graph_root + urllib.parse.quote(unique_id + "/entity/" + str(match_name) + str(match_type)))

                g.add((theDoc, hasMention, match_iri))
                g.add((match_iri, RDF.type, docMentionType))
                g.add((match_iri, hasConfidence, Literal(float(match_confidence))))
                g.add((match_iri, hasName, Literal(match_name)))
                g.add((match_iri, hasConcept, URIRef(match_entity)))
                g.add((URIRef(match_entity), RDF.type, conceptType ))
                g.add((match_iri, hasURL, Literal(match_url)))
                g.add((match_iri, hasType, Literal(match_type)))
                # print(entity)

        if fact_key in json_annotation:
            # add the annotations as facts
            # print("Found facts: ")
            for fact in json_annotation[fact_key]:
                # A fact is: {'subject': {'text': 'RETRACTED Chinese medical staff', 'charOffset': 0, 'charLength': 32}, 'relation': {'text': 'request', 'charOffset': 33, 'charLength': 7}, 'object': {'text': 'international medical assistance in fighting against COVID-19 nan nan', 'charOffset': 41, 'charLength': 69}}
                # in the fact we do not have any entities annotated, we need to do that later with the char offsets the fact has a subject, relation and object each of them are mentions

                fact_subject_text = fact['subject']['text']
                fact_subject_charLength = fact['subject']['charLength']
                fact_subject_charOffset = fact['subject']['charOffset']

                fact_relation_text = fact['relation']['text']
                fact_relation_charLength = fact['relation']['charLength']
                fact_relation_charOffset = fact['relation']['charOffset']

                fact_iri = URIRef(graph_root + urllib.parse.quote(unique_id + "/fact/" + fact_relation_text +  str(fact_relation_charLength) +  str(fact_relation_charOffset)))
                fact_subject_iri = URIRef(graph_root + urllib.parse.quote(unique_id + "/fact/mention/" + fact_subject_text +  str(fact_subject_charLength) +  str(fact_subject_charOffset)))
                fact_relation_iri = URIRef(graph_root + urllib.parse.quote(unique_id + "/fact/mention/" + fact_relation_text +  str(fact_relation_charLength) +  str(fact_relation_charOffset)))


                g.add((theDoc, hasFact, fact_iri))
                g.add((fact_iri, RDF.type, factType))
                g.add((fact_iri, hasSubject, fact_subject_iri))
                g.add((fact_iri, hasRelation, fact_relation_iri))


                g.add((fact_subject_iri, RDF.type, mentionType))
                g.add((fact_subject_iri, RDF.type, factType))
                g.add((fact_subject_iri, hasCharLength, Literal(int(fact_subject_charLength))))
                g.add((fact_subject_iri, hasCharOffset, Literal(int(fact_subject_charOffset))))
                g.add((fact_subject_iri, hasText, Literal(fact_subject_text)))

                g.add((fact_relation_iri, RDF.type, mentionType))
                g.add((fact_relation_iri, RDF.type, factType))
                g.add((fact_relation_iri, hasCharLength, Literal(int(fact_relation_charLength))))
                g.add((fact_relation_iri, hasCharOffset, Literal(int(fact_relation_charOffset))))
                g.add((fact_relation_iri, hasText, Literal(fact_relation_text)))

                # adding the object if it exists
                if ('object' in fact):
                    if ('text' in fact['object'] and 'charLength' in fact['object'] and 'charOffset' in fact['object']):
                        fact_object_text = fact['object']['text']
                        fact_object_charLength = fact['object']['charLength']
                        fact_object_charOffset = fact['object']['charOffset']
                        fact_object_iri = URIRef(graph_root + urllib.parse.quote(unique_id + "/fact/mention/" + fact_object_text +  str(fact_object_charLength) +  str(fact_object_charOffset)))
                        g.add((fact_iri, hasObject, fact_object_iri))
                        g.add((fact_object_iri, RDF.type, mentionType))
                        g.add((fact_object_iri, RDF.type, factType))
                        g.add((fact_object_iri, hasCharLength, Literal(int(fact_object_charLength))))
                        g.add((fact_object_iri, hasCharOffset, Literal(int(fact_object_charOffset))))
                        g.add((fact_object_iri, hasText, Literal(fact_object_text)))

                # print(fact)

    except:
        if doc_id != "nan":
            error_parse.append(doc_id)

            
g = Graph()
# just using two documents
# abstracts_annotated[0:2].progress_apply(lambda row: parse_row_to_graph(g, row), axis = 1)

# running on all:
data_annotated.progress_apply(lambda row: parse_row_to_graph(g, row), axis = 1)

print("Parsed: ", len(g), " triples into the graph")
print("Parsing failed on ", len(error_parse), " items")
print("Found ", len(wikidata_uris), " wikidata links")
g.serialize(destination='full_annotated.nt', format='nt')


  from pandas import Panel
100%|██████████| 16281/16281 [09:34<00:00, 28.32it/s] 


Parsed:  14733263  triples into the graph
Parsing failed on  69  items
Found  1467541  wikidata links


In [10]:
# deassamble the wikidata links to be able to load them from wikidata
from collections import Counter
wikidata_uris_testing = ['http://www.wikidata.org/entity/Q931309', 'http://www.wikidata.org/entity/Q11426']

occurence_by_item = Counter(wikidata_uris)
# print out the most common 10
print("The top 10 Items by occurence are: ", occurence_by_item.most_common(10))

# we are interested in just the distinct list of them
distinct_wd_items = dict(occurence_by_item).keys()

# split it to fixed size chunks so we do not overload wikidata
# copied from: https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]



The top 10 Items by occurence are:  [('http://www.wikidata.org/entity/Q808', 26906), ('http://www.wikidata.org/entity/Q166231', 20885), ('http://www.wikidata.org/entity/Q7868', 14800), ('http://www.wikidata.org/entity/Q8054', 14509), ('http://www.wikidata.org/entity/Q12136', 10279), ('http://www.wikidata.org/entity/Q101965', 9686), ('http://www.wikidata.org/entity/Q103177', 7514), ('http://www.wikidata.org/entity/Q221673', 7311), ('http://www.wikidata.org/entity/Q7187', 6920), ('http://www.wikidata.org/entity/Q42848', 6556)]


In [17]:
list(distinct_wd_items)[:10]

['http://www.wikidata.org/entity/Q931309',
 'http://www.wikidata.org/entity/Q12192',
 'http://www.wikidata.org/entity/Q167918',
 'http://www.wikidata.org/entity/Q7316896',
 'http://www.wikidata.org/entity/Q6813432',
 'http://www.wikidata.org/entity/Q14970638',
 'http://www.wikidata.org/entity/Q41117',
 'http://www.wikidata.org/entity/Q884',
 'http://www.wikidata.org/entity/Q11746',
 'http://www.wikidata.org/entity/Q148']

In [None]:
from pandas.io.json import json_normalize
from SPARQLWrapper import SPARQLWrapper, JSON
from string import Template

sparql_endpoint_url = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"
sparql = SPARQLWrapper(sparql_endpoint_url)
sparql.setReturnFormat(JSON)


def QuerySparql(sparql_query):
    sparql.setQuery(sparql_query)
    results = sparql.query().convert()
    return json_normalize(results["results"]["bindings"])



#usage of Sparql-Query as a template:
wd_get_instance_of = Template("""
PREFIX bis: <http://www.bis.org/ontology/>
PREFIX gleifL1: <https://www.gleif.org/ontology/L1/>
PREFIX gleifL2: <https://www.gleif.org/ontology/L2/>
PREFIX gleif: <https://www.gleif.org/ontology/Base/>
PREFIX emaxx: <https://emaxxreuters.com/>

    
} 

""")


wd_get_subclass_of


wd_get_label_of

bis_entities =  QuerySparql(query1.substitute(COUNTRYNAME=str(country), AMOUNT=str(amount), LIMIT=str(limit)))

list(distinct_wd_items)[:10]


In [4]:
# TODO: add some analytics after this cell


In [None]:
# load all the instance of (P31) and the subclass of (P279) relations along each path from wikidata, we need all intermediate steps

In [None]:
# And ask the questions on it

In [None]:
#print(jsons_annotated.iloc[13000:13001].to_string())

In [None]:
# give me the articles talking about "behaviour" : https://www.wikidata.org/wiki/Q9332 in their headline:
behaviour_in_title = jsons_annotated[jsons_annotated['title_annotated'].str.contains("Q9332")]
print(behaviour_in_title[['id', 'title']].to_string())

In [None]:
# give me the articles talking about "social distancing" : https://www.wikidata.org/wiki/Q30314010 in their headline:
social_distancing_in_title = jsons_annotated[jsons_annotated['title_annotated'].str.contains("Q30314010")]
print(social_distancing_in_title[['id', 'title']].to_string())

In [None]:
# give me the articles talking about "Wuhan" : https://www.wikidata.org/wiki/Q11746 in their headline:
wuhan_in_title = jsons_annotated[jsons_annotated['title_annotated'].str.contains("Q11746")]
print(wuhan_in_title[['id', 'title']].to_string())