In [14]:
# @name neo4j_import
# @description notebook to build the Neo4j graph 
# @author Núria Queralt Rosinach
# @date 20-10-2020

# Cytokines data import into Neo4j graph

### Steps

    1. Read data, URIs (id mappings) and semantic model types (owl)
    2. Prepare graph files for Neo4j
    3. Import into Neo4j server

### Import

In [15]:
import pandas as pd
import rdflib
from hashlib import md5

### Workflow
#### 1. Read data,  semantic model, id_mapping

In [16]:
# read cytokines data
data_file = '/home/nur/workspace/beat-covid/fair-data-model/cytokine/synthetic-data/BEAT-COVID1_excel_export_2020-05-28_Luminex_synthetic-data.csv'
data = pd.read_csv(data_file, sep='\t')
print("Citokines data dimension: {}".format(data.shape))
data.head(5)

Citokines data dimension: (9, 141)


Unnamed: 0,record_id,institute_abbreviation,record_creation_date,order,date_sampling,ward,age,beat_id,clinical_id,lum_date_meas,...,lum_sTNFR1_3,lum_sTNFR2_3,lum_TSLP_3,lum_TWEAK_3,lum_IL17F_4,lum_IL21_4,lum_IL23_4,lum_IL25_4,lum_IL31_4,lum_IL33_4
0,201708869,LUMC,26-05-2020 15:31:31,BEATVR,08-05-2020,ICUQ,59,BEAT-018,1100207.0,04-05-2020,...,963,263,OOR <,329,OOR <,OOR <,OOR <,OOR <,OOR <,597
1,201903364,LUMC,26-05-2020 15:42:21,BEATMA,01-05-2020,VIG1,30,BEAT-007,1100125.0,22-04-2020,...,1230,272,10,111,OOR <,OOR <,OOR <,5,OOR <,OOR <
2,201803814,LUMC,26-05-2020 15:45:16,BEATWO,04-05-2020,LOKL,54,BEAT-026,1100154.0,11-05-2020,...,1046,335,4,76,OOR <,OOR <,OOR <,OOR <,OOR <,OOR <
3,202000138,LUMC,26-05-2020 15:38:41,BEATAD,06-05-2020,KVVL,58,BEAT-001,,04-05-2020,...,997,269,9,123,OOR <,OOR <,OOR <,OOR <,OOR <,OOR <
4,202000131,LUMC,26-05-2020 15:42:26,BEATWO,04-05-2020,KVVL,54,BEAT-023,1100103.0,25-05-2020,...,1374,332,9,181,OOR <,OOR <,OOR <,OOR <,OOR <,794


In [17]:
# read id_mapping csv file
id_file = '/home/nur/workspace/beat-covid/fair-data-model/cytokine/id-mapping/cytokines_lab_terms_pilot_mapping_clean.csv'
mappings = pd.read_csv(id_file, sep=',')
print("Citokines data dimension: {}".format(mappings.shape))
mappings.head(5)

Citokines data dimension: (103, 9)


Unnamed: 0,lab data (ensembl gene name),ensembl gene,uniprot,ncit protein,ncit gene,ncit measurement (protein) = PROCESS or ACTIVITY,Clinical Measurement Ontology (CMO) (protein) = Measurement value which is the result of a process/activity,Unnamed: 7,Unnamed: 8
0,CX3CL1,ENSG00000006210,P78423,http://purl.obolibrary.org/obo/NCIT_C20489,http://purl.obolibrary.org/obo/NCIT_C24796,http://purl.obolibrary.org/obo/NCIT_C161361,http://www.ebi.ac.uk/efo/EFO_0009419,,
1,CCL26,ENSG00000006606,Q9Y258,http://purl.obolibrary.org/obo/NCIT_C28739,http://purl.obolibrary.org/obo/NCIT_C24780,http://purl.obolibrary.org/obo/NCIT_C81954,http://www.ebi.ac.uk/efo/EFO_0008122,,
2,IL32,ENSG00000008517,P24001,http://purl.obolibrary.org/obo/NCIT_C127926,http://purl.obolibrary.org/obo/NCIT_C127924,http://purl.obolibrary.org/obo/NCIT_C74830,http://purl.obolibrary.org/obo/CMO_0001925,,
3,HGF,ENSG00000019991,P14210,http://purl.obolibrary.org/obo/NCIT_C20428,http://purl.obolibrary.org/obo/NCIT_C24467,http://purl.obolibrary.org/obo/NCIT_C135426,http://www.ebi.ac.uk/efo/EFO_0006903,,
4,CXCL2,ENSG00000081041,P19875,http://purl.obolibrary.org/obo/NCIT_C20474,http://purl.obolibrary.org/obo/NCIT_C18426,http://purl.obolibrary.org/obo/NCIT_C81867,http://purl.obolibrary.org/obo/NCIT_C74804,,


In [18]:
# read semantic model
owl_file = '/home/nur/workspace/beat-covid/fair-data-model/cytokine/owl/cytokine_semantic_model.owl'
owl = rdflib.Graph()
owl.load(owl_file)
print("* The ontology has {} statements.".format(len(owl)))
import pprint
for stmt in owl:
    pprint.pprint(stmt)

* The ontology has 583 statements.
(rdflib.term.URIRef('http://semanticscience.org/resource/SIO_001049'),
 rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
 rdflib.term.URIRef('http://www.w3.org/2002/07/owl#Class'))
(rdflib.term.URIRef('http://semanticscience.org/resource/SIO_000132'),
 rdflib.term.URIRef('http://purl.org/dc/terms/description'),
 rdflib.term.Literal('has participant is a relation that describes the participation of the object in the (processual) subject.', lang='en'))
(rdflib.term.URIRef('http://semanticscience.org/resource/SIO_000229'),
 rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#range'),
 rdflib.term.URIRef('http://semanticscience.org/resource/SIO_000004'))
(rdflib.term.URIRef('http://semanticscience.org/resource/SIO_000008'),
 rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#domain'),
 rdflib.term.URIRef('http://semanticscience.org/resource/SIO_001024'))
(rdflib.term.URIRef('http://www.w3.org/ns/prov#wasInformedBy'),
 rdfl

(rdflib.term.URIRef('http://semanticscience.org/resource/SIO_010003'),
 rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
 rdflib.term.URIRef('http://www.w3.org/2002/07/owl#NamedIndividual'))
(rdflib.term.URIRef('http://purl.obolibrary.org/obo/OBI_0000968'),
 rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#label'),
 rdflib.term.Literal('device', lang='en'))
(rdflib.term.URIRef('http://semanticscience.org/resource/SIO_000715'),
 rdflib.term.URIRef('http://purl.org/dc/elements/1.1/identifier'),
 rdflib.term.Literal('SIO_000715'))
(rdflib.term.URIRef('http://semanticscience.org/resource/SIO_010056'),
 rdflib.term.URIRef('http://purl.org/dc/terms/description'),
 rdflib.term.Literal('a phenotype is an observable characteristic of an individual.', lang='en'))
(rdflib.term.URIRef('http://semanticscience.org/resource/SIO_000628'),
 rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#domain'),
 rdflib.term.URIRef('http://semanticscience.org/resource/SIO_001024

#### 2. Prepare graph files for Neo4j

__Edges file format__

| s_id:START_ID | p_id:TYPE | o_id:END_ID

other columns: 
   * p_label | p_description | p_uri
   * reference_uri | reference_sentence | reference_date
   * evidence_uri | dataset_quality_uri (synthetic/real)


__Nodes file format__

| id:ID | :LABEL | label | description | uri

_NB column_name:IGNORE_
_NULL data as 'NA' (.fill('NA'))

##### Identifier scheme 

We will use the cytokine ontology to provide:

    * edges: id ( p_uri )
    
We will use the cytokine id mapping file done manually to provide

    * nodes: id ( id:ID | uri )

##### Semantics

We will use the cytokine ontology to provide:

    * edges: id and type ( p_id:TYPE )
    * nodes: type ( :LABEL )
    
We will use the NCIT and CMO ontologies to provide ids for gene, protein, measurement and data measurement nodes. 

In [19]:
# test with a triple from data file
data[:1]

Unnamed: 0,record_id,institute_abbreviation,record_creation_date,order,date_sampling,ward,age,beat_id,clinical_id,lum_date_meas,...,lum_sTNFR1_3,lum_sTNFR2_3,lum_TSLP_3,lum_TWEAK_3,lum_IL17F_4,lum_IL21_4,lum_IL23_4,lum_IL25_4,lum_IL31_4,lum_IL33_4
0,201708869,LUMC,26-05-2020 15:31:31,BEATVR,08-05-2020,ICUQ,59,BEAT-018,1100207.0,04-05-2020,...,963,263,OOR <,329,OOR <,OOR <,OOR <,OOR <,OOR <,597


In [20]:
# EDGES
# triple: :process_ has-output :qt_ 
# :process_ (a ncit:IL6 measurement, 
#            dc:conforms-to :kit_) 
# has-output(sio:has-output) 
# :qt_ (a cmo:serum IL6 level, 
#       sio:has-value "12"^^xsd:float, 
#       sio:has-unit uo:pg/ml) 
# :qt_ sio:refers-to ncit:IL6_protein
# ncit:IL6_protein sio:encodes ncit:IL6_gene
edges_l = list()
edge = dict()
base = "http://purl.org/lumc-research/"

# cytokine info: protein name
device, protein_name, kit = 'lum_IL33_4'.split('_')
print("device: {}, cytokine label: {}, kit: {}".format(device, protein_name, kit))
# data records
record_id = str(data.iloc[0,0])
print("recor_id value: {}".format(record_id))
process = 'lum_IL33_4'
print("measurement process: {}".format(process))
# qt value float
qt_value = data['lum_IL33_4'][0]
print("cytokine level value: {}".format(qt_value))

# S :process_ 
# URI:
subject_id_str = "{}-{}".format(record_id,process)
subject_id = "BEATCOVID_" + md5(subject_id_str.encode('utf-8')).hexdigest()
s_uri = rdflib.URIRef(base + subject_id)
print(subject_id,s_uri)

# S type: protein name -> ncit:protein name_measurement

# O :qt_
# URI:
object_id_str = "{}-{}-{}".format(record_id,process,qt_value)
object_id = "BEATCOVID_" + md5(object_id_str.encode('utf-8')).hexdigest()
o_uri = rdflib.URIRef(base + object_id)
print(object_id,o_uri)

# O type: protein name -> cmo:protein name_level


# triple as neo4j edge:  s_id:START_ID | p_id:TYPE | o_id:END_ID
prefix = "lumc:"
# s_id = beat:BEATCOVID_record_measurement_process_hashedidentifier
s_id = prefix + subject_id
# p_id = sio:SIO_id 
has_output = "SIO_000229"
p_id = "sio:" + has_output
# o_id = beat:BEATCOVID_record_measurement_process_qt_hashedidentifier
o_id = prefix + object_id

# df neo4j format
edge[':START_ID'] = s_id
edge[':TYPE'] = p_id
edge[':END_ID'] = o_id
edges_l.append(edge)
df = pd.DataFrame(edges_l)
df

device: lum, cytokine label: IL33, kit: 4
recor_id value: 201708869
measurement process: lum_IL33_4
cytokine level value: 597
BEATCOVID_da4bbb0d60764bb94b5e48dd8e2eeeae http://purl.org/lumc-research/BEATCOVID_da4bbb0d60764bb94b5e48dd8e2eeeae
BEATCOVID_15b392e12cd2286030e13cf2f2b98aaf http://purl.org/lumc-research/BEATCOVID_15b392e12cd2286030e13cf2f2b98aaf


Unnamed: 0,:START_ID,:TYPE,:END_ID
0,lumc:BEATCOVID_da4bbb0d60764bb94b5e48dd8e2eeeae,sio:SIO_000229,lumc:BEATCOVID_15b392e12cd2286030e13cf2f2b98aaf


In [21]:
# create EDGES file 'covid19_statements_v{}.csv'
import datetime, os

# create dir
today = datetime.date.today()
graph_version = 'v{}'.format(today)
neo4j_path = os.getcwd() + '/neo4j'
path_to_import = neo4j_path + '/import' 
path_to_version = neo4j_path + '/import/' + graph_version
if not os.path.isdir(path_to_import): os.makedirs(path_to_import)
if not os.path.isdir(path_to_version): os.makedirs(path_to_version)

    
# create edge file with neo4j format
edges = df.copy()
edges.to_csv('{}/covid19_statements.csv'.format(path_to_import), index=False, na_rep='NA')
edges.to_csv('{}/covid19_statements.csv'.format(path_to_version), index=False, na_rep='NA')

In [22]:
# test with a triple from data file
data[:1]

Unnamed: 0,record_id,institute_abbreviation,record_creation_date,order,date_sampling,ward,age,beat_id,clinical_id,lum_date_meas,...,lum_sTNFR1_3,lum_sTNFR2_3,lum_TSLP_3,lum_TWEAK_3,lum_IL17F_4,lum_IL21_4,lum_IL23_4,lum_IL25_4,lum_IL31_4,lum_IL33_4
0,201708869,LUMC,26-05-2020 15:31:31,BEATVR,08-05-2020,ICUQ,59,BEAT-018,1100207.0,04-05-2020,...,963,263,OOR <,329,OOR <,OOR <,OOR <,OOR <,OOR <,597


In [23]:
# NODES
# | id:ID | :LABEL | label | description | uri
nodes_l = list()
node = dict()
ids = dict()
ids[s_id] = 1
ids[o_id] = 1
print(ids.keys())

# values
# subject
ids[s_id] = {'label': 'ASSAY', 'preflabel': process}

# object
ids[o_id] = {'label': 'MEASUREMENT', 'preflabel': protein_name}

# add 
for node_id in ids.keys():
    node['id:ID'] = node_id
    node[':LABEL'] = ids[node_id]['label']
    node['preflabel'] = ids[node_id]['preflabel']
    nodes_l.append(node)
    node = dict()

dict_keys(['lumc:BEATCOVID_da4bbb0d60764bb94b5e48dd8e2eeeae', 'lumc:BEATCOVID_15b392e12cd2286030e13cf2f2b98aaf'])


In [24]:
df = pd.DataFrame(nodes_l)
df

Unnamed: 0,id:ID,:LABEL,preflabel
0,lumc:BEATCOVID_da4bbb0d60764bb94b5e48dd8e2eeeae,ASSAY,lum_IL33_4
1,lumc:BEATCOVID_15b392e12cd2286030e13cf2f2b98aaf,MEASUREMENT,IL33


In [25]:
# create node file with neo4j format
nodes = df.copy()
nodes.to_csv('{}/covid19_concepts.csv'.format(path_to_import), index=False, na_rep='NA')
nodes.to_csv('{}/covid19_concepts.csv'.format(path_to_version), index=False, na_rep='NA')

In [26]:
# 