In [None]:
# @name neo4j_import
# @description notebook to build the Neo4j graph 
# @author Núria Queralt Rosinach
# @date 20-10-2020

# Cytokines data import into Neo4j graph

### Steps

    1. Read data and semantic model
    2. Prepare graph files for Neo4j
    3. Import into Neo4j server

### Import

In [7]:
import pandas as pd
import rdflib

### Workflow
#### Read data and semantic model

In [6]:
# read cytokines data
data_file = '/home/queralt/Documents/nuria/workspace/beat-covid-no-git/fair-data-model/cytokine/synthetic-data/BEAT-COVID1_excel_export_2020-05-28_Luminex_synthetic-data.csv'
data = pd.read_csv(data_file, sep='\t')
print("Citokines data dimension: {}".format(data.shape))
data.head(5)

Citokines data dimension: (9, 141)


Unnamed: 0,record_id,institute_abbreviation,record_creation_date,order,date_sampling,ward,age,beat_id,clinical_id,lum_date_meas,...,lum_sTNFR1_3,lum_sTNFR2_3,lum_TSLP_3,lum_TWEAK_3,lum_IL17F_4,lum_IL21_4,lum_IL23_4,lum_IL25_4,lum_IL31_4,lum_IL33_4
0,201708869,LUMC,26-05-2020 15:31:31,BEATVR,08-05-2020,ICUQ,59,BEAT-018,1100207.0,04-05-2020,...,963,263,OOR <,329,OOR <,OOR <,OOR <,OOR <,OOR <,597
1,201903364,LUMC,26-05-2020 15:42:21,BEATMA,01-05-2020,VIG1,30,BEAT-007,1100125.0,22-04-2020,...,1230,272,10,111,OOR <,OOR <,OOR <,5,OOR <,OOR <
2,201803814,LUMC,26-05-2020 15:45:16,BEATWO,04-05-2020,LOKL,54,BEAT-026,1100154.0,11-05-2020,...,1046,335,4,76,OOR <,OOR <,OOR <,OOR <,OOR <,OOR <
3,202000138,LUMC,26-05-2020 15:38:41,BEATAD,06-05-2020,KVVL,58,BEAT-001,,04-05-2020,...,997,269,9,123,OOR <,OOR <,OOR <,OOR <,OOR <,OOR <
4,202000131,LUMC,26-05-2020 15:42:26,BEATWO,04-05-2020,KVVL,54,BEAT-023,1100103.0,25-05-2020,...,1374,332,9,181,OOR <,OOR <,OOR <,OOR <,OOR <,794


In [12]:
# read semantic model
owl_file = '/home/queralt/Documents/nuria/workspace/beat-covid-no-git/fair-data-model/cytokine/owl/cytokine_semantic_model.owl'
owl = rdflib.Graph()
owl.load(owl_file)
print("* The ontology has {} statements.".format(len(owl)))
import pprint
for stmt in owl:
    pprint.pprint(stmt)

* The ontology has 583 statements.
(rdflib.term.URIRef('http://semanticscience.org/resource/SIO_000672'),
 rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#range'),
 rdflib.term.URIRef('http://semanticscience.org/resource/SIO_001050'))
(rdflib.term.URIRef('http://semanticscience.org/resource/SIO_000011'),
 rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
 rdflib.term.URIRef('http://www.w3.org/2002/07/owl#ObjectProperty'))
(rdflib.term.URIRef('http://semanticscience.org/resource/SIO_010056'),
 rdflib.term.URIRef('http://purl.org/dc/terms/description'),
 rdflib.term.Literal('a phenotype is an observable characteristic of an individual.', lang='en'))
(rdflib.term.URIRef('http://purl.org/dc/terms/conformsTo'),
 rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'),
 rdflib.term.URIRef('http://www.w3.org/2002/07/owl#ObjectProperty'))
(rdflib.term.BNode('Nfafaafff6794414da38e4afa03a65886'),
 rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-sy

(rdflib.term.URIRef('http://semanticscience.org/resource/SIO_000672'),
 rdflib.term.URIRef('http://purl.org/dc/elements/1.1/identifier'),
 rdflib.term.Literal('SIO_000672'))
(rdflib.term.URIRef('http://purl.org/dc/terms/date'),
 rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#label'),
 rdflib.term.Literal('date', lang='en'))
(rdflib.term.URIRef('http://www.bioassayontology.org/bao#BAO_0002854'),
 rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#subPropertyOf'),
 rdflib.term.URIRef('http://www.w3.org/2002/07/owl#topObjectProperty'))
(rdflib.term.BNode('Nfafaafff6794414da38e4afa03a65886'),
 rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#rest'),
 rdflib.term.BNode('N0b5d0c9f7927498faa167a93cf92e034'))
(rdflib.term.URIRef('http://semanticscience.org/resource/SIO_010056'),
 rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#subClassOf'),
 rdflib.term.URIRef('http://semanticscience.org/resource/SIO_000005'))
(rdflib.term.URIRef('http://purl.obolibrary.o

#### Prepare graph files for Neo4j

__Edges file format__

| s_id:START_ID | p_id:TYPE | o_id:END_ID

other columns: 
   * p_label | p_description | p_uri
   * reference_uri | reference_sentence | reference_date


__Nodes file format__

id:ID | :LABEL | label | description | uri

_NB column_name:IGNORE_
_NULL data as 'NA' (.fill('NA'))