In [1]:
import requests
import numpy as np
import pandas as pd
from shortid import ShortId
from rdflib import Graph, URIRef, BNode, Literal
from rdflib.namespace import RDF, XSD, RDFS, SKOS
from  urllib.parse import quote
from scipy.stats import ttest_ind
sid = ShortId()

'''
To avoid that rdflib Graph create a new namespace (happened when the generated ID starts with a number or -)
'''
def generate_sid():
    r = sid.generate()
    while r[0].isdigit() or r.startswith(('-', '_')):
        r = sid.generate()
    return r

In [2]:
# ORKG API endpoints
api = 'http://localhost:8000/api/'
api_resources = '{}resources/'.format(api)
api_predicates = '{}predicates/'.format(api)
api_literals = '{}literals/'.format(api)
api_statements = '{}statements/'.format(api)
api_classes = '{}classes/'.format(api)
# The RDF Data Cube Vocabulary index
cube = dict()
cube['DataSet'] = URIRef('http://purl.org/linked-data/cube#DataSet')
cube['DataStructureDefinition'] = URIRef('http://purl.org/linked-data/cube#DataStructureDefinition')
cube['Observation'] = URIRef('http://purl.org/linked-data/cube#Observation')
cube['ComponentSpecification'] = URIRef('http://purl.org/linked-data/cube#ComponentSpecification')
cube['ComponentProperty'] = URIRef('http://purl.org/linked-data/cube#ComponentProperty')
cube['DimensionProperty'] = URIRef('http://purl.org/linked-data/cube#DimensionProperty')
cube['MeasureProperty'] = URIRef('http://purl.org/linked-data/cube#MeasureProperty')
cube['AttributeProperty'] = URIRef('http://purl.org/linked-data/cube#AttributeProperty')
cube['dataSet'] = URIRef('http://purl.org/linked-data/cube#dataSet')
cube['structure'] = URIRef('http://purl.org/linked-data/cube#structure')
cube['component'] = URIRef('http://purl.org/linked-data/cube#component')
cube['componentProperty'] = URIRef('http://purl.org/linked-data/cube#componentProperty')
cube['componentAttachment'] = URIRef('http://purl.org/linked-data/cube#componentAttachment')
cube['dimension'] = URIRef('http://purl.org/linked-data/cube#dimension')
cube['attribute'] = URIRef('http://purl.org/linked-data/cube#attribute')
cube['measure'] = URIRef('http://purl.org/linked-data/cube#measure')
cube['order'] = URIRef('http://purl.org/linked-data/cube#order')
cube['codeList'] = URIRef('http://purl.org/linked-data/cube#codeList')
# Vocabulary Classes
cube_classes = [cube['DataSet'],cube['DataStructureDefinition'],cube['Observation'],cube['ComponentSpecification'],
          cube['ComponentProperty'],cube['DimensionProperty'],cube['MeasureProperty'],cube['AttributeProperty'],
          SKOS.Concept, SKOS.ConceptScheme]  
obo = dict()
obo['STATO_0000304'] = URIRef('http://purl.obolibrary.org/obo/STATO_0000304')
obo['OBI_0000299'] = URIRef('http://purl.obolibrary.org/obo/OBI_0000299')
obo['OBI_0000175'] = URIRef('http://purl.obolibrary.org/obo/OBI_0000175')
obo['OBI_0000293'] = URIRef('http://purl.obolibrary.org/obo/OBI_0000293')
obo['OBI_0001931'] = URIRef('http://purl.obolibrary.org/obo/OBI_0001931')
obo['OBI_0001937'] = URIRef('http://purl.obolibrary.org/obo/OBI_0001937')
obo['OBI_0001938'] = URIRef('http://purl.obolibrary.org/obo/OBI_0001938')

In [3]:
def represent(dataset, tt):
    orkg = dict()

    g = Graph()
    
    g.bind('orkg', 'http://orkg.org/vocab/')
    g.bind('lsuc', 'http://orkg.org/vocab/lsuc/')
    g.bind('qb', 'http://purl.org/linked-data/cube#')
    g.bind('skos', 'http://www.w3.org/2004/02/skos/core#')
    
    n1 = URIRef('https://doi.org/10.1093/eurheartj/ehw333-R1')
    n2 = URIRef('http://orkg.org/vocab/lsuc/{}'.format(generate_sid()))
    n3 = URIRef('http://orkg.org/vocab/lsuc/{}'.format(generate_sid())) # theDataset
    n4 = URIRef('http://orkg.org/vocab/lsuc/{}'.format(generate_sid()))
    n5 = URIRef('http://orkg.org/vocab/lsuc/{}'.format(generate_sid())) # FOR Measure : MeasureComponentSpecification
    n6 = URIRef('http://orkg.org/vocab/lsuc/{}'.format(generate_sid())) # FOR Dimesion : DimesionComponentSpecification
    n7 = URIRef('http://orkg.org/vocab/lsuc/{}'.format(generate_sid())) # FOR Measure : theMeasureProperty
    n8 = URIRef('http://orkg.org/vocab/lsuc/{}'.format(generate_sid())) # Code list of failing Heart Dimension : failingHeartScheme
    n9 = URIRef('http://orkg.org/vocab/lsuc/{}'.format(generate_sid())) # theDataStructureDefinition
    n10 = URIRef('http://orkg.org/vocab/lsuc/{}'.format(generate_sid())) # theDimension
    n11 = URIRef('http://orkg.org/vocab/lsuc/{}'.format(generate_sid())) # failingHeart Concept
    n12 = URIRef('http://orkg.org/vocab/lsuc/{}'.format(generate_sid())) # nonFailingHeart Concept
    
    g.add((n1, RDF.type, obo['STATO_0000304']))
    g.add((obo['STATO_0000304'], RDFS.label, Literal('two sample t-test with unequal variance')))
    g.add((n1, RDFS.label, Literal('Statistically significant hypothesis test with IRE binding dependent variable on failing and non-failing hearts')))
    g.add((n1, obo['OBI_0000299'], n2))
    g.add((n2, RDFS.label, Literal('the p-value of the statistical hypothesis test')))
    g.add((n2, RDF.type, obo['OBI_0000175']))
    g.add((obo['OBI_0000175'], RDFS.label, Literal('p-value')))
    g.add((obo['OBI_0000299'], RDFS.label, Literal('has specified output')))
    g.add((obo['OBI_0000293'], RDFS.label, Literal('has specified input')))
    g.add((n1, obo['OBI_0000293'], n3))
    g.add((obo['OBI_0001931'], RDFS.label, Literal('scalar value specification')))
    g.add((obo['OBI_0001938'], RDFS.label, Literal('has value specification')))
    g.add((obo['OBI_0001937'], RDFS.label, Literal('has specified numeric value')))
    g.add((n2, obo['OBI_0001938'], n4))
    g.add((n4, RDFS.label, Literal('the value specification of the p-value')))
    g.add((n4, RDF.type, obo['OBI_0001931']))
    g.add((n4, obo['OBI_0001937'], Literal(tt.pvalue, datatype=XSD.double)))
    
    # Vocabulary properties labels
    g.add((RDF.type, RDFS.label, Literal('type')))
    g.add((RDFS.label, RDFS.label, Literal('label')))
    g.add((cube['dataSet'] , RDFS.label, Literal('dataSet')))
    g.add((cube['structure'] , RDFS.label, Literal('structure')))
    g.add((cube['component'] , RDFS.label, Literal('component')))  
    g.add((cube['componentProperty'] , RDFS.label, Literal('component Property')))
    g.add((cube['componentAttachment'] , RDFS.label, Literal('component Attachment')))
    g.add((cube['dimension'] , RDFS.label, Literal('dimension')))
    g.add((cube['attribute'] , RDFS.label, Literal('attribute')))
    g.add((cube['measure'] , RDFS.label, Literal('measure')))
    g.add((cube['order'] , RDFS.label, Literal('order')))
    g.add((cube['codeList'] , RDFS.label, Literal('code List')))
    g.add((SKOS.Concept , RDFS.label, Literal('Concept')))
    g.add((SKOS.inScheme, RDFS.label, Literal('in Scheme')))
    g.add((SKOS.topConceptOf, RDFS.label, Literal('top Concept Of')))
    g.add((RDFS.range, RDFS.label, Literal('Range')))

    '''
    Dataset
    '''
    g.add((n3, RDF.type, cube['DataSet']))
    g.add((n3, RDFS.label, Literal('LSUC Dataset')))
    g.add((n3, cube['structure'], n9))
    '''
    DataStructureDefinition
    '''
    g.add((n9, RDF.type, cube['DataStructureDefinition']))
    g.add((n9, RDFS.label, Literal('Data Structure Definition')))
    g.add((n9, cube['component'], n5))  
    g.add((n9, cube['component'], n6)) 
    '''
    Measures
    '''
    g.add((n5, RDF.type, cube['ComponentSpecification']))
    g.add((n5, RDFS.label, Literal('Component Specification Measure')))
    g.add((n5, cube['measure'], n7))

    g.add((n7, RDF.type, cube['MeasureProperty']))
    g.add((n7, RDF.type, cube['ComponentProperty']))
    g.add((n7, RDFS.label, Literal('iron-responsive element binding activity')))
    '''
    Dimensions
    '''
    g.add((n6, RDF.type, cube['ComponentSpecification']))
    g.add((n6, RDFS.label, Literal('Component Specification Dimension')))
    g.add((n6, cube['dimension'], n10))
    
    g.add((n10, RDF.type, cube['DimensionProperty']))
    g.add((n10, RDF.type, cube['ComponentProperty']))
    g.add((n10, RDFS.range, SKOS.Concept))
    g.add((n10, RDFS.label, Literal('left ventricular tissue sample')))
    g.add((n10, cube['codeList'], n8))
    # failingHeart Scheme
    g.add((n8, RDF.type, SKOS.ConceptScheme))
    g.add((n8, RDFS.label, Literal('Concept Scheme')))
    g.add((n8, SKOS.topConceptOf, n11))                                                     
    g.add((n8, SKOS.topConceptOf, n12))  
    # failingHeart Concept
    g.add((n11, RDF.type, SKOS.Concept))
    g.add((n11, RDFS.label, Literal('failing heart')))
    g.add((n11, SKOS.inScheme, n8))
    # non-failingHeart Concept
    g.add((n12, RDF.type, SKOS.Concept))
    g.add((n12, RDFS.label, Literal('non-failing heart')))
    g.add((n12, SKOS.inScheme, n8))
    
    conceptScheme = dict()
    conceptScheme['non-failing heart'] = n12
    conceptScheme['failing heart'] = n11
    k = 1
    for dimension in dataset:
        for observation in dataset[dimension]:
            # bno = BNode() # Observation
            bno = URIRef('http://orkg.org/vocab/lsuc/{}'.format(generate_sid()))
            g.add((bno, RDF.type, cube['Observation']))
            g.add((bno, RDFS.label, Literal('Observation #{}'.format(k))))
            g.add((bno, cube['dataSet'], n3))
            g.add((bno, n10, conceptScheme[dimension]))
            g.add((bno, n7, Literal(str(observation), datatype=XSD.double)))
            k = k + 1
    return g

    
def store(g):
    resources = {}
    predicates = {}
    for s, p, o in g:    
        if(p.n3() in [RDF.type.n3()]) and o in cube_classes:
            continue 
            
        resources, s_id = get_id(resources, api_resources, s, g)
        resources, p_id = get_id(resources, api_predicates, p, g)
        
        if type(o) is Literal:
            cls = 'literal'
            resources, o_id = get_id(resources, api_literals, o, g)
        else:
            cls = 'resource'
            resources, o_id = get_id(resources, api_resources, o, g)
        
        requests.post(api_statements,
                      json={'subject_id': s_id, 'predicate_id': p_id, 'object': {'id': o_id, '_class': cls}}, 
                      headers={'Content-Type':'application/json'})
    dataset_node = str([s for s, p, o in g.triples((None, RDF.type, cube['DataSet']))][0])
    print(resources[dataset_node])


def get_id(resources, api, r, g):
    if not str(r) in resources or api==api_predicates:
        l = None
        l_classes = []
        if type(r) is Literal:
            l = r   
        else:
            l = g.value(r, RDFS.label)
            Ts = [o for s, p , o in g.triples( (r, RDF.type, None))]
            # Set the classes of a ressource
            for T in Ts:
                if(T in cube_classes):
                    lc = 'qb:'+T.split('#')[-1]
                    l_class = requests.get(api_classes, params={'q':lc, 'exact': 'true'}, headers={'Content-Type':'application/json', 'Accept':'application/json'}).json()
                    if len(l_class) == 0:
                        l_classes.append(requests.post(api_classes, json={'label':lc}, headers={'Content-Type':'application/json'}).json()['id'])
                    if len(l_class) == 1:
                        l_classes.append(l_class[0]['id'])

            # Name a predicate with the ressource ID
            if (api==api_predicates):
                if len(Ts) > 0:
                    if not str(r) in resources:
                        l = requests.post(api_resources, json={'label':l, 'classes':l_classes}, headers={'Content-Type':'application/json'}).json()['id']
                        resources[str(r)] = l
                    else:
                        l = resources[str(r)]

        if l is None:
            raise Exception('Label is none for resource {}'.format(r))
        if (api==api_predicates):
            j = requests.get(api, params={'q':l, 'exact': 'true'}, headers={'Content-Type':'application/json', 'Accept':'application/json'}).json()
            if len(j) == 0:
                return resources, requests.post(api, json={'label':l, 'classes':l_classes}, headers={'Content-Type':'application/json'}).json()['id']
            if len(j) == 1:
                return resources, j[0]['id']
        else:
            resource_id = requests.post(api, json={'label':l, 'classes':l_classes}, headers={'Content-Type':'application/json'}).json()['id']
            resources[str(r)] = resource_id
            return resources, resource_id
    else:
        return resources, resources[str(r)]

In [6]:
labels = ['non-failing heart', 'failing heart']
data = [(99, 52), 
        (96, 40), 
        (100, 38), 
        (105, 18), 
        (np.nan, 11), 
        (np.nan, 5), 
        (np.nan, 42), 
        (np.nan, 55), 
        (np.nan, 53), 
        (np.nan, 39),
        (np.nan, 42), 
        (np.nan, 50)]

d = pd.DataFrame.from_records(data, columns=labels)
t = ttest_ind(d['non-failing heart'], 
              d['failing heart'], 
              equal_var=False, nan_policy='omit')
g = represent(d,t)
print(g.serialize(format='turtle').decode('utf-8'))
store(g)

@prefix lsuc: <http://orkg.org/vocab/lsuc/> .
@prefix ns1: <http://purl.obolibrary.org/obo/> .
@prefix orkg: <http://orkg.org/vocab/> .
@prefix qb: <http://purl.org/linked-data/cube#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix skos: <http://www.w3.org/2004/02/skos/core#> .
@prefix xml: <http://www.w3.org/XML/1998/namespace> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .

lsuc:U9Rnnc39zu a qb:Observation ;
    rdfs:label "Observation #19" ;
    lsuc:W9j77R4zt lsuc:U4R73CE1k ;
    lsuc:WEj33L4zD 4.2e+01 ;
    qb:dataSet lsuc:U9RYY746u .

lsuc:U9j3Y5EEzt a qb:Observation ;
    rdfs:label "Observation #8" ;
    lsuc:W9j77R4zt lsuc:UdRnYad1k ;
    lsuc:WEj33L4zD "NaN"^^xsd:double ;
    qb:dataSet lsuc:U9RYY746u .

lsuc:U9j3nVd96u a qb:Observation ;
    rdfs:label "Observation #12" ;
    lsuc:W9j77R4zt lsuc:UdRnYad1k ;
    lsuc:WEj33L4zD "NaN"^^xsd:double ;
    qb:dataSet lsuc:U9RYY746u .

lsuc:UdRn7J44