In [1]:
import requests
import numpy as np
import pandas as pd
from  urllib.parse import quote
from scipy.stats import ttest_ind
from rdflib import Graph, URIRef, BNode, Literal
from rdflib.namespace import RDF, XSD, RDFS

obo = dict()

obo['STATO_0000304'] = URIRef('http://purl.obolibrary.org/obo/STATO_0000304')
obo['OBI_0000299'] = URIRef('http://purl.obolibrary.org/obo/OBI_0000299')
obo['OBI_0000175'] = URIRef('http://purl.obolibrary.org/obo/OBI_0000175')
obo['OBI_0000293'] = URIRef('http://purl.obolibrary.org/obo/OBI_0000293')
obo['STATO_0000251'] = URIRef('http://purl.obolibrary.org/obo/STATO_0000251')
obo['BFO_0000051'] = URIRef('http://purl.obolibrary.org/obo/BFO_0000051')
obo['IAO_0000032'] = URIRef('http://purl.obolibrary.org/obo/IAO_0000032')
obo['OBI_0001938'] = URIRef('http://purl.obolibrary.org/obo/OBI_0001938')
obo['OBI_0001933'] = URIRef('http://purl.obolibrary.org/obo/OBI_0001933')
obo['OBI_0001931'] = URIRef('http://purl.obolibrary.org/obo/OBI_0001931')
obo['OBI_0002135'] = URIRef('http://purl.obolibrary.org/obo/OBI_0002135')
obo['OBI_0001937'] = URIRef('http://purl.obolibrary.org/obo/OBI_0001937')
obo['OBI_0000751'] = URIRef('http://purl.obolibrary.org/obo/OBI_0000751')
obo['STATO_0000205'] = URIRef('http://purl.obolibrary.org/obo/STATO_0000205')
obo['STATO_0000019'] = URIRef('http://purl.obolibrary.org/obo/STATO_0000019')
obo['STATO_0000129'] = URIRef('http://purl.obolibrary.org/obo/STATO_0000129')
obo['GO_0030350'] = URIRef('http://purl.obolibrary.org/obo/GO_0030350')
obo['NCIT_C50577'] = URIRef('http://purl.obolibrary.org/obo/NCIT_C50577')
obo['NCIT_C25168'] = URIRef('http://purl.obolibrary.org/obo/NCIT_C25168')

api = 'http://localhost:8000/api/'

api_resources = '{}resources/'.format(api)
api_predicates = '{}predicates/'.format(api)
api_literals = '{}literals/'.format(api)
api_statements = '{}statements/'.format(api)


def represent(df, tt):
    g = Graph()
    g.bind('obo', 'http://purl.obolibrary.org/obo/')
    n1 = URIRef('https://doi.org/10.1093/eurheartj/ehw333-R1')
    n2 = BNode()
    n3 = BNode()
    n4 = BNode()
    n5 = BNode()
    n6 = BNode()
    g.add((RDFS.label, RDFS.label, Literal('label')))
    g.add((obo['STATO_0000304'], RDFS.label, Literal('two sample t-test with unequal variance')))
    g.add((obo['OBI_0000299'], RDFS.label, Literal('has specified output')))
    g.add((obo['OBI_0000293'], RDFS.label, Literal('has specified input')))
    g.add((obo['OBI_0000175'], RDFS.label, Literal('p-value')))
    g.add((obo['OBI_0001938'], RDFS.label, Literal('has value specification')))
    g.add((obo['OBI_0001931'], RDFS.label, Literal('scalar value specification')))
    g.add((obo['OBI_0001937'], RDFS.label, Literal('has specified numeric value')))
    g.add((obo['OBI_0000751'], RDFS.label, Literal('study design dependent variable')))
    g.add((obo['OBI_0001933'], RDFS.label, Literal('value specification')))
    g.add((obo['OBI_0002135'], RDFS.label, Literal('has specified value')))
    g.add((obo['GO_0030350'], RDFS.label, Literal('iron-responsive element binding')))
    g.add((obo['STATO_0000251'], RDFS.label, Literal('continuous variable')))
    g.add((obo['BFO_0000051'], RDFS.label, Literal('has part')))
    g.add((obo['IAO_0000032'], RDFS.label, Literal('scalar measurement datum')))
    g.add((n1, RDF.type, obo['STATO_0000304']))
    g.add((n1, RDFS.label, Literal('Statistically significant hypothesis test with IRE binding dependent variable on failing and non-failing hearts')))
    g.add((RDF.type, RDFS.label, Literal('type')))
    g.add((n1, obo['OBI_0000299'], n2))
    g.add((n2, RDFS.label, Literal('the p-value of the statistical hypothesis test')))
    g.add((n2, RDF.type, obo['OBI_0000175']))
    g.add((n2, obo['OBI_0001938'], n3))
    g.add((n3, RDFS.label, Literal('the value specification of the p-value')))
    g.add((n3, RDF.type, obo['OBI_0001931']))
    g.add((n3, obo['OBI_0001937'], Literal(tt.pvalue, datatype=XSD.double)))
    g.add((n1, obo['OBI_0000293'], n4))
    g.add((n4, RDFS.label, Literal('the study design dependent variable')))
    g.add((n4, RDF.type, obo['OBI_0000751']))
    g.add((n4, RDF.type, obo['GO_0030350']))
    k = 1
    for i in list(df):
        b1 = BNode()
        g.add((n1, obo['OBI_0000293'], b1))
        g.add((b1, RDFS.label, Literal(i)))
        g.add((b1, RDF.type, obo['STATO_0000251']))
        for j in df[i]:
            if np.isnan(j): 
                continue
            b2 = BNode()
            b3 = BNode()
            g.add((b1, obo['BFO_0000051'], b2))
            g.add((b2, RDFS.label, Literal('the scalar measurement datum #{}'.format(k))))
            g.add((b2, RDF.type, obo['IAO_0000032']))
            g.add((b2, obo['OBI_0001938'], b3))
            g.add((b3, RDFS.label, Literal('the value specification #{}'.format(k))))
            g.add((b3, RDF.type, obo['OBI_0001931']))
            g.add((b3, obo['OBI_0001937'], Literal(str(j))))
            k = k + 1
    return g


def store(g):
    for s, p, o in g:            
        s_id = get_id(api_resources, s, g)
        p_id = get_id(api_predicates, p, g)
        
        if type(o) is Literal:
            cls = 'literal'
            o_id = get_id(api_literals, o, g)
        else:
            cls = 'resource'
            o_id = get_id(api_resources, o, g)
        
        requests.post(api_statements,
                      json={'subject_id': s_id, 'predicate_id': p_id, 'object': {'id': o_id, '_class': cls}}, 
                      headers={'Content-Type':'application/json'})


def get_id(api, r, g):
    l = None
    
    if type(r) is Literal:
        l = r
    else:
        l = g.value(r, RDFS.label)
        
    if l is None:
        raise Exception('Label is none for resource {}'.format(r))
        
    j = requests.get(api, params={'q':l, 'exact': 'true'}, headers={'Content-Type':'application/json', 'Accept':'application/json'}).json()
    
    if len(j) == 0:
        return requests.post(api, json={'label':l}, headers={'Content-Type':'application/json'}).json()['id']
        
    if len(j) == 1:
        return j[0]['id']
        
    raise Exception('Lookup for {}, expected only one result {}'.format(l, j))

In [2]:
labels = ['non-failing heart', 'failing heart']
data = [(99, 52), 
        (96, 40), 
        (100, 38), 
        (105, 18), 
        (np.nan, 11), 
        (np.nan, 5), 
        (np.nan, 42), 
        (np.nan, 55), 
        (np.nan, 53), 
        (np.nan, 39),
        (np.nan, 42), 
        (np.nan, 50)]

d = pd.DataFrame.from_records(data, columns=labels)
t = ttest_ind(d['non-failing heart'], 
              d['failing heart'], 
              equal_var=False, nan_policy='omit')

store(represent(d, t))

t.pvalue

1.3111247517411591e-08