# Transformation of tabular dataset into CARE-SM RDF data

In [24]:
import pandas as pd
import numpy as np
import joblib
import re
import uuid
from rdflib import ConjunctiveGraph
from string import Template
from itertools import accumulate
from scipy.stats import norm, genextreme, exponweib
from datetime import datetime, date, timedelta

In [25]:
df = pd.read_csv("./syn_data.csv", index_col=0)
df.rename(columns={"output": "outcome"}, inplace=True)

# Shuffling dataframe with a random seed
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df

Unnamed: 0,hospital_stay_length,gcs,nb_acte,gender,entry,outcome,entry_code,ica,ttt,ica_therapy,...,ivh,age,nimodipine,paracetamol,nad,corotrop,morphine,dve,atl,iot
0,11.525542,18.050848,3.561477,0,1,0.0,2,1,1,0,...,0,46.653842,22,-1,-1,-1,-1,-1,-1,-1
1,4.096719,17.164788,20.830227,0,5,0.0,3,8,2,0,...,0,62.536000,25,-1,101,-1,-1,73,-1,49
2,92.015036,18.158804,29.897650,1,1,1.0,7,10,1,0,...,0,49.631746,-1,-1,-1,-1,-1,24,-1,54
3,66.217942,17.936781,45.870606,0,2,1.0,6,6,2,0,...,1,68.491810,-1,-1,-1,-1,-1,18,-1,44
4,25.694681,18.088936,4.813020,0,1,1.0,1,6,2,0,...,0,73.454985,23,-1,-1,-1,57,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9.744466,19.372323,106.752080,1,2,1.0,4,6,1,0,...,0,50.624381,13,-1,74,-1,-1,57,-1,35
9996,10.676763,15.452208,45.423368,0,1,0.0,0,0,1,0,...,0,39.705397,-1,-1,-1,-1,-1,18,-1,41
9997,33.567485,16.731739,18.703820,0,6,2.0,2,3,1,0,...,0,51.617016,22,86,-1,-1,-1,67,-1,49
9998,9.812358,18.074602,7.796590,0,4,0.0,1,2,2,0,...,0,72.462350,22,-1,102,-1,-1,83,-1,44


In [26]:
#size_train = int(len(df) * 0.8)

In [27]:
numerical = ['hospital_stay_length', 'gcs', 'nb_acte', 'age']
categorical = ['gender', 'entry', 'entry_code', 'ica', 'ttt', 'ica_therapy', 'fever', 'o2_clinic', 'o2', 'hta', 'hct', 'tabagisme', 'etOH', 'diabete', 'headache', 'instable', 'vasospasme', 'ivh', 'outcome']
events = ['nimodipine',  'paracetamol', 'nad', 'corotrop', 'morphine', 'dve', 'atl', 'iot']

events_codes = {
    'nimodipine': 'C08CA06', # ACT
    'paracetamol': 'N02BE01', # ACT
    'nad': 'C01CA03', # ACT
    'corotrop': 'C01CE02', # ACT
    'morphine': 'N02AA01', # ACT
    'dve': '00P6X0Z', # Removal of Drainage Device from Cerebral Ventricle External Approach (ICD-10)
    'atl': 'Z98.6', # ICD-10
    'iot': '0BH17EZ' # ICD-10
}

#['hospital_stay_length', 'gcs', 'nb_acte', 'age']
#['gender', 'entry', 'entry_code', 'ica', 'ttt', 'ica_therapy', 'fever', 'o2_clinic', 'o2', 'hta', 'hct', 'tabagisme', 'etOH', 'diabete', 'headache', 'instable', 'vasospasme', 'ivh']
other_codes = {
    'hospital_stay_length': '1', 
    'gcs':'2', 
    'nb_acte':'3', 
    'age':'4',
    'gender':'5',
    'entry':'6',
    'entry_code':'7',
    'ica':'8',
    'ttt':'9',
    'ica_therapy':'10',
    'fever':'11',
    'o2_clinic':'12',
    'o2':'13',
    'hta':'14',
    'hct':'15',
    'tabagisme':'16',
    'etOH':'17',
    'diabete':'18',
    'headache':'19',
    'instable':'20',
    'vasospasme':'21',
    'ivh':'22'
}

In [28]:
prefix  = """   
@prefix sio: <http://semanticscience.org/resource/> .
@prefix ncit: <http://purl.obolibrary.org/obo/NCIT_> .
@prefix nvasc: <http://nvasc.org/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
"""

caresm_diagnosis_code_date_template = Template("""
    
        nvasc:synth_patient_$patient_id a sio:Person ;  
            sio:has-role nvasc:role_$patient_id .
        
        nvasc:role_$patient_id a sio:Role, <http://purl.obolibrary.org/obo/OBI_0000093> ; 
            sio:is-realized-in nvasc:diag_$diag_id . 
            
        nvasc:diag_$diag_id a sio:Process, <http://purl.obolibrary.org/obo/NCIT_C18020> ;
            sio:has-output nvasc:diag_output_$diag_out_id ;
            nvasc:hasTimePoint nvasc:context_$context_id .
            
        nvasc:diag_output_$diag_out_id a sio:InformationContentEntity, 
            ncit:$diag_code. 
    
        nvasc:context_$context_id sio:SIO_000680 "$diag_start_date"^^xsd:dateTime ; # start date
                sio:SIO_000681 "$diag_end_date"^^xsd:dateTime ; # end date
                sio:SIO_000068 nvasc:timeline_$patient_id . # part-of 
                              
        nvasc:timeline_$patient_id a sio:SIO_000417, <http://purl.obolibrary.org/obo/NCIT_C54576> ;
                sio:SIO_000332 nvasc:synth_patient_$patient_id ; # is-about
                sio:SIO_000028 nvasc:context_$context_id . # this is a materialization of the has-part property
    """)

caresm_diagnosis_code_template = Template("""
    nvasc:synth_patient_$patient_id a sio:Person ;
        sio:has-role nvasc:role_$patient_id .
    nvasc:role_$patient_id a sio:Role, <http://purl.obolibrary.org/obo/OBI_0000093> ;
        sio:is-realized-in nvasc:diag_$diag_id .
        
    nvasc:diag_$diag_id a sio:Process, <http://purl.obolibrary.org/obo/NCIT_C18020> ;
        sio:has-output nvasc:diag_output_$diag_out_id .
        
    nvasc:diag_output_$diag_out_id a sio:InformationContentEntity, 
        ncit:$diag_code. 
    """)

caresm_diagnosis_quantity_template = Template("""
    nvasc:synth_patient_$patient_id a sio:Person ;
        sio:has-role nvasc:role_$patient_id .
    nvasc:role_$patient_id a sio:Role, <http://purl.obolibrary.org/obo/OBI_0000093> ;
        sio:is-realized-in nvasc:diag_$diag_id .
        
    nvasc:diag_$diag_id a sio:Process, <http://purl.obolibrary.org/obo/NCIT_C18020> ;
        sio:has-output nvasc:diag_output_$diag_out_id .
        
    nvasc:diag_output_$diag_out_id a sio:InformationContentEntity, 
        ncit:$diag_code ; # TODO should be a proper dereferenceable URI
        sio:has-value "$diag_value"^^xsd:float ;
        sio:has-unit  "$diag_unit" . # TODO should be a URI, subClassOf sio:Unit
    """)

nvasc_outcome = Template(
    """
    nvasc:synth_patient_$patient_id nvasc:hasOutcome nvasc:outcome_$outcome .
    """
)

def gen_start_event(y_min=2020, y_max=2023):
    n_days = (y_max - y_min) * 365
    d0 = datetime.fromisoformat(f"{y_min}-01-01")
    day_rand = round(np.random.uniform(n_days))
    delta = timedelta(days=day_rand, hours=round(norm.rvs(12, 5)), minutes=round(np.random.uniform(60)))
    d_out = d0 + delta
    return(d_out)


def gen_patient_rdf(row, kg):
    _i = row.name
    d_start = gen_start_event()
    for f in row.index:
        if f in events:
            if row[f] != -1:
                h = row[f]
                # make sure that there is no multiple events with the same timestamp
                if h == 0:
                    h = 1
                d_event = d_start + timedelta(hours=h)
                print(d_event)
                # diag_label = f
                # diag_code = events_codes[f]
                rdf = caresm_diagnosis_code_date_template.substitute(
                    diag_id=uuid.uuid4(),
                    diag_out_id=uuid.uuid4(),
                    context_id=uuid.uuid4(),
                    diag_label=f,
                    diag_code=events_codes[f],
                    diag_start_date=d_event.isoformat(),
                    diag_end_date=d_event.isoformat(),
                    patient_id=_i,
                )
                # print(row[f])
                # print(d_start)
                # print(rdf)
                kg.parse(data=prefix+rdf, format="turtle")

        elif f in numerical:
            value = None
            unit = None 
            if f == "hospital_stay_length":
                value = round(row[f])
                unit = "days"
            elif  f == "gcs":
                value = round(row[f],2)
                unit = "gcs"
            elif  f == "nb_acte":
                value = round(row[f])
                unit = "received medical treatments"
            elif  f == "age":
                value = round(row[f])
                unit = "age"

            rdf = caresm_diagnosis_quantity_template.substitute(
                diag_id=uuid.uuid4(), 
                diag_out_id=uuid.uuid4(),
                diag_label=f,  
                diag_code=str(f) + "_" + str(row[f]),
                diag_value=value, 
                diag_unit=unit, 
                patient_id=_i)
            kg.parse(data=prefix + rdf, format="turtle")

        # ['gender', 'entry', 'entry_code', 'ica', 'ttt', 'ica_therapy', 'fever', 'o2_clinic', 'o2', 'hta', 'hct', 'tabagisme', 'etOH', 'diabete', 'headache', 'instable', 'vasospasme', 'ivh']
        elif f in categorical:
            if f in ["outcome"]:
                outcome_value = row[f]
                rdf = nvasc_outcome.substitute(outcome=outcome_value, patient_id=_i)
                print(rdf)
                kg.parse(data=prefix + rdf, format="turtle")
            else:
                rdf = caresm_diagnosis_code_template.substitute(
                    diag_id=uuid.uuid4(), 
                    diag_out_id=uuid.uuid4(),
                    diag_label=f, 
                    diag_code=str(f) + "_" + str(row[f]), 
                    patient_id=_i)
                kg.parse(data=prefix + rdf, format="turtle")

In [39]:
#date_entrance = gen_start_event()
#print(date_entrance)
#date_2 = date_entrance + timedelta(hours=36)
#print(date_2)

## 1.0 Timestamped graph

In [None]:
N = 2
N_train = int(N * 0.95)
print("size training set: " + str(N_train))

N_test = N - N_train
print("size test set: " + str(N_test))
assert N == N_train + N_test

train_df = df.iloc[0:N]
with_outcome = df.iloc[0:N_train]
display(with_outcome)

print(df.columns)

no_outcome = df.iloc[N_train:N]
no_outcome = no_outcome.drop(columns=["outcome"])
display(no_outcome)

kg = ConjunctiveGraph()
with_outcome.apply(gen_patient_rdf, axis=1, kg=kg)
no_outcome.apply(gen_patient_rdf, axis=1, kg=kg)
print(f"Generated {len(kg)} RDF triples")

## SERIALIZATION with timestamps
kg.serialize(f"caresm_3ples_TS_{N_train}_{N_test}.ttl", format="turtle")
kg.serialize(f"caresm_3ples_TS_{N_train}_{N_test}.nt", format="ntriples")
joblib.dump(
    train_df["outcome"].astype(int).to_list(),
    f"caresm_3ples_TS{N_train}_{N_test}.outcomes" + ".joblib",
)
display(train_df["outcome"])
display(train_df["outcome"].value_counts())

size training set: 1
size test set: 1


Unnamed: 0,hospital_stay_length,gcs,nb_acte,gender,entry,outcome,entry_code,ica,ttt,ica_therapy,...,ivh,age,nimodipine,paracetamol,nad,corotrop,morphine,dve,atl,iot
0,11.525542,18.050848,3.561477,0,1,0.0,2,1,1,0,...,0,46.653842,22,-1,-1,-1,-1,-1,-1,-1


Index(['hospital_stay_length', 'gcs', 'nb_acte', 'gender', 'entry', 'outcome',
       'entry_code', 'ica', 'ttt', 'ica_therapy', 'fever', 'o2_clinic', 'o2',
       'hta', 'hct', 'tabagisme', 'etOH', 'diabete', 'headache', 'instable',
       'vasospasme', 'ivh', 'age', 'nimodipine', 'paracetamol', 'nad',
       'corotrop', 'morphine', 'dve', 'atl', 'iot'],
      dtype='object')


Unnamed: 0,hospital_stay_length,gcs,nb_acte,gender,entry,entry_code,ica,ttt,ica_therapy,fever,...,ivh,age,nimodipine,paracetamol,nad,corotrop,morphine,dve,atl,iot
1,4.096719,17.164788,20.830227,0,5,3,8,2,0,0,...,0,62.536,25,-1,101,-1,-1,73,-1,49



    nvasc:synth_patient_0 nvasc:hasOutcome nvasc:outcome_0.0 .
    
2021-08-23 06:22:00
2021-07-18 11:43:00
2021-07-21 15:43:00
2021-07-20 11:43:00
2021-07-19 11:43:00
Generated 350 RDF triples


In [34]:
before_query = """
CONSTRUCT {
    ?context1 time:before ?context2 .
} WHERE {
#SELECT * WHERE {
    ?timeline rdf:type <http://purl.obolibrary.org/obo/NCIT_C54576> .
    ?timeline sio:SIO_000028 ?context1, ?context2 .
    ?context1 sio:SIO_000680 ?start1 .
    ?context2 sio:SIO_000680 ?start2 .
    
    filter((?start1 < ?start2) && (?context1 != ?context2))
}
"""
res = kg.query(before_query)
print(len(res))
for t in res:
    # print(t["start1"], t["start2"])
    print(t)
    kg.add(t)
print(f"Generated {len(kg)} RDF triples")

6
(rdflib.term.URIRef('http://nvasc.org/context_b5bd8f4d-e3d3-4167-88c1-5c9267ec30c8'), rdflib.term.URIRef('http://www.w3.org/2006/time#before'), rdflib.term.URIRef('http://nvasc.org/context_5b69cfe0-edeb-4d43-830a-32c4f8d8b482'))
(rdflib.term.URIRef('http://nvasc.org/context_ec522650-2844-438e-aae6-5306b103fdb0'), rdflib.term.URIRef('http://www.w3.org/2006/time#before'), rdflib.term.URIRef('http://nvasc.org/context_5b69cfe0-edeb-4d43-830a-32c4f8d8b482'))
(rdflib.term.URIRef('http://nvasc.org/context_b5bd8f4d-e3d3-4167-88c1-5c9267ec30c8'), rdflib.term.URIRef('http://www.w3.org/2006/time#before'), rdflib.term.URIRef('http://nvasc.org/context_ec522650-2844-438e-aae6-5306b103fdb0'))
(rdflib.term.URIRef('http://nvasc.org/context_b5bd8f4d-e3d3-4167-88c1-5c9267ec30c8'), rdflib.term.URIRef('http://www.w3.org/2006/time#before'), rdflib.term.URIRef('http://nvasc.org/context_d6d3aa3c-52a6-487b-b15f-79dfe2e9b0bc'))
(rdflib.term.URIRef('http://nvasc.org/context_ec522650-2844-438e-aae6-5306b103fdb0

In [37]:
delete_ts_query = """
DELETE {
    ?context1 sio:SIO_000680 ?start .
    ?context2 sio:SIO_000681 ?end .
} WHERE {
    ?context1 sio:SIO_000680 ?start .
    ?context2 sio:SIO_000681 ?end .
}
"""
kg.update(delete_ts_query)
# print(len(res))
# for t in res:
# print(t["start1"], t["start2"])
##    print(t)
# kg.add(t)
print(f"KG length = {len(kg)} RDF triples")
kg.serialize(f"caresm_3ples_TR_{N_train}_{N_test}.ttl", format="turtle")
kg.serialize(f"caresm_3ples_TR_{N_train}_{N_test}.nt", format="ntriples")

KG length = 346 RDF triples




<Graph identifier=N289f8a579cf9462eb60c15cebbf39899 (<class 'rdflib.graph.ConjunctiveGraph'>)>

In [38]:
delete_tr_query = """
DELETE {
    ?x time:before ?y .
} WHERE {
    ?x time:before ?y .
}
"""
kg.update(delete_tr_query)
print(f"KG length = {len(kg)} RDF triples")
kg.serialize(f"caresm_3ples_{N_train}_{N_test}.ttl", format="turtle")
kg.serialize(f"caresm_3ples_{N_train}_{N_test}.nt", format="ntriples")

KG length = 340 RDF triples


<Graph identifier=N289f8a579cf9462eb60c15cebbf39899 (<class 'rdflib.graph.ConjunctiveGraph'>)>

In [13]:
kg.serialize(f"caresm_3ples_TS_TR_{N_train}_{N_test}.ttl", format="turtle")
kg.serialize(f"caresm_3ples_TS_TR_{N_train}_{N_test}.nt", format="ntriples")
joblib.dump(train_df["outcome"].astype(int).to_list(), "caresm_3ples__TS_TR_{N_train}_{N_test}.outcomes" + ".joblib")
display(train_df["outcome"])
display(train_df["outcome"].value_counts())



0    0.0
1    0.0
Name: outcome, dtype: float64

outcome
0.0    2
Name: count, dtype: int64

# Computing the duration of care events

In [95]:
## Understanding this code ... 
events = ['nimodipine',  'paracetamol', 'nad', 'corotrop', 'morphine', 'dve', 'atl', 'iot']
events_end = events + ['finish']

transitions = pd.read_csv('./care_transitions_probs.csv', index_col=0)
transitions

start_probs = [0.47381546, 0.09476309, 0.00997506, 0, 0.00997506, 0.24189526, 0.00249377, 0.16708229, 0]

# Generate a sequence of care events 
# The sequence is generated by starting with an initial event and then randomly selecting the next event based on the transition probabilities
def generate_care_path():
    event = np.random.choice(events_end, size=1, p=start_probs)[0]
    path = [event]

    while event != 'finish':
        event = np.random.choice(events_end, size=1, p=transitions[event].values)[0]
        # ensure that their is no duplicate event
        if event in path:
            event = 'finish'
        path += [event]
        
    # remove the finish event and return the path
    return path[:-1]
    

def generate_times_path(path):
    # Generate the time taken (in hour) for each event with a normal distribution with parameters mean=24 and std=5
    indv_times = map(round, norm.rvs(24, 5, len(path)))
    acc_times = list(accumulate(indv_times))
    # print(list(acc_times))

    sol = [-1] * len(events)
    # print(sol)
    for i, e in enumerate(path):
        sol[events.index(e)] = acc_times[i]
    
    # print(sol)
    return sol

In [96]:
generate_care_path()

['dve', 'nad', 'paracetamol', 'nimodipine']

In [98]:
df_events = pd.DataFrame([generate_times_path(generate_care_path()) for _ in range(5)], columns=events)
df_events

Unnamed: 0,nimodipine,paracetamol,nad,corotrop,morphine,dve,atl,iot
0,24,-1,-1,-1,-1,-1,-1,-1
1,24,50,-1,-1,-1,-1,-1,-1
2,-1,-1,-1,-1,-1,18,-1,-1
3,-1,-1,-1,50,-1,22,-1,62
4,29,-1,-1,-1,-1,70,-1,46
