# Transformation of tabular dataset into CARE-SM RDF data

In [60]:
import pandas as pd
import numpy as np
import joblib
import re
import uuid
from rdflib import ConjunctiveGraph
from string import Template
from itertools import accumulate
from scipy.stats import norm, genextreme, exponweib
from datetime import datetime, date, timedelta

In [61]:
df = pd.read_csv('./syn_data.csv', index_col=0)
df

Unnamed: 0,hospital_stay_length,gcs,nb_acte,gender,entry,output,entry_code,ica,ttt,ica_therapy,...,ivh,age,nimodipine,paracetamol,nad,corotrop,morphine,dve,atl,iot
0,41.089445,17.086233,34.307297,0,0,0.0,0,0,0,0,...,0,38.712762,-1,-1,-1,-1,-1,-1,-1,25
1,21.702298,18.805639,133.523169,0,1,1.0,2,2,1,0,...,0,58.565461,89,58,26,-1,-1,116,-1,-1
2,4.627752,19.516216,85.648533,0,2,0.0,1,2,2,0,...,1,76.432889,12,-1,-1,-1,-1,40,-1,-1
3,12.830087,19.940518,17.982208,1,1,2.0,3,4,2,0,...,1,87.351874,29,-1,-1,-1,-1,-1,-1,53
4,75.675201,21.665547,132.859962,0,3,1.0,4,5,2,0,...,0,75.440254,26,-1,-1,-1,-1,79,-1,52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,23.668646,18.302874,41.950003,1,5,2.0,3,3,2,0,...,1,83.381334,22,-1,-1,-1,-1,-1,-1,40
9996,33.643450,20.634914,84.554423,0,1,1.0,1,5,1,0,...,0,49.631746,-1,-1,-1,-1,-1,24,-1,56
9997,18.122234,15.604823,4.792264,1,5,2.0,1,6,1,0,...,0,53.602286,24,-1,-1,-1,-1,-1,-1,55
9998,43.964242,18.368290,4.802513,1,2,1.0,1,3,2,0,...,1,82.388699,26,-1,-1,-1,-1,-1,-1,48


In [62]:
size_train = int(len(df) * 0.8)

In [63]:
numerical = ['hospital_stay_length', 'gcs', 'nb_acte', 'age']
categorical = ['gender', 'entry', 'entry_code', 'ica', 'ttt', 'ica_therapy', 'fever', 'o2_clinic', 'o2', 'hta', 'hct', 'tabagisme', 'etOH', 'diabete', 'headache', 'instable', 'vasospasme', 'ivh']
events = ['nimodipine',  'paracetamol', 'nad', 'corotrop', 'morphine', 'dve', 'atl', 'iot']

events_codes = {
    'nimodipine': 'C08CA06', # ACT
    'paracetamol': 'N02BE01', # ACT
    'nad': 'C01CA03', # ACT
    'corotrop': 'C01CE02', # ACT
    'morphine': 'N02AA01', # ACT
    'dve': '00P6X0Z', # Removal of Drainage Device from Cerebral Ventricle External Approach (ICD-10)
    'atl': 'Z98.6', # ICD-10
    'iot': '0BH17EZ' # ICD-10
}

#['hospital_stay_length', 'gcs', 'nb_acte', 'age']
#['gender', 'entry', 'entry_code', 'ica', 'ttt', 'ica_therapy', 'fever', 'o2_clinic', 'o2', 'hta', 'hct', 'tabagisme', 'etOH', 'diabete', 'headache', 'instable', 'vasospasme', 'ivh']
other_codes = {
    'hospital_stay_length': '1', 
    'gcs':'2', 
    'nb_acte':'3', 
    'age':'4',
    'gender':'5',
    'entry':'6',
    'entry_code':'7',
    'ica':'8',
    'ttt':'9',
    'ica_therapy':'10',
    'fever':'11',
    'o2_clinic':'12',
    'o2':'13',
    'hta':'14',
    'hct':'15',
    'tabagisme':'16',
    'etOH':'17',
    'diabete':'18',
    'headache':'19',
    'instable':'20',
    'vasospasme':'21',
    'ivh':'22'
}

In [67]:
prefix  = """   
@prefix sio: <http://semanticscience.org/resource/> .
@prefix ncit: <http://purl.obolibrary.org/obo/NCIT_> .
@prefix nvasc: <http://nvasc.org/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
"""

caresm_diagnosis_code_date_template = Template("""
    nvasc:context_$context_id {                                           
        nvasc:synth_patient_$patient_id a sio:Person ;
            sio:has-role nvasc:role_$patient_id .
        nvasc:role_$patient_id a sio:Role, <http://purl.obolibrary.org/obo/OBI_0000093> ;
            sio:is-realized-in nvasc:diag_$diag_id .
            
        nvasc:diag_$diag_id a sio:Process, <http://purl.obolibrary.org/obo/NCIT_C18020> ;
            sio:has-output nvasc:diag_output_$diag_out_id .
            
        nvasc:diag_output_$diag_out_id a sio:InformationContentEntity, 
            ncit:$diag_code. 
    }
    
    {
        nvasc:context_$context_id sio:SIO_000680 "$diag_start_date"^^xsd:dateTime ;
                              sio:SIO_000681 "$diag_end_date"^^xsd:dateTime ;
                              sio:SIO_000068 nvasc:timeline_$patient_id . # part-of 
                              
        nvasc:timeline_$patient_id a sio:SIO_000417, <http://purl.obolibrary.org/obo/NCIT_C54576> ;
                                sio:SIO_000332 nvasc:synth_patient_$patient_id ; # is-about
                                sio:SIO_000028 nvasc:context_$context_id . # this is a materialization of the has-part property
    }
    """)

caresm_diagnosis_code_template = Template("""
    nvasc:synth_patient_$patient_id a sio:Person ;
        sio:has-role nvasc:role_$patient_id .
    nvasc:role_$patient_id a sio:Role, <http://purl.obolibrary.org/obo/OBI_0000093> ;
        sio:is-realized-in nvasc:diag_$diag_id .
        
    nvasc:diag_$diag_id a sio:Process, <http://purl.obolibrary.org/obo/NCIT_C18020> ;
        sio:has-output nvasc:diag_output_$diag_out_id .
        
    nvasc:diag_output_$diag_out_id a sio:InformationContentEntity, 
        ncit:$diag_code. 
    """)
    
caresm_diagnosis_quantity_template = Template("""
    nvasc:synth_patient_$patient_id a sio:Person ;
        sio:has-role nvasc:role_$patient_id .
    nvasc:role_$patient_id a sio:Role, <http://purl.obolibrary.org/obo/OBI_0000093> ;
        sio:is-realized-in nvasc:diag_$diag_id .
        
    nvasc:diag_$diag_id a sio:Process, <http://purl.obolibrary.org/obo/NCIT_C18020> ;
        sio:has-output nvasc:diag_output_$diag_out_id .
        
    nvasc:diag_output_$diag_out_id a sio:InformationContentEntity, 
        ncit:$diag_code ; # TODO should be a proper dereferenceable URI
        sio:has-value "$diag_value"^^xsd:float ;
        sio:has-unit  "$diag_unit" . # TODO should be a URI, subClassOf sio:Unit
    """)

def gen_start_event(y_min=2020, y_max=2023):
    n_days = (y_max - y_min) * 365
    d0 = datetime.fromisoformat(f"{y_min}-01-01")
    day_rand = round(np.random.uniform(n_days))
    delta = timedelta(days=day_rand, hours=round(norm.rvs(12, 5)), minutes=round(np.random.uniform(60)))
    d_out = d0 + delta
    return(d_out)


def gen_patient_rdf(row, kg):
    _i = row.name
    d_start = gen_start_event()
    for f in row.index:
        if f in events:
            if row[f] != -1:
                h = row[f]
                d_event = d_start + timedelta(hours=h)
                diag_label = f
                diag_code = events_codes[f]
                rdf = caresm_diagnosis_code_date_template.substitute(diag_id=uuid.uuid4(),
                                                        diag_out_id=uuid.uuid4(),
                                                        context_id=uuid.uuid4(),
                                                        diag_label = f,
                                                        diag_code = events_codes[f],
                                                        diag_start_date = d_start.isoformat(),
                                                        diag_end_date = d_event.isoformat(),
                                                        patient_id = _i)
                print(row[f])
                print(d_start)
                print(rdf)
                kg.parse(data=prefix+rdf, format="trig")
        
        elif f in numerical:
            value = None
            unit = None 
            if f == "hospital_stay_length":
                value = round(row[f])
                unit = "days"
            elif  f == "gcs":
                value = round(row[f],2)
                unit = "gcs"
            elif  f == "nb_acte":
                value = round(row[f])
                unit = "received medical treatments"
            elif  f == "age":
                value = round(row[f])
                unit = "age"
            
            rdf = caresm_diagnosis_quantity_template.substitute(
                diag_id=uuid.uuid4(), 
                diag_out_id=uuid.uuid4(),
                diag_label=f,  
                diag_code=str(f) + "_" + str(row[f]),
                diag_value=value, 
                diag_unit=unit, 
                patient_id=_i)
            kg.parse(data=prefix+rdf, format="trig")
            
        # ['gender', 'entry', 'entry_code', 'ica', 'ttt', 'ica_therapy', 'fever', 'o2_clinic', 'o2', 'hta', 'hct', 'tabagisme', 'etOH', 'diabete', 'headache', 'instable', 'vasospasme', 'ivh']
        elif f in categorical:
             diag_label = f
             diag_code = row[f]
             rdf = caresm_diagnosis_code_template.substitute(
                diag_id=uuid.uuid4(), 
                diag_out_id=uuid.uuid4(),
                diag_label=f, 
                diag_code=str(f) + "_" + str(row[f]), 
                patient_id=_i)
        
             kg.parse(data=prefix+rdf, format="trig")

In [68]:
date_entrance = gen_start_event()
print(date_entrance)
date_2 = date_entrance + timedelta(hours=36)
print(date_2)

2020-10-25 17:44:00
2020-10-27 05:44:00


In [69]:
kg = ConjunctiveGraph()
test_df = df.head(100)
# print(test_df)
test_df.apply(gen_patient_rdf, axis=1, kg = kg)
print(f"Generated {len(kg)} RDF triples")
kg.serialize("caresm.trig", format="trig")
kg.serialize("caresm.nq", format="nquads")

25.0
2020-04-03 04:38:00

    nvasc:context_fd5f3740-809c-4783-89af-14517d67cb03 {                                           
        nvasc:synth_patient_0 a sio:Person ;
            sio:has-role nvasc:role_0 .
        nvasc:role_0 a sio:Role, <http://purl.obolibrary.org/obo/OBI_0000093> ;
            sio:is-realized-in nvasc:diag_febf8326-6502-42a7-9194-82b4a4917472 .
            
        nvasc:diag_febf8326-6502-42a7-9194-82b4a4917472 a sio:Process, <http://purl.obolibrary.org/obo/NCIT_C18020> ;
            sio:has-output nvasc:diag_output_886a3c2b-da13-499f-935d-4766e9a19510 .
            
        nvasc:diag_output_886a3c2b-da13-499f-935d-4766e9a19510 a sio:InformationContentEntity, 
            ncit:0BH17EZ. 
    }
    
    {
        nvasc:context_fd5f3740-809c-4783-89af-14517d67cb03 sio:SIO_000680 "2020-04-03T04:38:00"^^xsd:dateTime ;
                              sio:SIO_000681 "2020-04-04T05:38:00"^^xsd:dateTime ;
                              sio:SIO_000068 nvasc:timeline_0 . #

<Graph identifier=Nca67fb9562004e8eaa404b82b89da96d (<class 'rdflib.graph.ConjunctiveGraph'>)>

# Computing the duration of care events

In [95]:
## Understanding this code ... 
events = ['nimodipine',  'paracetamol', 'nad', 'corotrop', 'morphine', 'dve', 'atl', 'iot']
events_end = events + ['finish']

transitions = pd.read_csv('./care_transitions_probs.csv', index_col=0)
transitions

start_probs = [0.47381546, 0.09476309, 0.00997506, 0, 0.00997506, 0.24189526, 0.00249377, 0.16708229, 0]

# Generate a sequence of care events 
# The sequence is generated by starting with an initial event and then randomly selecting the next event based on the transition probabilities
def generate_care_path():
    event = np.random.choice(events_end, size=1, p=start_probs)[0]
    path = [event]

    while event != 'finish':
        event = np.random.choice(events_end, size=1, p=transitions[event].values)[0]
        # ensure that their is no duplicate event
        if event in path:
            event = 'finish'
        path += [event]
        
    # remove the finish event and return the path
    return path[:-1]
    

def generate_times_path(path):
    # Generate the time taken (in hour) for each event with a normal distribution with parameters mean=24 and std=5
    indv_times = map(round, norm.rvs(24, 5, len(path)))
    acc_times = list(accumulate(indv_times))
    # print(list(acc_times))

    sol = [-1] * len(events)
    # print(sol)
    for i, e in enumerate(path):
        sol[events.index(e)] = acc_times[i]
    
    # print(sol)
    return sol

In [96]:
generate_care_path()

['dve', 'nad', 'paracetamol', 'nimodipine']

In [98]:
df_events = pd.DataFrame([generate_times_path(generate_care_path()) for _ in range(5)], columns=events)
df_events

Unnamed: 0,nimodipine,paracetamol,nad,corotrop,morphine,dve,atl,iot
0,24,-1,-1,-1,-1,-1,-1,-1
1,24,50,-1,-1,-1,-1,-1,-1
2,-1,-1,-1,-1,-1,18,-1,-1
3,-1,-1,-1,50,-1,22,-1,62
4,29,-1,-1,-1,-1,70,-1,46
