# Transformation of tabular dataset into SPHN RDF data

In [102]:
import pandas as pd
import numpy as np
import joblib
import re
import uuid
from rdflib import ConjunctiveGraph
from string import Template
from itertools import accumulate
from scipy.stats import norm, genextreme, exponweib
from datetime import datetime, date, timedelta

In [3]:
df = pd.read_csv('./syn_data.csv', index_col=0)
df

Unnamed: 0,hospital_stay_length,gcs,nb_acte,gender,entry,output,entry_code,ica,ttt,ica_therapy,...,ivh,age,nimodipine,paracetamol,nad,corotrop,morphine,dve,atl,iot
0,41.089445,17.086233,34.307297,0,0,0.0,0,0,0,0,...,0,38.712762,-1,-1,-1,-1,-1,-1,-1,25
1,21.702298,18.805639,133.523169,0,1,1.0,2,2,1,0,...,0,58.565461,89,58,26,-1,-1,116,-1,-1
2,4.627752,19.516216,85.648533,0,2,0.0,1,2,2,0,...,1,76.432889,12,-1,-1,-1,-1,40,-1,-1
3,12.830087,19.940518,17.982208,1,1,2.0,3,4,2,0,...,1,87.351874,29,-1,-1,-1,-1,-1,-1,53
4,75.675201,21.665547,132.859962,0,3,1.0,4,5,2,0,...,0,75.440254,26,-1,-1,-1,-1,79,-1,52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,23.668646,18.302874,41.950003,1,5,2.0,3,3,2,0,...,1,83.381334,22,-1,-1,-1,-1,-1,-1,40
9996,33.643450,20.634914,84.554423,0,1,1.0,1,5,1,0,...,0,49.631746,-1,-1,-1,-1,-1,24,-1,56
9997,18.122234,15.604823,4.792264,1,5,2.0,1,6,1,0,...,0,53.602286,24,-1,-1,-1,-1,-1,-1,55
9998,43.964242,18.368290,4.802513,1,2,1.0,1,3,2,0,...,1,82.388699,26,-1,-1,-1,-1,-1,-1,48


In [4]:
size_train = int(len(df) * 0.8)

In [27]:
numerical = ['hospital_stay_length', 'gcs', 'nb_acte', 'age']
categorical = ['gender', 'entry', 'entry_code', 'ica', 'ttt', 'ica_therapy', 'fever', 'o2_clinic', 'o2', 'hta', 'hct', 'tabagisme', 'etOH', 'diabete', 'headache', 'instable', 'vasospasme', 'ivh']
events = ['nimodipine',  'paracetamol', 'nad', 'corotrop', 'morphine', 'dve', 'atl', 'iot']

events_codes = {
    'nimodipine': 'C08CA06', # ACT
    'paracetamol': 'N02BE01', # ACT
    'nad': 'C01CA03', # ACT
    'corotrop': 'C01CE02', # ACT
    'morphine': 'N02AA01', # ACT
    'dve': '00P6X0Z', # Removal of Drainage Device from Cerebral Ventricle External Approach (ICD-10)
    'atl': 'Z98.6', # ICD-10
    'iot': '0BH17EZ' # ICD-10
}

In [158]:
prefix  = """   
@prefix sphn: <http://sphn.org/> .
@prefix nvasc: <http://nvasc.org/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
"""

sphn_diagnosis_code_date_template = Template("""
    nvasc:$diag_id a sphn:Diagnosis ;
        rdfs:label "$diag_label"^^xsd:string ;
        sphn:hasCode "$diag_code"^^xsd:string ;
        sphn:hasRecordDateTime "$diag_date"^^xsd:dateTime ;
        sphn:hasSubjectPseudoIdentifier nvasc:synth_patient_$patient_id .
    """)

sphn_diagnosis_code_template = Template("""
    nvasc:$diag_id a sphn:Diagnosis ;
        rdfs:label "$diag_label"^^xsd:string ;
        sphn:hasCode "$diag_code"^^xsd:string ;
        sphn:hasSubjectPseudoIdentifier nvasc:synth_patient_$patient_id .
    """)
    
sphn_diagnosis_quantity_template = Template("""
    nvasc:$diag_id a sphn:Diagnosis ;
        rdfs:label "$diag_label" ;
        sphn:hasQuantity [ rdf:type sphn:Quantity ;
                            sphn:hasValue "$diag_value" ;
                            sphn:hasUnit "$diag_unit" ] ;
        sphn:hasSubjectPseudoIdentifier nvasc:synth_patient_$patient_id .
    """)

def gen_start_event(y_min=2020, y_max=2023):
    n_days = (y_max - y_min) * 365
    d0 = datetime.fromisoformat(f"{y_min}-01-01")
    day_rand = round(np.random.uniform(n_days))
    delta = timedelta(days=day_rand, hours=round(norm.rvs(12, 5)), minutes=round(np.random.uniform(60)))
    d_out = d0 + delta
    return(d_out)


def gen_patient_rdf(row, kg):
    _i = row.name
    d_start = gen_start_event()
    for f in row.index:
        if f in events:
            if row[f] != -1:    
                h = row[f]
                d_event = d_start + timedelta(hours=h)
                diag_label = f
                diag_code = events_codes[f]
                rdf = sphn_diagnosis_code_date_template.substitute(diag_id=uuid.uuid4(),
                                                        diag_label = f, 
                                                        diag_code = events_codes [f], 
                                                        diag_date = d_event.isoformat(),
                                                        patient_id = _i)
                print(row[f])
                print(d_start)
                print(rdf)
                kg.parse(data=prefix+rdf, format="turtle")
        
        elif f in numerical:
            value = None
            unit = None 
            if f == "hospital_stay_length":
                value = round(row[f])
                unit = "days"
            elif  f == "gcs":
                value = round(row[f],2)
                unit = "gcs"
            elif  f == "nb_acte":
                value = round(row[f])
                unit = "received medical treatments"
            elif  f == "age":
                value = round(row[f])
                unit = "age"
            
            rdf = sphn_diagnosis_quantity_template.substitute(
                diag_id=uuid.uuid4(),
                diag_label=f,  
                diag_value=value, 
                diag_unit=unit, 
                patient_id=_i)
            kg.parse(data=prefix+rdf, format="turtle")
            
        # ['gender', 'entry', 'entry_code', 'ica', 'ttt', 'ica_therapy', 'fever', 'o2_clinic', 'o2', 'hta', 'hct', 'tabagisme', 'etOH', 'diabete', 'headache', 'instable', 'vasospasme', 'ivh']
        elif f in categorical:
             diag_label = f
             diag_code = row[f]
             rdf = sphn_diagnosis_code_template.substitute(diag_id=uuid.uuid4(),
                                                     diag_label=f, 
                                                     diag_code=row[f], 
                                                     patient_id=_i)
        
             kg.parse(data=prefix+rdf, format="turtle")

In [159]:
date_entrance = gen_start_event()
print(date_entrance)
date_2 = date_entrance + timedelta(hours=36)
print(date_2)


2020-10-07 15:24:00
2020-10-09 03:24:00


In [160]:
kg = ConjunctiveGraph()
test_df = df.head(10)
# print(test_df)
test_df.apply(gen_patient_rdf, axis=1, kg = kg)
print(f"Generated {len(kg)} RDF triples")
kg.serialize("sphn.ttl", format="turtle")
kg.serialize("sphn.nt", format="nt")

25.0
2022-07-17 13:11:00

    nvasc:29e93833-c92b-411f-b94c-e351a5ff4e61 a sphn:Diagnosis ;
        rdfs:label "iot"^^xsd:string ;
        sphn:hasCode "0BH17EZ"^^xsd:string ;
        sphn:hasRecordDateTime "2022-07-18T14:11:00"^^xsd:dateTime ;
        sphn:hasSubjectPseudoIdentifier nvasc:synth_patient_0 .
    
89.0
2020-12-28 15:19:00

    nvasc:2105db2f-8218-4d14-bbf5-91c641501a38 a sphn:Diagnosis ;
        rdfs:label "nimodipine"^^xsd:string ;
        sphn:hasCode "C08CA06"^^xsd:string ;
        sphn:hasRecordDateTime "2021-01-01T08:19:00"^^xsd:dateTime ;
        sphn:hasSubjectPseudoIdentifier nvasc:synth_patient_1 .
    
58.0
2020-12-28 15:19:00

    nvasc:1816d55c-81d3-44c8-a97f-aeb9eb22009d a sphn:Diagnosis ;
        rdfs:label "paracetamol"^^xsd:string ;
        sphn:hasCode "N02BE01"^^xsd:string ;
        sphn:hasRecordDateTime "2020-12-31T01:19:00"^^xsd:dateTime ;
        sphn:hasSubjectPseudoIdentifier nvasc:synth_patient_1 .
    
26.0
2020-12-28 15:19:00

    nvasc:240c963



<Graph identifier=Nf2efb0d6b1764112890d9470efa67711 (<class 'rdflib.graph.ConjunctiveGraph'>)>

# Computing the duration of care events

In [95]:
## Understanding this code ... 
events = ['nimodipine',  'paracetamol', 'nad', 'corotrop', 'morphine', 'dve', 'atl', 'iot']
events_end = events + ['finish']

transitions = pd.read_csv('./care_transitions_probs.csv', index_col=0)
transitions

start_probs = [0.47381546, 0.09476309, 0.00997506, 0, 0.00997506, 0.24189526, 0.00249377, 0.16708229, 0]

# Generate a sequence of care events 
# The sequence is generated by starting with an initial event and then randomly selecting the next event based on the transition probabilities
def generate_care_path():
    event = np.random.choice(events_end, size=1, p=start_probs)[0]
    path = [event]

    while event != 'finish':
        event = np.random.choice(events_end, size=1, p=transitions[event].values)[0]
        # ensure that their is no duplicate event
        if event in path:
            event = 'finish'
        path += [event]
        
    # remove the finish event and return the path
    return path[:-1]
    

def generate_times_path(path):
    # Generate the time taken (in hour) for each event with a normal distribution with parameters mean=24 and std=5
    indv_times = map(round, norm.rvs(24, 5, len(path)))
    acc_times = list(accumulate(indv_times))
    # print(list(acc_times))

    sol = [-1] * len(events)
    # print(sol)
    for i, e in enumerate(path):
        sol[events.index(e)] = acc_times[i]
    
    # print(sol)
    return sol

In [96]:
generate_care_path()

['dve', 'nad', 'paracetamol', 'nimodipine']

In [98]:
df_events = pd.DataFrame([generate_times_path(generate_care_path()) for _ in range(5)], columns=events)
df_events

Unnamed: 0,nimodipine,paracetamol,nad,corotrop,morphine,dve,atl,iot
0,24,-1,-1,-1,-1,-1,-1,-1
1,24,50,-1,-1,-1,-1,-1,-1
2,-1,-1,-1,-1,-1,18,-1,-1
3,-1,-1,-1,50,-1,22,-1,62
4,29,-1,-1,-1,-1,70,-1,46
