# Transformation of tabular dataset into SPHN RDF data

In [17]:
import pandas as pd
import numpy as np
import joblib
import re
import uuid
from rdflib import ConjunctiveGraph
from string import Template
from itertools import accumulate
from scipy.stats import norm, genextreme, exponweib
from datetime import datetime, date, timedelta

In [18]:
df = pd.read_csv('./syn_data.csv', index_col=0)
df.rename(columns={'output': 'outcome'}, inplace=True)
df

Unnamed: 0,hospital_stay_length,gcs,nb_acte,gender,entry,outcome,entry_code,ica,ttt,ica_therapy,...,ivh,age,nimodipine,paracetamol,nad,corotrop,morphine,dve,atl,iot
0,41.089445,17.086233,34.307297,0,0,0.0,0,0,0,0,...,0,38.712762,-1,-1,-1,-1,-1,-1,-1,25
1,21.702298,18.805639,133.523169,0,1,1.0,2,2,1,0,...,0,58.565461,89,58,26,-1,-1,116,-1,-1
2,4.627752,19.516216,85.648533,0,2,0.0,1,2,2,0,...,1,76.432889,12,-1,-1,-1,-1,40,-1,-1
3,12.830087,19.940518,17.982208,1,1,2.0,3,4,2,0,...,1,87.351874,29,-1,-1,-1,-1,-1,-1,53
4,75.675201,21.665547,132.859962,0,3,1.0,4,5,2,0,...,0,75.440254,26,-1,-1,-1,-1,79,-1,52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,23.668646,18.302874,41.950003,1,5,2.0,3,3,2,0,...,1,83.381334,22,-1,-1,-1,-1,-1,-1,40
9996,33.643450,20.634914,84.554423,0,1,1.0,1,5,1,0,...,0,49.631746,-1,-1,-1,-1,-1,24,-1,56
9997,18.122234,15.604823,4.792264,1,5,2.0,1,6,1,0,...,0,53.602286,24,-1,-1,-1,-1,-1,-1,55
9998,43.964242,18.368290,4.802513,1,2,1.0,1,3,2,0,...,1,82.388699,26,-1,-1,-1,-1,-1,-1,48


In [19]:
size_train = int(len(df) * 0.8)

In [20]:
numerical = ['hospital_stay_length', 'gcs', 'nb_acte', 'age']
categorical = ['gender', 'entry', 'entry_code', 'ica', 'ttt', 'ica_therapy', 'fever', 'o2_clinic', 'o2', 'hta', 'hct', 'tabagisme', 'etOH', 'diabete', 'headache', 'instable', 'vasospasme', 'ivh', 'outcome']
events = ['nimodipine',  'paracetamol', 'nad', 'corotrop', 'morphine', 'dve', 'atl', 'iot']

events_codes = {
    'nimodipine': 'C08CA06', # ACT
    'paracetamol': 'N02BE01', # ACT
    'nad': 'C01CA03', # ACT
    'corotrop': 'C01CE02', # ACT
    'morphine': 'N02AA01', # ACT
    'dve': '00P6X0Z', # Removal of Drainage Device from Cerebral Ventricle External Approach (ICD-10)
    'atl': 'Z98.6', # ICD-10
    'iot': '0BH17EZ' # ICD-10
}

In [21]:
prefix = """   
@prefix sphn: <http://sphn.org/> .
@prefix nvasc: <http://nvasc.org/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
"""

sphn_diagnosis_code_date_template = Template(
    """
    nvasc:$diag_id a sphn:Diagnosis ;
        rdfs:label "$diag_label"^^xsd:string ;
        sphn:hasCode "$diag_code"^^xsd:string ;
        sphn:hasRecordDateTime "$diag_date"^^xsd:dateTime ;
        sphn:hasSubjectPseudoIdentifier nvasc:synth_patient_$patient_id .
    """
)

sphn_diagnosis_code_template = Template(
    """
    nvasc:$diag_id a sphn:Diagnosis ;
        rdfs:label "$diag_label"^^xsd:string ;
        sphn:hasCode "$diag_code"^^xsd:string ;
        sphn:hasSubjectPseudoIdentifier nvasc:synth_patient_$patient_id .
    """
)

sphn_diagnosis_quantity_template = Template(
    """
    nvasc:$diag_id a sphn:Diagnosis ;
        rdfs:label "$diag_label" ;
        sphn:hasQuantity [ rdf:type sphn:Quantity ;
                            sphn:hasValue "$diag_value" ;
                            sphn:hasUnit "$diag_unit" ] ;
        sphn:hasSubjectPseudoIdentifier nvasc:synth_patient_$patient_id .
    """
)

nvasc_outcome = Template(
    """
    nvasc:synth_patient_$patient_id nvasc:hasOutcome nvasc:outcome_$outcome .
    """
)


def gen_start_event(y_min=2020, y_max=2023):
    n_days = (y_max - y_min) * 365
    d0 = datetime.fromisoformat(f"{y_min}-01-01")
    day_rand = round(np.random.uniform(n_days))
    delta = timedelta(
        days=day_rand,
        hours=round(norm.rvs(12, 5)),
        minutes=round(np.random.uniform(60)),
    )
    d_out = d0 + delta
    return d_out


def gen_patient_rdf(row, kg):
    _i = row.name
    d_start = gen_start_event()
    for f in row.index:
        if f in events:
            if row[f] != -1:
                h = row[f]
                d_event = d_start + timedelta(hours=h)
                diag_label = f
                diag_code = events_codes[f]
                rdf = sphn_diagnosis_code_date_template.substitute(
                    diag_id=uuid.uuid4(),
                    diag_label=f,
                    diag_code=events_codes[f],
                    diag_date=d_event.isoformat(),
                    patient_id=_i,
                )
                # print(row[f])#
                # print(d_start)
                # print(rdf)
                kg.parse(data=prefix + rdf, format="turtle")

        elif f in numerical:
            value = None
            unit = None
            if f == "hospital_stay_length":
                value = round(row[f])
                unit = "days"
            elif f == "gcs":
                value = round(row[f], 2)
                unit = "gcs"
            elif f == "nb_acte":
                value = round(row[f])
                unit = "received medical treatments"
            elif f == "age":
                value = round(row[f])
                unit = "age"

            rdf = sphn_diagnosis_quantity_template.substitute(
                diag_id=uuid.uuid4(),
                diag_label=f,
                diag_value=value,
                diag_unit=unit,
                patient_id=_i,
            )
            kg.parse(data=prefix + rdf, format="turtle")

        # ['gender', 'entry', 'entry_code', 'ica', 'ttt', 'ica_therapy', 'fever', 'o2_clinic', 'o2', 'hta', 'hct', 'tabagisme', 'etOH', 'diabete', 'headache', 'instable', 'vasospasme', 'ivh']
        elif f in categorical:
            if f in ["outcome"]:
                outcome_value = row[f]
                rdf = nvasc_outcome.substitute(outcome=outcome_value, patient_id=_i)
                kg.parse(data=prefix + rdf, format="turtle")
            else:
                diag_label = f
                diag_code = row[f]
                rdf = sphn_diagnosis_code_template.substitute(
                    diag_id=uuid.uuid4(),
                    diag_label=f,
                    diag_code=str(f) + "_" + str(row[f]),
                    patient_id=_i,
                )

                kg.parse(data=prefix + rdf, format="turtle")

In [28]:
N = 102
N_train = int(N * 0.8)
print(N_train)
N_test = N - N_train    
print(N_test)
assert N == N_train + N_test

train_df = df.iloc[0:N]

with_outcome = df.iloc[0:N_train]
with_outcome

81
21


Unnamed: 0,hospital_stay_length,gcs,nb_acte,gender,entry,outcome,entry_code,ica,ttt,ica_therapy,...,ivh,age,nimodipine,paracetamol,nad,corotrop,morphine,dve,atl,iot
0,41.089445,17.086233,34.307297,0,0,0.0,0,0,0,0,...,0,38.712762,-1,-1,-1,-1,-1,-1,-1,25
1,21.702298,18.805639,133.523169,0,1,1.0,2,2,1,0,...,0,58.565461,89,58,26,-1,-1,116,-1,-1
2,4.627752,19.516216,85.648533,0,2,0.0,1,2,2,0,...,1,76.432889,12,-1,-1,-1,-1,40,-1,-1
3,12.830087,19.940518,17.982208,1,1,2.0,3,4,2,0,...,1,87.351874,29,-1,-1,-1,-1,-1,-1,53
4,75.675201,21.665547,132.859962,0,3,1.0,4,5,2,0,...,0,75.440254,26,-1,-1,-1,-1,79,-1,52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,-0.568826,18.115110,51.401853,0,4,0.0,3,1,0,0,...,0,20.845333,-1,-1,-1,-1,-1,27,-1,-1
76,32.940101,19.241165,16.876576,0,1,0.0,6,6,3,0,...,0,64.521270,-1,-1,-1,-1,-1,44,-1,17
77,4.391382,16.736302,17.695417,1,1,1.0,3,1,1,0,...,0,53.602286,29,55,-1,-1,-1,-1,-1,-1
78,45.042998,18.588469,53.636239,0,2,1.0,1,1,2,0,...,0,64.521270,25,-1,-1,-1,-1,72,-1,48


In [23]:
no_outcome = df.iloc[80:100]
no_outcome = no_outcome.drop(columns=["outcome"])
no_outcome

Unnamed: 0,hospital_stay_length,gcs,nb_acte,gender,entry,entry_code,ica,ttt,ica_therapy,fever,...,ivh,age,nimodipine,paracetamol,nad,corotrop,morphine,dve,atl,iot
80,12.788602,18.188725,5.809512,0,1,1,7,2,0,0,...,0,75.440254,-1,27,-1,-1,-1,79,-1,55
81,32.205372,17.733582,9.118919,1,2,13,6,2,0,0,...,0,71.469715,28,-1,-1,-1,-1,73,-1,54
82,14.022996,19.441192,2.98166,1,3,3,6,2,0,0,...,1,77.425524,15,-1,-1,-1,-1,-1,-1,-1
83,11.8113,17.102448,66.785568,0,5,3,5,2,0,0,...,0,65.513905,85,58,-1,-1,-1,29,-1,-1
84,45.368277,16.869555,6.439832,0,1,0,1,1,0,0,...,0,43.675937,16,54,-1,-1,36,-1,-1,-1
85,18.171862,18.527551,14.720079,0,3,5,5,0,0,0,...,0,43.675937,-1,-1,-1,-1,25,-1,-1,-1
86,16.222231,20.206897,78.755852,0,3,1,3,2,0,0,...,0,65.513905,22,-1,-1,-1,-1,44,-1,-1
87,56.48858,18.850064,20.138758,1,3,14,2,2,0,0,...,0,65.513905,17,85,-1,-1,59,39,-1,113
88,103.713839,18.185148,15.063692,0,5,15,3,3,0,1,...,0,48.639111,-1,23,-1,-1,-1,-1,-1,-1
89,9.25091,16.501547,11.841801,1,5,1,3,2,0,0,...,1,79.410794,-1,28,-1,-1,-1,-1,-1,-1


In [24]:
kg = ConjunctiveGraph()
with_outcome.apply(gen_patient_rdf, axis=1, kg=kg)
no_outcome.apply(gen_patient_rdf, axis=1, kg=kg)
print(f"Generated {len(kg)} RDF triples")
kg.serialize("sphn_transductive.ttl", format="turtle")
kg.serialize("sphn_transductive.nt", format="nt")

joblib.dump(train_df["outcome"].astype(int).to_list(), "ouput_" + str(N) + ".joblib")

Generated 11285 RDF triples




['ouput_100.joblib']

# Computing the duration of care events

In [15]:
## Understanding this code ... 
events = ['nimodipine',  'paracetamol', 'nad', 'corotrop', 'morphine', 'dve', 'atl', 'iot']
events_end = events + ['finish']

transitions = pd.read_csv('./care_transitions_probs.csv', index_col=0)
transitions

start_probs = [0.47381546, 0.09476309, 0.00997506, 0, 0.00997506, 0.24189526, 0.00249377, 0.16708229, 0]

# Generate a sequence of care events 
# The sequence is generated by starting with an initial event and then randomly selecting the next event based on the transition probabilities
def generate_care_path():
    event = np.random.choice(events_end, size=1, p=start_probs)[0]
    path = [event]

    while event != 'finish':
        event = np.random.choice(events_end, size=1, p=transitions[event].values)[0]
        # ensure that their is no duplicate event
        if event in path:
            event = 'finish'
        path += [event]
        
    # remove the finish event and return the path
    return path[:-1]
    

def generate_times_path(path):
    # Generate the time taken (in hour) for each event with a normal distribution with parameters mean=24 and std=5
    indv_times = map(round, norm.rvs(24, 5, len(path)))
    acc_times = list(accumulate(indv_times))
    # print(list(acc_times))

    sol = [-1] * len(events)
    # print(sol)
    for i, e in enumerate(path):
        sol[events.index(e)] = acc_times[i]
    
    # print(sol)
    return sol

In [22]:
generate_care_path()

['dve', 'paracetamol', 'nimodipine', 'iot']

In [23]:
df_events = pd.DataFrame([generate_times_path(generate_care_path()) for _ in range(5)], columns=events)
df_events

Unnamed: 0,nimodipine,paracetamol,nad,corotrop,morphine,dve,atl,iot
0,87,66,-1,-1,47,17,-1,104
1,86,117,58,-1,-1,32,-1,-1
2,26,-1,-1,-1,-1,-1,-1,-1
3,30,-1,-1,-1,-1,-1,-1,-1
4,23,-1,-1,-1,-1,-1,-1,53
