# Transfor tabular data into CARE-SM RDF graph

In [1]:
import pandas as pd
import numpy as np
import joblib
import uuid
from rdflib import ConjunctiveGraph
from string import Template
from itertools import accumulate
from scipy.stats import norm, genextreme, exponweib
from datetime import datetime, timedelta

### Load tabular data

In [2]:
df = pd.read_csv("../../data/syn_data.csv", index_col=0)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.head()

Unnamed: 0,hospital_stay_length,gcs,nb_acte,gender,entry,outcome,entry_code,ica,ttt,ica_therapy,...,ivh,age,nimodipine,paracetamol,nad,corotrop,morphine,dve,atl,iot
0,11.525542,18.050848,3.561477,0,1,0.0,2,1,1,0,...,0,46.653842,22,-1,-1,-1,-1,-1,-1,-1
1,4.096719,17.164788,20.830227,0,5,0.0,3,8,2,0,...,0,62.536,25,-1,101,-1,-1,73,-1,49
2,92.015036,18.158804,29.89765,1,1,1.0,7,10,1,0,...,0,49.631746,-1,-1,-1,-1,-1,24,-1,54
3,66.217942,17.936781,45.870606,0,2,1.0,6,6,2,0,...,1,68.49181,-1,-1,-1,-1,-1,18,-1,44
4,25.694681,18.088936,4.81302,0,1,1.0,1,6,2,0,...,0,73.454985,23,-1,-1,-1,57,-1,-1,-1


### List of variables

In [3]:
numerical = ['hospital_stay_length', 'gcs', 'nb_acte', 'age']
categorical = ['gender', 'entry', 'entry_code', 'ica', 'ttt', 'ica_therapy', 'fever', 'o2_clinic', 'o2', 'hta', 'hct', 'tabagisme', 'etOH', 'diabete', 'headache', 'instable', 'vasospasme', 'ivh', 'outcome']
events = ['nimodipine',  'paracetamol', 'nad', 'corotrop', 'morphine', 'dve', 'atl', 'iot']

events_codes = {
    'nimodipine': 'C08CA06', # ACT
    'paracetamol': 'N02BE01', # ACT
    'nad': 'C01CA03', # ACT
    'corotrop': 'C01CE02', # ACT
    'morphine': 'N02AA01', # ACT
    'dve': '00P6X0Z', # Removal of Drainage Device from Cerebral Ventricle External Approach (ICD-10)
    'atl': 'Z98.6', # ICD-10
    'iot': '0BH17EZ' # ICD-10
}

other_codes = {
    'hospital_stay_length': '1', 
    'gcs':'2', 
    'nb_acte':'3', 
    'age':'4',
    'gender':'5',
    'entry':'6',
    'entry_code':'7',
    'ica':'8',
    'ttt':'9',
    'ica_therapy':'10',
    'fever':'11',
    'o2_clinic':'12',
    'o2':'13',
    'hta':'14',
    'hct':'15',
    'tabagisme':'16',
    'etOH':'17',
    'diabete':'18',
    'headache':'19',
    'instable':'20',
    'vasospasme':'21',
    'ivh':'22'
}

### CARE-SM template

In [4]:
prefix  = """   
@prefix sio: <http://semanticscience.org/resource/> .
@prefix ncit: <http://purl.obolibrary.org/obo/NCIT_> .
@prefix nvasc: <http://nvasc.org/> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
"""

caresm_diagnosis_code_date_template = Template("""
    
        nvasc:synth_patient_$patient_id a sio:Person ;  
            sio:has-role nvasc:role_$patient_id .
        
        nvasc:role_$patient_id a sio:Role, <http://purl.obolibrary.org/obo/OBI_0000093> ; 
            sio:is-realized-in nvasc:diag_$diag_id . 
            
        nvasc:diag_$diag_id a sio:Process, <http://purl.obolibrary.org/obo/NCIT_C18020> ;
            sio:has-output nvasc:diag_output_$diag_out_id ;
            nvasc:hasTimePoint nvasc:context_$context_id .
            
        nvasc:diag_output_$diag_out_id a sio:InformationContentEntity, 
            ncit:$diag_code. 
    
        nvasc:context_$context_id sio:SIO_000680 "$diag_start_date"^^xsd:dateTime ; # start date
                sio:SIO_000681 "$diag_end_date"^^xsd:dateTime ; # end date
                sio:SIO_000068 nvasc:timeline_$patient_id . # part-of 
                              
        nvasc:timeline_$patient_id a sio:SIO_000417, <http://purl.obolibrary.org/obo/NCIT_C54576> ;
                sio:SIO_000332 nvasc:synth_patient_$patient_id ; # is-about
                sio:SIO_000028 nvasc:context_$context_id . # this is a materialization of the has-part property
    """)

caresm_diagnosis_code_template = Template("""
    nvasc:synth_patient_$patient_id a sio:Person ;
        sio:has-role nvasc:role_$patient_id .
    nvasc:role_$patient_id a sio:Role, <http://purl.obolibrary.org/obo/OBI_0000093> ;
        sio:is-realized-in nvasc:diag_$diag_id .
        
    nvasc:diag_$diag_id a sio:Process, <http://purl.obolibrary.org/obo/NCIT_C18020> ;
        sio:has-output nvasc:diag_output_$diag_out_id .
        
    nvasc:diag_output_$diag_out_id a sio:InformationContentEntity, 
        ncit:$diag_code. 
    """)

caresm_diagnosis_quantity_template = Template("""
    nvasc:synth_patient_$patient_id a sio:Person ;
        sio:has-role nvasc:role_$patient_id .
    nvasc:role_$patient_id a sio:Role, <http://purl.obolibrary.org/obo/OBI_0000093> ;
        sio:is-realized-in nvasc:diag_$diag_id .
        
    nvasc:diag_$diag_id a sio:Process, <http://purl.obolibrary.org/obo/NCIT_C18020> ;
        sio:has-output nvasc:diag_output_$diag_out_id .
        
    nvasc:diag_output_$diag_out_id a sio:InformationContentEntity, 
        ncit:$diag_code ; # TODO should be a proper dereferenceable URI
        sio:has-value "$diag_value"^^xsd:float ;
        sio:has-unit  "$diag_unit" . # TODO should be a URI, subClassOf sio:Unit
    """)

nvasc_outcome = Template(
    """
    nvasc:synth_patient_$patient_id nvasc:hasOutcome nvasc:outcome_$outcome .
    """
)

def gen_start_event(y_min=2020, y_max=2023):
    n_days = (y_max - y_min) * 365
    d0 = datetime.fromisoformat(f"{y_min}-01-01")
    day_rand = round(np.random.uniform(n_days))
    delta = timedelta(days=day_rand, hours=round(norm.rvs(12, 5)), minutes=round(np.random.uniform(60)))
    d_out = d0 + delta
    return(d_out)


def gen_patient_rdf(row, kg):
    _i = row.name
    d_start = gen_start_event()
    for f in row.index:
        if f in events:
            if row[f] != -1:
                h = row[f]
                if h == 0:
                    h = 1
                d_event = d_start + timedelta(hours=h)
                rdf = caresm_diagnosis_code_date_template.substitute(
                    diag_id=uuid.uuid4(),
                    diag_out_id=uuid.uuid4(),
                    context_id=uuid.uuid4(),
                    diag_label=f,
                    diag_code=events_codes[f],
                    diag_start_date=d_event.isoformat(),
                    diag_end_date=d_event.isoformat(),
                    patient_id=_i,
                )
                kg.parse(data=prefix+rdf, format="turtle")

        elif f in numerical:
            value = None
            unit = None 
            if f == "hospital_stay_length":
                value = round(row[f])
                unit = "days"
            elif  f == "gcs":
                value = round(row[f],2)
                unit = "gcs"
            elif  f == "nb_acte":
                value = round(row[f])
                unit = "received medical treatments"
            elif  f == "age":
                value = round(row[f])
                unit = "age"

            rdf = caresm_diagnosis_quantity_template.substitute(
                diag_id=uuid.uuid4(), 
                diag_out_id=uuid.uuid4(),
                diag_label=f,  
                diag_code=str(f) + "_" + str(row[f]),
                diag_value=value, 
                diag_unit=unit, 
                patient_id=_i)
            kg.parse(data=prefix + rdf, format="turtle")

        elif f in categorical:
            if f in ["outcome"]:
                outcome_value = row[f]
                rdf = nvasc_outcome.substitute(outcome=outcome_value, patient_id=_i)
                kg.parse(data=prefix + rdf, format="turtle")
            else:
                rdf = caresm_diagnosis_code_template.substitute(
                    diag_id=uuid.uuid4(), 
                    diag_out_id=uuid.uuid4(),
                    diag_label=f, 
                    diag_code=str(f) + "_" + str(row[f]), 
                    patient_id=_i)
                kg.parse(data=prefix + rdf, format="turtle")

## 1. Timestamped graph

In [None]:
N = 10000 # the number of patients
no_outcome = df.iloc[:N].copy()
no_outcome = no_outcome.drop(columns=["outcome"])

# Serialize data
kg = ConjunctiveGraph()
no_outcome.apply(gen_patient_rdf, axis=1, kg=kg)

# Add timestamps and time relations
before_query = """
CONSTRUCT {
    ?context1 time:before ?context2 .
} WHERE {
#SELECT * WHERE {
    ?timeline rdf:type <http://purl.obolibrary.org/obo/NCIT_C54576> .
    ?timeline sio:SIO_000028 ?context1, ?context2 .
    ?context1 sio:SIO_000680 ?start1 .
    ?context2 sio:SIO_000680 ?start2 .
    
    filter((?start1 < ?start2) && (?context1 != ?context2))
}
"""

res = kg.query(before_query)
for t in res:    
    kg.add(t)
print(f"Generated {len(kg)} RDF triples")

# kg.serialize(f"../../data/caresm_{N}.ttl", format="turtle")
kg.serialize(f"../../data/caresm_{N}.nt", format="ntriples")
joblib.dump(df["outcome"][:N].astype(int).to_list(), f"../../data/caresm_outcomes_{N}.joblib")