# SPHN KG Data Preprocessing for Node Classification

In [1]:
import os
from datetime import datetime

import numpy as np
import pandas as pd

## 1. Load KG and remove outcomes

In [2]:
num_patients = 1000
df = pd.read_csv(f"../Data Generation/sphn_transductive_{num_patients}_0.nt", sep=" ", header=None)
df.drop(columns=df.columns[-1], axis=1, inplace=True)
df.columns=['s', 'r', 'd']
df

Unnamed: 0,s,r,d
0,<http://nvasc.org/age_304>,<http://sphn.org/hasDeterminationDateTime>,2022-07-27T20:12:00^^<http://www.w3.org/2001/X...
1,<http://nvasc.org/114de294-412d-43a9-8745-e611...,<http://www.w3.org/2000/01/rdf-schema#label>,gcs
2,<http://nvasc.org/d2740e1b-05fb-4d1b-8e84-2375...,<http://sphn.org/hasSubjectPseudoIdentifier>,<http://nvasc.org/synth_patient_286>
3,<http://nvasc.org/efa01174-3617-43a6-8d1d-22df...,<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>,<http://sphn.org/Diagnosis>
4,<http://nvasc.org/85949f5d-21f5-4841-83c5-638e...,<http://www.w3.org/2000/01/rdf-schema#label>,gcs
...,...,...,...
104595,<http://nvasc.org/552a9f45-43d6-4ca3-9954-242d...,<http://www.w3.org/2000/01/rdf-schema#label>,ttt^^<http://www.w3.org/2001/XMLSchema#string>
104596,<http://nvasc.org/6431a192-0c80-4573-8af3-ff22...,<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>,<http://sphn.org/Procedure>
104597,<http://nvasc.org/f4bf5497-6ccc-412b-9fc7-4463...,<http://www.w3.org/2000/01/rdf-schema#label>,etOH^^<http://www.w3.org/2001/XMLSchema#string>
104598,<http://nvasc.org/d582fb82-4498-4262-a4fc-2133...,<http://sphn.org/hasCode>,<http://nvasc.org/code_vasospasme_0.0>


### Remove outcomes from KG

In [3]:
outcome = df['d'].str.contains('outcome_0.0|outcome_1.0|outcome_2.0')
node_df = df[~outcome]
node_df = node_df.reset_index(drop=True)
outcome = node_df['s'].str.contains('outcome_0.0|outcome_1.0|outcome_2.0')
node_df = node_df[~outcome]
node_df = node_df.reset_index(drop=True)

ent_to_id = {k: v for v, k in enumerate(set(node_df['s']).union(set(node_df['d'])), start=0)}
rel_to_id = {k: v for v, k in enumerate(set(node_df['r']), start=0)}

patients = [f"<http://nvasc.org/synth_patient_{i}>" for i in range(num_patients)]
patient_id = []
for patient in patients:
    patient_id.append(ent_to_id[patient])

num_nodes = max(ent_to_id.values()) + 1
num_rels = max(rel_to_id.values()) + 1

events = node_df.copy()
events["s"] = node_df.s.map(ent_to_id)
events["d"] = node_df.d.map(ent_to_id)
events["r"] = node_df.r.map(rel_to_id)

ent_to_id = pd.DataFrame.from_dict(ent_to_id, orient='index')
rel_to_id = pd.DataFrame.from_dict(rel_to_id, orient='index')

### Save events, entities and relations to 'processed_data'

In [4]:
path = 'processed_data'
if not os.path.exists(path):
    os.makedirs(path)

events.to_csv(f'{path}/sphn_events_{num_patients}_noOutcome.tsv', sep='\t', index=False, header=None)
ent_to_id.to_csv(f'{path}/sphn_entities_{num_patients}_noOutcome.tsv', sep='\t', header=None)
rel_to_id.to_csv(f'{path}/sphn_relations_{num_patients}_noOutcome.tsv', sep='\t', header=None)

### Get numerical literals (including timestamps) and save to 'processed_data'

In [5]:
ent_to_id = pd.read_csv(f'{path}/sphn_entities_{num_patients}_noOutcome.tsv', sep='\t', header=None)
entity = pd.DataFrame({'id': ent_to_id[1].values, 'ent': ent_to_id[0].values})

# Get numeric values
numeric = node_df[node_df['r'] == '<http://sphn.org/hasValue>'].copy()


times = node_df[node_df['r'].str.contains('<http://sphn.org/hasStartDateTime>|<http://sphn.org/hasDeterminationDateTime>')].copy()
time = times.d.str.removesuffix('^^<http://www.w3.org/2001/XMLSchema#dateTime>')
for i, t in time.items():
    td = datetime.strptime(t, '%Y-%m-%dT%H:%M:%S') - datetime(2020,1,1)
    times.d.loc[i] = td.total_seconds()

from sklearn.preprocessing import QuantileTransformer, PowerTransformer
qt = QuantileTransformer(n_quantiles=10, random_state=0)
scaled_ages = qt.fit_transform(numeric.d.values.reshape(-1,1))
numeric['age'] = list(scaled_ages.reshape(-1,))

numeric_embedding = np.zeros((len(entity), 1))
for i, v in numeric.d.items():
    idx = entity[entity.ent == v].id
    numeric_embedding[idx] = numeric.age.loc[i]

# pt = PowerTransformer()
# scaled_times = pt.fit_transform(np.asarray(secs).reshape(-1,1))
qt = QuantileTransformer(n_quantiles=10, random_state=0)
scaled_times = qt.fit_transform(times.d.values.reshape(-1,1))
times['sec'] = list(scaled_times.reshape(-1,))
for i, v in times.d.items():
    idx = entity[entity.ent == v].id
    numeric_embedding[idx] = times.sec.loc[i]

print(numeric_embedding)
np.save(f"processed_data/sphn_numeric_{num_patients}.npy", numeric_embedding)

[[0.        ]
 [0.        ]
 [0.27736318]
 ...
 [0.        ]
 [0.        ]
 [0.        ]]
