# Preprocess - CARE-SM graph

In [1]:
import os
from datetime import datetime
import numpy as np
import pandas as pd
from sklearn.preprocessing import QuantileTransformer

### Load KG

In [2]:
num_patients = 10000
df = pd.read_csv(f"../../data/caresm_{num_patients}.nt", sep=" ", header=None)
df.drop(columns=df.columns[-1], axis=1, inplace=True)
df.columns=['h', 'r', 't']

### Map entities and relations

In [3]:
path = 'processed_data'
if not os.path.exists(path):
    os.makedirs(path)

node_df = df.copy()

# Map id to entities and relations.
ent_to_id = {k: v for v, k in enumerate(set(node_df['h']).union(set(node_df['t'])), start=0)}
rel_to_id = {k: v for v, k in enumerate(set(node_df['r']), start=0)}

triples = node_df.copy()
triples["h"] = node_df.h.map(ent_to_id)
triples["t"] = node_df.t.map(ent_to_id)
triples["r"] = node_df.r.map(rel_to_id)    

entity = pd.DataFrame({'id': list(ent_to_id.values()), 'entity': list(ent_to_id)})
relation = pd.DataFrame({'id': list(rel_to_id.values()), 'relation': list(rel_to_id)})

# Save triples, entities and relations.
triples.to_csv(f'processed_data/caresm_triples_{num_patients}.tsv', sep='\t', index=False, header=None)
entity.to_csv(f'processed_data/caresm_entities_{num_patients}.tsv', sep='\t', index=False, header=None)
relation.to_csv(f'processed_data/caresm_relations_{num_patients}.tsv', sep='\t', index=False, header=None)
print("Triples / Entities / Relations saved.")

Triples / Entities / Relations saved.


### Get literals

In [4]:
# Get literals.
numeric_df = node_df[node_df['r'] == '<http://sphn.org/has-value>'].copy()
numeric_df.t.str.removesuffix('^^<http://www.w3.org/2001/XMLSchema#float>')
numeric_values = pd.to_numeric(numeric_df.t.values)
numeric_df['numeric'] = numeric_values
numeric_arr = np.zeros((len(entity), 1))
for i, v in numeric_df.t.items():
    num_id = entity[entity.entity == v].id
    numeric_arr[num_id] = numeric_df.numeric.loc[i]

time_df = node_df[node_df['r'].str.contains('<http://semanticscience.org/resource/SIO_000680>|<http://semanticscience.org/resource/SIO_000681>')].copy()
time_df['sec'] = time_df.t.str.removesuffix('^^<http://www.w3.org/2001/XMLSchema#dateTime>')
times = []
for i, t in time_df.sec.items():
    time = datetime.strptime(t, '%Y-%m-%dT%H:%M:%S') - datetime(2020,1,1)
    times.append(time.total_seconds())
time_df['sec'] = times
    
qt = QuantileTransformer(n_quantiles=10, random_state=0)
qt_times = qt.fit_transform(time_df.sec.values.reshape(-1,1))
time_df['sec'] = list(qt_times.reshape(-1,))
for i, v in time_df.t.items():
    num_id = entity[entity.entity == v].id
    numeric_arr[num_id] = time_df.sec.loc[i]

np.save(f"processed_data/caresm_numeric_{num_patients}.npy", numeric_arr)
print("Literals saved.")

Literals saved.
