# From chatgpt

In [5]:
import glob, json
import pandas as pd

# 1) Gather all FHIR JSON files
files = glob.glob("../output/fhir/*.json")

In [6]:
# 2) Partition them by resource type
encounters = []
observations = []
for path in files:
    with open(path) as f:
        res = json.load(f)
    rtype = res.get("resourceType")
    if rtype == "Encounter":
        encounters.append(res)
    elif rtype == "Observation":
        observations.append(res)
    # (you can extend for Patient, Location, etc.)

UnicodeDecodeError: 'charmap' codec can't decode byte 0x81 in position 17298: character maps to <undefined>

In [None]:

# 3) Build an occupancy DataFrame from Encounters
records = []
for enc in encounters:
    ward = "unknown"
    # if enc["location"] exists, extract ward/location:
    if enc.get("location"):
        ward = enc["location"][0]["location"]["reference"].split("/")[-1]
    start = pd.to_datetime(enc["period"]["start"])
    end   = pd.to_datetime(enc["period"]["end"])
    records.append({"ward": ward, "start": start, "end": end})
df_enc = pd.DataFrame(records)


In [None]:

# 4) Expand to hourly occupancy counts
rows = []
for _, row in df_enc.iterrows():
    rng = pd.date_range(row["start"], row["end"], freq="H", closed="left")
    rows.append(pd.DataFrame({"timestamp": rng, "ward": row["ward"]}))
df_occ = pd.concat(rows, ignore_index=True)
df_occ = df_occ.groupby(["ward", "timestamp"]).size().reset_index(name="occupancy")


In [None]:

# 5) (Optional) Extract vitals from Observation for covariates
obs_rows = []
for obs in observations:
    pid = obs["subject"]["reference"].split("/")[-1]
    ts  = pd.to_datetime(obs["effectiveDateTime"])
    val = obs["valueQuantity"]["value"]
    code = obs["code"]["coding"][0]["code"]
    obs_rows.append({"patient": pid, "timestamp": ts, code: val})
df_obs = pd.DataFrame(obs_rows)
df_obs = df_obs.set_index("timestamp").sort_index()
# You can pivot to get wide table: df_obs.pivot_table(...)


In [None]:

# 6) Merge occupancy + covariates
# e.g. left-join df_occ (grouped per ward‐hour) with aggregated df_obs (e.g. mean per hour)

# 7) Add time features
df_occ["hour"]       = df_occ["timestamp"].dt.hour
df_occ["dow"]        = df_occ["timestamp"].dt.dayofweek
df_occ["is_weekend"] = df_occ["dow"].isin([5,6]).astype(int)
# and any holiday calendar you like

# 8) Final training table
# Columns: ward, timestamp, occupancy (target), plus covariates (hour, dow, is_weekend, vitals…)

df_training = df_occ  # now ready to feed into your TFT pipeline
# Save to Parquet or CSV:
df_training.to_parquet("training_data.parquet")


# Custom

In [7]:
from pandas import json_normalize
import ndjson
import pandas as pd
import json
import torch

In [8]:
def flatten_json(nested_json):
    out = {}

    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x

    flatten(nested_json)
    return out

In [9]:
# 1) Gather all FHIR JSON files
files = glob.glob("../output/fhir/*.json")

In [20]:
# extract_fhir.py

import argparse
import glob
import json
import pandas as pd
from pathlib import Path

def load_encounters(input_dir: Path):
    """
    Read all .json files in input_dir, parse FHIR bundles,
    and collect Encounter resources with period.start/end & ward.
    """
    encounters = []
    for fp in input_dir.glob("*.json"):
        text = fp.read_text(encoding="utf-8", errors="ignore")
        bundle = json.loads(text)
        for entry in bundle.get("entry", []):
            res = entry.get("resource", {})
            if res.get("resourceType") != "Encounter":
                continue
            period = res.get("period", {})
            if "start" not in period or "end" not in period:
                continue
            # extract ward/location display (fallback to 'unknown')
            loc = res.get("location", [{}])[0].get("location", {})
            ward = loc.get("display") or loc.get("reference", "unknown").split("/")[-1]
            encounters.append({
                "start": pd.to_datetime(period["start"]),
                "end":   pd.to_datetime(period["end"]),
                "ward":  ward
            })
    return pd.DataFrame(encounters)

def build_hourly_occupancy(df_enc: pd.DataFrame):
    """
    Expand each encounter into hourly timestamps, then
    aggregate per (ward, timestamp) to count active Encounters.
    """
    rows = []
    for _, row in df_enc.iterrows():
        rng = pd.date_range(row["start"], row["end"],
                            freq="H", inclusive="left")
        df = pd.DataFrame({
            "timestamp": rng,
            "ward":      row["ward"]
        })
        rows.append(df)
    # concatenate and count
    all_hours = pd.concat(rows, ignore_index=True)
    occ = (all_hours
           .groupby(["ward", "timestamp"])
           .size()
           .reset_index(name="occupancy"))
    return occ

def main():
    IN_PATH = Path("../output/fhir")
    OUT_PATH = Path("../output/cleaned_fhir")

    # 1. Load Encounters
    df_enc = load_encounters(IN_PATH)
    if df_enc.empty:
        print("⚠️  No Encounter resources found.")
        return

    # 2. Build hourly occupancy table
    df_occ = build_hourly_occupancy(df_enc)

    # 3. (Optional) add time features
    df_occ["hour"] = df_occ["timestamp"].dt.hour
    df_occ["dow"]  = df_occ["timestamp"].dt.dayofweek
    df_occ["is_weekend"] = df_occ["dow"].isin([5,6]).astype(int)

    # 4. Write out
    OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
    df_occ.to_csv("output.csv")
    print(f"→ Occupancy data written to {OUT_PATH}")

if __name__ == "__main__":
    main()


→ Occupancy data written to ..\output\cleaned_fhir


  rng = pd.date_range(row["start"], row["end"],


# EDA

## Structure exploration

In [None]:
def flatten_json(nested_json):
    out = {}

    def flatten(x, name=''):
        if isinstance(x, dict):
            for a in x:
                flatten(x[a], name + a + '_')
        elif isinstance(x, list):
            for i, a in enumerate(x):
                flatten(a, name + str(i) + '_')
        else:
            out[name[:-1]] = x

    flatten(nested_json)
    return out

# Example usage:
import json


In [21]:
with open(files[0]) as f:
    data = json.load(f)

flat_dict = flatten_json(data)
print(flat_dict)

{'resourceType': 'Bundle', 'type': 'transaction', 'entry_0_fullUrl': 'urn:uuid:4b40465b-ea68-fa3f-b224-d45cbc171913', 'entry_0_resource_resourceType': 'Patient', 'entry_0_resource_id': '4b40465b-ea68-fa3f-b224-d45cbc171913', 'entry_0_resource_meta_profile_0': 'http://hl7.org/fhir/us/core/StructureDefinition/us-core-patient', 'entry_0_resource_text_status': 'generated', 'entry_0_resource_text_div': '<div xmlns="http://www.w3.org/1999/xhtml">Generated by <a href="https://github.com/synthetichealth/synthea">Synthea</a>.Version identifier: c807432\n .   Person seed: 945465854560076454  Population seed: 1750493949303</div>', 'entry_0_resource_extension_0_url': 'http://hl7.org/fhir/us/core/StructureDefinition/us-core-race', 'entry_0_resource_extension_0_extension_0_url': 'ombCategory', 'entry_0_resource_extension_0_extension_0_valueCoding_system': 'urn:oid:2.16.840.1.113883.6.238', 'entry_0_resource_extension_0_extension_0_valueCoding_code': '2106-3', 'entry_0_resource_extension_0_extension_

In [27]:
data.keys()

dict_keys(['resourceType', 'type', 'entry'])

In [34]:
len(data["entry"])

352

so this person has 352 entries

In [38]:
data["entry"][0].keys()

dict_keys(['fullUrl', 'resource', 'request'])

In [41]:
data["entry"][0]["fullUrl"]

'urn:uuid:4b40465b-ea68-fa3f-b224-d45cbc171913'

In [45]:
data["entry"][0]["request"]

{'method': 'POST', 'url': 'Patient'}

In [78]:
methods = {}
all_url = {}
for i in range (352):
    methods[data["entry"][i]["request"]["method"]] = methods.get(data["entry"][i]["request"]["method"], 0) + 1
    all_url[data["entry"][i]["request"]["url"]] = all_url.get(data["entry"][i]["request"]["url"], 0) + 1


In [75]:
methods

{'POST': 352}

In [79]:
all_url # data["entry"][i]["request"]["url"] is the url of the request

{'Patient': 1,
 'Encounter': 20,
 'Condition': 11,
 'DiagnosticReport': 26,
 'DocumentReference': 20,
 'Claim': 26,
 'ExplanationOfBenefit': 26,
 'MedicationRequest': 6,
 'CareTeam': 3,
 'CarePlan': 3,
 'Observation': 116,
 'Immunization': 21,
 'Procedure': 52,
 'SupplyDelivery': 12,
 'Medication': 3,
 'MedicationAdministration': 3,
 'Device': 1,
 'ImagingStudy': 1,
 'Provenance': 1}

and those are the data

for each type in url, find information about it

In [80]:
from collections import defaultdict

report_type_to_index = defaultdict(list)
for i, entry in enumerate(data["entry"]):
    if "request" in entry and "url" in entry["request"]:
        url = entry["request"]["url"]
        report_type = url.split("/")[-1]  # Extract the last part of the URL
        report_type_to_index[report_type].append(i)

In [81]:
none_key_exists = None in report_type_to_index
print("Is there a None key in report_type_to_index?", none_key_exists)

Is there a None key in report_type_to_index? False


In [82]:
patient_index = report_type_to_index.get("Patient")

In [83]:
print(data["entry"][0]["resource"].keys())

dict_keys(['resourceType', 'id', 'meta', 'text', 'extension', 'identifier', 'name', 'telecom', 'gender', 'birthDate', 'address', 'maritalStatus', 'multipleBirthBoolean', 'communication'])


In [86]:
for report_type in all_url.keys():
    print("=" * 20)
    print(f"analyzing {report_type}")
    type_index = report_type_to_index.get(report_type)
    for keys in data["entry"][type_index[0]]["resource"].keys():
        print(f"  {keys} : {data['entry'][type_index[0]]['resource'][keys]}")
    print("=" * 20)
    
    

analyzing Patient
  resourceType : Patient
  id : 4b40465b-ea68-fa3f-b224-d45cbc171913
  meta : {'profile': ['http://hl7.org/fhir/us/core/StructureDefinition/us-core-patient']}
  text : {'status': 'generated', 'div': '<div xmlns="http://www.w3.org/1999/xhtml">Generated by <a href="https://github.com/synthetichealth/synthea">Synthea</a>.Version identifier: c807432\n .   Person seed: 945465854560076454  Population seed: 1750493949303</div>'}
  extension : [{'url': 'http://hl7.org/fhir/us/core/StructureDefinition/us-core-race', 'extension': [{'url': 'ombCategory', 'valueCoding': {'system': 'urn:oid:2.16.840.1.113883.6.238', 'code': '2106-3', 'display': 'White'}}, {'url': 'text', 'valueString': 'White'}]}, {'url': 'http://hl7.org/fhir/us/core/StructureDefinition/us-core-ethnicity', 'extension': [{'url': 'ombCategory', 'valueCoding': {'system': 'urn:oid:2.16.840.1.113883.6.238', 'code': '2186-5', 'display': 'Not Hispanic or Latino'}}, {'url': 'text', 'valueString': 'Not Hispanic or Latino'}

the relevant ones are: <br>
patient <br>
observation (has height, kg, etc. time issued)

In [94]:
data["entry"][report_type_to_index.get("Encounter")[0]]["resource"]

{'resourceType': 'Encounter',
 'id': 'ec9b24ff-7b15-dd4d-3e00-345cecda85d2',
 'meta': {'profile': ['http://hl7.org/fhir/us/core/StructureDefinition/us-core-encounter']},
 'identifier': [{'use': 'official',
   'system': 'https://github.com/synthetichealth/synthea',
   'value': 'ec9b24ff-7b15-dd4d-3e00-345cecda85d2'}],
 'status': 'finished',
 'class': {'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode',
  'code': 'AMB'},
 'type': [{'coding': [{'system': 'http://snomed.info/sct',
     'code': '410620009',
     'display': 'Well child visit (procedure)'}],
   'text': 'Well child visit (procedure)'}],
 'subject': {'reference': 'urn:uuid:4b40465b-ea68-fa3f-b224-d45cbc171913',
  'display': 'Arnoldo445 Cameron381 Lueilwitz711'},
 'participant': [{'type': [{'coding': [{'system': 'http://terminology.hl7.org/CodeSystem/v3-ParticipationType',
       'code': 'PPRF',
       'display': 'primary performer'}],
     'text': 'primary performer'}],
   'period': {'start': '2014-11-20T13:53:15+07:0

In [96]:
data["entry"][report_type_to_index.get("Encounter")[0]]["resource"]["participant"]

[{'type': [{'coding': [{'system': 'http://terminology.hl7.org/CodeSystem/v3-ParticipationType',
      'code': 'PPRF',
      'display': 'primary performer'}],
    'text': 'primary performer'}],
  'period': {'start': '2014-11-20T13:53:15+07:00',
   'end': '2014-11-20T14:08:15+07:00'},
  'individual': {'reference': 'Practitioner?identifier=http://hl7.org/fhir/sid/us-npi|9999951491',
   'display': 'Dr. Marcela739 CasÃ¡rez469'}}]

so take participant: period.start, period.end, location.display <br>
code (AMB, EMER, etc.) <br>
age

In [112]:
import pprint

for i in report_type_to_index.get("Encounter"):
    if data["entry"][i]["resource"]["class"]["code"] == "EMER":
        pprint.pprint(data["entry"][i]["resource"], indent=2)

{ 'class': { 'code': 'EMER',
             'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode'},
  'id': '63ff1421-019f-18a0-8d43-6733a8468f56',
  'identifier': [ { 'system': 'https://github.com/synthetichealth/synthea',
                    'use': 'official',
                    'value': '63ff1421-019f-18a0-8d43-6733a8468f56'}],
  'location': [ { 'location': { 'display': 'BLUEBERRY HILL REHABILITATION AND '
                                           'HEALTHCARE CTR',
                                'reference': 'Location?identifier=https://github.com/synthetichealth/synthea|b64fdbb3-9be7-317c-a90d-fa24b912c836'}}],
  'meta': { 'profile': [ 'http://hl7.org/fhir/us/core/StructureDefinition/us-core-encounter']},
  'participant': [ { 'individual': { 'display': 'Dr. Domonique463 '
                                                'Satterfield305',
                                     'reference': 'Practitioner?identifier=http://hl7.org/fhir/sid/us-npi|9999881292'},
                   