In [0]:
BASE_VOLUME_PATH = "/Volumes/workspace/healthcare_analytics/datastore"

PATHS = {
    "patients":     f"{BASE_VOLUME_PATH}/patients",
    "encounters":   f"{BASE_VOLUME_PATH}/encounters",
    "observations": f"{BASE_VOLUME_PATH}/observations",
    "conditions":   f"{BASE_VOLUME_PATH}/conditions",
    "procedures":   f"{BASE_VOLUME_PATH}/procedures",
    "allergies":    f"{BASE_VOLUME_PATH}/allergies",
    "payers":       f"{BASE_VOLUME_PATH}/payers",
    "providers":    f"{BASE_VOLUME_PATH}/providers"
}

PATHS

{'patients': '/Volumes/workspace/healthcare_analytics/datastore/patients',
 'encounters': '/Volumes/workspace/healthcare_analytics/datastore/encounters',
 'observations': '/Volumes/workspace/healthcare_analytics/datastore/observations',
 'conditions': '/Volumes/workspace/healthcare_analytics/datastore/conditions',
 'procedures': '/Volumes/workspace/healthcare_analytics/datastore/procedures',
 'allergies': '/Volumes/workspace/healthcare_analytics/datastore/allergies',
 'payers': '/Volumes/workspace/healthcare_analytics/datastore/payers',
 'providers': '/Volumes/workspace/healthcare_analytics/datastore/providers'}

In [0]:
for name, path in PATHS.items():
    print(f"\n{name.upper()}")
    display(dbutils.fs.ls(path))


PATIENTS


path,name,size,modificationTime
dbfs:/Volumes/workspace/healthcare_analytics/datastore/patients/patients.csv,patients.csv,33059,1768683233000



ENCOUNTERS


path,name,size,modificationTime
dbfs:/Volumes/workspace/healthcare_analytics/datastore/encounters/encounters.csv,encounters.csv,2124396,1768683184000



OBSERVATIONS


path,name,size,modificationTime
dbfs:/Volumes/workspace/healthcare_analytics/datastore/observations/observations.csv,observations.csv,18001057,1768683208000



CONDITIONS


path,name,size,modificationTime
dbfs:/Volumes/workspace/healthcare_analytics/datastore/conditions/conditions.csv,conditions.csv,677132,1768683170000



PROCEDURES


path,name,size,modificationTime
dbfs:/Volumes/workspace/healthcare_analytics/datastore/procedures/procedures.csv,procedures.csv,4282365,1768683289000



ALLERGIES


path,name,size,modificationTime
dbfs:/Volumes/workspace/healthcare_analytics/datastore/allergies/allergies.csv,allergies.csv,15120,1768683141000



PAYERS


path,name,size,modificationTime
dbfs:/Volumes/workspace/healthcare_analytics/datastore/payers/payers.csv,payers.csv,1775,1768683254000



PROVIDERS


path,name,size,modificationTime
dbfs:/Volumes/workspace/healthcare_analytics/datastore/providers/providers.csv,providers.csv,51049,1768683359000


In [0]:
from pyspark.sql.functions import current_timestamp, lit

def load_to_bronze(dataset):
    df = (
        spark.read
        .option("header", "true")
        .option("inferSchema", "true")
        .csv(PATHS[dataset])
    )

    # Add Bronze metadata columns
    df = (
        df
        .withColumn("ingest_ts", current_timestamp())
        .withColumn("source_system", lit("healthcare_csv"))
    )

    # Write to Bronze Delta table
    (
        df.write
        .mode("overwrite")
        .format("delta")
        .saveAsTable(f"healthcare_analytics.bronze_{dataset}")
    )

    print(f"healthcare_analytics.bronze_{dataset} created")

In [0]:
for dataset in PATHS.keys():
    load_to_bronze(dataset)

healthcare_analytics.bronze_patients created
healthcare_analytics.bronze_encounters created
healthcare_analytics.bronze_observations created
healthcare_analytics.bronze_conditions created
healthcare_analytics.bronze_procedures created
healthcare_analytics.bronze_allergies created
healthcare_analytics.bronze_payers created
healthcare_analytics.bronze_providers created


In [0]:
spark.table("healthcare_analytics.bronze_encounters").printSchema()


root
 |-- Id: string (nullable = true)
 |-- START: timestamp (nullable = true)
 |-- STOP: timestamp (nullable = true)
 |-- PATIENT: string (nullable = true)
 |-- ORGANIZATION: string (nullable = true)
 |-- PROVIDER: string (nullable = true)
 |-- PAYER: string (nullable = true)
 |-- ENCOUNTERCLASS: string (nullable = true)
 |-- CODE: long (nullable = true)
 |-- DESCRIPTION: string (nullable = true)
 |-- BASE_ENCOUNTER_COST: double (nullable = true)
 |-- TOTAL_CLAIM_COST: double (nullable = true)
 |-- PAYER_COVERAGE: double (nullable = true)
 |-- REASONCODE: long (nullable = true)
 |-- REASONDESCRIPTION: string (nullable = true)
 |-- ingest_ts: timestamp (nullable = true)
 |-- source_system: string (nullable = true)

