## Testing Secret Scope

In [0]:
storage_account_access_key = dbutils.secrets.get(
    scope="nhs-keyvault-scope", 
    key="adls-access-key"
)

## Mount ADLS Gen2 Container

1. Set storage configs using Key Vault secret
2. Retrieve the access key from Key Vault
3. Mount ADLS Gen2 container
4. Verify mount


In [0]:
#import time
import time

# ADLS configs
storage_account_name = "nhsdatalakevenz"
container_name = "raw"
mount_point = "/mnt/nhs_raw"

# Retrieve key from Key Vault
storage_account_access_key = dbutils.secrets.get(
    scope="nhs-keyvault-scope", 
    key="adls-access-key"
)

# Mount container only if not already mounted
if not any(m.mountPoint == mount_point for m in dbutils.fs.mounts()):
    dbutils.fs.mount(
        source = f"wasbs://data@nhsdatalakevenz.blob.core.windows.net/",
        mount_point = mount_point,
        extra_configs = {
            f"fs.azure.account.key.nhsdatalakevenz.blob.core.windows.net": storage_account_access_key
        }
    )
    print(f"{mount_point} mounted successfully")
else:
    print(f"{mount_point} is already mounted")



/mnt/nhs_raw is already mounted


## Copy CSVs to Raw Layer (Bronze)

In [0]:
source_path = f"{mount_point}/raw/"
bronze_path = f"{mount_point}/raw/bronze/"

# Ensure bronze folder exists
dbutils.fs.mkdirs(bronze_path)

def copy_recursive(src):
    items = dbutils.fs.ls(src)
    for f in items:
        #  CRITICAL: Skip bronze folder to avoid infinite recursion
        if f.isDir() and f.path.rstrip("/") != bronze_path.rstrip("/"):
            copy_recursive(f.path)

        elif f.name.lower().endswith(".csv"):
            target_file = bronze_path + f.name

            # Idempotent copy (no duplicates)
            if not any(x.name == f.name for x in dbutils.fs.ls(bronze_path)):
                dbutils.fs.cp(f.path, target_file)
                print(f"Copied {f.name} to bronze")
                time.sleep(0.2)

# Run copy
copy_recursive(source_path)
print("Bronze layer copy completed!")


Copied April-2025-CSV-revised.csv to bronze
Copied August-2025-CSV-revised.csv to bronze
Copied July-2025-CSV-revised.csv to bronze
Copied June-2025-CSV-revised.csv to bronze
Copied May-2025-CSV-revised.csv to bronze
Copied October-2025-CSV-hg6dl.csv to bronze
Copied September-2025-CSV-revised.csv to bronze
Copied November-2025-CSV-G9pr3.csv to bronze
Bronze layer copy completed!


## Read Bronze CSVs into Spark DataFrame

In [0]:
# REQUIRED for Databricks Jobs
import time
time.sleep(5)

# Read bronze safely
df = (
    spark.read
    .option("header", True)
    .option("inferSchema", True)
    .csv(bronze_path + "*.csv")
)

df.count()  # force materialisation
display(df)


Period,Org Code,Parent Org,Org name,A&E attendances Type 1,A&E attendances Type 2,A&E attendances Other A&E Department,A&E attendances Booked Appointments Type 1,A&E attendances Booked Appointments Type 2,A&E attendances Booked Appointments Other Department,Attendances over 4hrs Type 1,Attendances over 4hrs Type 2,Attendances over 4hrs Other Department,Attendances over 4hrs Booked Appointments Type 1,Attendances over 4hrs Booked Appointments Type 2,Attendances over 4hrs Booked Appointments Other Department,Patients who have waited 4-12 hs from DTA to admission,Patients who have waited 12+ hrs from DTA to admission,Emergency admissions via A&E - Type 1,Emergency admissions via A&E - Type 2,Emergency admissions via A&E - Other A&E department,Other emergency admissions
MSitAE-SEPTEMBER-2025,AQN04,NHS ENGLAND SOUTH EAST,PHL LYMINGTON UTC,0,0,2988,0,0,11,0,0,15,0,0,0,0,0,0,0,0,0
MSitAE-SEPTEMBER-2025,RBQ,NHS ENGLAND NORTH WEST,LIVERPOOL HEART AND CHEST HOSPITAL NHS FOUNDATION TRUST,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,134
MSitAE-SEPTEMBER-2025,RLQ,NHS ENGLAND MIDLANDS,WYE VALLEY NHS TRUST,5972,1130,155,107,0,0,2510,0,0,26,0,0,443,274,1448,0,0,142
MSitAE-SEPTEMBER-2025,Y02615,NHS ENGLAND MIDLANDS,SOUTH BIRMINGHAM GP WALK IN CENTRE,0,0,6084,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
MSitAE-SEPTEMBER-2025,AAH,NHS ENGLAND SOUTH WEST,TETBURY HOSPITAL TRUST LTD,0,0,593,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
MSitAE-SEPTEMBER-2025,C82038,NHS ENGLAND MIDLANDS,LATHAM HOUSE MEDICAL PRACTICE,0,0,363,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
MSitAE-SEPTEMBER-2025,NQTE4,NHS ENGLAND MIDLANDS,SUMMERFIELD URGENT CARE CENTRE,0,0,3150,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
MSitAE-SEPTEMBER-2025,RJ8,NHS ENGLAND SOUTH WEST,CORNWALL PARTNERSHIP NHS FOUNDATION TRUST,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,67
MSitAE-SEPTEMBER-2025,RW1,NHS ENGLAND SOUTH EAST,HAMPSHIRE AND ISLE OF WIGHT HEALTHCARE NHS FOUNDATION TRUST,0,0,2793,0,0,0,0,0,64,0,0,0,0,0,0,0,0,124
MSitAE-SEPTEMBER-2025,RWF,NHS ENGLAND SOUTH EAST,MAIDSTONE AND TUNBRIDGE WELLS NHS TRUST,19916,0,2105,1049,0,157,4113,0,69,141,0,3,839,96,4683,0,0,672


# Data Cleaning and Transformations
## 1. Rename columns using spark

In [0]:
from pyspark.sql.functions import col

# Create a mapping dictionary
rename_dict = {
    "Period": "period",
    "Org Code": "org_code",
    "Parent Org": "parent_org",
    "Org name": "org_name",
    "A&E attendances Type 1": "ae_attendances_type_1",
    "A&E attendances Type 2": "ae_attendances_type_2",
    "A&E attendances Other A&E Department": "ae_attendances_other",
    "A&E attendances Booked Appointments Type 1": "ae_attendances_booked_type_1",
    "A&E attendances Booked Appointments Type 2": "ae_attendances_booked_type_2",
    "A&E attendances Booked Appointments Other Department": "ae_attendances_booked_other",
    "Attendances over 4hrs Type 1": "attendances_over_4hrs_type_1",
    "Attendances over 4hrs Type 2": "attendances_over_4hrs_type_2",
    "Attendances over 4hrs Other Department": "attendances_over_4hrs_other",
    "Attendances over 4hrs Booked Appointments Type 1": "attendances_over_4hrs_booked_type_1",
    "Attendances over 4hrs Booked Appointments Type 2": "attendances_over_4hrs_booked_type_2",
    "Attendances over 4hrs Booked Appointments Other Department": "attendances_over_4hrs_booked_other",
    "Patients who have waited 4-12 hs from DTA to admission": "patients_4_12hrs_dta",
    "Patients who have waited 12+ hrs from DTA to admission": "patients_12hrs_plus_dta",
    "Emergency admissions via A&E - Type 1": "emergency_admissions_ae_type_1",
    "Emergency admissions via A&E - Type 2": "emergency_admissions_ae_type_2",
    "Emergency admissions via A&E - Other A&E department": "emergency_admissions_other_ae",
    "Other emergency admissions": "other_emergency_admissions"
}

# Apply renaming
for old_name, new_name in rename_dict.items():
    df = df.withColumnRenamed(old_name, new_name)

# Verify
df.printSchema()


root
 |-- period: string (nullable = true)
 |-- org_code: string (nullable = true)
 |-- parent_org: string (nullable = true)
 |-- org_name: string (nullable = true)
 |-- ae_attendances_type_1: integer (nullable = true)
 |-- ae_attendances_type_2: integer (nullable = true)
 |-- ae_attendances_other: integer (nullable = true)
 |-- ae_attendances_booked_type_1: integer (nullable = true)
 |-- ae_attendances_booked_type_2: integer (nullable = true)
 |-- ae_attendances_booked_other: integer (nullable = true)
 |-- attendances_over_4hrs_type_1: integer (nullable = true)
 |-- attendances_over_4hrs_type_2: integer (nullable = true)
 |-- attendances_over_4hrs_other: integer (nullable = true)
 |-- attendances_over_4hrs_booked_type_1: integer (nullable = true)
 |-- attendances_over_4hrs_booked_type_2: integer (nullable = true)
 |-- attendances_over_4hrs_booked_other: integer (nullable = true)
 |-- patients_4_12hrs_dta: integer (nullable = true)
 |-- patients_12hrs_plus_dta: integer (nullable = true

### 2. Drop a "TOTAL" row from the dataframe 

In [0]:
# Drop TOTAL row entirely
df = df.filter(~(df["period"] == "TOTAL"))

print("Rows after removing TOTAL row:", df.count())


Rows after removing TOTAL row: 1595


### 3. Add a new column "ingestion_date" using current_timestamp() function

This helps us to track when each dataset was ingested

In [0]:
from pyspark.sql.functions import current_date

df = df.withColumn("ingestion_date", current_date())

In [0]:
# Show result
df.display(5, truncate=False)

period,org_code,parent_org,org_name,ae_attendances_type_1,ae_attendances_type_2,ae_attendances_other,ae_attendances_booked_type_1,ae_attendances_booked_type_2,ae_attendances_booked_other,attendances_over_4hrs_type_1,attendances_over_4hrs_type_2,attendances_over_4hrs_other,attendances_over_4hrs_booked_type_1,attendances_over_4hrs_booked_type_2,attendances_over_4hrs_booked_other,patients_4_12hrs_dta,patients_12hrs_plus_dta,emergency_admissions_ae_type_1,emergency_admissions_ae_type_2,emergency_admissions_other_ae,other_emergency_admissions,ingestion_date
MSitAE-SEPTEMBER-2025,AQN04,NHS ENGLAND SOUTH EAST,PHL LYMINGTON UTC,0,0,2988,0,0,11,0,0,15,0,0,0,0,0,0,0,0,0,2026-01-13
MSitAE-SEPTEMBER-2025,RBQ,NHS ENGLAND NORTH WEST,LIVERPOOL HEART AND CHEST HOSPITAL NHS FOUNDATION TRUST,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,134,2026-01-13
MSitAE-SEPTEMBER-2025,RLQ,NHS ENGLAND MIDLANDS,WYE VALLEY NHS TRUST,5972,1130,155,107,0,0,2510,0,0,26,0,0,443,274,1448,0,0,142,2026-01-13
MSitAE-SEPTEMBER-2025,Y02615,NHS ENGLAND MIDLANDS,SOUTH BIRMINGHAM GP WALK IN CENTRE,0,0,6084,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2026-01-13
MSitAE-SEPTEMBER-2025,AAH,NHS ENGLAND SOUTH WEST,TETBURY HOSPITAL TRUST LTD,0,0,593,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2026-01-13
MSitAE-SEPTEMBER-2025,C82038,NHS ENGLAND MIDLANDS,LATHAM HOUSE MEDICAL PRACTICE,0,0,363,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2026-01-13
MSitAE-SEPTEMBER-2025,NQTE4,NHS ENGLAND MIDLANDS,SUMMERFIELD URGENT CARE CENTRE,0,0,3150,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2026-01-13
MSitAE-SEPTEMBER-2025,RJ8,NHS ENGLAND SOUTH WEST,CORNWALL PARTNERSHIP NHS FOUNDATION TRUST,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,67,2026-01-13
MSitAE-SEPTEMBER-2025,RW1,NHS ENGLAND SOUTH EAST,HAMPSHIRE AND ISLE OF WIGHT HEALTHCARE NHS FOUNDATION TRUST,0,0,2793,0,0,0,0,0,64,0,0,0,0,0,0,0,0,124,2026-01-13
MSitAE-SEPTEMBER-2025,RWF,NHS ENGLAND SOUTH EAST,MAIDSTONE AND TUNBRIDGE WELLS NHS TRUST,19916,0,2105,1049,0,157,4113,0,69,141,0,3,839,96,4683,0,0,672,2026-01-13


## 4. Modify column Period in Month-Year format using regexp_replace()

In [0]:
from pyspark.sql.functions import regexp_replace, col

df = df.withColumn(
    "Period",
    regexp_replace(col("Period"), "MSitAE-", "")
)

In [0]:
df.display()

Period,org_code,parent_org,org_name,ae_attendances_type_1,ae_attendances_type_2,ae_attendances_other,ae_attendances_booked_type_1,ae_attendances_booked_type_2,ae_attendances_booked_other,attendances_over_4hrs_type_1,attendances_over_4hrs_type_2,attendances_over_4hrs_other,attendances_over_4hrs_booked_type_1,attendances_over_4hrs_booked_type_2,attendances_over_4hrs_booked_other,patients_4_12hrs_dta,patients_12hrs_plus_dta,emergency_admissions_ae_type_1,emergency_admissions_ae_type_2,emergency_admissions_other_ae,other_emergency_admissions,ingestion_date
SEPTEMBER-2025,AQN04,NHS ENGLAND SOUTH EAST,PHL LYMINGTON UTC,0,0,2988,0,0,11,0,0,15,0,0,0,0,0,0,0,0,0,2026-01-13
SEPTEMBER-2025,RBQ,NHS ENGLAND NORTH WEST,LIVERPOOL HEART AND CHEST HOSPITAL NHS FOUNDATION TRUST,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,134,2026-01-13
SEPTEMBER-2025,RLQ,NHS ENGLAND MIDLANDS,WYE VALLEY NHS TRUST,5972,1130,155,107,0,0,2510,0,0,26,0,0,443,274,1448,0,0,142,2026-01-13
SEPTEMBER-2025,Y02615,NHS ENGLAND MIDLANDS,SOUTH BIRMINGHAM GP WALK IN CENTRE,0,0,6084,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2026-01-13
SEPTEMBER-2025,AAH,NHS ENGLAND SOUTH WEST,TETBURY HOSPITAL TRUST LTD,0,0,593,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2026-01-13
SEPTEMBER-2025,C82038,NHS ENGLAND MIDLANDS,LATHAM HOUSE MEDICAL PRACTICE,0,0,363,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2026-01-13
SEPTEMBER-2025,NQTE4,NHS ENGLAND MIDLANDS,SUMMERFIELD URGENT CARE CENTRE,0,0,3150,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2026-01-13
SEPTEMBER-2025,RJ8,NHS ENGLAND SOUTH WEST,CORNWALL PARTNERSHIP NHS FOUNDATION TRUST,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,11,67,2026-01-13
SEPTEMBER-2025,RW1,NHS ENGLAND SOUTH EAST,HAMPSHIRE AND ISLE OF WIGHT HEALTHCARE NHS FOUNDATION TRUST,0,0,2793,0,0,0,0,0,64,0,0,0,0,0,0,0,0,124,2026-01-13
SEPTEMBER-2025,RWF,NHS ENGLAND SOUTH EAST,MAIDSTONE AND TUNBRIDGE WELLS NHS TRUST,19916,0,2105,1049,0,157,4113,0,69,141,0,3,839,96,4683,0,0,672,2026-01-13


## Write Clean and Transformed Data to Silver Layer

In [0]:
silver_path = "/mnt/nhs_raw/curated/silver/"

In [0]:
silver_df = df.write \
    .format("delta") \
    .mode("overwrite") \
    .partitionBy("Period") \
    .save(silver_path)


In [0]:
silver_df = spark.read.format("delta").load(silver_path)
silver_df.display(5, truncate=False)


Period,org_code,parent_org,org_name,ae_attendances_type_1,ae_attendances_type_2,ae_attendances_other,ae_attendances_booked_type_1,ae_attendances_booked_type_2,ae_attendances_booked_other,attendances_over_4hrs_type_1,attendances_over_4hrs_type_2,attendances_over_4hrs_other,attendances_over_4hrs_booked_type_1,attendances_over_4hrs_booked_type_2,attendances_over_4hrs_booked_other,patients_4_12hrs_dta,patients_12hrs_plus_dta,emergency_admissions_ae_type_1,emergency_admissions_ae_type_2,emergency_admissions_other_ae,other_emergency_admissions,ingestion_date
SEPTEMBER-2025,RAL,NHS ENGLAND LONDON,ROYAL FREE LONDON NHS FOUNDATION TRUST,26019,567,14950,128,2,149,8625,0,603,46,0,11,1018,1975,6552,2,96,695,2026-01-13
SEPTEMBER-2025,RTD,NHS ENGLAND NORTH EAST AND YORKSHIRE,THE NEWCASTLE UPON TYNE HOSPITALS NHS FOUNDATION TRUST,12773,1735,6244,0,0,1068,5163,79,0,0,0,0,541,12,3792,20,0,2882,2026-01-13
SEPTEMBER-2025,RX1,NHS ENGLAND MIDLANDS,NOTTINGHAM UNIVERSITY HOSPITALS NHS TRUST,12732,2142,0,0,0,0,6930,142,0,0,0,0,255,634,3920,0,815,4519,2026-01-13
SEPTEMBER-2025,RH5,NHS ENGLAND SOUTH WEST,SOMERSET NHS FOUNDATION TRUST,12710,0,8427,0,0,0,6042,0,127,0,0,0,205,100,3119,0,0,1711,2026-01-13
SEPTEMBER-2025,Y02147,NHS ENGLAND LONDON,URGENT CARE CENTRE (QMS),0,0,7801,0,0,336,0,0,123,0,0,0,0,0,0,0,0,0,2026-01-13
SEPTEMBER-2025,RYJ,NHS ENGLAND LONDON,IMPERIAL COLLEGE HEALTHCARE NHS TRUST,11866,3895,7038,0,0,531,5300,197,333,0,0,1,588,0,3523,2,0,1126,2026-01-13
SEPTEMBER-2025,RJ7,NHS ENGLAND LONDON,ST GEORGE'S UNIVERSITY HOSPITALS NHS FOUNDATION TRUST,8302,0,4055,522,0,0,2580,0,172,75,0,0,376,500,2535,0,0,694,2026-01-13
SEPTEMBER-2025,RBL,NHS ENGLAND NORTH WEST,WIRRAL UNIVERSITY TEACHING HOSPITAL NHS FOUNDATION TRUST,8030,0,2802,0,0,124,4600,0,95,0,0,2,563,696,2039,0,0,1799,2026-01-13
SEPTEMBER-2025,RWJ,NHS ENGLAND NORTH WEST,STOCKPORT NHS FOUNDATION TRUST,7974,0,1010,328,0,1398,3249,0,42,125,0,1,687,324,2759,0,67,818,2026-01-13
SEPTEMBER-2025,RMC,NHS ENGLAND NORTH WEST,BOLTON NHS FOUNDATION TRUST,8141,0,3746,26,0,3,3816,0,269,8,0,0,536,213,2612,0,115,947,2026-01-13
