In [0]:
# service principal for integrating with ADLS and access it's data

spark.conf.set("fs.azure.account.auth.type.hpadlsacc.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.hpadlsacc.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.hpadlsacc.dfs.core.windows.net", dbutils.secrets.get("hc-secret-scope", "app-key"))
spark.conf.set("fs.azure.account.oauth2.client.secret.hpadlsacc.dfs.core.windows.net", dbutils.secrets.get("hc-secret-scope", "service-cred"))
tenant_id = dbutils.secrets.get("hc-secret-scope", "dir-id")
spark.conf.set("fs.azure.account.oauth2.client.endpoint.hpadlsacc.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

# Creating encounters table in Silver layer

In [0]:
# defining the source path for both the hospitals 

src_hosa = "abfss://bronze@hpadlsacc.dfs.core.windows.net/hos-a/encounters"
src_hosb = "abfss://bronze@hpadlsacc.dfs.core.windows.net/hos-b/encounters"

# reading the data from the hospital a encounters
df_hosa = spark.read.format("parquet").load(src_hosa)

# reading the data from the hospital b encounters
df_hosb = spark.read.format("parquet").load(src_hosb)

# merging the data from both the hospitals
df_merged = df_hosa.unionByName(df_hosb)
df_merged.display()

# creating a temp view to enable sql operations
df_merged.createOrReplaceTempView("encounters")

In [0]:
%sql
-- create a temp view table for quality checks
-- adding a new column 'is_quarantined' to check for null values
-- if any primary/important column null then the particular record should be quarantined(True)
-- else Not(False)

CREATE OR REPLACE TEMP VIEW quality_checks AS
SELECT 
  concat(EncounterID, '-', datasource) as EncounterID,
  EncounterID AS SRC_EncounterID,
  PatientID,
  EncounterDate,
  EncounterType,
  ProviderID,
  DepartmentID,
  ProcedureCode,
  InsertedDate as SRC_InsertedDate,
  ModifiedDate as SRC_ModifiedDate,
  datasource,
  CASE 
    WHEN EncounterID IS NULL OR PatientID IS NULL THEN TRUE
    ELSE FALSE
  END AS is_quarantined
FROM encounters;


In [0]:
%sql
-- getting some records
select * from quality_checks
where datasource='hos-b'

In [0]:
%sql
-- creating silver.encounters external table 

CREATE TABLE IF NOT EXISTS silver.encounters (
  EncounterID string,
  SRC_EncounterID string,
  PatientID string,
  EncounterDate date,
  EncounterType string,
  ProviderID string,
  DepartmentID string,
  ProcedureCode integer,
  SRC_InsertedDate date,
  SRC_ModifiedDate date,
  datasource string,
  is_quarantined boolean,
  audit_insertdate timestamp,
  audit_modifieddate timestamp,
  is_current boolean
)
USING DELTA
LOCATION "abfss://silver@hpadlsacc.dfs.core.windows.net/encounters/"


In [0]:
%sql
-- Step 1: based on condition EncounterID should be simliar and is_current should be true (i.e currently that specific record is active)
-- Mark that existing records as historical (is_current = false) for patients that will be updated
-- target.is_current = false,
-- target.modified_date = current_timestamp()

MERGE INTO silver.encounters AS target
USING quality_checks AS source
ON target.EncounterID = source.EncounterID AND target.is_current = true
WHEN MATCHED AND (
    target.SRC_EncounterID != source.SRC_EncounterID OR
    target.PatientID != source.PatientID OR
    target.EncounterDate != source.EncounterDate OR
    target.EncounterType != source.EncounterType OR
    target.ProviderID != source.ProviderID OR
    target.DepartmentID != source.DepartmentID OR
    target.ProcedureCode != source.ProcedureCode OR
    target.SRC_InsertedDate != source.SRC_InsertedDate OR
    target.SRC_ModifiedDate != source.SRC_ModifiedDate OR
    target.datasource != source.datasource OR
    target.is_quarantined != source.is_quarantined
)
THEN
  UPDATE SET
    target.is_current = false,
    target.audit_modifieddate = current_timestamp();


In [0]:
%sql
-- Step 2: Insert new and updated records into the Delta table, marking them as current
-- that is old record is updated with new records
-- based on condition EncounterID should be simliar and is_current should be true (i.e currently that specific record is active)
-- because the condition will not satisfy
-- inserting the new records which are not present in the silver table and updating the old records

MERGE INTO silver.encounters AS target
USING quality_checks AS source
ON target.EncounterID = source.EncounterID AND target.is_current = true
WHEN NOT MATCHED THEN
INSERT (
  EncounterID,
  SRC_EncounterID,
  PatientID,
  EncounterDate,
  EncounterType,
  ProviderID,
  DepartmentID,
  ProcedureCode,
  SRC_InsertedDate,
  SRC_ModifiedDate,
  datasource,
  is_quarantined,
  audit_insertdate,
  audit_modifieddate,
  is_current
)
VALUES (
  source.EncounterID,
  source.SRC_EncounterID,
  source.PatientID,
  source.EncounterDate,
  source.EncounterType,
  source.ProviderID,
  source.DepartmentID,
  source.ProcedureCode,
  source.SRC_InsertedDate,
  source.SRC_ModifiedDate,
  source.datasource,
  source.is_quarantined,
  current_timestamp(),
  current_timestamp(),
  true
);


In [0]:
%sql
-- displaying some records

SELECT 
  SRC_EncounterID,
  datasource,
  COUNT(patientid) AS patient_count
FROM silver.encounters
GROUP BY SRC_EncounterID, datasource
ORDER BY patient_count DESC;


In [0]:
%sql
-- drop table silver.encounters