In [0]:
spark.conf.set("fs.azure.account.auth.type.hpadlsacc.dfs.core.windows.net", "OAuth")
spark.conf.set("fs.azure.account.oauth.provider.type.hpadlsacc.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set("fs.azure.account.oauth2.client.id.hpadlsacc.dfs.core.windows.net", dbutils.secrets.get("hc-secret-scope", "app-key"))
spark.conf.set("fs.azure.account.oauth2.client.secret.hpadlsacc.dfs.core.windows.net", dbutils.secrets.get("hc-secret-scope", "service-cred"))
tenant_id = dbutils.secrets.get("hc-secret-scope", "dir-id")
spark.conf.set("fs.azure.account.oauth2.client.endpoint.hpadlsacc.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")

# Creating patients table in silver container

In [0]:
# definig the source path
src_hosa= "abfss://bronze@hpadlsacc.dfs.core.windows.net/hos-a/patients"
src_hosb= "abfss://bronze@hpadlsacc.dfs.core.windows.net/hos-b/patients"

#Reading Hospital A patient data 
df_hosa=spark.read.parquet(src_hosa)
df_hosa.createOrReplaceTempView("patients_hosa")

#Reading Hospital B patient data 
df_hosb=spark.read.parquet(src_hosb)

# temp view for hosb
df_hosb.createOrReplaceTempView("patients_hosb")

# here, creating a temp view for both the dataframes and combining it together
# when creating a cdm_patients using the UNION ALL

In [0]:
%sql
-- display some records
select * from patients_hosa;

In [0]:
%sql
-- display some records
select * from patients_hosb;

In [0]:
%sql
-- creating a common data modle , as because we have two different patients data from two hospitals

CREATE OR REPLACE TEMP VIEW cdm_patients AS
SELECT 
  CONCAT(SRC_PatientID, '-', datasource) AS Patient_Key,
  SRC_PatientID,
  FirstName,
  LastName,
  MiddleName,
  SSN,
  PhoneNumber,
  Gender,
  DOB,
  Address,
  ModifiedDate,
  datasource
FROM (
  SELECT 
    PatientID AS SRC_PatientID,
    FirstName,
    LastName,
    MiddleName,
    SSN,
    PhoneNumber,
    Gender,
    DOB,
    Address,
    ModifiedDate,
    'hosa' AS datasource
  FROM patients_hosa
  UNION ALL
  SELECT 
    ID AS SRC_PatientID,
    F_Name AS FirstName,
    L_Name AS LastName,
    M_Name AS MiddleName,
    SSN,
    PhoneNumber,
    Gender,
    DOB,
    Address,
    Updated_Date AS ModifiedDate,
    'hosb' AS datasource
  FROM patients_hosb
);


In [0]:
%sql
-- displaying it
select * from cdm_patients

In [0]:
%sql
-- creating the temp view for quality check
-- adding a new column 'is_quarantined' to check for null values
-- if any primary/important column null then the particular record should be quarantined(True)
-- else Not(False)

CREATE OR REPLACE TEMP VIEW quality_checks AS
SELECT 
    Patient_Key,
    SRC_PatientID,
    FirstName,
    LastName,
    MiddleName,
    SSN,
    PhoneNumber,
    Gender,
    DOB,
    Address,
    ModifiedDate As SRC_ModifiedDate,
    datasource,
    CASE 
        WHEN SRC_PatientID IS NULL OR dob IS NULL OR firstname IS NULL or lower(firstname)='null' THEN TRUE
        ELSE FALSE
    END AS is_quarantined
FROM cdm_patients;


In [0]:
%sql
-- displaying some records

select * from quality_checks
order by is_quarantined desc

In [0]:
%sql
-- creating a silver.patients external table in silver layer

CREATE TABLE IF NOT EXISTS silver.patients (
  Patient_Key STRING,
  SRC_PatientID STRING,
  FirstName STRING,
  LastName STRING,
  MiddleName STRING,
  SSN STRING,
  PhoneNumber STRING,
  Gender STRING,
  DOB DATE,
  Address STRING,
  SRC_ModifiedDate TIMESTAMP,
  datasource STRING,
  is_quarantined BOOLEAN,
  inserted_date TIMESTAMP,
  modified_date TIMESTAMP,
  is_current BOOLEAN
)
USING DELTA
LOCATION "abfss://silver@hpadlsacc.dfs.core.windows.net/patients";

In [0]:
%sql
-- Step 1: based on condition Patient_Key should be simliar and is_current should be true (i.e currently that specific record is active)
-- Mark that existing records as historical (is_current = false) for patients that will be updated
-- target.is_current = false,
-- target.modified_date = current_timestamp()

MERGE INTO silver.patients AS target
USING quality_checks AS source
ON target.Patient_Key = source.Patient_Key
AND target.is_current = true 
WHEN MATCHED
AND (
    target.SRC_PatientID <> source.SRC_PatientID OR
    target.FirstName <> source.FirstName OR
    target.LastName <> source.LastName OR
    target.MiddleName <> source.MiddleName OR
    target.SSN <> source.SSN OR
    target.PhoneNumber <> source.PhoneNumber OR
    target.Gender <> source.Gender OR
    target.DOB <> source.DOB OR
    target.Address <> source.Address OR
    target.SRC_ModifiedDate <> source.SRC_ModifiedDate OR
    target.datasource <> source.datasource OR
    target.is_quarantined <> source.is_quarantined
)
THEN UPDATE SET
    target.is_current = false,
    target.modified_date = current_timestamp()

-- Step 2: Insert new and updated records into the Delta table, marking them as current
-- that is old record is updated with new records
-- based on condition Patient_Key should be simliar and is_current should be true (i.e currently that specific record is active)
-- because the condition will not satisfy
-- inserting the new records which are not present in the silver table and updating the old records


WHEN NOT MATCHED
THEN INSERT (
    Patient_Key,
    SRC_PatientID,
    FirstName,
    LastName,
    MiddleName,
    SSN,
    PhoneNumber,
    Gender,
    DOB,
    Address,
    SRC_ModifiedDate,
    datasource,
    is_quarantined,
    inserted_date,
    modified_date,
    is_current
)
VALUES (
    source.Patient_Key,
    source.SRC_PatientID,
    source.FirstName,
    source.LastName,
    source.MiddleName,
    source.SSN,
    source.PhoneNumber,
    source.Gender,
    source.DOB,
    source.Address,
    source.SRC_ModifiedDate,
    source.datasource,
    source.is_quarantined,
    current_timestamp(), -- Set inserted_date to current timestamp
    current_timestamp(), -- Set modified_date to current timestamp
    true -- Mark as current
);


In [0]:
%sql
-- displaying the record

select count(*) as duplicates,Patient_Key from silver.patients
group by patient_key
having count(*) > 1
order by 1 desc;


In [0]:
%sql
-- drop table silver.patients