In [0]:
df_claims=spark.read.parquet('mnt/bronze/claims/*.parquet')
display(df_claims)
df_claims.createOrReplaceTempView('claims_data')

In [0]:
%sql
DESCRIBE claims_data

In [0]:
%sql
CREATE OR REPLACE TEMP VIEW quality_checks
AS SELECT 
concat(ClaimID,'-',datasource) as ClaimID,
ClaimID AS SRC_ClaimID,
TransactionID,
PatientID,
EncounterID,
ProviderID,
DeptID,
ServiceDate,
ClaimDate,
PayorID,
ClaimAmount,
PaidAmount,
ClaimStatus,
PayorType,
Deductible,
Coinsurance,
Copay,
InsertDate AS SRC_InsertDate,
ModifiedDate AS SRC_ModifiedDate,
datasource,
CASE 
    WHEN ClaimID IS NULL OR TransactionID IS NULL OR PatientID IS NULL OR ServiceDate IS NULL THEN TRUE
    ELSE FALSE
    END AS is_quarantined  
FROM claims_data;

In [0]:
%sql
CREATE TABLE IF NOT EXISTS healthcarerevenuecyclemanagement_databricks.silver.claims (
ClaimID STRING,
SRC_ClaimID STRING,
TransactionID STRING,
PatientID STRING,
EncounterID STRING,
ProviderID STRING,
DeptID STRING,
ServiceDate DATE,
ClaimDate DATE,
PayorID STRING,
ClaimAmount DECIMAL(10,4),
PaidAmount DECIMAL(10,4),
ClaimStatus STRING,
PayorType STRING,
Deductible DECIMAL(10,4),
Coinsurance DECIMAL(10,4),
Copay DECIMAL(10,4),
SRC_InsertDate DATE,
SRC_ModifiedDate DATE,
datasource STRING,
is_quarantined BOOLEAN,
audit_insertdate TIMESTAMP,
audit_modifieddate TIMESTAMP,
is_current BOOLEAN
)

In [0]:
healthcarerevenuecyclemanagement_databricks.silver.claims%sql
MERGE INTO healthcarerevenuecyclemanagement_databricks.silver.claims AS target
USING quality_checks AS source
ON target.ClaimID = source.ClaimID AND target.is_current = true
WHEN MATCHED AND
(
target.SRC_ClaimID <> source.SRC_ClaimID OR
target.TransactionID <> source.TransactionID OR
target.PatientID <> source.PatientID OR
target.EncounterID <> source.EncounterID OR
target.ProviderID <> source.ProviderID OR
target.DeptID <> source.DeptID OR
target.ServiceDate <> source.ServiceDate OR
target.ClaimDate <> source.ClaimDate OR
target.PayorID <> source.PayorID OR
target.ClaimAmount <> source.ClaimAmount OR
target.PaidAmount <> source.PaidAmount OR
target.ClaimStatus <> source.ClaimStatus OR
target.PayorType <> source.PayorType OR
target.Deductible <> source.Deductible OR
target.Coinsurance <> source.Coinsurance OR
target.Copay <> source.Copay OR
target.SRC_InsertDate <> source.SRC_InsertDate OR
target.SRC_ModifiedDate <> source.SRC_ModifiedDate OR
target.datasource <> source.datasource OR
target.is_quarantined <> source.is_quarantined)
THEN UPDATE SET target.is_current = false,
                target.audit_modifieddate = current_timestamp()
WHEN NOT MATCHED THEN INSERT (
ClaimID,
SRC_ClaimID,
TransactionID,
PatientID,
EncounterID,
ProviderID,
DeptID,
ServiceDate,
ClaimDate,
PayorID,
ClaimAmount,
PaidAmount,
ClaimStatus,
PayorType,
Deductible,
Coinsurance,
Copay,
SRC_InsertDate,
SRC_ModifiedDate,
datasource,
is_quarantined,
audit_insertdate,
audit_modifieddate,
is_current
) VALUES(
source.ClaimID,
source.SRC_ClaimID,
source.TransactionID,
source.PatientID,
source.EncounterID,
source.ProviderID,
source.DeptID,
CAST(source.ServiceDate AS DATE),
CAST(source.ClaimDate AS DATE),
source.PayorID,
CAST(source.ClaimAmount AS DECIMAL(10,4)),
CAST(source.PaidAmount AS DECIMAL(10,4)),
source.ClaimStatus,
source.PayorType,
CAST(source.Deductible AS DECIMAL(10,4)),
CAST(source.Coinsurance AS DECIMAL(10,4)),
CAST(source.Copay AS DECIMAL(10,4)),
source.SRC_InsertDate,
source.SRC_ModifiedDate,
source.datasource,
source.is_quarantined,
current_timestamp(),
current_timestamp(),
true
)