In [0]:
dbutils.widgets.text("p_client_id", "", "Client ID")
dbutils.widgets.text("p_consolidated_schema_name", "", "Consolidated Schema Name")
dbutils.widgets.text("p_environment_name", "", "Environment Name")

In [0]:
v_clientid = dbutils.widgets.get("p_client_id").strip()
v_consolidated_schema_name =  dbutils.widgets.get("p_consolidated_schema_name").strip()
v_environment_name = dbutils.widgets.get("p_environment_name").strip()

In [0]:
%run "../../common/ParserCommonFunctions"

In [0]:
v_locations_dict = get_locations_by_env(v_environment_name)

v_file_location        = v_locations_dict['source']
v_archive_location     = v_locations_dict['archive']
v_unity_catalog        = v_locations_dict['unity_catalog']

v_location_client = v_file_location + v_clientid
v_file_location_client = v_file_location + v_clientid + "/837/"
v_archive_location_client = v_archive_location + v_clientid + "/parsed/837/"
v_quarantined_location_client = v_archive_location + v_clientid + "/quarantined/837/"
v_client_schema = "db_" + v_clientid
v_catalog = v_unity_catalog
v_consolidated_catalog = v_unity_catalog
v_healthcare_file_location = v_file_location + v_consolidated_schema_name.replace("db_","")

In [0]:
v_folder_contents_list = dbutils.fs.ls(v_file_location_client)

if len(v_folder_contents_list) == 0:
    dbutils.notebook.exit("No files to process")

In [0]:
%pip install ../x12-edi-parser

In [0]:
import json
import pandas as pd

from pyspark.sql.functions import (
    pandas_udf,
    from_json,
    col,
    schema_of_json,
    to_json,
    lit,
    current_timestamp,
    expr,
    split,
    element_at,
    concat,
    coalesce
)

from pyspark.sql.types import StructType, StructField, StringType, ArrayType

from ember import *
from ember.hls.healthcare import HealthcareManager as hm

from pyspark.sql.utils import AnalysisException


In [0]:
create_stmt = (
    f"CREATE SCHEMA IF NOT EXISTS {v_consolidated_catalog}.{v_consolidated_schema_name}"
    f" MANAGED LOCATION '{v_healthcare_file_location}'"
)
print(f"Creating schema: {create_stmt}")
spark.sql(create_stmt)


In [0]:
quarantine_schema = StructType([
    StructField("filename", StringType(), True),
    StructField("error", StringType(), True)
])


v_files_list = [f.path for f in dbutils.fs.ls(v_file_location_client)]
v_good_files_list = []
v_quarantine_files_list = []


for f in v_files_list:
    try:
        spark.read.text(v_file_location_client + f, wholetext=True)
        v_good_files_list.append(f)
    except Exception as e:
        v_quarantine_files_list.append((f, f"[STORAGE_READ_ERROR] {str(e)}"))


if v_quarantine_files_list:
    v_quarantine_files_df = spark.createDataFrame(v_quarantine_files_list, schema=quarantine_schema)
else:
    v_quarantine_files_df = spark.createDataFrame([], schema=quarantine_schema)

v_quarantine_files_audit_df = add_audit_fields(v_quarantine_files_df, lit(v_clientid), lit('tbd'))

v_quarantine_target_table_location = f"{v_consolidated_catalog}.{v_consolidated_schema_name}.quarantine"
append_or_create_table(v_quarantine_files_audit_df, v_quarantine_target_table_location)

In [0]:
if v_good_files_list:
    v_good_files_df = spark.read.text(v_good_files_list, wholetext=True)
    v_good_files_df = v_good_files_df.withColumn("filename", v_good_files_df["_metadata.file_path"])
    v_good_files_with_results_df = (
        v_good_files_df
        .withColumn("parsed", parse_edi_udf0(col("value"), col("filename")))
        .withColumn("json_str", expr("explode(parsed)"))
    )

    v_parsing_failures_df = (
        v_good_files_with_results_df
        .filter(col("parsed").cast("string").contains('"quarantine": true'))
    )

    v_parsing_success_df = (
        v_good_files_with_results_df
        .filter(~col("parsed").cast("string").contains('"quarantine": true'))
    )

In [0]:
if v_parsing_failures_df:
    v_parsing_failures_df = v_parsing_failures_df.select(col('filename'), col('json_str').alias('error'))
    v_parsing_failures_with_audit_df = add_audit_fields(v_parsing_failures_df, lit(v_clientid), lit('tbd'))

    append_or_create_table(v_parsing_failures_with_audit_df, v_quarantine_target_table_location)

In [0]:
# ---------------- Provider Struct ----------------
provider_struct = StructType([
    StructField("city", StringType()),
    StructField("ein", StringType()),
    StructField("ein_type", StringType()),
    StructField("entity_type", StringType()),
    StructField("name", StringType()),
    StructField("npi", StringType()),
    StructField("provider_role", StringType()),
    StructField("state", StringType()),
    StructField("street", StringType()),
    StructField("taxonomy", StringType()),
    StructField("zip", StringType())
])

# ---------------- Submitter Contacts ----------------
submitter_contacts_type = ArrayType(ArrayType(StringType()))

# ---------------- Custom Schema ----------------
custom_schema = StructType([
    # ---------------- File Info ----------------
    StructField("filename", StringType()),

    # ---------------- EDI ----------------
    StructField("EDI.control_number", StringType()),
    StructField("EDI.date", StringType()),
    StructField("EDI.recipient_qualifier_id", StringType()),
    StructField("EDI.sender_qualifier_id", StringType()),
    StructField("EDI.standard_version", StringType()),
    StructField("EDI.time", StringType()),

    # ---------------- FunctionalGroup ----------------
    StructField("FunctionalGroup.control_number", StringType()),
    StructField("FunctionalGroup.date", StringType()),
    StructField("FunctionalGroup.receiver", StringType()),
    StructField("FunctionalGroup.sender", StringType()),
    StructField("FunctionalGroup.standard_version", StringType()),
    StructField("FunctionalGroup.time", StringType()),
    StructField("FunctionalGroup.transaction_type", StringType()),

    # ---------------- Transaction ----------------
    StructField("Transaction.transaction_type", StringType()),

    # ---------------- Claim Header ----------------
    StructField("claim_header", StructType([
        StructField("admission_src_cd", StringType()),
        StructField("admission_type", StringType()),
        StructField("claim_amount", StringType()),
        StructField("claim_dates", ArrayType(
            StructType([
                StructField("date", StringType()),
                StructField("date_cd", StringType()),
                StructField("date_format", StringType())
            ])
        )),
        StructField("claim_id", StringType()),
        StructField("discharge_status_cd", StringType()),
        StructField("drg_cd", StringType()),
        StructField("encounter_id", StringType()),
        StructField("facility_type_code", StringType())
    ])),

    # ---------------- Claim Lines ----------------
    StructField("claim_lines", ArrayType(
        StructType([
            StructField("claim_line_number", StringType()),
            StructField("date_format", StringType()),
            StructField("dg_cd_pntr", StringType()),
            StructField("line_chrg_amt", StringType()),
            StructField("modifier_cds", StringType()),
            StructField("place_of_service", StringType()),
            StructField("prcdr_cd", StringType()),
            StructField("prcdr_cd_type", StringType()),
            StructField("revenue_cd", StringType()),
            StructField("service_date", StringType()),
            StructField("service_date_format", StringType()),
            StructField("service_time", StringType()),
            StructField("units", StringType()),
            StructField("units_measurement", StringType())
        ])
    )),

    # ---------------- Diagnosis ----------------
    StructField("diagnosis", StructType([
        StructField("admitting_dx_cd", StringType()),
        StructField("external_injury_dx_cd", StringType()),
        StructField("other_dx_cds", StringType()),
        StructField("principal_dx_cd", StringType()),
        StructField("reason_visit_dx_cd", StringType())
    ])),

    # ---------------- Patient ----------------
    StructField("patient", StructType([
        StructField("city", StringType()),
        StructField("dob", StringType()),
        StructField("dob_format", StringType()),
        StructField("gender_cd", StringType()),
        StructField("id", StringType()),
        StructField("mrn", StringType()),
        StructField("name", StringType()),
        StructField("patient_relationship_cd", StringType()),
        StructField("state", StringType()),
        StructField("street", StringType()),
        StructField("subsciber_identifier", StringType()),
        StructField("subscriber_relationship_cd", StringType()),
        StructField("zip", StringType())
    ])),

    # ---------------- Payer ----------------
    StructField("payer", StructType([
        StructField("business_entity_type", StringType()),
        StructField("payer_identifier", StringType()),
        StructField("payer_identifier_cd", StringType()),
        StructField("payer_name", StringType())
    ])),

    # ---------------- Providers ----------------
    StructField("providers", StructType([
        StructField("attending", provider_struct),
        StructField("billing", provider_struct),
        StructField("facility", provider_struct),
        StructField("operating", provider_struct),
        StructField("other", provider_struct),
        StructField("referring", provider_struct),
        StructField("service_facility", provider_struct),
        StructField("servicing", provider_struct)
    ])),

    # ---------------- Receiver ----------------
    StructField("receiver", StructType([
        StructField("name", StringType()),
        StructField("type", StringType())
    ])),

    # ---------------- Submitter ----------------
    StructField("submitter", StructType([
        StructField("name", StringType()),
        StructField("sbmtter_contact_name", StringType()),
        StructField("sbmtter_contacts", submitter_contacts_type),
        StructField("type", StringType())
    ])),

    # ---------------- Subscriber ----------------
    StructField("subscriber", StructType([
        StructField("city", StringType()),
        StructField("dob", StringType()),
        StructField("dob_format", StringType()),
        StructField("gender_cd", StringType()),
        StructField("id", StringType()),
        StructField("mrn", StringType()),
        StructField("name", StringType()),
        StructField("patient_relationship_cd", StringType()),
        StructField("state", StringType()),
        StructField("street", StringType()),
        StructField("subsciber_identifier", StringType()),
        StructField("subscriber_relationship_cd", StringType()),
        StructField("zip", StringType())
    ]))
])


In [0]:
if v_parsing_success_df:
    v_parsed_structured_df = v_parsing_success_df.select(
        from_json(col("json_str"), custom_schema).alias("data")
    ).select("data.*")

    v_parsed_structured_with_audit_df = add_audit_fields(
        v_parsed_structured_df, lit(v_clientid), lit("tbd")
    )
    
    v_parsed_structured_with_audit_df = v_parsed_structured_with_audit_df.withColumn(
        "837UID",
        lit('tbd'),
    )

In [0]:
# try:
#     spark.sql(f"DESCRIBE SCHEMA {v_catalog}.{v_client_schema}")
#     print(f"Schema {v_catalog}.{v_client_schema} already exists. Skipping creation.")
# except AnalysisException:
create_stmt = (
    f"CREATE SCHEMA IF NOT EXISTS {v_catalog}.{v_client_schema} "
    f"MANAGED LOCATION '{v_location_client}'"
)
print(f"Creating schema: {create_stmt}")
spark.sql(create_stmt)


In [0]:
v_claim_header_df = v_parsed_structured_with_audit_df.drop("claim_lines")
temp_claim_line_df = v_parsed_structured_with_audit_df.select(
    "claim_header.claim_id",
    "claim_lines",
    "filename",
    "client_id",
    "facility_id",
    "created_by_user",
    "process_timestamp",
    "modified_by_user",
    "datetime_last_modified",
    "837UID",
)

temp_claim_line_df.createOrReplaceTempView("claim_line")

v_claim_line_df = sql(
    """
SELECT
  claim_id,
  line.claim_line_number AS claim_line_number,
  line.date_format AS date_format,
  line.dg_cd_pntr AS dg_cd_pntr,
  line.line_chrg_amt AS line_chrg_amt,
  line.modifier_cds AS modifier_cds,
  line.place_of_service AS place_of_service,
  line.prcdr_cd AS prcdr_cd,
  line.prcdr_cd_type AS prcdr_cd_type,
  line.revenue_cd AS revenue_cd,
  line.service_date AS service_date,
  line.service_date_format AS service_date_format,
  line.service_time AS service_time,
  line.units AS units,
  line.units_measurement AS units_measurement,
  filename,
  client_id,
  facility_id,
  created_by_user,
  process_timestamp,
  modified_by_user,
  datetime_last_modified,
  837UID
FROM
  claim_line LATERAL VIEW explode(claim_lines) AS line
"""
)


In [0]:
v_claim_header_target_table_name = f"{v_catalog}.{v_client_schema}.claim_header"
v_consolidated_claim_header_target_table_name = f"{v_consolidated_catalog}.{v_consolidated_schema_name}.claim_header"

v_claim_lines_target_table_name = f"{v_catalog}.{v_client_schema}.claim_lines"
v_consolidated_claim_lines_target_table_name = f"{v_consolidated_catalog}.{v_consolidated_schema_name}.claim_lines"

v_success_claim_header = append_or_create_table(v_claim_header_df, v_claim_header_target_table_name)
v_success_consolidated_claim_header = append_or_create_table(v_claim_header_df, v_consolidated_claim_header_target_table_name)

v_success_claim_lines = append_or_create_table(v_claim_line_df, v_claim_lines_target_table_name)
v_success_consolidated_claim_lines = append_or_create_table(v_claim_line_df, v_consolidated_claim_lines_target_table_name)

In [0]:
v_parsed_success_with_dest_df = (v_parsing_success_df.select("filename")
                .withColumn("destination", concat(lit(v_archive_location_client), element_at(split(col("filename"), "/"), -1)),)
)

v_parsing_failures_with_dest_df = (v_parsing_failures_df.select("filename")
                .withColumn("destination", concat(lit(v_quarantined_location_client), element_at(split(col("filename"), "/"), -1)),)
)

v_quarantine_files_with_dest_df = (v_quarantine_files_df.select("filename")
                .withColumn("destination", concat(lit(v_quarantined_location_client), element_at(split(col("filename"), "/"), -1)),)
)

v_movement_tbl_df = v_parsed_success_with_dest_df.union(v_parsing_failures_with_dest_df).union(v_quarantine_files_with_dest_df)


In [0]:
for row in v_movement_tbl_df.collect():
    source = row['filename']
    target = row['destination']
    try:
        dbutils.fs.mv(source, target, True) 
        print(f"[MOVED] {source} → {target}")
    except Exception as e:
        print(f"[ERROR] Failed to move {source} → {target} | Reason: {str(e)}")
