In [1]:
import os, re, json

In [2]:
source_path = "../data/raw"
target_path = "../data/processed"

In [3]:
# Fetch payer list from file
with open(os.path.join(source_path, "payer_list.json"), "r") as f:
    payer_list = json.loads(f.read())

In [4]:
payer_list

[{'name': 'United Healthcare', 'abbr': 'UHC'},
 {'name': 'Elevance Health', 'abbr': 'ANTM'},
 {'name': 'Aetna', 'abbr': 'AET'},
 {'name': 'Cigna Healthcare', 'abbr': 'CI'},
 {'name': 'Humana', 'abbr': 'HUM'},
 {'name': 'Kaiser Permanente', 'abbr': 'KP'},
 {'name': 'Centene Corporation', 'abbr': 'CNC'},
 {'name': 'Molina Healthcare', 'abbr': 'MOH'},
 {'name': 'Blue Cross Blue Shield Association', 'abbr': 'BCBSA'},
 {'name': 'Health Care Service Corporation', 'abbr': 'HCSC'},
 {'name': 'Highmark Health', 'abbr': 'HMI'},
 {'name': 'Geisinger Health Plan', 'abbr': 'GHP'},
 {'name': 'UPMC Health Plan', 'abbr': 'UPMC'},
 {'name': 'Independence Blue Cross', 'abbr': 'IBC'},
 {'name': 'Harvard Pilgrim Health Care', 'abbr': 'HPHC'},
 {'name': 'Tufts Health Plan', 'abbr': 'THP'},
 {'name': 'WellCare Health Plans', 'abbr': 'WCG'},
 {'name': 'Tricare', 'abbr': 'TRI'},
 {'name': 'Medicare', 'abbr': 'MCR'},
 {'name': 'Medicaid', 'abbr': 'MCD'}]

In [5]:
# Get list of relevant files for preprocessing
dd_files = []
ddl_files = []

for f in os.listdir(source_path):
    if f.endswith('_ddl.sql'):
        ddl_files.append(f)
    elif f.endswith('_data_dictionary.json'):
        dd_files.append(f)

files = {"dd": dd_files, "ddl": ddl_files}

In [6]:
files

{'dd': ['mcd_eligibility_mcd_data_dictionary.json',
  'hmi_claims_hmi_data_dictionary.json',
  'upmc_rxclaims_upmc_data_dictionary.json',
  'bcbsa_eligibility_bcbsa_data_dictionary.json',
  'hphc_rxclaims_hphc_data_dictionary.json',
  'aet_claims_aet_data_dictionary.json',
  'uhc_rxclaims_uhc_data_dictionary.json',
  'upmc_claims_upmc_data_dictionary.json',
  'cnc_claims_cnc_data_dictionary.json',
  'uhc_eligibility_uhc_data_dictionary.json',
  'uhc_claims_uhc_data_dictionary.json',
  'hum_eligibility_hum_data_dictionary.json',
  'antm_claims_antm_data_dictionary.json',
  'hmi_rxclaims_hmi_data_dictionary.json',
  'antm_rxclaims_antm_data_dictionary.json',
  'wcg_rxclaims_wcg_data_dictionary.json',
  'kp_claims_kp_data_dictionary.json',
  'ci_claims_ci_data_dictionary.json',
  'claims_standard_claims_data_dictionary.json',
  'upmc_eligibility_upmc_data_dictionary.json',
  'moh_rxclaims_moh_data_dictionary.json',
  'hphc_eligibility_hphc_data_dictionary.json',
  'hmi_eligibility_hmi_dat

In [7]:
def write_to_file(data: list[dict], file_name: str):
    """Write data to given file name."""
    with open(file_name, "w") as fw:
        json.dump(data, fw, indent=2)
    print(f"File written successfully to: {file_name}")

def get_payer_name_abbr(table_name: str) -> dict:
    """Returns payer name and abbreviation for given table name."""
    abbr = table_name.split("_")[-1]
    for payer in payer_list:
        if abbr.upper() == payer["abbr"]:
            return payer.values()
    else:
        return {"name": "Default", "abbr": "default"}.values()

def format_ddl(file_name: str) -> list[dict]:
    """Returns preprocessed list of dictionaries of DDLs."""
    with open(os.path.join(source_path, file_name), 'r') as f:
        content = f.read()
    
    ddl_scripts = [stmt.strip() + ';' for stmt in content.split(';') if stmt.strip()]
    result = []
    for ddl in ddl_scripts:
        match = re.search(r'CREATE\s+TABLE\s+([^\s(]+)', ddl, re.IGNORECASE)
        table_name = match.group(1) if match else None
        client_name, client_abbr = get_payer_name_abbr(table_name)
        metadata = {
            "file_name": file_name,
            "file_path": source_path,
            "table_name": table_name,
            "client_name": client_name,
            "client_abbr": client_abbr
        }
        result.append({"content": ddl, "metadata": metadata})
    return result

def format_dd(file_name: str) -> list[dict]:
    """Returns preprocessed list of dictionaries of data dictionary."""
    with open(os.path.join(source_path, file_name), 'r') as f:
        content = json.load(f)
    
    result = []
    for item in content:
        dd = '\n'.join(f"{k}: {v}" for k, v in item.items())
        table_name = "_".join(file_name.split("data_dictionary")[0].split("_")[1:3])
        client_name, client_abbr = get_payer_name_abbr(table_name)
        metadata = {
            "file_name": file_name,
            "file_path": source_path,
            "table_name": table_name,
            "client_name": client_name,
            "client_abbr": client_abbr
        }
        result.append({"content": dd, "metadata": metadata})
    return result


In [8]:
# Prepare DDLs
processed_ddls = []
for ddl_file in files["ddl"]:
    processed_ddls.extend(format_ddl(ddl_file))
    print(f"Processed File: {ddl_file}")


Processed File: source_layouts_eligibility_ddl.sql
Processed File: source_layouts_claims_ddl.sql
Processed File: source_layouts_rxclaims_ddl.sql
Processed File: standard_layouts_ddl.sql


In [9]:
processed_ddls

[{'content': 'CREATE TABLE eligibility_uhc (\n    sub_member_id VARCHAR2(50),\n    first_name_code VARCHAR2(35),\n    mem_last_name VARCHAR2(60),\n    sub_date_of_birth_val DATE,\n    mem_gender_val CHAR(1),\n    address_id VARCHAR2(100),\n    city_code VARCHAR2(30),\n    state_nm CHAR(2),\n    sub_zip_code_id VARCHAR2(9),\n    sub_plan_id_code VARCHAR2(30),\n    mem_group_number_code VARCHAR2(50),\n    coverage_start_date_val DATE,\n    coverage_end_date_id DATE\n);',
  'metadata': {'file_name': 'source_layouts_eligibility_ddl.sql',
   'file_path': '../data/raw',
   'table_name': 'eligibility_uhc',
   'client_name': 'United Healthcare',
   'client_abbr': 'UHC'}},
 {'content': 'CREATE TABLE eligibility_antm (\n    mem_member_id_nm VARCHAR2(50),\n    first_name_code VARCHAR2(35),\n    last_name_nm VARCHAR2(60),\n    date_of_birth DATE,\n    sub_gender_val CHAR(1),\n    address_val VARCHAR2(100),\n    city VARCHAR2(30),\n    state_nm CHAR(2),\n    zip_code_code VARCHAR2(9),\n    plan_id_

In [10]:
# Write processed DDLs to processed directory
write_to_file(processed_ddls, os.path.join(target_path, "ddls.json"))

File written successfully to: ../data/processed/ddls.json


In [11]:
# Prepare Data Dictionaries
processed_dds = []
for dd_file in files["dd"]:
    processed_dds.extend(format_dd(dd_file))
    print(f"Processed File: {dd_file}")


Processed File: mcd_eligibility_mcd_data_dictionary.json
Processed File: hmi_claims_hmi_data_dictionary.json
Processed File: upmc_rxclaims_upmc_data_dictionary.json
Processed File: bcbsa_eligibility_bcbsa_data_dictionary.json
Processed File: hphc_rxclaims_hphc_data_dictionary.json
Processed File: aet_claims_aet_data_dictionary.json
Processed File: uhc_rxclaims_uhc_data_dictionary.json
Processed File: upmc_claims_upmc_data_dictionary.json
Processed File: cnc_claims_cnc_data_dictionary.json
Processed File: uhc_eligibility_uhc_data_dictionary.json
Processed File: uhc_claims_uhc_data_dictionary.json
Processed File: hum_eligibility_hum_data_dictionary.json
Processed File: antm_claims_antm_data_dictionary.json
Processed File: hmi_rxclaims_hmi_data_dictionary.json
Processed File: antm_rxclaims_antm_data_dictionary.json
Processed File: wcg_rxclaims_wcg_data_dictionary.json
Processed File: kp_claims_kp_data_dictionary.json
Processed File: ci_claims_ci_data_dictionary.json
Processed File: claims

In [12]:
processed_dds

[{'content': 'column_name: sub_member_id_code\ndata_type: varchar\ncolumn_size: 50\ncolumn_description: Unique identifier code for the subscriber member.',
  'metadata': {'file_name': 'mcd_eligibility_mcd_data_dictionary.json',
   'file_path': '../data/raw',
   'table_name': 'eligibility_mcd',
   'client_name': 'Medicaid',
   'client_abbr': 'MCD'}},
 {'content': 'column_name: first_name\ndata_type: varchar\ncolumn_size: 35\ncolumn_description: First name of the subscriber member.',
  'metadata': {'file_name': 'mcd_eligibility_mcd_data_dictionary.json',
   'file_path': '../data/raw',
   'table_name': 'eligibility_mcd',
   'client_name': 'Medicaid',
   'client_abbr': 'MCD'}},
 {'content': 'column_name: mem_last_name_code\ndata_type: varchar\ncolumn_size: 60\ncolumn_description: Last name code of the subscriber member.',
  'metadata': {'file_name': 'mcd_eligibility_mcd_data_dictionary.json',
   'file_path': '../data/raw',
   'table_name': 'eligibility_mcd',
   'client_name': 'Medicaid',
 

In [13]:
# Write processed Data Dictionaries to processed directory
write_to_file(processed_dds, os.path.join(target_path, "dds.json"))

File written successfully to: ../data/processed/dds.json
