In [1]:
import os, re, json
from dotenv import load_dotenv

from langchain_openai import AzureChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser

In [2]:
load_dotenv()

True

In [3]:
llm = AzureChatOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
    openai_api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    temperature=0
)

In [4]:
data_dictionary_gen_prompt = """
Given ddl script please respond a list of dictionary with the following format attached below.
Strictly respond in a list of dictionaries format.

DDL:{ddl}

Format:
[
  {{
    "column_name": "mem_member_id_code",
    "data_type": "varchar",
    "column_size": 50,
    "column_description": "Unique identifier code for the member."
  }},
  {{
    "column_name": "mem_first_name_code",
    "data_type": "varchar",
    "column_size": 35,
    "column_description": "First name code of the member."
  }},
  {{
    "column_name": "last_name_val",
    "data_type": "varchar",
    "column_size": 60,
    "column_description": "Last name of the member."
  }},
  {{
    "column_name": "date_of_birth",
    "data_type": "date",
    "column_description": "Date of birth of the member (YYYY-MM-DD)."
  }}
]

"""

In [5]:
template = PromptTemplate.from_template(data_dictionary_gen_prompt)

In [6]:
dd_gen_chain = template | llm | JsonOutputParser()

In [7]:
def format_ddl(file_path: str) -> list[dict]:
    """Given a file with ddl scripts, return list of dictionaries with client name, table name and ddl scripts."""
    with open(file_path, 'r') as f:
        content = f.read()
    ddl_scripts = [stmt.strip() + ';' for stmt in content.split(';') if stmt.strip()]
    result = []
    for ddl in ddl_scripts:
        match = re.search(r'CREATE\s+TABLE\s+([^\s(]+)', ddl, re.IGNORECASE)
        table_name = match.group(1) if match else None
        result.append({"table_name": table_name, "ddl": ddl})
    return result

def generate_data_dictionary(data: list[dict]):
    """Given list of dictionaries with ddls, generate data dictionary of those ddls and write to respective files."""
    for item in data:
        file_name = "../data/raw/" + item["table_name"].split("_")[-1] + "_" + item["table_name"] + "_data_dictionary.json"
        ddl = item["ddl"]
        try:
            ai_message = dd_gen_chain.invoke({"ddl": ddl})
            with open(file_name, "w") as fw:
                json.dump(ai_message, fw, indent=2)
            print(f"Written to file: {file_name}")
        except Exception as ex:
            print(f"Failed to write file: {file_name}")
            continue


In [8]:
# Generate Eligibility data dictionary

In [9]:
elig_path = "../data/raw/source_layouts_eligibility_ddl.sql"

In [10]:
elig_ddls = format_ddl(elig_path)

In [11]:
generate_data_dictionary(elig_ddls)

Written to file: ../data/raw/uhc_eligibility_uhc_data_dictionary.json
Written to file: ../data/raw/antm_eligibility_antm_data_dictionary.json
Written to file: ../data/raw/aet_eligibility_aet_data_dictionary.json
Written to file: ../data/raw/ci_eligibility_ci_data_dictionary.json
Written to file: ../data/raw/hum_eligibility_hum_data_dictionary.json
Written to file: ../data/raw/kp_eligibility_kp_data_dictionary.json
Written to file: ../data/raw/cnc_eligibility_cnc_data_dictionary.json
Written to file: ../data/raw/moh_eligibility_moh_data_dictionary.json
Written to file: ../data/raw/bcbsa_eligibility_bcbsa_data_dictionary.json
Written to file: ../data/raw/hcsc_eligibility_hcsc_data_dictionary.json
Written to file: ../data/raw/hmi_eligibility_hmi_data_dictionary.json
Written to file: ../data/raw/ghp_eligibility_ghp_data_dictionary.json
Written to file: ../data/raw/upmc_eligibility_upmc_data_dictionary.json
Written to file: ../data/raw/ibc_eligibility_ibc_data_dictionary.json
Written to fil

In [12]:
# Generate Claims data dictionary

In [13]:
claim_path = "../data/raw/source_layouts_claims_ddl.sql"

In [14]:
claim_ddls = format_ddl(claim_path)

In [15]:
generate_data_dictionary(claim_ddls)

Written to file: ../data/raw/uhc_claims_uhc_data_dictionary.json
Written to file: ../data/raw/antm_claims_antm_data_dictionary.json
Written to file: ../data/raw/aet_claims_aet_data_dictionary.json
Written to file: ../data/raw/ci_claims_ci_data_dictionary.json
Written to file: ../data/raw/hum_claims_hum_data_dictionary.json
Written to file: ../data/raw/kp_claims_kp_data_dictionary.json
Written to file: ../data/raw/cnc_claims_cnc_data_dictionary.json
Written to file: ../data/raw/moh_claims_moh_data_dictionary.json
Written to file: ../data/raw/bcbsa_claims_bcbsa_data_dictionary.json
Written to file: ../data/raw/hcsc_claims_hcsc_data_dictionary.json
Written to file: ../data/raw/hmi_claims_hmi_data_dictionary.json
Written to file: ../data/raw/ghp_claims_ghp_data_dictionary.json
Written to file: ../data/raw/upmc_claims_upmc_data_dictionary.json
Written to file: ../data/raw/ibc_claims_ibc_data_dictionary.json
Written to file: ../data/raw/hphc_claims_hphc_data_dictionary.json
Written to file: 

In [16]:
# Generate RxClaims data dictionary

In [17]:
rxclaims_path = "../data/raw/source_layouts_rxclaims_ddl.sql"

In [18]:
rxclaims_ddls = format_ddl(rxclaims_path)

In [19]:
generate_data_dictionary(rxclaims_ddls)

Written to file: ../data/raw/uhc_rxclaims_uhc_data_dictionary.json
Written to file: ../data/raw/antm_rxclaims_antm_data_dictionary.json
Written to file: ../data/raw/aet_rxclaims_aet_data_dictionary.json
Written to file: ../data/raw/ci_rxclaims_ci_data_dictionary.json
Written to file: ../data/raw/hum_rxclaims_hum_data_dictionary.json
Written to file: ../data/raw/kp_rxclaims_kp_data_dictionary.json
Written to file: ../data/raw/cnc_rxclaims_cnc_data_dictionary.json
Written to file: ../data/raw/moh_rxclaims_moh_data_dictionary.json
Written to file: ../data/raw/bcbsa_rxclaims_bcbsa_data_dictionary.json
Written to file: ../data/raw/hcsc_rxclaims_hcsc_data_dictionary.json
Written to file: ../data/raw/hmi_rxclaims_hmi_data_dictionary.json
Written to file: ../data/raw/ghp_rxclaims_ghp_data_dictionary.json
Written to file: ../data/raw/upmc_rxclaims_upmc_data_dictionary.json
Written to file: ../data/raw/ibc_rxclaims_ibc_data_dictionary.json
Written to file: ../data/raw/hphc_rxclaims_hphc_data_dic

In [20]:
# Generate standard layouts data dictionary

In [23]:
std_layouts_path = "../data/raw/standard_layouts_ddl.sql"

In [24]:
std_ddls = format_ddl(std_layouts_path)

In [25]:
generate_data_dictionary(std_ddls)

Written to file: ../data/raw/eligibility_standard_eligibility_data_dictionary.json
Written to file: ../data/raw/claims_standard_claims_data_dictionary.json
Written to file: ../data/raw/rxclaims_standard_rxclaims_data_dictionary.json
