In [7]:
import pandas as pd
import numpy as np
import json
from psycopg2.extras import execute_values
import psycopg2

# PostgreSQL Database Connection
DB_PARAMS = {
    "dbname": "mydatabase",
    "user": "myuser",
    "password": "mypassword",
    "host": "localhost",
    "port": "5433"  # Ensure this matches your running PostgreSQL container
}

conn = psycopg2.connect(**DB_PARAMS)
cur = conn.cursor()

cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
conn.commit()

# meta table creation

In [3]:
def create_metadata_table():
    """Create the metadata table for SQLRAG if it doesn't exist"""
    try:
        # Try to check for vector extension
        cur.execute("SELECT 1 FROM pg_extension WHERE extname = 'vector';")
        vector_installed = cur.fetchone() is not None
        
        # If vector extension isn't available, use JSONB for embeddings instead
        if not vector_installed:
            print("⚠️ pgvector extension not found. Will use JSONB for embeddings instead.")
            create_table_sql = """
            CREATE TABLE IF NOT EXISTS mimic_table_metadata (
                id SERIAL PRIMARY KEY,
                table_name VARCHAR(100) NOT NULL UNIQUE,
                description TEXT,
                table_purpose TEXT,
                columns_info JSONB,
                primary_keys TEXT[],
                foreign_keys JSONB,
                important_considerations TEXT,
                common_joins TEXT[],
                example_questions TEXT[],
                synonyms_and_terms TEXT[],
                embedding JSONB
            );
            """
        else:
            print("✅ pgvector extension is available.")
            create_table_sql = """
            CREATE TABLE IF NOT EXISTS mimic_table_metadata (
                id SERIAL PRIMARY KEY,
                table_name VARCHAR(100) NOT NULL UNIQUE,
                description TEXT,
                table_purpose TEXT,
                columns_info JSONB,
                primary_keys TEXT[],
                foreign_keys JSONB,
                important_considerations TEXT,
                common_joins TEXT[],
                example_questions TEXT[],
                synonyms_and_terms TEXT[],
                embedding VECTOR(1536)
            );
            """
    except Exception as e:
        # If there's an error checking for pgvector, default to JSONB
        print(f"⚠️ Could not check for pgvector extension: {e}")
        print("⚠️ Will use JSONB for embeddings instead.")
        create_table_sql = """
        CREATE TABLE IF NOT EXISTS mimic_table_metadata (
            id SERIAL PRIMARY KEY,
            table_name VARCHAR(100) NOT NULL UNIQUE,
            description TEXT,
            table_purpose TEXT,
            columns_info JSONB,
            primary_keys TEXT[],
            foreign_keys JSONB,
            important_considerations TEXT,
            common_joins TEXT[],
            example_questions TEXT[],
            synonyms_and_terms TEXT[],
            embedding JSONB
        );
        """
    
    # Create the table
    cur.execute(create_table_sql)
    conn.commit()
    print("✅ Table 'mimic_table_metadata' is ready.")

# Run the function
create_metadata_table()

✅ pgvector extension is available.
✅ Table 'mimic_table_metadata' is ready.


# admission

In [8]:
def get_column_statistics(table_name):
    """Get statistics for each column in the table"""
    # First get all columns
    cur.execute(f"""
        SELECT column_name, data_type, character_maximum_length
        FROM information_schema.columns
        WHERE table_name = '{table_name}';
    """)
    columns = cur.fetchall()
    
    # Initialize the result dictionary
    columns_info = {}
    
    for col in columns:
        column_name = col[0]
        data_type = col[1]
        max_length = col[2]
        
        # Create basic column info structure
        column_info = {
            "data_type": data_type,
            "description": get_column_description(table_name, column_name),
            "constraints": get_column_constraints(table_name, column_name)
        }
        
        # Add additional statistics based on data type
        if data_type in ('integer', 'numeric', 'double precision', 'smallint'):
            # For numeric columns
            try:
                cur.execute(f"""
                    SELECT 
                        MIN({column_name}) as min_value,
                        MAX({column_name}) as max_value,
                        AVG({column_name}) as avg_value,
                        PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY {column_name}) as q1,
                        PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY {column_name}) as median,
                        PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY {column_name}) as q3
                    FROM {table_name}
                    WHERE {column_name} IS NOT NULL;
                """)
                stats = cur.fetchone()
                if stats[0] is not None:  # Check if we got valid stats
                    column_info["value_range"] = {"min": stats[0], "max": stats[1]}
                    column_info["statistical_summary"] = {
                        "mean": float(stats[2]) if stats[2] else None,
                        "median": float(stats[4]) if stats[4] else None,
                        "q1": float(stats[3]) if stats[3] else None, 
                        "q3": float(stats[5]) if stats[5] else None
                    }
            except Exception as e:
                print(f"Error getting numeric stats for {column_name}: {e}")
        
        elif data_type in ('character varying', 'varchar', 'text', 'char'):
            # For categorical columns, get value distributions
            # Skip long text fields
            if max_length is None or max_length > 100:
                # This might be a long text field, so skip distribution analysis
                pass
            else:
                try:
                    cur.execute(f"""
                        SELECT 
                            {column_name},
                            COUNT(*) as count,
                            ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 2) as percentage
                        FROM {table_name}
                        WHERE {column_name} IS NOT NULL
                        GROUP BY {column_name}
                        ORDER BY count DESC
                        LIMIT 15;
                    """)
                    value_counts = cur.fetchall()
                    
                    if value_counts:
                        # Get unique values
                        categorical_values = [str(row[0]) for row in value_counts]
                        column_info["categorical_values"] = categorical_values
                        
                        # Get value distribution
                        distribution = {str(row[0]): float(row[2]) for row in value_counts}
                        column_info["value_distribution"] = distribution
                except Exception as e:
                    print(f"Error getting categorical stats for {column_name}: {e}")
        
        elif data_type in ('timestamp', 'date', 'time'):
            # For datetime columns
            try:
                cur.execute(f"""
                    SELECT 
                        MIN({column_name}) as min_date,
                        MAX({column_name}) as max_date
                    FROM {table_name}
                    WHERE {column_name} IS NOT NULL;
                """)
                date_stats = cur.fetchone()
                if date_stats[0]:
                    column_info["date_range"] = {
                        "min": date_stats[0].isoformat() if date_stats[0] else None,
                        "max": date_stats[1].isoformat() if date_stats[1] else None
                    }
            except Exception as e:
                print(f"Error getting date stats for {column_name}: {e}")
        
        # Add the column info to our result
        columns_info[column_name] = column_info
    
    return columns_info

def get_column_description(table_name, column_name):
    """Get description for a specific column based on MIMIC-IV documentation"""
    # This is a placeholder function - in a real implementation you would
    # either have these descriptions in a file or database table
    
    # Descriptions for admissions table columns
    descriptions = {
        "subject_id": "Unique identifier for the patient",
        "hadm_id": "Unique identifier for the hospital admission",
        "admittime": "Date and time the patient was admitted to the hospital",
        "dischtime": "Date and time the patient was discharged from the hospital",
        "deathtime": "Time of in-hospital death, if applicable",
        "admission_type": "Classification of the urgency of admission (e.g., ELECTIVE, URGENT)",
        "admit_provider_id": "Anonymous identifier for the provider who admitted the patient",
        "admission_location": "Location of the patient prior to arriving at the hospital",
        "discharge_location": "Disposition of the patient after discharge from hospital",
        "insurance": "Insurance information for the given hospitalization",
        "language": "Patient's primary language",
        "marital_status": "Patient's marital status",
        "race": "Patient's race",
        "edregtime": "Date and time at which the patient was registered in the emergency department",
        "edouttime": "Date and time at which the patient was discharged from the emergency department",
        "hospital_expire_flag": "Binary flag indicating whether the patient died within the given hospitalization"
    }
    
    return descriptions.get(column_name, "")

def get_column_constraints(table_name, column_name):
    """Get constraints for a specific column"""
    constraints = []
    
    # Check for NOT NULL constraint
    cur.execute(f"""
        SELECT is_nullable
        FROM information_schema.columns
        WHERE table_name = '{table_name}' AND column_name = '{column_name}';
    """)
    is_nullable = cur.fetchone()[0]
    if is_nullable == 'NO':
        constraints.append("NOT NULL")
    
    # Check if column is part of primary key
    cur.execute(f"""
        SELECT a.attname
        FROM pg_index i
        JOIN pg_attribute a ON a.attrelid = i.indrelid AND a.attnum = ANY(i.indkey)
        WHERE i.indrelid = '{table_name}'::regclass AND i.indisprimary AND a.attname = '{column_name}';
    """)
    if cur.fetchone():
        constraints.append("PRIMARY KEY")
    
    # Check if column is part of a foreign key - safer approach
    cur.execute(f"""
        SELECT
            conname AS constraint_name,
            pg_get_constraintdef(oid) AS constraint_def
        FROM
            pg_constraint
        WHERE
            conrelid = '{table_name}'::regclass AND contype = 'f';
    """)
    
    for constraint in cur.fetchall():
        constraint_def = constraint[1]
        # Look for the column name in the constraint definition
        # Example: FOREIGN KEY (subject_id) REFERENCES patients(subject_id)
        if f"({column_name})" in constraint_def or f", {column_name}," in constraint_def:
            constraints.append("FOREIGN KEY")
            break
    
    return " ".join(constraints)

def get_primary_keys(table_name):
    """Get primary key columns for the table"""
    cur.execute(f"""
        SELECT a.attname
        FROM pg_index i
        JOIN pg_attribute a ON a.attrelid = i.indrelid AND a.attnum = ANY(i.indkey)
        WHERE i.indrelid = '{table_name}'::regclass AND i.indisprimary;
    """)
    return [row[0] for row in cur.fetchall()]

def get_foreign_keys(table_name):
    """Get foreign key relationships for the table"""
    cur.execute(f"""
        SELECT
            conname AS constraint_name,
            pg_catalog.pg_get_constraintdef(r.oid, true) AS constraint_definition
        FROM
            pg_catalog.pg_constraint r
        WHERE
            r.conrelid = '{table_name}'::regclass AND r.contype = 'f';
    """)
    
    foreign_keys = {}
    for row in cur.fetchall():
        # Parse constraint definition to extract referenced table and column
        constraint_def = row[1]
        # Example format: FOREIGN KEY (subject_id) REFERENCES patients(subject_id)
        fk_parts = constraint_def.split("REFERENCES ")
        if len(fk_parts) > 1:
            # Extract local column
            local_col = fk_parts[0].split("(")[1].split(")")[0].strip()
            
            # Extract referenced table and column
            ref_table_col = fk_parts[1].strip()
            ref_table = ref_table_col.split("(")[0].strip()
            ref_col = ref_table_col.split("(")[1].split(")")[0].strip()
            
            foreign_keys[local_col] = {"table": ref_table, "column": ref_col}
    
    return foreign_keys

def create_admission_metadata():
    """Create metadata for the admissions table"""
    table_name = "admissions"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Table description and purpose
    description = "Detailed information about hospital stays."
    table_purpose = ("The admissions table gives information regarding a patient's admission to the hospital. "
                    "Since each unique hospital visit for a patient is assigned a unique hadm_id, "
                    "the admissions table can be considered as a definition table for hadm_id.")
    
    # Important considerations about the data
    important_considerations = ("The data is sourced from the admission, discharge and transfer database from the hospital "
                              "(often referred to as 'ADT' data). Organ donor accounts are sometimes created for patients "
                              "who died in the hospital. These are distinct hospital admissions with very short, sometimes "
                              "negative lengths of stay.")
    
    # Common table joins
    common_joins = [
        "JOIN patients ON admissions.subject_id = patients.subject_id",
        "LEFT JOIN icustays ON admissions.hadm_id = icustays.hadm_id"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "How many patients were admitted as urgent cases?",
        "What is the average length of stay for elective admissions?",
        "How many patients died during their hospital stay?",
        "What are the most common admission locations?",
        "What percentage of patients have Medicare insurance?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "hospital stay", 
        "patient admission", 
        "inpatient", 
        "hospitalization", 
        "ADT data", 
        "admission discharge transfer", 
        "hospital visit"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_admission_metadata()

✅ Metadata for 'admissions' table has been created.


In [10]:
# Test query to verify metadata was created
cur.execute("SELECT table_name, description FROM mimic_table_metadata;")
metadata_tables = cur.fetchall()
print("\n📊 Metadata Tables:")
for table in metadata_tables:
    print(f"- {table[0]}: {table[1]}")

# Sample query to test retrieval of column information
cur.execute("""
    SELECT columns_info->'admission_type' 
    FROM mimic_table_metadata 
    WHERE table_name='admissions';
""")
admission_type_info = cur.fetchone()[0]
print("\n🔍 Sample Column Info (admission_type):")
print(json.dumps(admission_type_info, indent=2))  # Remove json.loads()


📊 Metadata Tables:
- admissions: Detailed information about hospital stays.

🔍 Sample Column Info (admission_type):
{
  "data_type": "character varying",
  "constraints": "NOT NULL",
  "description": "Classification of the urgency of admission (e.g., ELECTIVE, URGENT)",
  "categorical_values": [
    "EW EMER.",
    "EU OBSERVATION",
    "OBSERVATION ADMIT",
    "URGENT",
    "SURGICAL SAME DAY ADMISSION",
    "DIRECT EMER.",
    "DIRECT OBSERVATION",
    "ELECTIVE",
    "AMBULATORY OBSERVATION"
  ],
  "value_distribution": {
    "URGENT": 10.36,
    "ELECTIVE": 2.45,
    "EW EMER.": 34.65,
    "DIRECT EMER.": 4.53,
    "EU OBSERVATION": 21.98,
    "OBSERVATION ADMIT": 12.21,
    "DIRECT OBSERVATION": 4.34,
    "AMBULATORY OBSERVATION": 1.54,
    "SURGICAL SAME DAY ADMISSION": 7.94
  }
}


In [11]:
# Get all column info for the admissions table
cur.execute("""
    SELECT columns_info
    FROM mimic_table_metadata 
    WHERE table_name='admissions';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info:")
print(json.dumps(all_columns_info, indent=2))


📋 All Columns Info:
{
  "race": {
    "data_type": "character varying",
    "constraints": "",
    "description": "Patient's race",
    "categorical_values": [
      "WHITE",
      "BLACK/AFRICAN AMERICAN",
      "OTHER",
      "UNKNOWN",
      "HISPANIC/LATINO - PUERTO RICAN",
      "WHITE - OTHER EUROPEAN",
      "HISPANIC OR LATINO",
      "ASIAN",
      "ASIAN - CHINESE",
      "WHITE - RUSSIAN",
      "BLACK/CAPE VERDEAN",
      "HISPANIC/LATINO - DOMINICAN",
      "BLACK/CARIBBEAN ISLAND",
      "BLACK/AFRICAN",
      "PATIENT DECLINED TO ANSWER"
    ],
    "value_distribution": {
      "ASIAN": 1.43,
      "OTHER": 3.5,
      "WHITE": 63.29,
      "UNKNOWN": 2.47,
      "BLACK/AFRICAN": 0.59,
      "ASIAN - CHINESE": 1.3,
      "WHITE - RUSSIAN": 1.17,
      "BLACK/CAPE VERDEAN": 1.1,
      "HISPANIC OR LATINO": 1.8,
      "BLACK/AFRICAN AMERICAN": 13.9,
      "BLACK/CARIBBEAN ISLAND": 0.63,
      "WHITE - OTHER EUROPEAN": 1.84,
      "PATIENT DECLINED TO ANSWER": 0.41,
      "

# omr

In [12]:
def create_omr_metadata():
    """Create metadata for the omr table"""
    table_name = "omr"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Table description and purpose
    description = "The Online Medical Record (OMR) table contains miscellaneous information from the EHR."
    table_purpose = ("The Online Medical Record (OMR) table stores miscellaneous information documented in "
                    "the electronic health record. It is a useful source of outpatient measurements such as "
                    "blood pressure, weight, height, and body mass index.")
    
    # Important considerations about the data
    important_considerations = ("Each row provides detail regarding a single observation in the EHR. "
                               "The seq_num field helps distinguish multiple measurements of the same type "
                               "recorded on the same day.")
    
    # Common table joins
    common_joins = [
        "JOIN patients ON omr.subject_id = patients.subject_id"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What's the average BMI for patients in the dataset?",
        "What's the distribution of blood pressure readings?",
        "What's the weight trend for a specific patient over time?",
        "How many patients have height measurements recorded?",
        "What's the most frequently recorded vital sign?",
        "What's the average weight of patients?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "outpatient measurements", 
        "vital signs", 
        "patient metrics",
        "BMI", 
        "blood pressure", 
        "height", 
        "weight", 
        "patient observations",
        "clinical observations",
        "outpatient vitals"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# You'll want to update the get_column_description function to include OMR column descriptions
def get_column_description(table_name, column_name):
    """Get description for a specific column based on MIMIC-IV documentation"""
    descriptions = {
        # Existing admissions table descriptions...
        
        # OMR table descriptions
        "omr.subject_id": "Unique identifier which specifies an individual patient",
        "omr.chartdate": "The date on which the observation was recorded",
        "omr.seq_num": "An monotonically increasing integer which uniquely distinguishes results of the same type recorded on the same day",
        "omr.result_name": "Human interpretable description of the observation (e.g., 'Blood Pressure', 'BMI', 'Weight')",
        "omr.result_value": "The value associated with the given OMR observation (e.g., '120/80', '25.5', '150')"
    }
    
    # Try with table prefix
    full_key = f"{table_name}.{column_name}"
    if full_key in descriptions:
        return descriptions[full_key]
    
    # Try without table prefix
    return descriptions.get(column_name, "")

# Run the function to create OMR metadata
create_omr_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info->'result_name' 
    FROM mimic_table_metadata 
    WHERE table_name='omr';
""")
result_name_info = cur.fetchone()[0]
print("\n🔍 Sample Column Info (result_name):")
print(json.dumps(result_name_info, indent=2))

✅ Metadata for 'omr' table has been created.

🔍 Sample Column Info (result_name):
{
  "data_type": "character varying",
  "constraints": "NOT NULL PRIMARY KEY",
  "description": "Human interpretable description of the observation (e.g., 'Blood Pressure', 'BMI', 'Weight')",
  "categorical_values": [
    "Blood Pressure",
    "Weight (Lbs)",
    "BMI (kg/m2)",
    "Height (Inches)",
    "Blood Pressure Sitting",
    "Blood Pressure Lying",
    "Blood Pressure Standing (1 min)",
    "Blood Pressure Standing (3 mins)",
    "BMI",
    "Blood Pressure Standing",
    "Weight",
    "eGFR",
    "Height"
  ],
  "value_distribution": {
    "BMI": 0.01,
    "eGFR": 0.0,
    "Height": 0.0,
    "Weight": 0.01,
    "BMI (kg/m2)": 25.81,
    "Weight (Lbs)": 29.34,
    "Blood Pressure": 33.69,
    "Height (Inches)": 10.98,
    "Blood Pressure Lying": 0.04,
    "Blood Pressure Sitting": 0.05,
    "Blood Pressure Standing": 0.01,
    "Blood Pressure Standing (1 min)": 0.04,
    "Blood Pressure Standing (

# d_hcpcs

In [14]:
def create_d_hcpcs_metadata():
    """Create metadata for the d_hcpcs table"""
    table_name = "d_hcpcs"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Manually update column descriptions since they're missing
    columns_info["code"]["description"] = "A five character code which uniquely represents the event"
    columns_info["category"]["description"] = "Broad classification of the code"
    columns_info["long_description"]["description"] = "Detailed textual description of the code"
    columns_info["short_description"]["description"] = "Brief textual description of the code"
    
    # Table description and purpose
    description = "Dimension table for hcpcsevents; provides a description of CPT codes."
    table_purpose = ("The d_hcpcs table is used to acquire human readable definitions for the codes used in the "
                    "hcpcsevents table. The concepts primarily correspond to hospital billing, and are mostly CPT codes.")
    
    # Important considerations about the data
    important_considerations = ("This is a reference table that helps interpret the codes in the hcpcsevents table. "
                               "Some code definitions may be missing due to licensing restrictions.")
    
    # Common table joins
    common_joins = [
        "JOIN hcpcsevents ON d_hcpcs.code = hcpcsevents.code"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What is the description for CPT code 12345?",
        "What are all the CPT codes in category 1?",
        "How many different CPT codes are in the database?",
        "Which CPT code is used for a specific procedure?",
        "What are the categories of CPT codes available?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "CPT codes", 
        "HCPCS codes",
        "procedure codes",
        "billing codes",
        "hospital billing",
        "medical coding",
        "healthcare common procedure coding system"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_d_hcpcs_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info->'code' 
    FROM mimic_table_metadata 
    WHERE table_name='d_hcpcs';
""")
code_info = cur.fetchone()[0]
print("\n🔍 Sample Column Info (code):")
print(json.dumps(code_info, indent=2))

✅ Metadata for 'd_hcpcs' table has been created.

🔍 Sample Column Info (code):
{
  "data_type": "character",
  "constraints": "NOT NULL PRIMARY KEY",
  "description": "A five character code which uniquely represents the event"
}


In [15]:
# 获取 d_hcpcs 表的所有列信息
cur.execute("""
    SELECT columns_info
    FROM mimic_table_metadata 
    WHERE table_name='d_hcpcs';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info for d_hcpcs:")
print(json.dumps(all_columns_info, indent=2))


📋 All Columns Info for d_hcpcs:
{
  "code": {
    "data_type": "character",
    "constraints": "NOT NULL PRIMARY KEY",
    "description": "A five character code which uniquely represents the event"
  },
  "category": {
    "data_type": "smallint",
    "constraints": "",
    "description": "Broad classification of the code",
    "value_range": {
      "max": 3,
      "min": 1
    },
    "statistical_summary": {
      "q1": 1.0,
      "q3": 1.0,
      "mean": 1.0293481956384478,
      "median": 1.0
    }
  },
  "long_description": {
    "data_type": "text",
    "constraints": "",
    "description": "Detailed textual description of the code"
  },
  "short_description": {
    "data_type": "character varying",
    "constraints": "",
    "description": "Brief textual description of the code"
  }
}


# d_icd_diagnoses

In [16]:
def create_d_icd_diagnoses_metadata():
    """Create metadata for the d_icd_diagnoses table"""
    table_name = "d_icd_diagnoses"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Manually update column descriptions
    if 'icd_code' in columns_info:
        columns_info['icd_code']['description'] = "International Coding Definitions (ICD) code for diagnosis"
    
    if 'icd_version' in columns_info:
        columns_info['icd_version']['description'] = "Version of the ICD coding system (9 or 10)"
    
    if 'long_title' in columns_info:
        columns_info['long_title']['description'] = "The meaning of the ICD code (e.g., 'Cholera due to vibrio cholerae')"
    
    # Table description and purpose
    description = "Dimension table for diagnoses_icd; provides a description of ICD-9/ICD-10 billed diagnoses."
    table_purpose = ("This table defines International Classification of Diseases (ICD) Version 9 and 10 codes for diagnoses. "
                     "These codes are assigned at the end of the patient's stay and are used by the hospital to bill for care provided.")
    
    # Important considerations about the data
    important_considerations = ("ICD-9 and ICD-10 codes have distinct formats: ICD-9 codes are 5 character long strings which are "
                                "entirely numeric (with the exception of codes prefixed with 'E' or 'V'). ICD-10 codes are 3-7 "
                                "characters long and always prefixed by a letter followed by numeric values. Both versions often "
                                "include a decimal in presentation, but the decimal is not stored in the database.")
    
    # Common table joins
    common_joins = [
        "JOIN diagnoses_icd ON d_icd_diagnoses.icd_code = diagnoses_icd.icd_code AND d_icd_diagnoses.icd_version = diagnoses_icd.icd_version"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What is the description for ICD code '0010'?",
        "How many different ICD-10 diagnoses are in the database?",
        "What are all the diabetes-related diagnoses?",
        "What is the meaning of ICD-9 code 'V30.00'?",
        "Which ICD codes are related to heart failure?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "ICD codes", 
        "diagnosis codes",
        "diagnostic codes",
        "medical coding",
        "ICD-9",
        "ICD-10",
        "billing codes",
        "International Classification of Diseases"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_d_icd_diagnoses_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info 
    FROM mimic_table_metadata 
    WHERE table_name='d_icd_diagnoses';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info for d_icd_diagnoses:")
print(json.dumps(all_columns_info, indent=2))

✅ Metadata for 'd_icd_diagnoses' table has been created.

📋 All Columns Info for d_icd_diagnoses:
{
  "icd_code": {
    "data_type": "character",
    "constraints": "NOT NULL PRIMARY KEY",
    "description": "International Coding Definitions (ICD) code for diagnosis"
  },
  "long_title": {
    "data_type": "character varying",
    "constraints": "",
    "description": "The meaning of the ICD code (e.g., 'Cholera due to vibrio cholerae')"
  },
  "icd_version": {
    "data_type": "integer",
    "constraints": "NOT NULL PRIMARY KEY",
    "description": "Version of the ICD coding system (9 or 10)",
    "value_range": {
      "max": 10,
      "min": 9
    },
    "statistical_summary": {
      "q1": 10.0,
      "q3": 10.0,
      "mean": 9.866399453427466,
      "median": 10.0
    }
  }
}


# d_labitems

In [17]:
def create_d_labitems_metadata():
    """Create metadata for the d_labitems table"""
    table_name = "d_labitems"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Manually update column descriptions
    if 'itemid' in columns_info:
        columns_info['itemid']['description'] = "A unique identifier for a laboratory concept"
    
    if 'label' in columns_info:
        columns_info['label']['description'] = "Describes the concept which is represented by the itemid"
    
    if 'fluid' in columns_info:
        columns_info['fluid']['description'] = "Describes the substance on which the measurement was made (e.g., 'BLOOD')"
    
    if 'category' in columns_info:
        columns_info['category']['description'] = "Provides higher level information as to the type of measurement (e.g., 'ABG')"
    
    # Table description and purpose
    description = "Dimension table for labevents provides a description of all lab items."
    table_purpose = ("d_labitems contains definitions for all itemid associated with lab measurements in the MIMIC database. "
                     "All data in labevents link to the d_labitems table. Each unique (fluid, category, label) tuple in the "
                     "hospital database was assigned an itemid in this table, and the use of this itemid facilitates efficient "
                     "storage and querying of the data.")
    
    # Important considerations about the data
    important_considerations = ("This table used to contain a column called loinc_code, which stored standardized identifiers "
                                "for laboratory measurements. To support ongoing improvement of these labels, the assignment "
                                "of LOINC codes is now done in the MIMIC Code Repository.")
    
    # Common table joins
    common_joins = [
        "JOIN labevents ON d_labitems.itemid = labevents.itemid"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What are all the blood chemistry tests available?",
        "What laboratory measurements are performed on urine samples?",
        "How many different categories of lab tests are there?",
        "Which itemid corresponds to a specific lab test like glucose?",
        "What are all the tests in the ABG category?",
        "What fluid types are available for creatinine measurement?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "lab tests", 
        "laboratory measurements",
        "lab items",
        "blood tests",
        "clinical tests",
        "diagnostic tests",
        "lab codes",
        "lab dictionary"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_d_labitems_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info 
    FROM mimic_table_metadata 
    WHERE table_name='d_labitems';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info for d_labitems:")
print(json.dumps(all_columns_info, indent=2))

✅ Metadata for 'd_labitems' table has been created.

📋 All Columns Info for d_labitems:
{
  "fluid": {
    "data_type": "character varying",
    "constraints": "",
    "description": "Describes the substance on which the measurement was made (e.g., 'BLOOD')",
    "categorical_values": [
      "Blood",
      "Urine",
      "Other Body Fluid",
      "Cerebrospinal Fluid",
      "Joint Fluid",
      "Pleural",
      "Ascites",
      "Bone Marrow",
      "Stool",
      "Q",
      "Fluid",
      "I"
    ],
    "value_distribution": {
      "I": 0.06,
      "Q": 0.55,
      "Blood": 49.75,
      "Fluid": 0.31,
      "Stool": 2.65,
      "Urine": 11.96,
      "Ascites": 4.19,
      "Pleural": 4.25,
      "Bone Marrow": 3.88,
      "Joint Fluid": 4.44,
      "Other Body Fluid": 11.84,
      "Cerebrospinal Fluid": 6.1
    }
  },
  "label": {
    "data_type": "character varying",
    "constraints": "",
    "description": "Describes the concept which is represented by the itemid",
    "categorica

# d_icd_procedures

In [18]:
def create_d_icd_procedures_metadata():
    """Create metadata for the d_icd_procedures table"""
    table_name = "d_icd_procedures"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Manually update column descriptions
    if 'icd_code' in columns_info:
        columns_info['icd_code']['description'] = "International Coding Definitions (ICD) code for procedures"
    
    if 'icd_version' in columns_info:
        columns_info['icd_version']['description'] = "Version of the ICD coding system (9 or 10)"
    
    if 'long_title' in columns_info:
        columns_info['long_title']['description'] = "The meaning of the ICD procedure code"
    
    # Table description and purpose
    description = "Dimension table for procedures_icd; provides a description of ICD-9/ICD-10 billed procedures."
    table_purpose = ("This table defines International Classification of Diseases (ICD) codes for procedures. "
                     "These codes are assigned at the end of the patient's stay and are used by the hospital to bill for care provided. "
                     "They can further be used to identify if certain procedures have been performed (e.g. surgery).")
    
    # Important considerations about the data
    important_considerations = ("ICD-9 and ICD-10 codes have distinct formats. Both versions are often presented with a decimal, "
                                "but the decimal is not stored in the database (i.e., the icd_code of '0010' is equivalent to '001.0'). "
                                "In general, ICD-10 codes are more detailed, though code mappings exist which convert ICD-9 codes to ICD-10 codes.")
    
    # Common table joins
    common_joins = [
        "JOIN procedures_icd ON d_icd_procedures.icd_code = procedures_icd.icd_code AND d_icd_procedures.icd_version = procedures_icd.icd_version"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What is the description for procedure code '0010'?",
        "How many different ICD-10 procedures are in the database?",
        "What are all the cardiac surgery procedures?",
        "What is the meaning of ICD-9 procedure code '39.61'?",
        "Which ICD codes are related to ventilation procedures?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "ICD procedure codes", 
        "procedure codes",
        "surgical codes",
        "medical procedure coding",
        "ICD-9 procedures",
        "ICD-10 procedures",
        "billing procedure codes",
        "International Classification of Diseases procedures"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_d_icd_procedures_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info 
    FROM mimic_table_metadata 
    WHERE table_name='d_icd_procedures';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info for d_icd_procedures:")
print(json.dumps(all_columns_info, indent=2))

✅ Metadata for 'd_icd_procedures' table has been created.

📋 All Columns Info for d_icd_procedures:
{
  "icd_code": {
    "data_type": "character",
    "constraints": "NOT NULL PRIMARY KEY",
    "description": "International Coding Definitions (ICD) code for procedures"
  },
  "long_title": {
    "data_type": "character varying",
    "constraints": "",
    "description": "The meaning of the ICD procedure code"
  },
  "icd_version": {
    "data_type": "integer",
    "constraints": "NOT NULL PRIMARY KEY",
    "description": "Version of the ICD coding system (9 or 10)",
    "value_range": {
      "max": 10,
      "min": 9
    },
    "statistical_summary": {
      "q1": 10.0,
      "q3": 10.0,
      "mean": 9.954396706428797,
      "median": 10.0
    }
  }
}


# patients

In [19]:
def create_patients_metadata():
    """Create metadata for the patients table"""
    table_name = "patients"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Manually update column descriptions
    if 'subject_id' in columns_info:
        columns_info['subject_id']['description'] = "Unique identifier which specifies an individual patient"
    
    if 'gender' in columns_info:
        columns_info['gender']['description'] = "Genotypical sex of the patient"
    
    if 'anchor_age' in columns_info:
        columns_info['anchor_age']['description'] = "Patient's age in the anchor_year (patients over 89 have anchor_age of 91)"
    
    if 'anchor_year' in columns_info:
        columns_info['anchor_year']['description'] = "Shifted year for the patient (for de-identification)"
    
    if 'anchor_year_group' in columns_info:
        columns_info['anchor_year_group']['description'] = "Range of years during which the patient's anchor_year occurred"
    
    if 'dod' in columns_info:
        columns_info['dod']['description'] = "De-identified date of death for the patient, if applicable"
    
    # Table description and purpose
    description = "Patients' gender, age, and date of death if information exists."
    table_purpose = ("Information that is consistent for the lifetime of a patient is stored in this table. "
                     "This is the primary demographic table for patients in MIMIC-IV.")
    
    # Important considerations about the data
    important_considerations = ("Due to de-identification, out of hospital mortality is only available "
                                "up to one year post-hospital discharge. All patient deaths occurring more than "
                                "one year after hospital discharge are censored. Patients with age over 89 in "
                                "the anchor_year have their anchor_age set to 91, regardless of actual age.")
    
    # Common table joins
    common_joins = [
        "JOIN admissions ON patients.subject_id = admissions.subject_id",
        "JOIN omr ON patients.subject_id = omr.subject_id",
        "JOIN labevents ON patients.subject_id = labevents.subject_id"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What is the gender distribution of patients in the database?",
        "How many patients died during their hospital stay?",
        "What is the average age of patients?",
        "How does mortality rate vary by gender?",
        "What percentage of patients are over 65 years old?",
        "How many patients died within one year after discharge?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "demographics", 
        "patient information",
        "patient demographics",
        "mortality data",
        "patient age",
        "patient gender",
        "patient death",
        "patient characteristics"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_patients_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info 
    FROM mimic_table_metadata 
    WHERE table_name='patients';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info for patients:")
print(json.dumps(all_columns_info, indent=2))

✅ Metadata for 'patients' table has been created.

📋 All Columns Info for patients:
{
  "dod": {
    "data_type": "timestamp without time zone",
    "constraints": "",
    "description": "De-identified date of death for the patient, if applicable"
  },
  "gender": {
    "data_type": "character varying",
    "constraints": "NOT NULL",
    "description": "Genotypical sex of the patient",
    "categorical_values": [
      "F",
      "M"
    ],
    "value_distribution": {
      "F": 52.9,
      "M": 47.1
    }
  },
  "anchor_age": {
    "data_type": "integer",
    "constraints": "NOT NULL",
    "description": "Patient's age in the anchor_year (patients over 89 have anchor_age of 91)",
    "value_range": {
      "max": 91,
      "min": 18
    },
    "statistical_summary": {
      "q1": 29.0,
      "q3": 65.0,
      "mean": 48.53988829276105,
      "median": 48.0
    }
  },
  "subject_id": {
    "data_type": "integer",
    "constraints": "NOT NULL PRIMARY KEY",
    "description": "Unique ide