In [1]:
import pandas as pd
import numpy as np
import json
from psycopg2.extras import execute_values
import psycopg2

# PostgreSQL Database Connection
DB_PARAMS = {
    "dbname": "mydatabase",
    "user": "myuser",
    "password": "mypassword",
    "host": "localhost",
    "port": "5433"  # Ensure this matches your running PostgreSQL container
}

conn = psycopg2.connect(**DB_PARAMS)
cur = conn.cursor()

cur.execute("CREATE EXTENSION IF NOT EXISTS vector;")
conn.commit()

ImportError: dlopen(/Users/brandonma/Library/Python/3.9/lib/python/site-packages/psycopg2/_psycopg.cpython-39-darwin.so, 0x0002): Library not loaded: @rpath/libpq.5.dylib
  Referenced from: <04F18D1D-1202-3FC6-A406-82EC49D6BBF8> /Users/brandonma/Library/Python/3.9/lib/python/site-packages/psycopg2/_psycopg.cpython-39-darwin.so
  Reason: no LC_RPATH's found

# meta table creation

In [3]:
def create_metadata_table():
    """Create the metadata table for SQLRAG if it doesn't exist"""
    try:
        # Try to check for vector extension
        cur.execute("SELECT 1 FROM pg_extension WHERE extname = 'vector';")
        vector_installed = cur.fetchone() is not None
        
        # If vector extension isn't available, use JSONB for embeddings instead
        if not vector_installed:
            print("⚠️ pgvector extension not found. Will use JSONB for embeddings instead.")
            create_table_sql = """
            CREATE TABLE IF NOT EXISTS mimic_table_metadata (
                id SERIAL PRIMARY KEY,
                table_name VARCHAR(100) NOT NULL UNIQUE,
                description TEXT,
                table_purpose TEXT,
                columns_info JSONB,
                primary_keys TEXT[],
                foreign_keys JSONB,
                important_considerations TEXT,
                common_joins TEXT[],
                example_questions TEXT[],
                synonyms_and_terms TEXT[],
                embedding JSONB
            );
            """
        else:
            print("✅ pgvector extension is available.")
            create_table_sql = """
            CREATE TABLE IF NOT EXISTS mimic_table_metadata (
                id SERIAL PRIMARY KEY,
                table_name VARCHAR(100) NOT NULL UNIQUE,
                description TEXT,
                table_purpose TEXT,
                columns_info JSONB,
                primary_keys TEXT[],
                foreign_keys JSONB,
                important_considerations TEXT,
                common_joins TEXT[],
                example_questions TEXT[],
                synonyms_and_terms TEXT[],
                embedding VECTOR(1536)
            );
            """
    except Exception as e:
        # If there's an error checking for pgvector, default to JSONB
        print(f"⚠️ Could not check for pgvector extension: {e}")
        print("⚠️ Will use JSONB for embeddings instead.")
        create_table_sql = """
        CREATE TABLE IF NOT EXISTS mimic_table_metadata (
            id SERIAL PRIMARY KEY,
            table_name VARCHAR(100) NOT NULL UNIQUE,
            description TEXT,
            table_purpose TEXT,
            columns_info JSONB,
            primary_keys TEXT[],
            foreign_keys JSONB,
            important_considerations TEXT,
            common_joins TEXT[],
            example_questions TEXT[],
            synonyms_and_terms TEXT[],
            embedding JSONB
        );
        """
    
    # Create the table
    cur.execute(create_table_sql)
    conn.commit()
    print("✅ Table 'mimic_table_metadata' is ready.")

# Run the function
create_metadata_table()

✅ pgvector extension is available.
✅ Table 'mimic_table_metadata' is ready.


# admission

In [8]:
def get_column_statistics(table_name):
    """Get statistics for each column in the table"""
    # First get all columns
    cur.execute(f"""
        SELECT column_name, data_type, character_maximum_length
        FROM information_schema.columns
        WHERE table_name = '{table_name}';
    """)
    columns = cur.fetchall()
    
    # Initialize the result dictionary
    columns_info = {}
    
    for col in columns:
        column_name = col[0]
        data_type = col[1]
        max_length = col[2]
        
        # Create basic column info structure
        column_info = {
            "data_type": data_type,
            "description": get_column_description(table_name, column_name),
            "constraints": get_column_constraints(table_name, column_name)
        }
        
        # Add additional statistics based on data type
        if data_type in ('integer', 'numeric', 'double precision', 'smallint'):
            # For numeric columns
            try:
                cur.execute(f"""
                    SELECT 
                        MIN({column_name}) as min_value,
                        MAX({column_name}) as max_value,
                        AVG({column_name}) as avg_value,
                        PERCENTILE_CONT(0.25) WITHIN GROUP(ORDER BY {column_name}) as q1,
                        PERCENTILE_CONT(0.5) WITHIN GROUP(ORDER BY {column_name}) as median,
                        PERCENTILE_CONT(0.75) WITHIN GROUP(ORDER BY {column_name}) as q3
                    FROM {table_name}
                    WHERE {column_name} IS NOT NULL;
                """)
                stats = cur.fetchone()
                if stats[0] is not None:  # Check if we got valid stats
                    column_info["value_range"] = {"min": stats[0], "max": stats[1]}
                    column_info["statistical_summary"] = {
                        "mean": float(stats[2]) if stats[2] else None,
                        "median": float(stats[4]) if stats[4] else None,
                        "q1": float(stats[3]) if stats[3] else None, 
                        "q3": float(stats[5]) if stats[5] else None
                    }
            except Exception as e:
                print(f"Error getting numeric stats for {column_name}: {e}")
        
        elif data_type in ('character varying', 'varchar', 'text', 'char'):
            # For categorical columns, get value distributions
            # Skip long text fields
            if max_length is None or max_length > 100:
                # This might be a long text field, so skip distribution analysis
                pass
            else:
                try:
                    cur.execute(f"""
                        SELECT 
                            {column_name},
                            COUNT(*) as count,
                            ROUND(100.0 * COUNT(*) / SUM(COUNT(*)) OVER(), 2) as percentage
                        FROM {table_name}
                        WHERE {column_name} IS NOT NULL
                        GROUP BY {column_name}
                        ORDER BY count DESC
                        LIMIT 15;
                    """)
                    value_counts = cur.fetchall()
                    
                    if value_counts:
                        # Get unique values
                        categorical_values = [str(row[0]) for row in value_counts]
                        column_info["categorical_values"] = categorical_values
                        
                        # Get value distribution
                        distribution = {str(row[0]): float(row[2]) for row in value_counts}
                        column_info["value_distribution"] = distribution
                except Exception as e:
                    print(f"Error getting categorical stats for {column_name}: {e}")
        
        elif data_type in ('timestamp', 'date', 'time'):
            # For datetime columns
            try:
                cur.execute(f"""
                    SELECT 
                        MIN({column_name}) as min_date,
                        MAX({column_name}) as max_date
                    FROM {table_name}
                    WHERE {column_name} IS NOT NULL;
                """)
                date_stats = cur.fetchone()
                if date_stats[0]:
                    column_info["date_range"] = {
                        "min": date_stats[0].isoformat() if date_stats[0] else None,
                        "max": date_stats[1].isoformat() if date_stats[1] else None
                    }
            except Exception as e:
                print(f"Error getting date stats for {column_name}: {e}")
        
        # Add the column info to our result
        columns_info[column_name] = column_info
    
    return columns_info

def get_column_description(table_name, column_name):
    """Get description for a specific column based on MIMIC-IV documentation"""
    # This is a placeholder function - in a real implementation you would
    # either have these descriptions in a file or database table
    
    # Descriptions for admissions table columns
    descriptions = {
        "subject_id": "Unique identifier for the patient",
        "hadm_id": "Unique identifier for the hospital admission",
        "admittime": "Date and time the patient was admitted to the hospital",
        "dischtime": "Date and time the patient was discharged from the hospital",
        "deathtime": "Time of in-hospital death, if applicable",
        "admission_type": "Classification of the urgency of admission (e.g., ELECTIVE, URGENT)",
        "admit_provider_id": "Anonymous identifier for the provider who admitted the patient",
        "admission_location": "Location of the patient prior to arriving at the hospital",
        "discharge_location": "Disposition of the patient after discharge from hospital",
        "insurance": "Insurance information for the given hospitalization",
        "language": "Patient's primary language",
        "marital_status": "Patient's marital status",
        "race": "Patient's race",
        "edregtime": "Date and time at which the patient was registered in the emergency department",
        "edouttime": "Date and time at which the patient was discharged from the emergency department",
        "hospital_expire_flag": "Binary flag indicating whether the patient died within the given hospitalization"
    }
    
    return descriptions.get(column_name, "")

def get_column_constraints(table_name, column_name):
    """Get constraints for a specific column"""
    constraints = []
    
    # Check for NOT NULL constraint
    cur.execute(f"""
        SELECT is_nullable
        FROM information_schema.columns
        WHERE table_name = '{table_name}' AND column_name = '{column_name}';
    """)
    is_nullable = cur.fetchone()[0]
    if is_nullable == 'NO':
        constraints.append("NOT NULL")
    
    # Check if column is part of primary key
    cur.execute(f"""
        SELECT a.attname
        FROM pg_index i
        JOIN pg_attribute a ON a.attrelid = i.indrelid AND a.attnum = ANY(i.indkey)
        WHERE i.indrelid = '{table_name}'::regclass AND i.indisprimary AND a.attname = '{column_name}';
    """)
    if cur.fetchone():
        constraints.append("PRIMARY KEY")
    
    # Check if column is part of a foreign key - safer approach
    cur.execute(f"""
        SELECT
            conname AS constraint_name,
            pg_get_constraintdef(oid) AS constraint_def
        FROM
            pg_constraint
        WHERE
            conrelid = '{table_name}'::regclass AND contype = 'f';
    """)
    
    for constraint in cur.fetchall():
        constraint_def = constraint[1]
        # Look for the column name in the constraint definition
        # Example: FOREIGN KEY (subject_id) REFERENCES patients(subject_id)
        if f"({column_name})" in constraint_def or f", {column_name}," in constraint_def:
            constraints.append("FOREIGN KEY")
            break
    
    return " ".join(constraints)

def get_primary_keys(table_name):
    """Get primary key columns for the table"""
    cur.execute(f"""
        SELECT a.attname
        FROM pg_index i
        JOIN pg_attribute a ON a.attrelid = i.indrelid AND a.attnum = ANY(i.indkey)
        WHERE i.indrelid = '{table_name}'::regclass AND i.indisprimary;
    """)
    return [row[0] for row in cur.fetchall()]

def get_foreign_keys(table_name):
    """Get foreign key relationships for the table"""
    cur.execute(f"""
        SELECT
            conname AS constraint_name,
            pg_catalog.pg_get_constraintdef(r.oid, true) AS constraint_definition
        FROM
            pg_catalog.pg_constraint r
        WHERE
            r.conrelid = '{table_name}'::regclass AND r.contype = 'f';
    """)
    
    foreign_keys = {}
    for row in cur.fetchall():
        # Parse constraint definition to extract referenced table and column
        constraint_def = row[1]
        # Example format: FOREIGN KEY (subject_id) REFERENCES patients(subject_id)
        fk_parts = constraint_def.split("REFERENCES ")
        if len(fk_parts) > 1:
            # Extract local column
            local_col = fk_parts[0].split("(")[1].split(")")[0].strip()
            
            # Extract referenced table and column
            ref_table_col = fk_parts[1].strip()
            ref_table = ref_table_col.split("(")[0].strip()
            ref_col = ref_table_col.split("(")[1].split(")")[0].strip()
            
            foreign_keys[local_col] = {"table": ref_table, "column": ref_col}
    
    return foreign_keys

def create_admission_metadata():
    """Create metadata for the admissions table"""
    table_name = "admissions"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Table description and purpose
    description = "Detailed information about hospital stays."
    table_purpose = ("The admissions table gives information regarding a patient's admission to the hospital. "
                    "Since each unique hospital visit for a patient is assigned a unique hadm_id, "
                    "the admissions table can be considered as a definition table for hadm_id.")
    
    # Important considerations about the data
    important_considerations = ("The data is sourced from the admission, discharge and transfer database from the hospital "
                              "(often referred to as 'ADT' data). Organ donor accounts are sometimes created for patients "
                              "who died in the hospital. These are distinct hospital admissions with very short, sometimes "
                              "negative lengths of stay.")
    
    # Common table joins
    common_joins = [
        "JOIN patients ON admissions.subject_id = patients.subject_id",
        "LEFT JOIN icustays ON admissions.hadm_id = icustays.hadm_id"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "How many patients were admitted as urgent cases?",
        "What is the average length of stay for elective admissions?",
        "How many patients died during their hospital stay?",
        "What are the most common admission locations?",
        "What percentage of patients have Medicare insurance?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "hospital stay", 
        "patient admission", 
        "inpatient", 
        "hospitalization", 
        "ADT data", 
        "admission discharge transfer", 
        "hospital visit"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_admission_metadata()

✅ Metadata for 'admissions' table has been created.


In [10]:
# Test query to verify metadata was created
cur.execute("SELECT table_name, description FROM mimic_table_metadata;")
metadata_tables = cur.fetchall()
print("\n📊 Metadata Tables:")
for table in metadata_tables:
    print(f"- {table[0]}: {table[1]}")

# Sample query to test retrieval of column information
cur.execute("""
    SELECT columns_info->'admission_type' 
    FROM mimic_table_metadata 
    WHERE table_name='admissions';
""")
admission_type_info = cur.fetchone()[0]
print("\n🔍 Sample Column Info (admission_type):")
print(json.dumps(admission_type_info, indent=2))  # Remove json.loads()


📊 Metadata Tables:
- admissions: Detailed information about hospital stays.

🔍 Sample Column Info (admission_type):
{
  "data_type": "character varying",
  "constraints": "NOT NULL",
  "description": "Classification of the urgency of admission (e.g., ELECTIVE, URGENT)",
  "categorical_values": [
    "EW EMER.",
    "EU OBSERVATION",
    "OBSERVATION ADMIT",
    "URGENT",
    "SURGICAL SAME DAY ADMISSION",
    "DIRECT EMER.",
    "DIRECT OBSERVATION",
    "ELECTIVE",
    "AMBULATORY OBSERVATION"
  ],
  "value_distribution": {
    "URGENT": 10.36,
    "ELECTIVE": 2.45,
    "EW EMER.": 34.65,
    "DIRECT EMER.": 4.53,
    "EU OBSERVATION": 21.98,
    "OBSERVATION ADMIT": 12.21,
    "DIRECT OBSERVATION": 4.34,
    "AMBULATORY OBSERVATION": 1.54,
    "SURGICAL SAME DAY ADMISSION": 7.94
  }
}


In [11]:
# Get all column info for the admissions table
cur.execute("""
    SELECT columns_info
    FROM mimic_table_metadata 
    WHERE table_name='admissions';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info:")
print(json.dumps(all_columns_info, indent=2))


📋 All Columns Info:
{
  "race": {
    "data_type": "character varying",
    "constraints": "",
    "description": "Patient's race",
    "categorical_values": [
      "WHITE",
      "BLACK/AFRICAN AMERICAN",
      "OTHER",
      "UNKNOWN",
      "HISPANIC/LATINO - PUERTO RICAN",
      "WHITE - OTHER EUROPEAN",
      "HISPANIC OR LATINO",
      "ASIAN",
      "ASIAN - CHINESE",
      "WHITE - RUSSIAN",
      "BLACK/CAPE VERDEAN",
      "HISPANIC/LATINO - DOMINICAN",
      "BLACK/CARIBBEAN ISLAND",
      "BLACK/AFRICAN",
      "PATIENT DECLINED TO ANSWER"
    ],
    "value_distribution": {
      "ASIAN": 1.43,
      "OTHER": 3.5,
      "WHITE": 63.29,
      "UNKNOWN": 2.47,
      "BLACK/AFRICAN": 0.59,
      "ASIAN - CHINESE": 1.3,
      "WHITE - RUSSIAN": 1.17,
      "BLACK/CAPE VERDEAN": 1.1,
      "HISPANIC OR LATINO": 1.8,
      "BLACK/AFRICAN AMERICAN": 13.9,
      "BLACK/CARIBBEAN ISLAND": 0.63,
      "WHITE - OTHER EUROPEAN": 1.84,
      "PATIENT DECLINED TO ANSWER": 0.41,
      "

# omr

In [12]:
def create_omr_metadata():
    """Create metadata for the omr table"""
    table_name = "omr"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Table description and purpose
    description = "The Online Medical Record (OMR) table contains miscellaneous information from the EHR."
    table_purpose = ("The Online Medical Record (OMR) table stores miscellaneous information documented in "
                    "the electronic health record. It is a useful source of outpatient measurements such as "
                    "blood pressure, weight, height, and body mass index.")
    
    # Important considerations about the data
    important_considerations = ("Each row provides detail regarding a single observation in the EHR. "
                               "The seq_num field helps distinguish multiple measurements of the same type "
                               "recorded on the same day.")
    
    # Common table joins
    common_joins = [
        "JOIN patients ON omr.subject_id = patients.subject_id"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What's the average BMI for patients in the dataset?",
        "What's the distribution of blood pressure readings?",
        "What's the weight trend for a specific patient over time?",
        "How many patients have height measurements recorded?",
        "What's the most frequently recorded vital sign?",
        "What's the average weight of patients?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "outpatient measurements", 
        "vital signs", 
        "patient metrics",
        "BMI", 
        "blood pressure", 
        "height", 
        "weight", 
        "patient observations",
        "clinical observations",
        "outpatient vitals"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# You'll want to update the get_column_description function to include OMR column descriptions
def get_column_description(table_name, column_name):
    """Get description for a specific column based on MIMIC-IV documentation"""
    descriptions = {
        # Existing admissions table descriptions...
        
        # OMR table descriptions
        "omr.subject_id": "Unique identifier which specifies an individual patient",
        "omr.chartdate": "The date on which the observation was recorded",
        "omr.seq_num": "An monotonically increasing integer which uniquely distinguishes results of the same type recorded on the same day",
        "omr.result_name": "Human interpretable description of the observation (e.g., 'Blood Pressure', 'BMI', 'Weight')",
        "omr.result_value": "The value associated with the given OMR observation (e.g., '120/80', '25.5', '150')"
    }
    
    # Try with table prefix
    full_key = f"{table_name}.{column_name}"
    if full_key in descriptions:
        return descriptions[full_key]
    
    # Try without table prefix
    return descriptions.get(column_name, "")

# Run the function to create OMR metadata
create_omr_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info->'result_name' 
    FROM mimic_table_metadata 
    WHERE table_name='omr';
""")
result_name_info = cur.fetchone()[0]
print("\n🔍 Sample Column Info (result_name):")
print(json.dumps(result_name_info, indent=2))

✅ Metadata for 'omr' table has been created.

🔍 Sample Column Info (result_name):
{
  "data_type": "character varying",
  "constraints": "NOT NULL PRIMARY KEY",
  "description": "Human interpretable description of the observation (e.g., 'Blood Pressure', 'BMI', 'Weight')",
  "categorical_values": [
    "Blood Pressure",
    "Weight (Lbs)",
    "BMI (kg/m2)",
    "Height (Inches)",
    "Blood Pressure Sitting",
    "Blood Pressure Lying",
    "Blood Pressure Standing (1 min)",
    "Blood Pressure Standing (3 mins)",
    "BMI",
    "Blood Pressure Standing",
    "Weight",
    "eGFR",
    "Height"
  ],
  "value_distribution": {
    "BMI": 0.01,
    "eGFR": 0.0,
    "Height": 0.0,
    "Weight": 0.01,
    "BMI (kg/m2)": 25.81,
    "Weight (Lbs)": 29.34,
    "Blood Pressure": 33.69,
    "Height (Inches)": 10.98,
    "Blood Pressure Lying": 0.04,
    "Blood Pressure Sitting": 0.05,
    "Blood Pressure Standing": 0.01,
    "Blood Pressure Standing (1 min)": 0.04,
    "Blood Pressure Standing (

# d_hcpcs

In [14]:
def create_d_hcpcs_metadata():
    """Create metadata for the d_hcpcs table"""
    table_name = "d_hcpcs"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Manually update column descriptions since they're missing
    columns_info["code"]["description"] = "A five character code which uniquely represents the event"
    columns_info["category"]["description"] = "Broad classification of the code"
    columns_info["long_description"]["description"] = "Detailed textual description of the code"
    columns_info["short_description"]["description"] = "Brief textual description of the code"
    
    # Table description and purpose
    description = "Dimension table for hcpcsevents; provides a description of CPT codes."
    table_purpose = ("The d_hcpcs table is used to acquire human readable definitions for the codes used in the "
                    "hcpcsevents table. The concepts primarily correspond to hospital billing, and are mostly CPT codes.")
    
    # Important considerations about the data
    important_considerations = ("This is a reference table that helps interpret the codes in the hcpcsevents table. "
                               "Some code definitions may be missing due to licensing restrictions.")
    
    # Common table joins
    common_joins = [
        "JOIN hcpcsevents ON d_hcpcs.code = hcpcsevents.code"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What is the description for CPT code 12345?",
        "What are all the CPT codes in category 1?",
        "How many different CPT codes are in the database?",
        "Which CPT code is used for a specific procedure?",
        "What are the categories of CPT codes available?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "CPT codes", 
        "HCPCS codes",
        "procedure codes",
        "billing codes",
        "hospital billing",
        "medical coding",
        "healthcare common procedure coding system"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_d_hcpcs_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info->'code' 
    FROM mimic_table_metadata 
    WHERE table_name='d_hcpcs';
""")
code_info = cur.fetchone()[0]
print("\n🔍 Sample Column Info (code):")
print(json.dumps(code_info, indent=2))

✅ Metadata for 'd_hcpcs' table has been created.

🔍 Sample Column Info (code):
{
  "data_type": "character",
  "constraints": "NOT NULL PRIMARY KEY",
  "description": "A five character code which uniquely represents the event"
}


In [15]:
# 获取 d_hcpcs 表的所有列信息
cur.execute("""
    SELECT columns_info
    FROM mimic_table_metadata 
    WHERE table_name='d_hcpcs';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info for d_hcpcs:")
print(json.dumps(all_columns_info, indent=2))


📋 All Columns Info for d_hcpcs:
{
  "code": {
    "data_type": "character",
    "constraints": "NOT NULL PRIMARY KEY",
    "description": "A five character code which uniquely represents the event"
  },
  "category": {
    "data_type": "smallint",
    "constraints": "",
    "description": "Broad classification of the code",
    "value_range": {
      "max": 3,
      "min": 1
    },
    "statistical_summary": {
      "q1": 1.0,
      "q3": 1.0,
      "mean": 1.0293481956384478,
      "median": 1.0
    }
  },
  "long_description": {
    "data_type": "text",
    "constraints": "",
    "description": "Detailed textual description of the code"
  },
  "short_description": {
    "data_type": "character varying",
    "constraints": "",
    "description": "Brief textual description of the code"
  }
}


# d_icd_diagnoses

In [16]:
def create_d_icd_diagnoses_metadata():
    """Create metadata for the d_icd_diagnoses table"""
    table_name = "d_icd_diagnoses"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Manually update column descriptions
    if 'icd_code' in columns_info:
        columns_info['icd_code']['description'] = "International Coding Definitions (ICD) code for diagnosis"
    
    if 'icd_version' in columns_info:
        columns_info['icd_version']['description'] = "Version of the ICD coding system (9 or 10)"
    
    if 'long_title' in columns_info:
        columns_info['long_title']['description'] = "The meaning of the ICD code (e.g., 'Cholera due to vibrio cholerae')"
    
    # Table description and purpose
    description = "Dimension table for diagnoses_icd; provides a description of ICD-9/ICD-10 billed diagnoses."
    table_purpose = ("This table defines International Classification of Diseases (ICD) Version 9 and 10 codes for diagnoses. "
                     "These codes are assigned at the end of the patient's stay and are used by the hospital to bill for care provided.")
    
    # Important considerations about the data
    important_considerations = ("ICD-9 and ICD-10 codes have distinct formats: ICD-9 codes are 5 character long strings which are "
                                "entirely numeric (with the exception of codes prefixed with 'E' or 'V'). ICD-10 codes are 3-7 "
                                "characters long and always prefixed by a letter followed by numeric values. Both versions often "
                                "include a decimal in presentation, but the decimal is not stored in the database.")
    
    # Common table joins
    common_joins = [
        "JOIN diagnoses_icd ON d_icd_diagnoses.icd_code = diagnoses_icd.icd_code AND d_icd_diagnoses.icd_version = diagnoses_icd.icd_version"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What is the description for ICD code '0010'?",
        "How many different ICD-10 diagnoses are in the database?",
        "What are all the diabetes-related diagnoses?",
        "What is the meaning of ICD-9 code 'V30.00'?",
        "Which ICD codes are related to heart failure?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "ICD codes", 
        "diagnosis codes",
        "diagnostic codes",
        "medical coding",
        "ICD-9",
        "ICD-10",
        "billing codes",
        "International Classification of Diseases"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_d_icd_diagnoses_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info 
    FROM mimic_table_metadata 
    WHERE table_name='d_icd_diagnoses';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info for d_icd_diagnoses:")
print(json.dumps(all_columns_info, indent=2))

✅ Metadata for 'd_icd_diagnoses' table has been created.

📋 All Columns Info for d_icd_diagnoses:
{
  "icd_code": {
    "data_type": "character",
    "constraints": "NOT NULL PRIMARY KEY",
    "description": "International Coding Definitions (ICD) code for diagnosis"
  },
  "long_title": {
    "data_type": "character varying",
    "constraints": "",
    "description": "The meaning of the ICD code (e.g., 'Cholera due to vibrio cholerae')"
  },
  "icd_version": {
    "data_type": "integer",
    "constraints": "NOT NULL PRIMARY KEY",
    "description": "Version of the ICD coding system (9 or 10)",
    "value_range": {
      "max": 10,
      "min": 9
    },
    "statistical_summary": {
      "q1": 10.0,
      "q3": 10.0,
      "mean": 9.866399453427466,
      "median": 10.0
    }
  }
}


# d_labitems

In [17]:
def create_d_labitems_metadata():
    """Create metadata for the d_labitems table"""
    table_name = "d_labitems"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Manually update column descriptions
    if 'itemid' in columns_info:
        columns_info['itemid']['description'] = "A unique identifier for a laboratory concept"
    
    if 'label' in columns_info:
        columns_info['label']['description'] = "Describes the concept which is represented by the itemid"
    
    if 'fluid' in columns_info:
        columns_info['fluid']['description'] = "Describes the substance on which the measurement was made (e.g., 'BLOOD')"
    
    if 'category' in columns_info:
        columns_info['category']['description'] = "Provides higher level information as to the type of measurement (e.g., 'ABG')"
    
    # Table description and purpose
    description = "Dimension table for labevents provides a description of all lab items."
    table_purpose = ("d_labitems contains definitions for all itemid associated with lab measurements in the MIMIC database. "
                     "All data in labevents link to the d_labitems table. Each unique (fluid, category, label) tuple in the "
                     "hospital database was assigned an itemid in this table, and the use of this itemid facilitates efficient "
                     "storage and querying of the data.")
    
    # Important considerations about the data
    important_considerations = ("This table used to contain a column called loinc_code, which stored standardized identifiers "
                                "for laboratory measurements. To support ongoing improvement of these labels, the assignment "
                                "of LOINC codes is now done in the MIMIC Code Repository.")
    
    # Common table joins
    common_joins = [
        "JOIN labevents ON d_labitems.itemid = labevents.itemid"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What are all the blood chemistry tests available?",
        "What laboratory measurements are performed on urine samples?",
        "How many different categories of lab tests are there?",
        "Which itemid corresponds to a specific lab test like glucose?",
        "What are all the tests in the ABG category?",
        "What fluid types are available for creatinine measurement?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "lab tests", 
        "laboratory measurements",
        "lab items",
        "blood tests",
        "clinical tests",
        "diagnostic tests",
        "lab codes",
        "lab dictionary"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_d_labitems_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info 
    FROM mimic_table_metadata 
    WHERE table_name='d_labitems';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info for d_labitems:")
print(json.dumps(all_columns_info, indent=2))

✅ Metadata for 'd_labitems' table has been created.

📋 All Columns Info for d_labitems:
{
  "fluid": {
    "data_type": "character varying",
    "constraints": "",
    "description": "Describes the substance on which the measurement was made (e.g., 'BLOOD')",
    "categorical_values": [
      "Blood",
      "Urine",
      "Other Body Fluid",
      "Cerebrospinal Fluid",
      "Joint Fluid",
      "Pleural",
      "Ascites",
      "Bone Marrow",
      "Stool",
      "Q",
      "Fluid",
      "I"
    ],
    "value_distribution": {
      "I": 0.06,
      "Q": 0.55,
      "Blood": 49.75,
      "Fluid": 0.31,
      "Stool": 2.65,
      "Urine": 11.96,
      "Ascites": 4.19,
      "Pleural": 4.25,
      "Bone Marrow": 3.88,
      "Joint Fluid": 4.44,
      "Other Body Fluid": 11.84,
      "Cerebrospinal Fluid": 6.1
    }
  },
  "label": {
    "data_type": "character varying",
    "constraints": "",
    "description": "Describes the concept which is represented by the itemid",
    "categorica

# d_icd_procedures

In [18]:
def create_d_icd_procedures_metadata():
    """Create metadata for the d_icd_procedures table"""
    table_name = "d_icd_procedures"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Manually update column descriptions
    if 'icd_code' in columns_info:
        columns_info['icd_code']['description'] = "International Coding Definitions (ICD) code for procedures"
    
    if 'icd_version' in columns_info:
        columns_info['icd_version']['description'] = "Version of the ICD coding system (9 or 10)"
    
    if 'long_title' in columns_info:
        columns_info['long_title']['description'] = "The meaning of the ICD procedure code"
    
    # Table description and purpose
    description = "Dimension table for procedures_icd; provides a description of ICD-9/ICD-10 billed procedures."
    table_purpose = ("This table defines International Classification of Diseases (ICD) codes for procedures. "
                     "These codes are assigned at the end of the patient's stay and are used by the hospital to bill for care provided. "
                     "They can further be used to identify if certain procedures have been performed (e.g. surgery).")
    
    # Important considerations about the data
    important_considerations = ("ICD-9 and ICD-10 codes have distinct formats. Both versions are often presented with a decimal, "
                                "but the decimal is not stored in the database (i.e., the icd_code of '0010' is equivalent to '001.0'). "
                                "In general, ICD-10 codes are more detailed, though code mappings exist which convert ICD-9 codes to ICD-10 codes.")
    
    # Common table joins
    common_joins = [
        "JOIN procedures_icd ON d_icd_procedures.icd_code = procedures_icd.icd_code AND d_icd_procedures.icd_version = procedures_icd.icd_version"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What is the description for procedure code '0010'?",
        "How many different ICD-10 procedures are in the database?",
        "What are all the cardiac surgery procedures?",
        "What is the meaning of ICD-9 procedure code '39.61'?",
        "Which ICD codes are related to ventilation procedures?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "ICD procedure codes", 
        "procedure codes",
        "surgical codes",
        "medical procedure coding",
        "ICD-9 procedures",
        "ICD-10 procedures",
        "billing procedure codes",
        "International Classification of Diseases procedures"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_d_icd_procedures_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info 
    FROM mimic_table_metadata 
    WHERE table_name='d_icd_procedures';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info for d_icd_procedures:")
print(json.dumps(all_columns_info, indent=2))

✅ Metadata for 'd_icd_procedures' table has been created.

📋 All Columns Info for d_icd_procedures:
{
  "icd_code": {
    "data_type": "character",
    "constraints": "NOT NULL PRIMARY KEY",
    "description": "International Coding Definitions (ICD) code for procedures"
  },
  "long_title": {
    "data_type": "character varying",
    "constraints": "",
    "description": "The meaning of the ICD procedure code"
  },
  "icd_version": {
    "data_type": "integer",
    "constraints": "NOT NULL PRIMARY KEY",
    "description": "Version of the ICD coding system (9 or 10)",
    "value_range": {
      "max": 10,
      "min": 9
    },
    "statistical_summary": {
      "q1": 10.0,
      "q3": 10.0,
      "mean": 9.954396706428797,
      "median": 10.0
    }
  }
}


# patients

In [19]:
def create_patients_metadata():
    """Create metadata for the patients table"""
    table_name = "patients"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Manually update column descriptions
    if 'subject_id' in columns_info:
        columns_info['subject_id']['description'] = "Unique identifier which specifies an individual patient"
    
    if 'gender' in columns_info:
        columns_info['gender']['description'] = "Genotypical sex of the patient"
    
    if 'anchor_age' in columns_info:
        columns_info['anchor_age']['description'] = "Patient's age in the anchor_year (patients over 89 have anchor_age of 91)"
    
    if 'anchor_year' in columns_info:
        columns_info['anchor_year']['description'] = "Shifted year for the patient (for de-identification)"
    
    if 'anchor_year_group' in columns_info:
        columns_info['anchor_year_group']['description'] = "Range of years during which the patient's anchor_year occurred"
    
    if 'dod' in columns_info:
        columns_info['dod']['description'] = "De-identified date of death for the patient, if applicable"
    
    # Table description and purpose
    description = "Patients' gender, age, and date of death if information exists."
    table_purpose = ("Information that is consistent for the lifetime of a patient is stored in this table. "
                     "This is the primary demographic table for patients in MIMIC-IV.")
    
    # Important considerations about the data
    important_considerations = ("Due to de-identification, out of hospital mortality is only available "
                                "up to one year post-hospital discharge. All patient deaths occurring more than "
                                "one year after hospital discharge are censored. Patients with age over 89 in "
                                "the anchor_year have their anchor_age set to 91, regardless of actual age.")
    
    # Common table joins
    common_joins = [
        "JOIN admissions ON patients.subject_id = admissions.subject_id",
        "JOIN omr ON patients.subject_id = omr.subject_id",
        "JOIN labevents ON patients.subject_id = labevents.subject_id"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What is the gender distribution of patients in the database?",
        "How many patients died during their hospital stay?",
        "What is the average age of patients?",
        "How does mortality rate vary by gender?",
        "What percentage of patients are over 65 years old?",
        "How many patients died within one year after discharge?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "demographics", 
        "patient information",
        "patient demographics",
        "mortality data",
        "patient age",
        "patient gender",
        "patient death",
        "patient characteristics"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_patients_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info 
    FROM mimic_table_metadata 
    WHERE table_name='patients';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info for patients:")
print(json.dumps(all_columns_info, indent=2))

✅ Metadata for 'patients' table has been created.

📋 All Columns Info for patients:
{
  "dod": {
    "data_type": "timestamp without time zone",
    "constraints": "",
    "description": "De-identified date of death for the patient, if applicable"
  },
  "gender": {
    "data_type": "character varying",
    "constraints": "NOT NULL",
    "description": "Genotypical sex of the patient",
    "categorical_values": [
      "F",
      "M"
    ],
    "value_distribution": {
      "F": 52.9,
      "M": 47.1
    }
  },
  "anchor_age": {
    "data_type": "integer",
    "constraints": "NOT NULL",
    "description": "Patient's age in the anchor_year (patients over 89 have anchor_age of 91)",
    "value_range": {
      "max": 91,
      "min": 18
    },
    "statistical_summary": {
      "q1": 29.0,
      "q3": 65.0,
      "mean": 48.53988829276105,
      "median": 48.0
    }
  },
  "subject_id": {
    "data_type": "integer",
    "constraints": "NOT NULL PRIMARY KEY",
    "description": "Unique ide

# diagnoses_icd

In [None]:
def create_diagnoses_icd_metadata():
    """Create metadata for the diagnoses_icd table"""
    table_name = "diagnoses_icd"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Manually update column descriptions
    if 'subject_id' in columns_info:
        columns_info['subject_id']['description'] = "Unique identifier which specifies an individual patient"
    
    if 'hadm_id' in columns_info:
        columns_info['hadm_id']['description'] = "Integer identifier which is unique for each patient hospitalization"
    
    if 'seq_num' in columns_info:
        columns_info['seq_num']['description'] = "Priority assigned to the diagnoses, representing the ranking of importance"
    
    if 'icd_code' in columns_info:
        columns_info['icd_code']['description'] = "International Coding Definitions (ICD) code for the diagnosis"
    
    if 'icd_version' in columns_info:
        columns_info['icd_version']['description'] = "Version of the ICD coding system (9 or 10)"
    
    # Table description and purpose
    description = "Billed ICD-9/ICD-10 diagnoses for hospitalizations."
    table_purpose = ("During routine hospital care, patients are billed by the hospital for diagnoses associated with "
                     "their hospital stay. This table contains a record of all diagnoses a patient was billed for during "
                     "their hospital stay using the ICD-9 and ICD-10 ontologies. Diagnoses are billed on hospital discharge, "
                     "and are determined by trained persons who read signed clinical notes.")
    
    # Important considerations about the data
    important_considerations = ("The priority (seq_num) can be interpreted as a ranking of which diagnoses are 'important', "
                                "but many caveats to this broad statement exist. For example, patients who are diagnosed with "
                                "sepsis must have sepsis as their 2nd billed condition. ICD codes are often presented with a "
                                "decimal in standard notation, but the decimal is not stored in the database.")
    
    # Common table joins
    common_joins = [
        "JOIN d_icd_diagnoses ON diagnoses_icd.icd_code = d_icd_diagnoses.icd_code AND diagnoses_icd.icd_version = d_icd_diagnoses.icd_version",
        "JOIN admissions ON diagnoses_icd.hadm_id = admissions.hadm_id",
        "JOIN patients ON diagnoses_icd.subject_id = patients.subject_id"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What are the most common diagnoses?",
        "How many patients were diagnosed with diabetes?",
        "What is the average number of diagnoses per admission?",
        "What percentage of patients have heart-related diagnoses?",
        "Which diagnoses frequently occur together?",
        "What are the top 10 primary diagnoses (seq_num = 1)?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "diagnoses", 
        "patient diagnoses",
        "ICD diagnoses",
        "medical conditions",
        "disease codes",
        "ICD-9 diagnoses",
        "ICD-10 diagnoses",
        "diagnostic codes",
        "billing diagnoses"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_diagnoses_icd_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info 
    FROM mimic_table_metadata 
    WHERE table_name='diagnoses_icd';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info for diagnoses_icd:")
print(json.dumps(all_columns_info, indent=2))

# drgcodes

In [1]:
def create_drgcodes_metadata():
    """Create metadata for the drgcodes table"""
    table_name = "drgcodes"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Manually update column descriptions
    if 'subject_id' in columns_info:
        columns_info['subject_id']['description'] = "Unique identifier which specifies an individual patient"
    
    if 'hadm_id' in columns_info:
        columns_info['hadm_id']['description'] = "Integer identifier which is unique for each patient hospitalization"
    
    if 'drg_type' in columns_info:
        columns_info['drg_type']['description'] = "The specific DRG ontology used for the code"
    
    if 'drg_code' in columns_info:
        columns_info['drg_code']['description'] = "The DRG code"
    
    if 'description' in columns_info:
        columns_info['description']['description'] = "A description for the given DRG code"
    
    if 'drg_severity' in columns_info:
        columns_info['drg_severity']['description'] = "Qualifier for the patient severity of illness"
    
    if 'drg_mortality' in columns_info:
        columns_info['drg_mortality']['description'] = "Qualifier for the patient likelihood of mortality"
    
    # Table description and purpose
    description = "Billed diagnosis related group (DRG) codes for hospitalizations."
    table_purpose = ("Diagnosis related groups (DRGs) are used by the hospital to obtain reimbursement for a patient's "
                     "hospital stay. The codes correspond to the primary reason for a patient's stay at the hospital.")
    
    # Important considerations about the data
    important_considerations = ("DRG codes are primarily used for billing and reimbursement purposes. Some DRG ontologies "
                                "further qualify the patient severity of illness and likelihood of mortality through the "
                                "drg_severity and drg_mortality fields.")
    
    # Common table joins
    common_joins = [
        "JOIN admissions ON drgcodes.hadm_id = admissions.hadm_id",
        "JOIN patients ON drgcodes.subject_id = patients.subject_id"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What are the most common DRG codes?",
        "What is the average severity across different DRG types?",
        "Which DRG codes have the highest mortality risk?",
        "What percentage of patients have high-severity DRG codes?",
        "What is the distribution of DRG types?",
        "Are certain DRG codes more common in specific demographic groups?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "DRG codes", 
        "diagnosis related groups",
        "billing codes",
        "reimbursement codes",
        "hospital billing",
        "hospital payment",
        "primary diagnosis codes",
        "inpatient prospective payment system"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_drgcodes_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info 
    FROM mimic_table_metadata 
    WHERE table_name='drgcodes';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info for drgcodes:")
print(json.dumps(all_columns_info, indent=2))

NameError: name 'get_column_statistics' is not defined

# emar

In [None]:
def create_emar_metadata():
    """Create metadata for the emar table"""
    table_name = "emar"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Manually update column descriptions
    if 'subject_id' in columns_info:
        columns_info['subject_id']['description'] = "Unique identifier which specifies an individual patient"
    
    if 'hadm_id' in columns_info:
        columns_info['hadm_id']['description'] = "Integer identifier which is unique for each patient hospitalization"
    
    if 'emar_id' in columns_info:
        columns_info['emar_id']['description'] = "Unique identifier for each order made in eMAR"
    
    if 'emar_seq' in columns_info:
        columns_info['emar_seq']['description'] = "Consecutive integer which numbers eMAR orders chronologically"
    
    if 'poe_id' in columns_info:
        columns_info['poe_id']['description'] = "Identifier which links administrations in emar to orders in poe and prescriptions"
    
    if 'pharmacy_id' in columns_info:
        columns_info['pharmacy_id']['description'] = "Identifier which links administrations in emar to pharmacy information"
    
    if 'enter_provider_id' in columns_info:
        columns_info['enter_provider_id']['description'] = "Anonymous identifier for the provider who entered the information"
    
    if 'charttime' in columns_info:
        columns_info['charttime']['description'] = "Time at which the medication was administered"
    
    if 'medication' in columns_info:
        columns_info['medication']['description'] = "Name of the medication which was administered"
    
    if 'event_txt' in columns_info:
        columns_info['event_txt']['description'] = "Information about the administration (e.g., 'Administered', 'Applied')"
    
    if 'scheduletime' in columns_info:
        columns_info['scheduletime']['description'] = "Time at which the administration was scheduled (if present)"
    
    if 'storetime' in columns_info:
        columns_info['storetime']['description'] = "Time at which the administration was documented in the eMAR table"
    
    # Table description and purpose
    description = "The Electronic Medicine Administration Record (eMAR); barcode scanning of medications at the time of administration."
    table_purpose = ("The EMAR table is used to record administrations of a given medicine to an individual patient. "
                     "Records in this table are populated by bedside nursing staff scanning barcodes associated with "
                     "the medicine and the patient.")
    
    # Important considerations about the data
    important_considerations = ("The eMAR system was implemented during 2011-2013. As a result, eMAR data is not available "
                                "for all patients. The emar_id is composed of subject_id and emar_seq in the pattern: "
                                "'subject_id-emar_seq'.")
    
    # Common table joins
    common_joins = [
        "JOIN emar_detail ON emar.emar_id = emar_detail.emar_id",
        "JOIN pharmacy ON emar.pharmacy_id = pharmacy.pharmacy_id",
        "JOIN prescriptions ON emar.pharmacy_id = prescriptions.pharmacy_id",
        "JOIN poe ON emar.poe_id = poe.poe_id",
        "JOIN patients ON emar.subject_id = patients.subject_id",
        "JOIN admissions ON emar.hadm_id = admissions.hadm_id"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What are the most commonly administered medications?",
        "What percentage of scheduled medications were not given?",
        "What's the average time difference between scheduled and actual medication administration?",
        "Which medications have the highest rate of being marked as 'Not Given'?",
        "How many medications does a typical patient receive during their hospital stay?",
        "Who are the providers that administer the most medications?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "medication administration", 
        "medicine administration",
        "drug administration",
        "eMAR",
        "electronic MAR",
        "medication record",
        "medication barcode scanning",
        "bedside medication"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_emar_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info 
    FROM mimic_table_metadata 
    WHERE table_name='emar';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info for emar:")
print(json.dumps(all_columns_info, indent=2))

# emar_detail

In [None]:
def create_emar_detail_metadata():
    """Create metadata for the emar_detail table"""
    table_name = "emar_detail"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Manually update column descriptions
    if 'subject_id' in columns_info:
        columns_info['subject_id']['description'] = "Unique identifier which specifies an individual patient"
    
    if 'emar_id' in columns_info:
        columns_info['emar_id']['description'] = "Unique identifier for each order made in eMAR"
    
    if 'emar_seq' in columns_info:
        columns_info['emar_seq']['description'] = "Consecutive integer which numbers eMAR orders chronologically"
    
    if 'parent_field_ordinal' in columns_info:
        columns_info['parent_field_ordinal']['description'] = "Delineates multiple administrations for the same eMAR event (NULL for main record, '1.1', '1.2', etc. for individual doses)"
    
    if 'administration_type' in columns_info:
        columns_info['administration_type']['description'] = "Type of administration (IV Bolus, IV Infusion, Transdermal Patch, etc.)"
    
    if 'pharmacy_id' in columns_info:
        columns_info['pharmacy_id']['description'] = "Identifier linking to pharmacy information in the pharmacy table"
    
    if 'barcode_type' in columns_info:
        columns_info['barcode_type']['description'] = "Type of barcode scanned during administration"
    
    if 'reason_for_no_barcode' in columns_info:
        columns_info['reason_for_no_barcode']['description'] = "Reason why barcode was not scanned, if applicable"
    
    if 'complete_dose_not_given' in columns_info:
        columns_info['complete_dose_not_given']['description'] = "Indicator if the complete dose was not given"
    
    if 'dose_due' in columns_info:
        columns_info['dose_due']['description'] = "Amount of medication that was due to be administered"
    
    if 'dose_due_unit' in columns_info:
        columns_info['dose_due_unit']['description'] = "Unit of measurement for dose_due"
    
    if 'dose_given' in columns_info:
        columns_info['dose_given']['description'] = "Amount of medication that was actually administered"
    
    if 'dose_given_unit' in columns_info:
        columns_info['dose_given_unit']['description'] = "Unit of measurement for dose_given"
    
    if 'will_remainder_of_dose_be_given' in columns_info:
        columns_info['will_remainder_of_dose_be_given']['description'] = "Indicator if remainder of dose will be given later"
    
    if 'product_amount_given' in columns_info:
        columns_info['product_amount_given']['description'] = "Amount of product given"
    
    if 'product_unit' in columns_info:
        columns_info['product_unit']['description'] = "Unit of measurement for product"
    
    if 'product_code' in columns_info:
        columns_info['product_code']['description'] = "Code identifying the specific product"
    
    if 'product_description' in columns_info:
        columns_info['product_description']['description'] = "Description of the product administered"
    
    if 'route' in columns_info:
        columns_info['route']['description'] = "Route of administration (IV, PO, etc.)"
    
    if 'infusion_rate' in columns_info:
        columns_info['infusion_rate']['description'] = "Rate at which the medication is infused"
    
    if 'infusion_rate_unit' in columns_info:
        columns_info['infusion_rate_unit']['description'] = "Unit of measurement for infusion rate"
    
    if 'site' in columns_info:
        columns_info['site']['description'] = "Anatomical site where medication was administered"
    
    # Table description and purpose
    description = "Supplementary information for electronic administrations recorded in emar."
    table_purpose = ("The emar_detail table contains information for each medicine administration made in the EMAR table. "
                     "Information includes the associated pharmacy order, the dose due, the dose given, and many other "
                     "parameters associated with the medical administration.")
    
    # Important considerations about the data
    important_considerations = ("The eMAR system was implemented during 2011-2013. As a result, eMAR data is not available "
                                "for all patients. Multiple rows in emar_detail may correspond to a single row in emar, "
                                "especially when multiple formulary doses are administered to achieve the desired total dose. "
                                "The parent_field_ordinal field is used to distinguish between main records (NULL) and "
                                "individual administrations ('1.1', '1.2', etc.).")
    
    # Common table joins
    common_joins = [
        "JOIN emar ON emar_detail.emar_id = emar.emar_id",
        "JOIN pharmacy ON emar_detail.pharmacy_id = pharmacy.pharmacy_id",
        "JOIN patients ON emar_detail.subject_id = patients.subject_id"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What's the difference between prescribed doses and administered doses?",
        "Which administration routes are most common for specific medications?",
        "What percentage of medication administrations are completed?",
        "How often are medications administered without barcode scanning?",
        "What are the typical infusion rates for specific IV medications?",
        "Which anatomical sites are most commonly used for specific medications?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "medication administration details", 
        "drug administration details",
        "eMAR details",
        "medication dosing",
        "drug dosing",
        "medication administration parameters",
        "medication delivery",
        "infusion details"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_emar_detail_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info 
    FROM mimic_table_metadata 
    WHERE table_name='emar_detail';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info for emar_detail:")
print(json.dumps(all_columns_info, indent=2))

# hcpcsevents

In [None]:
def create_hcpcsevents_metadata():
    """Create metadata for the hcpcsevents table"""
    table_name = "hcpcsevents"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Manually update column descriptions
    if 'subject_id' in columns_info:
        columns_info['subject_id']['description'] = "Unique identifier which specifies an individual patient"
    
    if 'hadm_id' in columns_info:
        columns_info['hadm_id']['description'] = "Integer identifier which is unique for each patient hospitalization"
    
    if 'chartdate' in columns_info:
        columns_info['chartdate']['description'] = "The date associated with the coded event"
    
    if 'hcpcs_cd' in columns_info:
        columns_info['hcpcs_cd']['description'] = "A five character code which uniquely represents the event; links to d_hcpcs table"
    
    if 'seq_num' in columns_info:
        columns_info['seq_num']['description'] = "An assigned order to HCPCS codes for an individual hospitalization"
    
    if 'short_description' in columns_info:
        columns_info['short_description']['description'] = "A short textual description of the HCPCS code"
    
    # Table description and purpose
    description = "Billed events occurring during the hospitalization. Includes CPT codes."
    table_purpose = ("The hcpcsevents table contains billable events and procedures that occurred during "
                     "a patient's hospitalization, primarily using CPT (Current Procedural Terminology) codes "
                     "which are a subset of HCPCS codes.")
    
    # Important considerations about the data
    important_considerations = ("The seq_num field indicates an order for HCPCS codes, which sometimes conveys "
                                "priority or importance, but this interpretation is not guaranteed across all codes. "
                                "For complete code descriptions, link to the d_hcpcs table using the hcpcs_cd field.")
    
    # Common table joins
    common_joins = [
        "JOIN d_hcpcs ON hcpcsevents.hcpcs_cd = d_hcpcs.code",
        "JOIN admissions ON hcpcsevents.hadm_id = admissions.hadm_id",
        "JOIN patients ON hcpcsevents.subject_id = patients.subject_id"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What are the most common procedures performed during hospitalizations?",
        "How many patients underwent specific procedures during their stay?",
        "What procedures are most frequently performed on patients with specific diagnoses?",
        "What is the distribution of procedures across different demographic groups?",
        "What procedures typically occur early in a hospitalization versus later?",
        "Which procedures are most commonly performed together?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "procedures", 
        "CPT codes",
        "HCPCS codes",
        "billable procedures",
        "medical procedures",
        "patient procedures",
        "procedural codes",
        "hospital procedures",
        "billable events"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_hcpcsevents_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info 
    FROM mimic_table_metadata 
    WHERE table_name='hcpcsevents';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info for hcpcsevents:")
print(json.dumps(all_columns_info, indent=2))

# labevents

In [None]:
def create_labevents_metadata():
    """Create metadata for the labevents table"""
    table_name = "labevents"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Manually update column descriptions
    if 'labevent_id' in columns_info:
        columns_info['labevent_id']['description'] = "Unique identifier for each row in the table"
    
    if 'subject_id' in columns_info:
        columns_info['subject_id']['description'] = "Unique identifier which specifies an individual patient"
    
    if 'hadm_id' in columns_info:
        columns_info['hadm_id']['description'] = "Integer identifier which is unique for each patient hospitalization"
    
    if 'specimen_id' in columns_info:
        columns_info['specimen_id']['description'] = "Uniquely denotes the specimen from which the lab measurement was made"
    
    if 'itemid' in columns_info:
        columns_info['itemid']['description'] = "Identifier which uniquely denotes laboratory concepts; links to d_labitems table"
    
    if 'order_provider_id' in columns_info:
        columns_info['order_provider_id']['description'] = "Anonymous identifier for the provider who ordered the laboratory measurement"
    
    if 'charttime' in columns_info:
        columns_info['charttime']['description'] = "Time at which the laboratory measurement was charted (usually when specimen was acquired)"
    
    if 'storetime' in columns_info:
        columns_info['storetime']['description'] = "Time at which the measurement was made available in the laboratory system"
    
    if 'value' in columns_info:
        columns_info['value']['description'] = "The result of the laboratory measurement as text"
    
    if 'valuenum' in columns_info:
        columns_info['valuenum']['description'] = "The result of the laboratory measurement as a numeric value, if applicable"
    
    if 'valueuom' in columns_info:
        columns_info['valueuom']['description'] = "The unit of measurement for the laboratory concept"
    
    if 'ref_range_lower' in columns_info:
        columns_info['ref_range_lower']['description'] = "Lower reference range indicating the normal range for the laboratory measurement"
    
    if 'ref_range_upper' in columns_info:
        columns_info['ref_range_upper']['description'] = "Upper reference range indicating the normal range for the laboratory measurement"
    
    if 'flag' in columns_info:
        columns_info['flag']['description'] = "Brief string mainly used to indicate if the laboratory measurement is abnormal"
    
    if 'priority' in columns_info:
        columns_info['priority']['description'] = "Priority of the laboratory measurement: either routine or stat (urgent)"
    
    if 'comments' in columns_info:
        columns_info['comments']['description'] = "Deidentified free-text comments associated with the laboratory measurement"
    
    # Table description and purpose
    description = "Laboratory measurements sourced from patient derived specimens."
    table_purpose = ("The labevents table stores the results of all laboratory measurements made for a single patient. "
                     "These include hematology measurements, blood gases, chemistry panels, and less common tests such as genetic assays.")
    
    # Important considerations about the data
    important_considerations = ("hadm_id is assigned to labs close to the hospital stay using the transfers table. However, "
                               "this does not always perfectly capture labs proximal to the hospital stay. Some lab "
                               "observations may not have an hadm_id if they fall outside of a row in transfers or "
                               "do not have an associated ED stay identifier.")
    
    # Common table joins
    common_joins = [
        "JOIN d_labitems ON labevents.itemid = d_labitems.itemid",
        "JOIN patients ON labevents.subject_id = patients.subject_id",
        "JOIN admissions ON labevents.hadm_id = admissions.hadm_id"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What are the normal ranges for specific laboratory tests?",
        "How many lab tests were ordered as stat (urgent) versus routine?",
        "Which lab values were abnormal (flagged) for a specific patient?",
        "What is the average time between when a lab is drawn (charttime) and when results are available (storetime)?",
        "Which lab tests are most commonly ordered for specific conditions?",
        "What is the trend of a specific lab value (e.g., hemoglobin) over a patient's hospital stay?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "lab tests", 
        "laboratory results",
        "lab values",
        "blood tests",
        "lab measurements",
        "laboratory data",
        "diagnostic tests",
        "clinical laboratory tests",
        "medical tests"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_labevents_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info 
    FROM mimic_table_metadata 
    WHERE table_name='labevents';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info for labevents:")
print(json.dumps(all_columns_info, indent=2))

# microbiologyevents

In [None]:
def create_microbiologyevents_metadata():
    """Create metadata for the microbiologyevents table"""
    table_name = "microbiologyevents"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Manually update column descriptions
    if 'microevent_id' in columns_info:
        columns_info['microevent_id']['description'] = "A unique integer denoting the row"
    
    if 'subject_id' in columns_info:
        columns_info['subject_id']['description'] = "Unique identifier which specifies an individual patient"
    
    if 'hadm_id' in columns_info:
        columns_info['hadm_id']['description'] = "Integer identifier which is unique for each patient hospitalization"
    
    if 'micro_specimen_id' in columns_info:
        columns_info['micro_specimen_id']['description'] = "Uniquely denotes the specimen from which the microbiology measurement was made"
    
    if 'order_provider_id' in columns_info:
        columns_info['order_provider_id']['description'] = "Anonymous identifier for the provider who ordered the microbiology test"
    
    if 'chartdate' in columns_info:
        columns_info['chartdate']['description'] = "Date of when the specimen was collected"
    
    if 'charttime' in columns_info:
        columns_info['charttime']['description'] = "Date and time of when the specimen was collected (may be NULL if time unknown)"
    
    if 'spec_itemid' in columns_info:
        columns_info['spec_itemid']['description'] = "Identifier for the specimen type"
    
    if 'spec_type_desc' in columns_info:
        columns_info['spec_type_desc']['description'] = "Description of the specimen tested (e.g., BLOOD CULTURE, URINE, SPUTUM)"
    
    if 'test_seq' in columns_info:
        columns_info['test_seq']['description'] = "Sequence number to differentiate multiple tests on the same specimen"
    
    if 'storedate' in columns_info:
        columns_info['storedate']['description'] = "Date when the microbiology result was available"
    
    if 'storetime' in columns_info:
        columns_info['storetime']['description'] = "Date and time when the microbiology result was available"
    
    if 'test_itemid' in columns_info:
        columns_info['test_itemid']['description'] = "Identifier for the test performed on the specimen"
    
    if 'test_name' in columns_info:
        columns_info['test_name']['description'] = "Name of the test performed on the specimen"
    
    if 'org_itemid' in columns_info:
        columns_info['org_itemid']['description'] = "Identifier for the organism that grew, if any"
    
    if 'org_name' in columns_info:
        columns_info['org_name']['description'] = "Name of the organism that grew (NULL indicates negative culture)"
    
    if 'isolate_num' in columns_info:
        columns_info['isolate_num']['description'] = "Isolated colony number for antibiotic testing (starts at 1)"
    
    if 'quantity' in columns_info:
        columns_info['quantity']['description'] = "Quantity or amount of the organism found"
    
    if 'ab_itemid' in columns_info:
        columns_info['ab_itemid']['description'] = "Identifier for antibiotic tested against the organism"
    
    if 'ab_name' in columns_info:
        columns_info['ab_name']['description'] = "Name of antibiotic tested against the organism for sensitivity"
    
    if 'dilution_text' in columns_info:
        columns_info['dilution_text']['description'] = "Text representation of the dilution for antibiotic sensitivity testing"
    
    if 'dilution_comparison' in columns_info:
        columns_info['dilution_comparison']['description'] = "Comparison operator for the dilution value (e.g., <=, =, >=)"
    
    if 'dilution_value' in columns_info:
        columns_info['dilution_value']['description'] = "Numeric value of the dilution for antibiotic sensitivity testing"
    
    if 'interpretation' in columns_info:
        columns_info['interpretation']['description'] = "Interpretation of antibiotic sensitivity (S=sensitive, R=resistant, I=intermediate, P=pending)"
    
    if 'comments' in columns_info:
        columns_info['comments']['description'] = "Deidentified free-text comments associated with the microbiology measurement"
    
    # Table description and purpose
    description = "Microbiology cultures."
    table_purpose = ("Microbiology tests are a common procedure to check for infectious growth and to assess which "
                     "antibiotic treatments are most effective. This table contains the results of bacterial cultures "
                     "and antibiotic sensitivity tests.")
    
    # Important considerations about the data
    important_considerations = ("The data is hierarchical: each specimen can have multiple organisms, and each organism "
                               "can have multiple antibiotics tested against it. Typically, negative values are indicated "
                               "by a NULL value in the org_name column, but itemid 90856 has a value of 'NEGATIVE'. "
                               "hadm_id is assigned using the administrative transfer table but captures only ~96% of "
                               "observations that could be linked to a hospital admission.")
    
    # Common table joins
    common_joins = [
        "JOIN patients ON microbiologyevents.subject_id = patients.subject_id",
        "JOIN admissions ON microbiologyevents.hadm_id = admissions.hadm_id"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What are the most common organisms found in blood cultures?",
        "Which antibiotics are most effective against specific bacteria?",
        "What percentage of cultures are positive versus negative?",
        "What is the antibiotic resistance pattern for a specific organism?",
        "How long does it typically take from specimen collection to result availability?",
        "Which specimens are most likely to yield positive cultures?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "micro cultures", 
        "bacterial cultures",
        "microbiology tests",
        "culture and sensitivity",
        "antibiotic sensitivity",
        "organism testing",
        "infectious disease testing",
        "C&S testing"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_microbiologyevents_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info 
    FROM mimic_table_metadata 
    WHERE table_name='microbiologyevents';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info for microbiologyevents:")
print(json.dumps(all_columns_info, indent=2))

# pharmecy

In [None]:
def create_pharmacy_metadata():
    """Create metadata for the pharmacy table"""
    table_name = "pharmacy"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Manually update column descriptions
    if 'subject_id' in columns_info:
        columns_info['subject_id']['description'] = "Unique identifier which specifies an individual patient"
    
    if 'hadm_id' in columns_info:
        columns_info['hadm_id']['description'] = "Integer identifier which is unique for each patient hospitalization"
    
    if 'pharmacy_id' in columns_info:
        columns_info['pharmacy_id']['description'] = "A unique identifier for each pharmacy entry"
    
    if 'poe_id' in columns_info:
        columns_info['poe_id']['description'] = "Foreign key which links to the provider order entry in prescriptions table"
    
    if 'starttime' in columns_info:
        columns_info['starttime']['description'] = "The start time for the given prescribed medication"
    
    if 'stoptime' in columns_info:
        columns_info['stoptime']['description'] = "The stop time for the given prescribed medication"
    
    if 'medication' in columns_info:
        columns_info['medication']['description'] = "The name of the medication provided"
    
    if 'proc_type' in columns_info:
        columns_info['proc_type']['description'] = "The type of order (e.g., 'IV Piggyback', 'Non-formulary', 'Unit Dose')"
    
    if 'status' in columns_info:
        columns_info['status']['description'] = "Whether the prescription is active, inactive, or discontinued"
    
    if 'entertime' in columns_info:
        columns_info['entertime']['description'] = "Date and time when the prescription was entered into the pharmacy system"
    
    if 'verifiedtime' in columns_info:
        columns_info['verifiedtime']['description'] = "Date and time when the prescription was verified by a physician"
    
    if 'route' in columns_info:
        columns_info['route']['description'] = "The intended route of administration for the prescription"
    
    if 'frequency' in columns_info:
        columns_info['frequency']['description'] = "The frequency at which the medication should be administered (e.g., 'Q6H' = every 6 hours)"
    
    if 'disp_sched' in columns_info:
        columns_info['disp_sched']['description'] = "The hours of the day at which the medication should be administered"
    
    if 'infusion_type' in columns_info:
        columns_info['infusion_type']['description'] = "A coded letter describing the type of infusion ('B', 'C', 'N', 'N1', 'O', or 'R')"
    
    if 'sliding_scale' in columns_info:
        columns_info['sliding_scale']['description'] = "Indicates whether the medication should be given on a sliding scale ('Y' or 'N')"
    
    if 'lockout_interval' in columns_info:
        columns_info['lockout_interval']['description'] = "Time the patient must wait until providing themselves with another dose"
    
    if 'basal_rate' in columns_info:
        columns_info['basal_rate']['description'] = "The rate at which the medication is given over 24 hours"
    
    if 'one_hr_max' in columns_info:
        columns_info['one_hr_max']['description'] = "The maximum dose that may be given in a single hour"
    
    if 'doses_per_24_hrs' in columns_info:
        columns_info['doses_per_24_hrs']['description'] = "The number of expected doses per 24 hours"
    
    if 'duration' in columns_info:
        columns_info['duration']['description'] = "The numeric duration of the given dose"
    
    if 'duration_interval' in columns_info:
        columns_info['duration_interval']['description'] = "The unit of measurement for the duration (e.g., 'Doses', 'Weeks')"
    
    if 'expiration_value' in columns_info:
        columns_info['expiration_value']['description'] = "Numeric value for time until drug expiry"
    
    if 'expiration_unit' in columns_info:
        columns_info['expiration_unit']['description'] = "Unit of measurement for expiration time (e.g., 'days', 'hours')"
    
    if 'expirationdate' in columns_info:
        columns_info['expirationdate']['description'] = "The deidentified date of expiry for the medication"
    
    if 'dispensation' in columns_info:
        columns_info['dispensation']['description'] = "The source of dispensation for the medication"
    
    if 'fill_quantity' in columns_info:
        columns_info['fill_quantity']['description'] = "What proportion of the formulary to fill"
    
    # Table description and purpose
    description = "Formulary, dosing, and other information for prescribed medications."
    table_purpose = ("The pharmacy table provides detailed information regarding filled medications which were prescribed "
                     "to the patient. Pharmacy information includes the dose of the drug, the number of formulary doses, "
                     "the frequency of dosing, the medication route, and the duration of the prescription.")
    
    # Important considerations about the data
    important_considerations = ("The pharmacy table connects to prescriptions and emar tables via pharmacy_id, allowing "
                               "tracking from medication order to dispensing to administration. The doses_per_24_hrs column "
                               "can be misleading for continuously infused medications as they are usually only 'dosed' "
                               "once per day, despite continuous administration.")
    
    # Common table joins
    common_joins = [
        "JOIN poe ON pharmacy.poe_id = poe.poe_id",
        "JOIN prescriptions ON pharmacy.pharmacy_id = prescriptions.pharmacy_id",
        "JOIN emar ON pharmacy.pharmacy_id = emar.pharmacy_id",
        "JOIN patients ON pharmacy.subject_id = patients.subject_id",
        "JOIN admissions ON pharmacy.hadm_id = admissions.hadm_id"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What are the most commonly prescribed medications?",
        "What is the average duration of antibiotic prescriptions?",
        "How often are medications prescribed with a sliding scale?",
        "What routes of administration are most common for pain medications?",
        "What is the typical frequency for administering specific medications?",
        "How long does it take from entering a prescription to verification?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "medication prescriptions", 
        "drug orders",
        "medication dispensing",
        "pharmacy records",
        "medication orders",
        "drug dispensation",
        "pharmacy dispensing",
        "pharmaceutical orders"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_pharmacy_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info 
    FROM mimic_table_metadata 
    WHERE table_name='pharmacy';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info for pharmacy:")
print(json.dumps(all_columns_info, indent=2))

# poe

In [None]:
def create_poe_metadata():
    """Create metadata for the poe table"""
    table_name = "poe"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Manually update column descriptions
    if 'poe_id' in columns_info:
        columns_info['poe_id']['description'] = "Unique identifier for the given order (format: subject_id-poe_seq)"
    
    if 'poe_seq' in columns_info:
        columns_info['poe_seq']['description'] = "Monotonically increasing integer which chronologically sorts the POE orders"
    
    if 'subject_id' in columns_info:
        columns_info['subject_id']['description'] = "Unique identifier which specifies an individual patient"
    
    if 'hadm_id' in columns_info:
        columns_info['hadm_id']['description'] = "Integer identifier which is unique for each patient hospitalization"
    
    if 'ordertime' in columns_info:
        columns_info['ordertime']['description'] = "Date and time at which the provider order was made"
    
    if 'order_type' in columns_info:
        columns_info['order_type']['description'] = "Type of provider order (e.g., 'Medications', 'Lab', 'Nutrition')"
    
    if 'order_subtype' in columns_info:
        columns_info['order_subtype']['description'] = "Further detail on the type of order made by the provider"
    
    if 'transaction_type' in columns_info:
        columns_info['transaction_type']['description'] = "Action performed by provider (e.g., 'New', 'Change', 'D/C')"
    
    if 'discontinue_of_poe_id' in columns_info:
        columns_info['discontinue_of_poe_id']['description'] = "If this order discontinues a previous order, links to that previous order"
    
    if 'discontinued_by_poe_id' in columns_info:
        columns_info['discontinued_by_poe_id']['description'] = "If this order was later discontinued, links to the order that discontinued it"
    
    if 'order_provider_id' in columns_info:
        columns_info['order_provider_id']['description'] = "Anonymous identifier for the provider who made the order"
    
    if 'order_status' in columns_info:
        columns_info['order_status']['description'] = "Whether the order is active ('Active') or has been inactivated ('Inactive')"
    
    # Table description and purpose
    description = "Orders made by providers relating to patient care."
    table_purpose = ("Provider order entry (POE) is the general interface through which care providers at the hospital "
                     "enter orders. Most treatments and procedures must be ordered via POE. This table contains the "
                     "basic information about each order entered by healthcare providers.")
    
    # Important considerations about the data
    important_considerations = ("The poe_id is composed of subject_id and poe_seq in the format 'subject_id-poe_seq'. "
                               "The table tracks order relationships, allowing identification of orders that discontinue "
                               "previous orders or are discontinued by future orders. Order types and subtypes provide "
                               "categorization of the clinical actions ordered.")
    
    # Common table joins
    common_joins = [
        "JOIN poe_detail ON poe.poe_id = poe_detail.poe_id",
        "JOIN patients ON poe.subject_id = patients.subject_id",
        "JOIN admissions ON poe.hadm_id = admissions.hadm_id",
        "JOIN prescriptions ON poe.poe_id = prescriptions.poe_id"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What types of orders are most frequently placed?",
        "Which providers place the most orders?",
        "What percentage of orders are discontinued?",
        "What is the distribution of orders across different order types?",
        "How many medication orders are placed per patient admission on average?",
        "What is the pattern of order entry over the course of a day?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "provider orders", 
        "medical orders",
        "clinical orders",
        "order entry",
        "healthcare orders",
        "provider instructions",
        "care orders",
        "treatment orders"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_poe_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info 
    FROM mimic_table_metadata 
    WHERE table_name='poe';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info for poe:")
print(json.dumps(all_columns_info, indent=2))

# poe_detail

In [None]:
def create_poe_detail_metadata():
    """Create metadata for the poe_detail table"""
    table_name = "poe_detail"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Manually update column descriptions
    if 'poe_id' in columns_info:
        columns_info['poe_id']['description'] = "Unique identifier for the given order (format: subject_id-poe_seq)"
    
    if 'poe_seq' in columns_info:
        columns_info['poe_seq']['description'] = "Monotonically increasing integer which chronologically sorts the POE orders"
    
    if 'subject_id' in columns_info:
        columns_info['subject_id']['description'] = "Unique identifier which specifies an individual patient"
    
    if 'field_name' in columns_info:
        columns_info['field_name']['description'] = "Name of the attribute being recorded for the POE order (e.g., 'Admit to', 'Indication')"
    
    if 'field_value' in columns_info:
        columns_info['field_value']['description'] = "Value associated with the field_name for the given POE order"
    
    # Table description and purpose
    description = "Supplementary information for orders made by providers in the hospital."
    table_purpose = ("The poe_detail table provides further information on Provider Order Entry (POE) orders. "
                     "The table uses an Entity-Attribute-Value (EAV) model: the entity is poe_id, the attribute "
                     "is field_name, and the value is field_value. This allows for flexible description of orders "
                     "when the attributes vary widely across different order types.")
    
    # Important considerations about the data
    important_considerations = ("This table uses an Entity-Attribute-Value model, so each POE order may have multiple "
                               "rows in this table, each describing a different aspect of the order. Common field_name "
                               "values include 'Admit to', 'Indication', 'Code status', 'Transfer to', and 'Discharge When'. "
                               "The poe_id links directly to the main poe table for the core order information.")
    
    # Common table joins
    common_joins = [
        "JOIN poe ON poe_detail.poe_id = poe.poe_id AND poe_detail.poe_seq = poe.poe_seq",
        "JOIN patients ON poe_detail.subject_id = patients.subject_id"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What are the most common indications for antibiotic orders?",
        "What types of units are patients most commonly admitted to?",
        "What percentage of patients have 'Do Not Resuscitate' code status?",
        "What are the typical discharge instructions for patients?",
        "How many consults are marked as urgent versus routine?",
        "What types of tubes and drains are most commonly ordered?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "order details", 
        "provider order details",
        "clinical order parameters",
        "order specifications",
        "order attributes",
        "POE details",
        "order metadata",
        "clinical instruction details"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_poe_detail_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info 
    FROM mimic_table_metadata 
    WHERE table_name='poe_detail';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info for poe_detail:")
print(json.dumps(all_columns_info, indent=2))

# prescriptions

In [None]:
def create_prescriptions_metadata():
    """Create metadata for the prescriptions table"""
    table_name = "prescriptions"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Manually update column descriptions
    if 'subject_id' in columns_info:
        columns_info['subject_id']['description'] = "Unique identifier which specifies an individual patient"
    
    if 'hadm_id' in columns_info:
        columns_info['hadm_id']['description'] = "Integer identifier which is unique for each patient hospitalization"
    
    if 'pharmacy_id' in columns_info:
        columns_info['pharmacy_id']['description'] = "Identifier which links to pharmacy information in the pharmacy table"
    
    if 'poe_id' in columns_info:
        columns_info['poe_id']['description'] = "Identifier which links to the provider order entry (POE) in the poe table"
    
    if 'poe_seq' in columns_info:
        columns_info['poe_seq']['description'] = "Sequential number used with poe_id to link to the poe table"
    
    if 'order_provider_id' in columns_info:
        columns_info['order_provider_id']['description'] = "Anonymous identifier for the provider who initiated the order"
    
    if 'starttime' in columns_info:
        columns_info['starttime']['description'] = "The prescribed start time for the medication"
    
    if 'stoptime' in columns_info:
        columns_info['stoptime']['description'] = "The prescribed stop time for the medication"
    
    if 'drug_type' in columns_info:
        columns_info['drug_type']['description'] = "Component of the prescription which the drug occupies (MAIN, BASE, or ADDITIVE)"
    
    if 'drug' in columns_info:
        columns_info['drug']['description'] = "Free-text description of the medication administered"
    
    if 'formulary_drug_cd' in columns_info:
        columns_info['formulary_drug_cd']['description'] = "Hospital specific ontology used to order drugs from the formulary"
    
    if 'gsn' in columns_info:
        columns_info['gsn']['description'] = "Generic Sequence Number, a coded identifier used for medications"
    
    if 'ndc' in columns_info:
        columns_info['ndc']['description'] = "National Drug Code, a coded identifier which uniquely identifies medications"
    
    if 'prod_strength' in columns_info:
        columns_info['prod_strength']['description'] = "Free-text description of the composition of the prescribed medication"
    
    if 'form_rx' in columns_info:
        columns_info['form_rx']['description'] = "Container in which the formulary dose is delivered (e.g., TABLET, VIAL)"
    
    if 'dose_val_rx' in columns_info:
        columns_info['dose_val_rx']['description'] = "Prescribed dose for the patient intended to be administered over the given time period"
    
    if 'dose_unit_rx' in columns_info:
        columns_info['dose_unit_rx']['description'] = "Unit of measurement for the dose"
    
    if 'form_val_disp' in columns_info:
        columns_info['form_val_disp']['description'] = "Amount of medication contained in a single formulary dose"
    
    if 'form_unit_disp' in columns_info:
        columns_info['form_unit_disp']['description'] = "Unit of measurement used for the formulary dosage"
    
    if 'doses_per_24_hrs' in columns_info:
        columns_info['doses_per_24_hrs']['description'] = "Number of doses per 24 hours for which the medication is to be given"
    
    if 'route' in columns_info:
        columns_info['route']['description'] = "Route of administration for the medication"
    
    # Table description and purpose
    description = "Prescribed medications."
    table_purpose = ("The prescriptions table provides information about prescribed medications. "
                     "Information includes the name of the drug, coded identifiers including the "
                     "Generic Sequence Number (GSN) and National Drug Code (NDC), the product strength, "
                     "the formulary dose, and the route of administration.")
    
    # Important considerations about the data
    important_considerations = ("This table represents the medication orders that have been processed by pharmacy. "
                               "It links to the poe table which contains the original provider orders, and to the "
                               "pharmacy and emar tables which contain information about medication preparation "
                               "and administration. The doses_per_24_hrs field can be used to identify medication "
                               "frequency (1=daily, 2=BID/twice daily, etc.).")
    
    # Common table joins
    common_joins = [
        "JOIN pharmacy ON prescriptions.pharmacy_id = pharmacy.pharmacy_id",
        "JOIN emar ON prescriptions.pharmacy_id = emar.pharmacy_id",
        "JOIN poe ON prescriptions.poe_id = poe.poe_id AND prescriptions.poe_seq = poe.poe_seq",
        "JOIN patients ON prescriptions.subject_id = patients.subject_id",
        "JOIN admissions ON prescriptions.hadm_id = admissions.hadm_id"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What are the most commonly prescribed medications?",
        "What dosing regimens are typically used for specific medications?",
        "How long are patients typically prescribed antibiotics?",
        "What routes of administration are most common for pain medications?",
        "How do medication prescriptions vary by admission type?",
        "What medications are most frequently prescribed together?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "medication orders", 
        "drug prescriptions",
        "prescribed drugs",
        "medication regimens",
        "drug orders",
        "pharmacy orders",
        "medication prescriptions",
        "drug therapy"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_prescriptions_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info 
    FROM mimic_table_metadata 
    WHERE table_name='prescriptions';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info for prescriptions:")
print(json.dumps(all_columns_info, indent=2))

# procedures_icd

In [None]:
def create_procedures_icd_metadata():
    """Create metadata for the procedures_icd table"""
    table_name = "procedures_icd"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Manually update column descriptions
    if 'subject_id' in columns_info:
        columns_info['subject_id']['description'] = "Unique identifier which specifies an individual patient"
    
    if 'hadm_id' in columns_info:
        columns_info['hadm_id']['description'] = "Integer identifier which is unique for each patient hospitalization"
    
    if 'seq_num' in columns_info:
        columns_info['seq_num']['description'] = "An assigned priority for procedures which occurred within the hospital stay"
    
    if 'chartdate' in columns_info:
        columns_info['chartdate']['description'] = "The date of the associated procedures (does not strictly correlate with seq_num)"
    
    if 'icd_code' in columns_info:
        columns_info['icd_code']['description'] = "International Coding Definitions (ICD) code for the procedure"
    
    if 'icd_version' in columns_info:
        columns_info['icd_version']['description'] = "Version of the ICD coding system (9 or 10)"
    
    # Table description and purpose
    description = "Billed procedures for patients during their hospital stay."
    table_purpose = ("During routine hospital care, patients are billed by the hospital for procedures "
                     "they undergo. This table contains a record of all procedures a patient was billed "
                     "for during their hospital stay using the ICD-9 and ICD-10 ontologies.")
    
    # Important considerations about the data
    important_considerations = ("Procedures during the hospital stay can be billed (1) by the hospital or "
                               "(2) by the provider. This table contains only procedures billed by the hospital. "
                               "Both ICD-9 and ICD-10 codes are often presented with a decimal, but the decimal "
                               "is not stored in the database (i.e., the icd_code of '0010' is equivalent to '001.0').")
    
    # Common table joins
    common_joins = [
        "JOIN d_icd_procedures ON procedures_icd.icd_code = d_icd_procedures.icd_code AND procedures_icd.icd_version = d_icd_procedures.icd_version",
        "JOIN patients ON procedures_icd.subject_id = patients.subject_id",
        "JOIN admissions ON procedures_icd.hadm_id = admissions.hadm_id"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What are the most common procedures performed in the hospital?",
        "How many patients underwent a specific procedure (e.g., ventilation) during their stay?",
        "What procedures are commonly performed together?",
        "What is the distribution of procedure types across different demographics?",
        "How have procedure patterns changed over time with the transition from ICD-9 to ICD-10?",
        "What procedures are most associated with specific diagnoses?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "hospital procedures", 
        "surgical procedures",
        "medical procedures",
        "hospital billable procedures",
        "ICD procedure codes",
        "procedure billing",
        "procedure claims",
        "hospital interventions"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_procedures_icd_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info 
    FROM mimic_table_metadata 
    WHERE table_name='procedures_icd';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info for procedures_icd:")
print(json.dumps(all_columns_info, indent=2))

# services

In [None]:
def create_services_metadata():
    """Create metadata for the services table"""
    table_name = "services"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Manually update column descriptions
    if 'subject_id' in columns_info:
        columns_info['subject_id']['description'] = "Unique identifier which specifies an individual patient"
    
    if 'hadm_id' in columns_info:
        columns_info['hadm_id']['description'] = "Integer identifier which is unique for each patient hospitalization"
    
    if 'transfertime' in columns_info:
        columns_info['transfertime']['description'] = "Time at which the patient moved from the prev_service to the curr_service"
    
    if 'prev_service' in columns_info:
        columns_info['prev_service']['description'] = "Previous hospital service that the patient was under"
    
    if 'curr_service' in columns_info:
        columns_info['curr_service']['description'] = "Current hospital service that the patient is under"
    
    # Table description and purpose
    description = "The hospital service(s) which cared for the patient during their hospitalization."
    table_purpose = ("The services table describes the service that a patient was admitted under. "
                     "While a patient can be physically located at a given ICU type (say MICU), they are not "
                     "necessarily being cared for by the team which staffs the MICU. The services table "
                     "should be used if interested in identifying the type of service a patient is receiving "
                     "in the hospital.")
    
    # Important considerations about the data
    important_considerations = ("Each service is listed in the table as an abbreviation - this is exactly how "
                               "the data is stored in the hospital database. Services include CMED (Cardiac Medical), "
                               "CSURG (Cardiac Surgery), DENT (Dental), ENT (Ear, nose, and throat), MED (Medical), "
                               "NSURG (Neurologic Surgical), PSYCH (Psychiatric), SURG (Surgical), and many others. "
                               "For identifying specific types of patients (e.g., surgical patients), using the services "
                               "table is recommended rather than physical location.")
    
    # Common table joins
    common_joins = [
        "JOIN patients ON services.subject_id = patients.subject_id",
        "JOIN admissions ON services.hadm_id = admissions.hadm_id",
        "JOIN transfers ON services.hadm_id = transfers.hadm_id"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What percentage of patients received care from the cardiac surgery service?",
        "How often do patients transfer between services during a hospital stay?",
        "What is the most common service that patients are initially admitted to?",
        "What is the average length of stay for patients under the psychiatric service?",
        "Which services have the highest rates of mortality?",
        "How do outcomes differ between medical and surgical services?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "hospital services", 
        "clinical services",
        "care teams",
        "treatment services",
        "medical teams",
        "clinical departments",
        "specialty care",
        "service transfers"
    ]
    
    # Service abbreviations with descriptions for curr_service and prev_service
    service_descriptions = {
        "CMED": "Cardiac Medical - for non-surgical cardiac related admissions",
        "CSURG": "Cardiac Surgery - for surgical cardiac admissions",
        "DENT": "Dental - for dental/jaw related admissions",
        "ENT": "Ear, nose, and throat - conditions primarily affecting these areas",
        "EYE": "Eye diseases - including subspecialty services in glaucoma, cataract surgery, cornea and external diseases, and neuro-ophthalmology",
        "GU": "Genitourinary - reproductive organs/urinary system",
        "GYN": "Gynecological - female reproductive systems and breasts",
        "MED": "Medical - general service for internal medicine",
        "NB": "Newborn - infants born at the hospital",
        "NBB": "Newborn baby - infants born at the hospital",
        "NMED": "Neurologic Medical - non-surgical, relating to the brain",
        "NSURG": "Neurologic Surgical - surgical, relating to the brain",
        "OBS": "Obstetrics - concerned with childbirth and the care of women giving birth",
        "ORTHO": "Orthopaedic - surgical, relating to the musculoskeletal system",
        "OMED": "Oncologic Medical - non-surgical, relating to cancer",
        "PSURG": "Plastic - restoration/reconstruction of the human body (including cosmetic or aesthetic)",
        "PSYCH": "Psychiatric - mental disorders relating to mood, behaviour, cognition, or perceptions",
        "SURG": "Surgical - general surgical service not classified elsewhere",
        "TRAUM": "Trauma - injury or damage caused by physical harm from an external source",
        "TSURG": "Thoracic Surgical - surgery on the thorax, located between the neck and the abdomen",
        "VSURG": "Vascular Surgical - surgery relating to the circulatory system"
    }
    
    # Add service descriptions to curr_service and prev_service columns if they exist
    if 'curr_service' in columns_info and 'categorical_values' in columns_info['curr_service']:
        service_values = columns_info['curr_service']['categorical_values']
        service_descriptions_list = [f"{code}: {service_descriptions.get(code, 'Unknown service')}" for code in service_values]
        columns_info['curr_service']['service_descriptions'] = service_descriptions_list
    
    if 'prev_service' in columns_info and 'categorical_values' in columns_info['prev_service']:
        service_values = columns_info['prev_service']['categorical_values']
        service_descriptions_list = [f"{code}: {service_descriptions.get(code, 'Unknown service')}" for code in service_values]
        columns_info['prev_service']['service_descriptions'] = service_descriptions_list
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_services_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info 
    FROM mimic_table_metadata 
    WHERE table_name='services';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info for services:")
print(json.dumps(all_columns_info, indent=2))

# transfer

In [None]:
def create_transfers_metadata():
    """Create metadata for the transfers table"""
    table_name = "transfers"
    
    # Get column statistics
    columns_info = get_column_statistics(table_name)
    
    # Get primary and foreign keys
    primary_keys = get_primary_keys(table_name)
    foreign_keys = get_foreign_keys(table_name)
    
    # Manually update column descriptions
    if 'subject_id' in columns_info:
        columns_info['subject_id']['description'] = "Unique identifier which specifies an individual patient"
    
    if 'hadm_id' in columns_info:
        columns_info['hadm_id']['description'] = "Integer identifier which is unique for each patient hospitalization"
    
    if 'transfer_id' in columns_info:
        columns_info['transfer_id']['description'] = "Unique identifier for a patient physical location"
    
    if 'eventtype' in columns_info:
        columns_info['eventtype']['description'] = "Type of transfer event ('ed', 'admit', 'transfer', or 'discharge')"
    
    if 'careunit' in columns_info:
        columns_info['careunit']['description'] = "Type of unit or ward in which the patient is physically located"
    
    if 'intime' in columns_info:
        columns_info['intime']['description'] = "Date and time the patient was transferred into the current care unit"
    
    if 'outtime' in columns_info:
        columns_info['outtime']['description'] = "Date and time the patient was transferred out of the current physical location"
    
    # Table description and purpose
    description = "Detailed information about patients' unit transfers."
    table_purpose = ("The transfers table tracks patients' physical locations throughout their hospital stay, "
                     "including admissions, transfers between units, and discharges. It provides a complete "
                     "timeline of where a patient was located during their hospitalization.")
    
    # Important considerations about the data
    important_considerations = ("The icustays table is derived from this table. For ICU stays, three contiguous ICU "
                               "transfers will have three separate transfer_id values for each distinct physical location, "
                               "but a single stay_id in the icustays table (equal to the transfer_id of the first physical "
                               "location). This table tracks all movements, not just those related to ICU stays.")
    
    # Common table joins
    common_joins = [
        "JOIN patients ON transfers.subject_id = patients.subject_id",
        "JOIN admissions ON transfers.hadm_id = admissions.hadm_id",
        "JOIN icustays ON transfers.transfer_id = icustays.stay_id"
    ]
    
    # Example questions that this table can answer
    example_questions = [
        "What is the average length of stay in each care unit?",
        "How many times do patients typically transfer between units during a hospital stay?",
        "What percentage of patients visit the emergency department before admission?",
        "What is the typical flow of patients through different units?",
        "Which units have the highest occupancy rates?",
        "How long do patients typically wait in the emergency department before being admitted?"
    ]
    
    # Synonyms and alternative terms for this table
    synonyms_and_terms = [
        "patient movements", 
        "unit transfers",
        "bed transfers",
        "patient locations",
        "care unit assignments",
        "patient flow",
        "ward transfers",
        "hospital movements"
    ]
    
    # Insert the metadata into the table
    insert_sql = """
    INSERT INTO mimic_table_metadata (
        table_name, 
        description, 
        table_purpose,
        columns_info,
        primary_keys, 
        foreign_keys,
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    ON CONFLICT (table_name) DO UPDATE SET
        description = EXCLUDED.description,
        table_purpose = EXCLUDED.table_purpose,
        columns_info = EXCLUDED.columns_info,
        primary_keys = EXCLUDED.primary_keys,
        foreign_keys = EXCLUDED.foreign_keys,
        important_considerations = EXCLUDED.important_considerations,
        common_joins = EXCLUDED.common_joins,
        example_questions = EXCLUDED.example_questions,
        synonyms_and_terms = EXCLUDED.synonyms_and_terms;
    """
    
    values = (
        table_name,
        description,
        table_purpose,
        json.dumps(columns_info),
        primary_keys,
        json.dumps(foreign_keys),
        important_considerations,
        common_joins,
        example_questions,
        synonyms_and_terms
    )
    
    cur.execute(insert_sql, values)
    conn.commit()
    print(f"✅ Metadata for '{table_name}' table has been created.")

# Run the function
create_transfers_metadata()

# Verify the metadata was created
cur.execute("""
    SELECT columns_info 
    FROM mimic_table_metadata 
    WHERE table_name='transfers';
""")
all_columns_info = cur.fetchone()[0]
print("\n📋 All Columns Info for transfers:")
print(json.dumps(all_columns_info, indent=2))