In [0]:
import yaml
import os

# Vector Search Configuration
VECTOR_SEARCH_ENDPOINT = "bcp_confluence_store"
VECTOR_SEARCH_INDEX = "theodore_kop_personal.bcp.bcp_chunked_docs_managed_index"

# Unity Catalog Configuration
UC_CATALOG = "theodore_kop_personal"
UC_SCHEMA = "bcp"
RAG_APP_NAME = "bcp_confluence_documents_rag_poc"

# Data Pipeline Configuration
data_pipeline_config = {
    # Vector Search index configuration
    "vectorsearch_config": {
        "pipeline_type": "CONTINUOUS",
    },
    # Embedding model to use
    "embedding_config": {
        "embedding_endpoint_name": "databricks-bge-large-en",
        "embedding_tokenizer": {
            "tokenizer_model_name": "BAAI/bge-large-en-v1.5",
            "tokenizer_source": "hugging_face",
        },
    },
    # Parsing and chunking configuration
    "pipeline_config": {
        "file_format": "pdf",
        "parser": {"name": "pypdf", "config": {}},
        "chunker": {
            "name": "langchain_recursive_char",
            "config": {
                "chunk_size_tokens": 1024,
                "chunk_overlap_tokens": 512,
            },
        },
    },
}

# Destination Tables Configuration
destination_tables_config = {
    "raw_files_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`raw_files`",
    "parsed_docs_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`parsed_docs`",
    "chunked_docs_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`chunked_docs`",
    "vectorsearch_index_table_name": f"`{UC_CATALOG}`.`{UC_SCHEMA}`.`bcp_chunked_docs_managed_index`",
}
destination_tables_config["vectorsearch_index_name"] = destination_tables_config["vectorsearch_index_table_name"].replace("`", "")

# Chain configuration
rag_chain_config = {
    "databricks_resources": {
        "vector_search_endpoint_name": VECTOR_SEARCH_ENDPOINT,
        "llm_endpoint_name": "databricks-claude-3-7-sonnet",
    },
    "retriever_config": {
        "vector_search_index": VECTOR_SEARCH_INDEX,
        "schema": {
            # The column name in the retriever's response referred to the unique key
            # If using Databricks vector search with delta sync, this should the column of the delta table that acts as the primary key
            "primary_key": "chunk_id",
            # The column name in the retriever's response that contains the returned chunk.
            "chunk_text": "chunked_text",
            # The template of the chunk returned by the retriever - used to format the chunk for presentation to the LLM.
            "document_uri": "path",
        },
        # Prompt template used to format the retrieved information to present to the LLM to help in answering the user's question
        "chunk_template": "Passage: {chunk_text}\nSource: {document_uri}\n",
        # The column name in the retriever's response that refers to the original document.
        "parameters": {
            # Number of search results that the retriever returns
            "k": 8,
            # Type of search to run
            # Semantic search: `ann`
            # Hybrid search (keyword + sementic search): `hybrid`
            "query_type": "hybrid",
        },
        # Tag for the data pipeline, allowing you to easily compare the POC results vs. future data pipeline configurations you try.
        "data_pipeline_tag": "bcp_doc_poc",
    },
    "llm_config": {
        # Define a template for the LLM prompt.  This is how the RAG chain combines the user's question and the retrieved context.
        "llm_system_prompt_template": """You are an insightful and helpful assistant for BCP that only answers questions related to BCP internal documentation. Use the following pieces of retrieved context to answer the question. Some pieces of context may be irrelevant, in which case you should not use them to form the answer. Answer honestly and if you do not now the answer or if the answer is not contained in the documentation provided as context, limit yourself to answer that "You could not find the answer in the documentation and prompt the user to provide more details"

        Context: {context}""".strip(),
        "llm_parameters": {"temperature": 0, "max_tokens": 2000},
    },
    "input_example": {
        "messages": [
            {
                "role": "user",
                "content": "Qué es un EDV?",
            },
        ]
    },
}

def create_rag_chain_config():
    """Create the rag_chain_config.yaml file with the specified configuration."""
    # Create the yaml file
    with open("rag_chain_config.yaml", "w") as f:
        yaml.dump(rag_chain_config, f, default_flow_style=False, sort_keys=False)
    
    print("Created rag_chain_config.yaml successfully!")

def create_data_pipeline_config():
    """Create the data_pipeline_config.yaml file."""
    with open("data_pipeline_config.yaml", "w") as f:
        yaml.dump(data_pipeline_config, f, default_flow_style=False, sort_keys=False)
    
    print("Created data_pipeline_config.yaml successfully!")

def create_destination_tables_config():
    """Create the destination_tables_config.yaml file."""
    with open("destination_tables_config.yaml", "w") as f:
        yaml.dump(destination_tables_config, f, default_flow_style=False, sort_keys=False)
    
    print("Created destination_tables_config.yaml successfully!")

if __name__ == "__main__":
    create_rag_chain_config()
    create_data_pipeline_config()
    create_destination_tables_config() 