# Ingestion

Prepare data for vector store.

In [None]:
import json
import sys
import os


from langchain_community.vectorstores import SupabaseVectorStore
from langchain_openai import OpenAIEmbeddings
from langchain_core.documents import Document
from supabase import create_client, Client
from dotenv import load_dotenv
from loguru import logger

sys.path.append("/Users/ricardo.mesquita/Documents/Trainings/code4all/road_pal")  # TODO: resolve relative import properly
from notebooks.ingestion._utils import ingest_via_pdf, ingest_markdown_questions

## Configurations

In [None]:
# Setup logging
logger.remove()
logger.add(
    sys.stdout,
    colorize=True,
    format="<level>{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {message}</level>",
    level="SUCCESS"
)

In [None]:
# Load environment variables from .env file
load_dotenv()
api_key = os.environ.get("OPENAI_API_KEY")
supabase_url = os.environ.get("SUPABASE_URL")
supabase_api_key = os.environ.get("SUPABASE_API_KEY")

In [None]:
if not all([api_key, supabase_url, supabase_api_key]):
    logger.critical("Error: Please set SUPABASE_URL, SUPABASE_KEY, and OPENAI_API_KEY environment variables.")
    logger.critical("You can do it by creating a .env file or setting them directly in your environment.")
    exit(1)

## 1. Document Loading and Cleaning

### Via PDF

In [None]:
# Define pdf path to be ingested 
pdf_file_path = "/Users/ricardo.mesquita/Documents/Trainings/code4all/road_pal/data/codigo_estrada.pdf"

# Define your manual metadata
manual_metadata = {
    "source": "Código da Estrada, Decreto-Lei n.º 114/94, de 3 de maio",
    "date": "2020-12-09",
    "description": "Republicação do Código da Estrada, com disposições gerais, regras de trânsito, veículos, habilitação legal para conduzir, responsabilidade e procedimentos de fiscalização.",
    "language": "Portuguese",
    "title": "Código da Estrada"
}

# Ingest the PDF
ingested_data = ingest_via_pdf(pdf_file_path, manual_metadata)

# To use with a vector store, you would typically convert these into a format
# compatible with your chosen vector store (e.g., Langchain Document objects or direct embeddings)
langchain_documents = [
    Document(page_content=item['content'], metadata=item['metadata'])
    for item in ingested_data
]

### Via Markdown

In [None]:
# Example usage:
md_file_path = "/Users/ricardo.mesquita/Documents/Trainings/code4all/road_pal/data/bom_condutor.md"
markdown_metadata = {
    "source": "Bom Condutor",
    "date": "2025-06-21",
    "description": "Questões de exemplo sobre o Código da Estrada, incluindo perguntas, opções e respostas corretas.",
    "language": "Portuguese",
    "title": "Testes de Código da Estrada"
}

langchain_documents = ingest_markdown_questions(md_file_path, markdown_metadata)

### Check Documents for DB Population

In [None]:
langchain_documents[0]

## 3. Embeddings

In [None]:
# Initialize the OpenAI embeddings model.
# This model will convert text chunks into numerical vector representations.
embeddings_model = OpenAIEmbeddings(openai_api_key=api_key, model="text-embedding-3-small")

## 4. Populate Supabase Vector Store

In [None]:
# Initialize Supabase client (using service key for server-side operations)
supabase_client: Client = create_client(supabase_url, supabase_api_key)
logger.success("Successfully initialized Supabase client.")

def populate_supabase_with_langchain(documents: list[Document]):
    """
    Populates a Supabase vector store using Langchain's SupabaseVectorStore.

    Args:
        documents (list[Document]): A list of Langchain Document objects.
    """
    logger.info(f"Attempting to add {len(documents)} documents to Supabase via Langchain...")
    try:
        # Initialize SupabaseVectorStore
        # This will handle embedding the documents and upserting them.
        # The 'table_name' should match your table in Supabase (e.g., 'documents').
        # The 'query_name' is the name of the RPC function for similarity search (usually 'match_documents').
        # TODO: Improve insert authentication on supabase table
        vectorstore = SupabaseVectorStore.from_documents(
            documents,
            embeddings_model,
            client=supabase_client,
            table_name="documents", # Your table name
            query_name="match_documents" # Your RPC function name for similarity search
        )
        logger.success("Documents successfully added to Supabase using Langchain's SupabaseVectorStore.")
        return vectorstore
    except Exception as e:
        logger.error(f"Error populating Supabase with Langchain: {e}")
        return None

# Populate the vector store
vectorstore = populate_supabase_with_langchain(langchain_documents)

In [None]:
def query_supabase_vector_store(vectorstore: SupabaseVectorStore, query_text: str):
    """
    Performs a similarity search on the Supabase vector store.

    Args:
        vectorstore (SupabaseVectorStore): The initialized SupabaseVectorStore instance.
        query_text (str): The query string to search for.

    Returns:
        list: A list of relevant documents.
    """
    if vectorstore:
        logger.info(f"\nSearching for documents similar to: '{query_text}'")
        # Perform similarity search
        # k is the number of similar documents to retrieve
        docs = vectorstore.similarity_search(query_text, k=5)
        logger.info("Search results:")
        for doc in docs:
            logger.info(f"  Content: {doc.page_content[:200]}...") # Print first 200 chars
            logger.info(f"  Metadata: {json.dumps(doc.metadata, indent=2, ensure_ascii=False)}")
            logger.info("-" * 30)
        return docs
    else:
        logger.error("Vector store not initialized! Cannot perform query ...")
        return []

# Test queries
query_supabase_vector_store(vectorstore, "Quais são as regras para conduzir em rotundas?")
query_supabase_vector_store(vectorstore, "Qual a velocidade máxima em autoestrada para ligeiros?")
query_supabase_vector_store(vectorstore, "O que acontece se eu for apanhado a conduzir embriagado?")