In [168]:
import dotenv
import os
import json
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.document_loaders.csv_loader import CSVLoader
from llama_index import PromptTemplate
from llama_index.llms import OpenAI
from llama_index.types import BaseOutputParser
from llama_index.program import OpenAIPydanticProgram
from rich import print
from dataclasses import fields
from pydantic import BaseModel
from typing import List

In [41]:
dotenv.load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

**Step 1: ETL**
- Read raw CSV data
- Parse raw data into Documents

In [40]:
loader = CSVLoader(file_path='./events.csv')
event_docs = loader.load()


**Step 2: Init + Build Vector DB**
- Init embedding model
- Load data into vector DB

In [42]:
persist_directory = 'datastore'
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)
vectordb = Chroma.from_documents(documents=event_docs, embedding=embedding, persist_directory=persist_directory)

In [43]:
vectordb.persist()

**Step 3: Query Router**
- Query cector DB for context list
- Create custom prompt using context list
- Force LLM output to JSON format

In [71]:
llm = OpenAI(model="gpt-3.5-turbo")

In [54]:
choices = [
    "Useful for questions related to live events (broadway shows, sports games, etc.)",
    "All other questions",
]

def get_choice_str(choices):
    choices_str = "\n\n".join([f"{idx+1}. {c}" for idx, c in enumerate(choices)])
    return choices_str

choices_str = get_choice_str(choices)

In [187]:
router_prompt0 = PromptTemplate(
    "Some choices are given below. It is provided in a numbered "
    "list (1 to {num_choices}), "
    "where each item in the list corresponds to an event.\n"
    "---------------------\n"
    "{context_list}"
    "\n---------------------\n"
    "Using only the choices above and not prior knowledge, return the top events"
    "(no more than {max_outputs}, but only select what is needed) that "
    "are most match the location date and description constrains to the question: '{query_str}'\n"
)

In [188]:
def get_formatted_prompt(query_str, choices, choices_str):
    fmt_prompt = router_prompt0.format(
        num_choices=len(choices),
        max_outputs=5,
        context_list=choices_str,
        query_str=query_str,
    )
    return fmt_prompt

In [189]:
class Event(BaseModel):
    id: int
    name: str
    time: str

In [190]:
print(json.dumps(Event.schema(), indent=2))

/var/folders/z6/529jfvqn3ng4d479hkxx755r0000gn/T/ipykernel_50238/3606971842.py:1: PydanticDeprecatedSince20: The `schema` method is deprecated; use `model_json_schema` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.4/migration/
  print(json.dumps(Event.schema(), indent=2))


In [193]:
FORMAT_STR = """The output should be formatted as a JSON instance that conforms to
the JSON schema below.

Here is the output schema:
{
  "type": "array",
  "items": {
    "type": "object",
    "properties": {
      "Id": {
        "type": "integer"
      },
      "Name": {
        "type": "string"
      },
      "Time": {
        "type": "string"
      }
    },
    "required": [
      "Id",
      "Name",
      "Time",
    ],
    "additionalProperties": false
  }
}
"""

In [194]:
def _escape_curly_braces(input_string: str) -> str:
    # Replace '{' with '{{' and '}' with '}}' to escape curly braces
    escaped_string = input_string.replace("{", "{{").replace("}", "}}")
    return escaped_string

In [195]:
def _marshal_output_to_json(output: str) -> str:
    output = output.strip()
    left = output.find("[")
    right = output.find("]")
    output = output[left : right + 1]
    return output

In [196]:
class RouterOutputParser(BaseOutputParser):
    def parse(self, output: str) -> List[Event]:
        json_output = _marshal_output_to_json(output)
        json_dicts = json.loads(json_output)
        events = [Event.from_dict(json_dict) for json_dict in json_dicts]
        return events

    def format(self, prompt_template: str) -> str:
        return prompt_template + "\n\n" + _escape_curly_braces(FORMAT_STR)

In [197]:
output_parser = RouterOutputParser()

In [198]:
def route_query(query_str: str, choices: List[str], output_parser: RouterOutputParser):
    
    choices_str = get_choice_str([doc.page_content for doc in choices])

    fmt_base_prompt = router_prompt0.format(
        num_choices=len(choices),
        max_outputs=len(choices),
        context_list=choices_str,
        query_str=query_str,
    )
    fmt_json_prompt = output_parser.format(fmt_base_prompt)

    raw_output = llm.complete(fmt_json_prompt)
    parsed = output_parser.parse(str(raw_output))

    return parsed

**Step 4: Prompt LLM**
- Take in and embed prompt
- Query Vector DB
- Pass into query router
- Call LLM

In [199]:
query_str = "Find me a sports game in San Francisco in 2025"
docs = vectordb.similarity_search(query_str, k=20)

In [200]:
# parse docs into string
choices_str = get_choice_str([doc.page_content for doc in docs])
fmt_prompt = get_formatted_prompt(query_str=query_str, choices=docs, choices_str=choices_str)
print(fmt_prompt)

In [205]:
fmt_base_prompt = router_prompt0.format(
        num_choices=len(docs),
        max_outputs=5,
        context_list=choices_str,
        query_str=query_str,
    )
print(fmt_base_prompt)

In [206]:
# prompt LLM
response = llm.complete(fmt_base_prompt)

In [207]:
print(str(response))

**Step 5: Guardrails**
- Validate LLM outputs