#Chat with School Beacon
* This POC uses llamaParse to parse the school beacon pdfs. The pdfs are in tabular format with lot of redundant data init.
* It uses gpt-40-mini LLM.
* The POC uses below llama_index features:
  1. VectorStore
  2. Workflow
  3. ReAct Agent
  4. llamaParse

In [None]:
from dotenv import load_dotenv

env_path = f'../../.env'
load_dotenv(env_path)

In [47]:
import nest_asyncio; nest_asyncio.apply()

In [48]:
from llama_index.core.workflow import (
    Workflow,
    Event,
    step,
    Context,
    StartEvent,
    StopEvent
)

In [49]:
class HandleQueryEvent(Event):
    query : str

In [55]:
class BeaconAnalyzerWorkflow(Workflow):

    def load_vectorstore(self, ctx : Context):
        """Loades vector store into memory"""
        print(f"\n--------DEBUG-----load_vectorstore -----------------\n")
        
        from llama_index.core import StorageContext, load_index_from_storage
        persist_dir = ctx.data['persist_dir']

        self.query_index = None
        try:
            storage_context = StorageContext.from_defaults(
                persist_dir = persist_dir
            )
            self.query_index = load_index_from_storage(storage_context=storage_context)
            self.index_loaded = True
        except:
            self.index_loaded = False
        
        print(f"index_loaded? : {self.index_loaded}")

    def read_beacons(self, ctx : Context):
        from llama_index.core import SimpleDirectoryReader
        from llama_parse import LlamaParse
        import os

        print(f"\n--------DEBUG-----read_beacons -----------------\n")

        beacon_dir_path = ctx.data['beacon_dir_path']

        if not os.path.exists(beacon_dir_path):
            print(f"\n beacon dir path '{beacon_dir_path}' doesnt exist, give appropriate path.")
            return None
        
        parser = LlamaParse(result_type = 'markdown')

        file_extractor = {".pdf" : parser}
        
        print(beacon_dir_path)
        
        documents = SimpleDirectoryReader(input_dir = beacon_dir_path, file_extractor = file_extractor).load_data()

        print(f"No of Documents populated: {len(documents)}")
        return documents

    def populate_vectorstore(self, ctx : Context, documents):
        print(f"\n--------DEBUG-----populate_vectorstore -----------------\n")

        from llama_index.core import VectorStoreIndex
        persist_dir = ctx.data['persist_dir']
        self.query_index = VectorStoreIndex.from_documents(documents)
        self.query_index.storage_context.persist(persist_dir)

    def build_query_engine_tool(self, ctx : Context):

        from llama_index.llms.openai import OpenAI
        from llama_index.embeddings.openai import OpenAIEmbedding
        from llama_index.core import Settings

        similarity_top_k = ctx.data['similarity_top_k']

        self.query_engine = self.query_index.as_query_engine(similarity_top_k = similarity_top_k)
        
        from llama_index.core.tools import QueryEngineTool, ToolMetadata
        self.query_engine_tool = QueryEngineTool(query_engine = self.query_engine,
                                    metadata = ToolMetadata(
                                        name = "BeaconInfoDB",
                                        description=("Provides information about what teacher is going to teach every week.")
                                    ))
        
    def build_react_agent(self, ctx : Context):
        print(f"\n--------DEBUG-----build_react_agent -----------------\n")

        llm = ctx.data['llm']
        tools = ctx.data['tools']
        tools = tools + [self.query_engine_tool]

        from llama_index.core.agent import ReActAgent
        from llama_index.llms.openai import OpenAI

        self.agent = ReActAgent.from_tools(tools = tools,
                        llm = llm,
                        verbose = True)

    @step(pass_context=True)
    async def Setup(self, ctx : Context, ev : StartEvent) -> HandleQueryEvent:
        # 1. Build the Context object
        if (hasattr(ev, "llm")):
            ctx.data['llm'] = ev.llm
        
        if (hasattr(ev, "tools")):
            ctx.data['tools'] = ev.tools

        if (hasattr(ev, "beacon_dir_path")):
            ctx.data['beacon_dir_path'] = ev.beacon_dir_path
        else:
            ctx.data['beacon_dir_path'] = "./Data/"

        if (hasattr(ev, "persist_dir")):
            ctx.data['persist_dir'] = ev.persist_dir
        else:
            ctx.data['persist_dir'] = "./Storage/BeaconInfoDB"
        
        if (hasattr(ev, "similarity_top_k")):
            ctx.data['similarity_top_k'] = ev.similarity_top_k
        else:
            ctx.data['similarity_top_k'] = 20

        # 2. Load the vectorstore from disk
        self.load_vectorstore(ctx)
        if not self.index_loaded:
            self.read_beacons(ctx)
            self.populate_vectorstore(ctx)

        self.build_query_engine_tool(ctx)

        # 4. Check if there any new beacon available in mail and if so, download it . (ENHANCEMENT)
        # 5. Check if there any new beacon available on disk : Invokes CheckIfAnyNewBeaconEvent  (ENHANCEMENT)

        # 6. Build a ReActAgent to answer queries
        self.build_react_agent(ctx)
        
        # 6. Invokes HandleQueryEvent to handle user queries
        query = ev.query
        ctx.send_event(HandleQueryEvent(query = query))
        
    
    @step(pass_context=True)
    async def answer_query(self, ctx : Context, ev:HandleQueryEvent) -> StopEvent:
        """Answer user queries using ReActAgent"""

        print(f"\n-----------------DEBUG: answer_query-------------\n")

        if (hasattr(ev, 'query')):
            query = ev.query

        
        print (f"DEBUG: query: {query}")
        
        response = self.agent.chat(message = str(query))
        print(f"\n-----------------DEBUG:-------------\n {response}")

        ctx.send_event(StopEvent(result = str(response)))

In [None]:
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import Settings

embed_model = OpenAIEmbedding(model="text-embedding-3-small")
llm = OpenAI(model="gpt-4o-mini")

Settings.llm = llm
Settings.embed_model = embed_model


beacon_workflow = BeaconAnalyzerWorkflow(timeout = 200, verbose= True)
result = await beacon_workflow.run(llm = llm, 
                             tools = [],
                             query = "What will be taught in Grade 8 for this week. Give a subject wise summary in markdown format.")

import pprint
pprint.pprint(f"\n----------------------------FINAL OUTPUT -------------\n{result}")


Running step Setup

--------DEBUG-----load_vectorstore -----------------

index_loaded? : True

--------DEBUG-----build_react_agent -----------------

Step Setup produced no event
Running step answer_query

-----------------DEBUG: answer_query-------------

DEBUG: query: What will be taught in Grade 8 for this week. Give a subject wise summary in markdown format.
> Running step 0f78179e-f6c4-4a0b-af62-40d6a83cd074. Step input: What will be taught in Grade 8 for this week. Give a subject wise summary in markdown format.
[1;3;38;5;200mThought: The current language of the user is: English. I need to use a tool to help me answer the question.
Action: BeaconInfoDB
Action Input: {'input': 'Grade 8'}
[0m[1;3;34mObservation: The information for Grade 8 includes various subjects and their respective topics, learning objectives, activities, and assessments. Key subjects covered are French, ICT, English, Mathematics, Creative Arts and Sports, and Social Studies, among others. Specific lessons 