In [6]:
from src.db.database import session_scope
from src.db import models

with session_scope() as session:
    res = session.query(models.Firms.sector).distinct().all()
    res = [r[0] for r in res]

res

['Electric Utilities',
 'Communications Equipment',
 'Life Sciences Tools & Services',
 'Health Care REITs',
 'Electronic Equipment & Instruments',
 'Application Software',
 'Asset Management & Custody Banks',
 'IT Consulting & Other Services',
 'Casinos & Gaming',
 'Health Care Distributors',
 'Air Freight & Logistics',
 'Automobile Manufacturers',
 'Diversified Banks',
 'Specialty Chemicals',
 'Publishing',
 'Integrated Telecommunication Services',
 'Packaged Foods & Meats',
 'Construction Machinery & Heavy Transportation Equipment',
 'Consumer Finance',
 'Passenger Airlines',
 'Financial Exchanges & Data',
 'Health Care Equipment',
 'Oil & Gas Exploration & Production',
 'Restaurants',
 'Life & Health Insurance',
 'Industrial Machinery & Supplies & Components',
 'Insurance Brokers',
 'Homebuilding',
 'Heavy Electrical Equipment',
 'Biotechnology',
 'Rail Transportation',
 'Hotel & Resort REITs',
 'Multi-Utilities',
 'Oil & Gas Refining & Marketing',
 'Tobacco',
 'Systems Software',


# Llama-Index Text-To-SQL Retrieval Agent
### Thoughts:
- Too inconsistent in its performance
- Easily makes up facts in the absence of results
- Isn't really able to grasp the full context of the data structure and meaning
- Underlying functionality difficult to modify, particularly the prompt template for the text-to-sql process prior to response synthesis.

In [7]:
import os
from dotenv import load_dotenv
from IPython.display import Markdown, display
import pandas as pd

from llama_index.core import SQLDatabase
from llama_index.llms.openai import OpenAI
from llama_index.core.indices.struct_store.sql_query import (
    SQLTableRetrieverQueryEngine,
)
from llama_index.core.objects import (
    SQLTableNodeMapping,
    ObjectIndex,
    SQLTableSchema,
)
from llama_index.core import VectorStoreIndex, PromptTemplate

from src.db.database import engine
from src.db import models


load_dotenv()


OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


llm = OpenAI(temperature=0.1, model="gpt-4o-mini", api_key=OPENAI_API_KEY)

sql_database = SQLDatabase(engine)

table_node_mapping = SQLTableNodeMapping(sql_database)

table_schema_objs = [
    (SQLTableSchema(table_name=table.__tablename__, context_str=table.__context_str__)) 
    for table in models.__dict__.values() if hasattr(table, '__tablename__')
]

obj_index = ObjectIndex.from_objects(
    table_schema_objs,
    table_node_mapping,
    VectorStoreIndex,
)

response_synthesis_prompt_str = (
    "Given an input question, synthesize a response from the query results. \
    You must ensure your response is completely factual.\n"
    "<query>{query_str}</query>\n"
    "<sql>{sql_query}</sql>\n"
    "<sql response>SQL Response: {context_str}</sql response>\n"
    "Response: "
)
response_synthesis_prompt = PromptTemplate(
    response_synthesis_prompt_str,
)

query_engine = SQLTableRetrieverQueryEngine(
    sql_database, obj_index.as_retriever(similarity_top_k=1),
    response_synthesis_prompt=response_synthesis_prompt,
)

# query = "What are the fields in the meetings table and what do they represent contextually?"
# query = "Using just your provided system messaging and without using SQL, \
#     What are the fields in the meetings table and what do they represent contextually?"
# query = "What is the name of the firm that has the most meetings and how many meets do they have?"
# query = "Can you show me the first 5 rows of meetings?"
query = "Fetch the first 5 meetings and their content which have a firm attended that are in the IT Consulting & Other Services sector."
response = query_engine.query(query)

print("SQL Query:")
print("```\n" + response.metadata["sql_query"] + "\n```")
print("Response:")
display(Markdown(f"<b>{response}</b>"))
if "result" in response.metadata:
    display(pd.DataFrame(response.metadata["result"], columns=response.metadata["col_keys"]))

SQL Query:
```
SELECT m.meeting_id, m.title, m.content
FROM meetings m
JOIN firms f ON m.firm_attended_id = f.firm_id
WHERE f.sector = 'IT Consulting & Other Services'
ORDER BY m.date
LIMIT 5;
```
Response:


<b>The first 5 meetings attended by firms in the IT Consulting & Other Services sector are as follows:
1. Meeting with Accenture: Discussed potential synergies, tech innovations, data analytics, regulatory changes, and improved communication channels.
2. Call with Cognizant: Discussed collaboration opportunities, growth in digital services, key sectors for investment, and regulatory challenges.
3. Call with Accenture: Discussed Verizon's market performance, investment in 5G technology, competition, customer retention strategies, and potential for M&A.
4. Call with Cognizant: Discussed Incyte's pipeline developments, GE HealthCare's market position, Entergy's regulatory challenges, and Gilead Sciences' acquisitions.
5. Email with Cognizant: Discussion on potential collaborations with IQVIA, Yum! Brands, Host Hotels & Resorts, and Cummins.</b>

Unnamed: 0,meeting_id,title,content
0,b1df9bfc-febd-44d2-9cf4-ae5e905953d5,Meeting with Accenture,- Discussed potential synergies between our fi...
1,93ee4cf5-ca97-48ce-ac42-e9b01e83c605,Call with Cognizant,- Discussed potential collaboration opportunit...
2,6539b388-0b71-4ed4-b792-a3088d7496c6,Call with Accenture,- Discussed Verizon's recent market performanc...
3,18d357db-b908-4360-8054-3c16bccbba8f,Call with Cognizant,- Discussed Incyte's recent pipeline developme...
4,8cb7004e-07f8-4180-b1cd-acf71a3a6a2b,Email with Cognizant,Subject: Discussion on Potential Collaboration...


# Custom Simplified Implementation
- Much slower
- Has chain of thought reasoning with verbosity
- Still has issues constructing queries
- Need to consider how the information is presented back to the User in a memory-friendly way
    - Can return just beam_ids as part of the retrieval?
        - This can be added to the user's 'meetings in-focus' view?
    - Can return as markdown (BIG CONTEXT ISSUE)

In [1]:
import os
from dotenv import load_dotenv

import pandas as pd
from llama_index.llms.openai import OpenAI

from src.db.database import session_scope
from src.db import models
from src.rag.sql_retriever import SQLAgent

load_dotenv()


OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

llm = OpenAI(temperature=0.1, model="gpt-4o-mini", api_key=OPENAI_API_KEY)

agent = SQLAgent(llm, "src/db/models.py", verbose=True)

# query = "I need all meetings between 2022-01-01 and 2023-01-01 where the firms that attended are in the Energy sector."
query = "Return the beam_ids of all meetings between 2022-01-01 and 2023-01-01 where the firms that attended are in the IT Consulting & Other Services sector."

with session_scope() as session:
    response = agent.complete(session, query)

response_md = response.to_markdown(index=True)

print(response_md)

CHAIN OF THOUGHTS:
Thoughts: I need to retrieve the beam_ids of meetings that took place between January 1, 2022, and January 1, 2023. I also need to filter these meetings based on the sector of the firms that attended, specifically looking for firms in the 'IT Consulting & Other Services' sector.
Outcome: I will need to join the meetings table with the firms table to filter by sector and select the beam_id from the meetings table. 

SQL QUERY:
```
SELECT meetings.beam_id 
FROM meetings 
JOIN firms ON meetings.firm_attended_id = firms.firm_id 
WHERE meetings.date BETWEEN '2022-01-01' AND '2023-01-01' 
AND firms.sector ILIKE 'IT Consulting & Other Services' 
ORDER BY meetings.date ASC;
```
|    | meetings.beam_id                     |
|---:|:-------------------------------------|
|  0 | 27bbf96b-84f2-4cd2-bfb6-f369ca319c3e |
|  1 | 6d75098d-f807-48cc-980a-aecfdbdb038f |
|  2 | ebed6d2a-46a5-4d35-ab5f-b63de1b237f0 |
|  3 | a3fec944-3754-4521-b939-80781d54170c |
|  4 | 732c8d16-4a72-4b58-

# Some additional tests

In [2]:
with session_scope() as session:
    res = (
        session.query(models.Meetings)
        .filter(models.Meetings.date >= "2025-01-15", models.Meetings.date <= "2025-02-15")
        .all()
    )
    for meeting in res:
        meeting.employees
    

res = [x for y in [[str(y.employee_id) for y in x.employees] for x in res] for x in y]
user_id = pd.Series(res).value_counts().sort_values(ascending=False).index[0]

In [3]:
with session_scope() as session:
    res = (
        session.query(models.Meetings)
        .filter(
            models.Meetings.date >= "2025-01-15", 
            models.Meetings.date <= "2025-02-15",
            models.Meetings.employees.any(models.Employees.employee_id == user_id)
        )
        .all()
    )

len(res)

2

In [4]:
query = "Give me a summary of meetings I have attended in the last month (employee_id: {}, current date: {}).".format(user_id, "2025-02-15")

with session_scope() as session:
    response = agent.complete(session, query)

response_md = response.to_markdown(index=True)
print(response_md)

CHAIN OF THOUGHTS:
Thoughts: I need to retrieve meetings attended by a specific employee within the last month. The employee_id is provided, and I need to filter meetings based on the date, which should be within the last month from the current date (2025-02-15).
Outcome: I will join the meetings table with the employee_meetings association table to filter by the employee_id and the date. 

Thoughts: The date filter should check if the meeting date is greater than or equal to 2025-01-15 (one month before the current date).
Outcome: I will add a filter for the meeting date to ensure it falls within the last month. 

Thoughts: I need to select the relevant fields from the meetings table, including the beam_id, title, content, date, created_at, and firm_attended_id.
Outcome: I will specify the fields to be selected in the SQL query. 

SQL QUERY:
```
SELECT meetings.beam_id, meetings.title, meetings.content, meetings.date, meetings.created_at, meetings.firm_attended_id 
FROM meetings 
JOIN

# Expand into Query -> Answer
- Not constructing working queries on SQL that requires association table joins
    - Thought: Provide example queries with use-case descriptions.

In [8]:
from typing import Optional
from textwrap import dedent
from pydantic import BaseModel
from tenacity import retry, stop_after_attempt
from llama_index.core import PromptTemplate

prompt_template = PromptTemplate(
    dedent(
        """\
        You are a ChatBot built by Harvery's & Co, an investment bank. \
        Harvery's & Co specialise in investment banking, mergers and acquisitions, and asset management. \
        Your task is to answer employee's queries relating to company meeting notes held in a database. \
        Given a query by a user, a SQL AI Agent will try to find the relevant data in the database to answer the query. \
        Your task is to:\n
            1. read the user's query and the returned data from the database.\n
            2. analyse the retrieved data and how it might relate to the user's query.\n
            3. write a response back to the user to answer their query.\n\n

        ## IMPORTANT\n
        - **Your output must use the structured output format provided.**\n
        - **If the retrieved data does not answer the user's query, you must tell the user this and ask for more context to help you answer their query.**\n
        - **You must not make information up that does not exist in the database.**\n
        - **When referencing or citing meeting data from the database, you MUST provide the meetings.beam_id of the meeting encased in xml tags <ref>beam_id</ref>**\n
        - **You must stylise your response in markdown to make it easier to read by humans.**\n\n

        # User Query:\n
        <query>{query}</query>\n\n

        # Retrieved Data:\n
        <data>{data}</data>\n\n
        """
    )
)

validator_prompt_template = PromptTemplate(
    dedent(
        """\
        You are a ChatBot built by Harvery's & Co, an investment bank. \
        Harvery's & Co specialise in investment banking, mergers and acquisitions, and asset management. \
        Your task is to answer employee's queries relating to company meeting notes held in a database. \
        Given a query by a user, a SQL AI Agent will try to find the relevant data in the database to answer the query. \
        Your task is to determine if the retrieved data from the database answers the user's query.\n\n

        ## IMPORTANT\n
        - **Your output must use the structured output format provided.**\n

        # User Query:\n
        <query>{query}</query>\n\n

        # Retrieved Data:\n
        <data>{data}</data>\n\n
        """
    )
)

query_writer_template = PromptTemplate(
    dedent(
        """\
        You are a ChatBot built by Harvery's & Co, an investment bank. \
        Harvery's & Co specialise in investment banking, mergers and acquisitions, and asset management. \
        Your task is to answer employee's queries relating to company meeting notes held in a database. \
        You must accomplish this task by cooperating with a SQL AI Agent that can retrieve data from the database. \
        Given a query by a user, you must instruct the SQL AI Agent using ONLY natural language to retrieve the relevant data from the database.\n\n
        
        ## IMPORTANT
        - **It is important that you provide clear and concise instructions to the SQL AI Agent, including any dates, ids, or personal details the \
        user has mentioned that is relevant to their query.**\n
        - **It is always helpful to provide context on why you need the data, this will help the SQL AI Agent retrieve the correct data fields.**\n
        - **You must not write SQL queries yourself, ONLY provide natural language instructions to the SQL AI Agent.**\n

        # User Query:\n
        <query>{query}</query>\n\n{error}
        """
    )
)

class Step(BaseModel):
    """
    Use this class to think about the user's query and the retrieved data and how it might relate to the user's query.

    Attributes:
    - thought: (str) - Your thoughts on how the retrieved data might relate to the user's query.
    - conclusion: (str) - Your conclusion on how the retrieved data relates to the user's query.
    """
    thought: str
    conclusion: str


class Response(BaseModel):
    """
    Use this class to structure your response back to the user.

    Attributes:
    - steps: list[Step] - A list of Step objects.
    - response: (str) - Your response back to the user.
    - beam_ids: Optional[list[str]] - An exhaustive list of the beam_ids of the meetings that were used to answer the user's query.
    """
    steps: list[Step]
    response: str
    beam_ids: Optional[list[str]] = []


class BoolResponse(BaseModel):
    """
    Use this class to decide whether the retrieved data answers the user's query or not.
    Be sure to analyse the data carefully in regard to the query, thinking step by step on relevance.
    
    Attributes:
    - response: (bool) - True if the retrieved data answers the user's query, False otherwise.
    """
    steps: list[Step]
    response: bool


class SQLQnA:
    def __init__(
            self, 
            llm: OpenAI, 
            agent: SQLAgent, 
            prompt_template: PromptTemplate,
            query_writer_template: PromptTemplate,
            validator_prompt_template: PromptTemplate,
            output_format: Response = Response,
            bool_response_format: BoolResponse = BoolResponse,
            verbose: bool = False,
            _query_write_max_tokens: int = 250,
            _response_max_tokens: int = 4000,
            _max_query_attempts: int = 2
        ):
        self.agent = agent
        self.output_format = output_format
        self.bool_response_format = bool_response_format
        self.llm = llm
        self.prompt_template = prompt_template
        self.query_writer_template = query_writer_template
        self.validator_prompt_template = validator_prompt_template
        self._query_write_max_tokens = _query_write_max_tokens
        self._response_max_tokens = _response_max_tokens
        self._max_query_attempts = _max_query_attempts
        self._verbose = verbose

    def _query_db(self, query: str) -> pd.DataFrame:
        with session_scope() as session:
            response = self.agent.complete(session, query)
        return response

    def _get_response_md(self, query: str) -> str:
        response = self._query_db(query)
        if isinstance(response, str):
            return response
        return response.to_markdown(index=True)
    
    @retry(stop=stop_after_attempt(3))
    def _invoke_llm(self, output_format: BaseModel, prompt_template: PromptTemplate, max_tokens: int, **kwargs) -> Response:
        return self.llm.as_structured_llm(output_format).complete(
            prompt_template.format(**kwargs),
            max_tokens=max_tokens
        ).raw
    
    def complete(self, query: str) -> Response:
        attempt = 0
        error = ""
        while True:
            if attempt >= self._max_query_attempts:
                raise ValueError("The retrieved data does not answer the user's query.")
            try:
                ai_query = self._invoke_llm(
                    self.output_format, self.query_writer_template, self._query_write_max_tokens, query=query, error=error
                )
                if self._verbose:
                    print("AI QUERY:")
                    print(ai_query.response)
                data = self._get_response_md(ai_query.response)
                check = self._invoke_llm(
                    self.bool_response_format, self.validator_prompt_template, self._query_write_max_tokens, query=query, data=data
                )
                if check.response is True:
                    break
                else:
                    error = """\n\n**The retrieved data does not answer the user's query. \
                        Please try to provide more context (if available) or re-phrase the query to the SQL AI Agent.**\n
                        **Your Last Query**: "{ai_query}"\n
                        **Retrieved Data**: {data}
                    """.format(ai_query=ai_query.response, data=data)
                    attempt += 1
                    
            except Exception as e:
                error = """\n\n**Your request on the last attempt failed. \
                    Please try to provide more context (if available) or re-phrase the query to the SQL AI Agent.**\n
                    **Your Last Query**: "{ai_query}"\n
                    **Error Message**: {e}
                """.format(ai_query=ai_query.response, e=e)
                attempt += 1
                if attempt >= self._max_query_attempts:
                    raise e
                continue
        if self._verbose:
            print("RETURNED DATA:")
            print(data)
        response = self._invoke_llm(
            self.output_format, self.prompt_template, self._response_max_tokens, query=query, data=data
        )
        return response
    

query_template = PromptTemplate(
    dedent(
        """
        **User Query:**\n
        {query}\n\n

        **Key Information:**\n
        - Employee ID: {employee_id}\n
        - Current Date: {current_date}\n
        """
    )
)
query = query_template.format(
    # query="Give me a summary of meetings I have attended in the last month.",
    # query="Which company have we had the most meeting with?",
    query="How many meetings have we had with Marathon Petroleum?",
    # query="Write a report on the last five meetings we have had with Marathon Petroleum.",
    employee_id=user_id,
    current_date="2025-02-15"
)

agent = SQLAgent(
    OpenAI(temperature=0.1, model="gpt-4o-mini", api_key=OPENAI_API_KEY), 
    "src/db/models.py", 
    verbose=True
)

qna_agent = SQLQnA(
    llm=OpenAI(temperature=0.1, model="gpt-4o-mini", api_key=OPENAI_API_KEY),
    agent=agent,
    prompt_template=prompt_template,
    query_writer_template=query_writer_template,
    validator_prompt_template=validator_prompt_template,
    output_format=Response,
    verbose=True
)

response = qna_agent.complete(query)
print("RESPONSE:")
print(response.response)
print(response.beam_ids)

AI QUERY:
Please retrieve the total count of meetings that have been held with Marathon Petroleum up to the current date, February 15, 2025.
CHAIN OF THOUGHTS:
Thoughts: I need to count the total number of meetings held with Marathon Petroleum. This involves joining the meetings table with the firms table to filter by the firm's name.
Outcome: I will use a COUNT function to get the total number of meetings and filter the results where the firm's name is 'Marathon Petroleum'. 

Thoughts: I need to ensure that I only count meetings that have occurred up to February 15, 2025. This means I will add a date filter to the query.
Outcome: I will add a WHERE clause to filter meetings by date, ensuring they are less than or equal to February 15, 2025. 

SQL QUERY:
```
SELECT COUNT(meetings.meeting_id) AS total_meetings 
FROM meetings 
JOIN firms ON meetings.firm_attended_id = firms.firm_id 
WHERE firms.name ILIKE '%Marathon Petroleum%' 
AND meetings.date <= '2025-02-15';
```
RETURNED DATA:
|    