In [6]:
from src.db.database import session_scope
from src.db import models

with session_scope() as session:
    res = session.query(models.Firms.sector).distinct().all()
    res = [r[0] for r in res]

res

['Electric Utilities',
 'Communications Equipment',
 'Life Sciences Tools & Services',
 'Health Care REITs',
 'Electronic Equipment & Instruments',
 'Application Software',
 'Asset Management & Custody Banks',
 'IT Consulting & Other Services',
 'Casinos & Gaming',
 'Health Care Distributors',
 'Air Freight & Logistics',
 'Automobile Manufacturers',
 'Diversified Banks',
 'Specialty Chemicals',
 'Publishing',
 'Integrated Telecommunication Services',
 'Packaged Foods & Meats',
 'Construction Machinery & Heavy Transportation Equipment',
 'Consumer Finance',
 'Passenger Airlines',
 'Financial Exchanges & Data',
 'Health Care Equipment',
 'Oil & Gas Exploration & Production',
 'Restaurants',
 'Life & Health Insurance',
 'Industrial Machinery & Supplies & Components',
 'Insurance Brokers',
 'Homebuilding',
 'Heavy Electrical Equipment',
 'Biotechnology',
 'Rail Transportation',
 'Hotel & Resort REITs',
 'Multi-Utilities',
 'Oil & Gas Refining & Marketing',
 'Tobacco',
 'Systems Software',


# Llama-Index Text-To-SQL Retrieval Agent
### Thoughts:
- Too inconsistent in its performance
- Easily makes up facts in the absence of results
- Isn't really able to grasp the full context of the data structure and meaning
- Underlying functionality difficult to modify, particularly the prompt template for the text-to-sql process prior to response synthesis.

In [7]:
import os
from dotenv import load_dotenv
from IPython.display import Markdown, display
import pandas as pd

from llama_index.core import SQLDatabase
from llama_index.llms.openai import OpenAI
from llama_index.core.indices.struct_store.sql_query import (
    SQLTableRetrieverQueryEngine,
)
from llama_index.core.objects import (
    SQLTableNodeMapping,
    ObjectIndex,
    SQLTableSchema,
)
from llama_index.core import VectorStoreIndex, PromptTemplate

from src.db.database import engine
from src.db import models


load_dotenv()


OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")


llm = OpenAI(temperature=0.1, model="gpt-4o-mini", api_key=OPENAI_API_KEY)

sql_database = SQLDatabase(engine)

table_node_mapping = SQLTableNodeMapping(sql_database)

table_schema_objs = [
    (SQLTableSchema(table_name=table.__tablename__, context_str=table.__context_str__)) 
    for table in models.__dict__.values() if hasattr(table, '__tablename__')
]

obj_index = ObjectIndex.from_objects(
    table_schema_objs,
    table_node_mapping,
    VectorStoreIndex,
)

response_synthesis_prompt_str = (
    "Given an input question, synthesize a response from the query results. \
    You must ensure your response is completely factual.\n"
    "<query>{query_str}</query>\n"
    "<sql>{sql_query}</sql>\n"
    "<sql response>SQL Response: {context_str}</sql response>\n"
    "Response: "
)
response_synthesis_prompt = PromptTemplate(
    response_synthesis_prompt_str,
)

query_engine = SQLTableRetrieverQueryEngine(
    sql_database, obj_index.as_retriever(similarity_top_k=1),
    response_synthesis_prompt=response_synthesis_prompt,
)

# query = "What are the fields in the meetings table and what do they represent contextually?"
# query = "Using just your provided system messaging and without using SQL, \
#     What are the fields in the meetings table and what do they represent contextually?"
# query = "What is the name of the firm that has the most meetings and how many meets do they have?"
# query = "Can you show me the first 5 rows of meetings?"
query = "Fetch the first 5 meetings and their content which have a firm attended that are in the IT Consulting & Other Services sector."
response = query_engine.query(query)

print("SQL Query:")
print("```\n" + response.metadata["sql_query"] + "\n```")
print("Response:")
display(Markdown(f"<b>{response}</b>"))
if "result" in response.metadata:
    display(pd.DataFrame(response.metadata["result"], columns=response.metadata["col_keys"]))

SQL Query:
```
SELECT m.meeting_id, m.title, m.content
FROM meetings m
JOIN firms f ON m.firm_attended_id = f.firm_id
WHERE f.sector = 'IT Consulting & Other Services'
ORDER BY m.date
LIMIT 5;
```
Response:


<b>The first 5 meetings attended by firms in the IT Consulting & Other Services sector are as follows:
1. Meeting with Accenture: Discussed potential synergies, tech innovations, data analytics, regulatory changes, and improved communication channels.
2. Call with Cognizant: Discussed collaboration opportunities, growth in digital services, key sectors for investment, and regulatory challenges.
3. Call with Accenture: Discussed Verizon's market performance, investment in 5G technology, competition, customer retention strategies, and potential for M&A.
4. Call with Cognizant: Discussed Incyte's pipeline developments, GE HealthCare's market position, Entergy's regulatory challenges, and Gilead Sciences' acquisitions.
5. Email with Cognizant: Discussion on potential collaborations with IQVIA, Yum! Brands, Host Hotels & Resorts, and Cummins.</b>

Unnamed: 0,meeting_id,title,content
0,b1df9bfc-febd-44d2-9cf4-ae5e905953d5,Meeting with Accenture,- Discussed potential synergies between our fi...
1,93ee4cf5-ca97-48ce-ac42-e9b01e83c605,Call with Cognizant,- Discussed potential collaboration opportunit...
2,6539b388-0b71-4ed4-b792-a3088d7496c6,Call with Accenture,- Discussed Verizon's recent market performanc...
3,18d357db-b908-4360-8054-3c16bccbba8f,Call with Cognizant,- Discussed Incyte's recent pipeline developme...
4,8cb7004e-07f8-4180-b1cd-acf71a3a6a2b,Email with Cognizant,Subject: Discussion on Potential Collaboration...


# Custom Simplified Implementation
- Much slower
- Has chain of thought reasoning with verbosity
- Still has issues constructing queries
- Need to consider how the information is presented back to the User in a memory-friendly way
    - Can return just beam_ids as part of the retrieval?
        - This can be added to the user's 'meetings in-focus' view?
    - Can return as markdown (BIG CONTEXT ISSUE)

In [1]:
import os
from dotenv import load_dotenv

import pandas as pd
from llama_index.llms.openai import OpenAI

from src.db.database import session_scope
from src.db import models
from src.rag.sql_retriever import MeetingsSQLAgent

load_dotenv()


OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

llm = OpenAI(temperature=0.1, model="gpt-4o-mini", api_key=OPENAI_API_KEY)

agent = MeetingsSQLAgent(llm, "src/db/models.py", verbose=True)

query = "Return all meetings between 2022-01-01 and 2023-01-01 where the firms that attended are in the IT Consulting & Other Services sector."

with session_scope() as session:
    response = agent.complete(session, query)

response_md = response.to_markdown(index=True)

print(response_md)

CHAIN OF THOUGHTS:
Thoughts: I need to retrieve meetings that occurred between the specified date range. Additionally, I need to filter these meetings based on the sector of the firms that attended. This requires a join between the meetings table and the firms table to access the sector information.
Outcome: Join the meetings table with the firms table using the firm_attended_id and firm_id, and filter based on the date range and sector. 

SQL QUERY:
```
SELECT meetings.meeting_id 
FROM meetings 
JOIN firms ON meetings.firm_attended_id = firms.firm_id 
WHERE meetings.date >= '2022-01-01' 
AND meetings.date <= '2023-01-01' 
AND firms.sector ILIKE '%IT Consulting & Other Services%';
```
|    | meeting_id                           | date of interaction   | beam_id                              | title                  | content                                                                                                                                                                     

# Some additional tests

In [2]:
from sqlalchemy import func

with session_scope() as session:
    res = (
        session.query(func.max(models.Meetings.date))
    ).one_or_none()

current_date = res[0].strftime("%Y-%m-%d")

In [3]:
from datetime import datetime, timedelta

current_date_dt = datetime.strptime(current_date, "%Y-%m-%d")
minus_30_days = current_date_dt - timedelta(days=30)

with session_scope() as session:
    res = (
        session.query(models.Meetings)
        .filter(models.Meetings.date >= minus_30_days.strftime("%Y-%m-%d"), models.Meetings.date <= current_date)
        .all()
    )
    for meeting in res:
        meeting.employees
    

res = [x for y in [[str(y.employee_id) for y in x.employees] for x in res] for x in y]
user_id = pd.Series(res).value_counts().sort_values(ascending=False).index[0]

In [4]:
with session_scope() as session:
    res = (
        session.query(models.Employees.name)
        .filter(models.Employees.employee_id == user_id)
        .first()
    )
user_name = res[0]

In [5]:
with session_scope() as session:
    res = (
        session.query(models.Meetings)
        .filter(
            models.Meetings.date >= minus_30_days.strftime("%Y-%m-%d"), 
            models.Meetings.date <= current_date,
            models.Meetings.employees.any(models.Employees.employee_id == user_id)
        )
        .all()
    )

len(res)

4

In [6]:
query = "Give me a summary of meetings I have attended in the last month (employee_id: {}, current date: {}).".format(user_id, current_date)

with session_scope() as session:
    response = agent.complete(session, query)

response_md = response.to_markdown(index=True)
print(response_md)

CHAIN OF THOUGHTS:
Thoughts: I need to retrieve the meeting IDs for meetings attended by a specific employee within the last month. The employee's ID is provided, and I will filter the meetings based on the date range from 2025-11-28 to 2025-12-28.
Outcome: I will join the meetings table with the employee_meetings association table to filter by the employee ID and the date range. 

SQL QUERY:
```
SELECT meetings.meeting_id 
FROM meetings 
JOIN employee_meetings ON meetings.meeting_id = employee_meetings.meeting_id 
JOIN employees ON employees.employee_id = employee_meetings.employee_id 
WHERE employees.employee_id = 'e3264b13-b6eb-4fd8-a235-0f248da809c8' 
AND meetings.date >= '2025-11-28' 
AND meetings.date <= '2025-12-28';
```
|    | meeting_id                           | date of interaction   | beam_id                              | title                            | content                                                                                                               

# Expand into Query -> Answer
- Orinally I tried a full text to sql agent into a responder agent, however the text to sql agent was highly unstable when required to perform queries that used association tables.
- This has now therefore changed from being completely text-to-sql process to a more structured workflow:
    1. User sends prompt to the agent
    2. Agent re-writes the prompt as a natural language instruction to the text-to-sql agent. The text-to-sql agent only writes the sql to find the meeting_ids of the meetings relevant to the instructions. Programmatic querying then completes the full output by joining the relevant tables.
    3. The returned table is converted to markdown and passed to the original agent for generating a response to the user.
- This is more stable than before, however still have some issues where a query asks for a list of meetings but also to identify which ones the user was not in attendance of - this returns only the meetings they were/weren't in attendance, never both.

In [7]:
from textwrap import dedent
from llama_index.core import PromptTemplate

from src.rag.sql_responder import MeetingsSQLQnA


query_template = PromptTemplate(
    dedent(
        """
        **User Query:**\n
        {query}\n\n

        **Key Information:**\n
        - User's Employee ID: {employee_id}\n
        - User's Name: {user_name}\n
        - Current Date: {current_date}\n
        """
    )
)
query = query_template.format(
    # query="Give me a summary of meetings I have attended in the last month.",
    # query="Write a report on the last five meetings we have had with Marathon Petroleum.",
    # query="Write a report on the last five months of meetings we have had with Marathon Petroleum.",
    query="Write a report on the last five months of meetings we have had with Marathon Petroleum or where they were discussed.",
    # query="What were all the meetings in the last 2 months and which ones did I not attend?",
    # query="Which meetings have been tagged as interesting?", # NOTE: BOGUS QUERY
    employee_id=user_id,
    user_name=user_name,
    current_date=current_date
)

agent = MeetingsSQLAgent(
    OpenAI(temperature=0.1, model="gpt-4o-mini", api_key=OPENAI_API_KEY), 
    "src/db/models.py", 
    verbose=True
)

qna_agent = MeetingsSQLQnA(
    llm=OpenAI(temperature=0.1, model="gpt-4o-mini", api_key=OPENAI_API_KEY),
    agent=agent,
    verbose=True
)

response = qna_agent.complete(query)
print("RESPONSE:")
print(response.response)

AI QUERY:
Retrieve all meetings from the last five months that involved Marathon Petroleum or where Marathon Petroleum was discussed.
CHAIN OF THOUGHTS:
Thoughts: To retrieve meetings from the last five months, I need to filter the meetings based on the date. I will use the current date and subtract five months from it to get the start date for the filter.
Outcome: Add a date filter for meetings.date to be greater than or equal to the calculated date from five months ago. 

Thoughts: I need to check if Marathon Petroleum was either a firm that attended the meeting or a firm that was discussed during the meeting. This requires joining the meetings table with both the firms table (for attendance) and the meetings_firms table (for discussion).
Outcome: Add joins to the firms table for attendance and the meetings_firms table for discussion. 

Thoughts: I will use the ILIKE operator to perform a case-insensitive search for 'Marathon Petroleum' in the firms' names.
Outcome: Add filters using