## **Mongo Client**

In [None]:
from pymongo import MongoClient
from dotenv import load_dotenv
import os

load_dotenv()

user = os.getenv("MONGO_USER")
password = os.getenv("MONGO_PASS")

uri = f"mongodb+srv://{user}:{password}@sylvr-financial-cluster.jz9cn66.mongodb.net/?retryWrites=true&w=majority"
client = MongoClient(uri)
DB_NAME = os.getenv("DATABASE_NAME")
db = client[DB_NAME]

def get_collection(collection_name):
    return db[collection_name]

## **Query Builder**

In [None]:
import os
import json
from dotenv import load_dotenv
from langchain.output_parsers import StructuredOutputParser, ResponseSchema
from langchain.prompts import PromptTemplate
from langchain_huggingface import HuggingFaceEndpoint
import re
from langchain_openai import ChatOpenAI

load_dotenv()

# Define expected response schema
response_schemas = [
    ResponseSchema(
        name="collection",
        description="MongoDB collection to query (strictly choose only one from: stocks, ETFs, customers, accounts, transactions)"
    ),
    ResponseSchema(
        name="query",
        description="MongoDB query as a Python dict, without extra explanation or formatting"
    )
]

parser = StructuredOutputParser.from_response_schemas(response_schemas)
format_instructions = parser.get_format_instructions()

# Define the prompt template
prompt_template = PromptTemplate(
    template="""
You are a MongoDB query generator. Given a natural language question, output only the MongoDB query in pure JSON format.

The response must be a single JSON object with exactly two keys:
- "collection": the MongoDB collection to query (strrictly only for suitable one from: stocks, ETFs, customers, accounts, transactions)
- "query": the MongoDB filter as a Python dictionary (no MongoDB-specific types like ISODate)

Rules:
- No explanation or extra output
- No markdown formatting
- No labels like "Answer:"
- No triple backticks
- Do not answer multiple questions
- Output only the pure JSON result for the current question
- Dates must be formatted as ISO 8601 strings. Do not use MongoDB types like ISODate().

{format_instructions}

Question: {user_input}
""",
    input_variables=["user_input"],
    partial_variables={"format_instructions": format_instructions},
)

# # Initialize the LLM
# llm = HuggingFaceEndpoint(
#     repo_id="HuggingFaceH4/zephyr-7b-beta",
#     huggingfacehub_api_token=os.environ["HUGGINGFACE_API_KEY"],
#     max_new_tokens=105,
#     temperature=0.01,
# )

llm = ChatOpenAI(model="gpt-4", temperature=0)


def clean_llm_response(text: str) -> str:
    """
    Clean HuggingFace LLM response by removing markdown, fixing ISODate,
    and extracting the first valid JSON block only.
    """
    # Remove markdown/code formatting and labels
    text = re.sub(r'json||Answer:|Question:.*', '', text, flags=re.IGNORECASE).strip()

    # Replace ISODate("...") with just the string inside
    text = re.sub(r'ISODate\("([^"]+)"\)', r'"\1"', text)

    # Optional: truncate to first JSON object
    start_idx = text.find('{')
    end_idx = text.rfind('}') + 1
    if start_idx >= 0 and end_idx > start_idx:
        text = text[start_idx:end_idx]

    return text


def build_query_from_natural_language(user_input: str):
    prompt = prompt_template.format(user_input=user_input)
    response = llm.invoke(prompt).content

    print("=== RAW LLM RESPONSE ===")
    print(response)

    try:
        cleaned = clean_llm_response(response)
        parsed = parser.parse(cleaned)
        collection = parsed["collection"].strip().lower()
        query = parsed["query"]
        return collection, query

    except Exception as e:
        raise ValueError(f"JSON parsing failed: {e}\nCleaned response:\n{cleaned[:300]}...")


# Test block
if __name__ == "__main__":
    user_question = "Show me all transactions over $1000 in the last 7 days"
    collection, query = build_query_from_natural_language(user_question)
    print("Collection:", collection)
    print("MongoDB Query:", query)


## **Query Executor**

In [None]:
from pymongo import MongoClient
from config.config import settings  # assuming settings has Mongo URI
from bson import json_util
client = MongoClient(settings.mongo_db)
db = client[settings.mongo_db]

def execute_query(collection_name: str, query: dict, limit: int = 20):
    collection = db[collection_name]
    results = collection.find(query).limit(limit)
    return [json_util.loads(json_util.dumps(doc)) for doc in results]

## **Summarizer Function**
- Query response to Natural Language

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-4", temperature=0)

def summarize_results(results: list, user_query: str) -> str:
    """Summarize MongoDB query results using the LLM."""
    context = f"The user asked: '{user_query}'.\n\n"
    context += "Here are the top results from the database:\n"
    context += json.dumps(results[:5], indent=2)

    prompt = (
        f"{context}\n\n"
        "Based on these results, give a concise, user-friendly summary of the key findings in 2–3 sentences."
    )

    response = llm.invoke(prompt)
    return response.content.strip()

## **Langchain Agent**

In [None]:
from langchain.agents import Tool, initialize_agent
from langchain_openai import ChatOpenAI
from langchain_community.tools import tool

llm = ChatOpenAI(model="gpt-4", temperature=0)

@tool
def mongo_query_tool(input_text: str) -> str:
    """Query the financial MongoDB collections using natural language."""
    collection, query = build_query_from_natural_language(input_text)
    result = execute_query(collection, query)
    return str(result[:10])  # limit results in preview

tools = [mongo_query_tool]

agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent="zero-shot-react-description",
    verbose=True
)

def run_natural_language_query(user_input: str):
    collection, query = build_query_from_natural_language(user_input)
    print(f"Running on collection: {collection}")
    print(f"Query: {query}")
    results = execute_query(collection, query)
    summary = summarize_results(results, user_input)
    return {"collection": collection, "query": query, "results": results, "summary": summary}

## **Querying Pipeline Test**

In [None]:
natural_query = "str"
run_natural_language_query(natural_query)