In [2]:
from langchain_community.document_loaders.pdf import PyPDFLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import re
import os
import pypdf
from groq import Groq

  from pydantic.v1.fields import FieldInfo as FieldInfoV1
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
PDF_PATH = r"C:\Users\KIIT\OneDrive\Desktop\Agentic_Enterprise\data\HCL_Financial_Report_2024-25\Annual-Report-2024-25.pdf"
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
GROQ_MODEL = "llama-3.3-70b-versatile"   

In [4]:
loader = PyPDFLoader(PDF_PATH)
raw_docs = loader.load()
print("Raw pages:", len(raw_docs))

Raw pages: 423


In [5]:
clean_docs = []

for doc in raw_docs:
    text = doc.page_content.strip()

    if len(text) < 50:
        continue
    if text.lower().startswith("page"):
        continue

    clean_docs.append(
        Document(
            page_content=text,
            metadata={
                "page": doc.metadata.get("page"),
                "source": "HCL Annual Report 2024-25"
            }
        )
    )

print("After cleaning:", len(clean_docs))

After cleaning: 418


In [6]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150,
    separators=["\n\n", "\n", ".", " "]
)

chunks = splitter.split_documents(clean_docs)
print("Final chunks:", len(chunks))

Final chunks: 2071


In [7]:
embedder = SentenceTransformer("all-MiniLM-L6-v2")

texts = [c.page_content for c in chunks]
metadatas = [c.metadata for c in chunks]

embeddings = embedder.encode(
    texts,
    convert_to_numpy=True,
    normalize_embeddings=True
)

index = faiss.IndexFlatIP(embeddings.shape[1])
index.add(embeddings)

print("FAISS index size:", index.ntotal)

FAISS index size: 2071


In [8]:
def retrieve(query, k=5, min_score=0.4):
    q_emb = embedder.encode([query], normalize_embeddings=True)
    scores, idx = index.search(q_emb, k)

    results = []
    for rank, i in enumerate(idx[0]):
        score = scores[0][rank]
        if score >= min_score:
            results.append({
                "text": texts[i],
                "page": metadatas[i]["page"],
                "score": float(score)
            })

    return results

In [9]:
def build_context(retrieved):
    blocks = []
    for r in retrieved:
        blocks.append(
            f"[Page {r['page']}]\n{r['text']}"
        )
    return "\n\n".join(blocks)

In [10]:
client = Groq(api_key=GROQ_API_KEY)

def groq_answer(query, context):
    completion = client.chat.completions.create(
        model=GROQ_MODEL,
        temperature=0,
        messages=[
            {
                "role": "system",
                "content": (
                    "You are a strict factual extraction system.\n"
                    "- Use ONLY the provided context.\n"
                    "- Do NOT infer, estimate, or paraphrase numbers.\n"
                    "- Quote exact sentences for numeric answers.\n"
                    "- If the answer is missing, reply EXACTLY:\n"
                    "I cannot find this information in the provided document."
                )
            },
            {
                "role": "user",
                "content": f"Context:\n{context}\n\nQuestion:\n{query}"
            }
        ]
    )
    return completion.choices[0].message.content.strip()

In [11]:
def validate_citations(answer, retrieved):
    cited_pages = set(map(int, re.findall(r"\(Page (\d+)\)", answer)))
    allowed_pages = {r["page"] for r in retrieved}
    return cited_pages.issubset(allowed_pages)

In [12]:
def rag(query):
    retrieved = retrieve(query)

    if not retrieved:
        return "The document does not contain this information."

    context = build_context(retrieved)
    answer = groq_answer(query, context)

    if "cannot find this information" in answer.lower():
        return answer

    if not validate_citations(answer, retrieved):
        return "Answer rejected due to invalid citation."

    return answer

In [13]:
print(rag("what is the revenue in 2025"))

The sentence with the exact revenue for 2025 is: 
"Revenue from operations in the year ended 31 March 2025 increased by 6.5% to ₹117,055 crores from ₹109,913 crores  
in the year ended 31 March 2024." 

So, the revenue in 2025 is ₹117,055 crores.


In [14]:
def write_email_tool(recipient, subject, key_points):
    completion = client.chat.completions.create(
        model=GROQ_MODEL,
        temperature=0.2,
        messages=[
            {
                "role": "system",
                "content": (
                    "Write a concise, professional business email.\n"
                    "No fabricated facts.\n"
                    "No unnecessary fluff."
                )
            },
            {
                "role": "user",
                "content": (
                    f"Recipient: {recipient}\n"
                    f"Subject: {subject}\n"
                    f"Key Points:\n{key_points}"
                )
            }
        ]
    )
    return completion.choices[0].message.content.strip()

In [15]:
def schedule_meeting_tool(participants, date, time, agenda):
    return {
        "status": "DRAFT_ONLY",
        "participants": participants,
        "date": date,
        "time": time,
        "agenda": agenda,
        "note": "Meeting NOT scheduled. External calendar integration required."
    }

In [16]:
def route_intent(query):
    completion = client.chat.completions.create(
        model=GROQ_MODEL,
        temperature=0,
        messages=[
            {
                "role": "system",
                "content": (
                    "Classify the intent strictly as ONE label:\n"
                    "DOC_QA\nWRITE_EMAIL\nSCHEDULE_MEETING\nUNKNOWN\n"
                    "Return ONLY the label."
                )
            },
            {
                "role": "user",
                "content": query
            }
        ]
    )
    return completion.choices[0].message.content.strip()

In [None]:
def agent(query):
    intent = route_intent(query)

    if intent == "DOC_QA":
        return rag(query)

    elif intent == "WRITE_EMAIL":
        return (
            "Email drafting requires:\n"
            "- recipient\n"
            "- subject\n"
            "- key points\n"
            "Provide these explicitly."
        )

    elif intent == "SCHEDULE_MEETING":
        return (
            "Meeting scheduling requires:\n"
            "- participants\n"
            "- date\n"
            "- time\n"
            "- agenda\n"
            "Provide these explicitly."
        )

    else:
        return "I cannot perform this task."

In [18]:
print(route_intent("Write an email to HR asking for leave tomorrow"))

WRITE_EMAIL


In [19]:
print(route_intent("Schedule a meeting with finance team next Monday"))

SCHEDULE_MEETING


In [20]:
print(route_intent("Tell me a joke"))

UNKNOWN


In [21]:
print(route_intent("What was HCL's total revenue in FY 2024-25?"))

DOC_QA


In [22]:
print(schedule_meeting_tool(
    participants=["finance@hcl.com"],
    date="2026-01-20",
    time="11:00 AM",
    agenda="Budget discussion"
))

{'status': 'DRAFT_ONLY', 'participants': ['finance@hcl.com'], 'date': '2026-01-20', 'time': '11:00 AM', 'agenda': 'Budget discussion', 'note': 'Meeting NOT scheduled. External calendar integration required.'}


In [23]:
print(write_email_tool(
    recipient="hr@hcl.com",
    subject="Leave Request",
    key_points="Request leave for 12 Jan due to personal reasons"
))

Subject: Leave Request

Dear HR Team,

I am writing to request a leave of absence on 12 January due to personal reasons. I would appreciate it if you could approve my request.

Thank you for your time and consideration.

Sincerely,
[Your Name]


In [24]:
print(rag("What was the total revenue reported in FY 2024-25?"))

According to the provided context, "HCLTech reported a consolidated revenue of $13.8 billion in FY25, an increase of 4.7% from the preceding year in constant currency (CC)."


In [25]:
def run_agent(query: str):
    return agent(query)