In [1]:
!pip -q install pymupdf sentence-transformers faiss-cpu ddgs mistralai==0.4.2

In [2]:
import os
import re
from pathlib import Path

In [3]:
import fitz
import faiss
from sentence_transformers import SentenceTransformer
from ddgs import DDGS
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage

In [4]:
#Data Loading
#Embedding and LLM model

In [5]:
PDF_PATH = "/content/Data.pdf"
assert Path(PDF_PATH).exists()

EMBEDDING_MODEL_NAME = "all-MiniLM-L6-v2"
LLM_MODEL = "mistral-large-latest"

In [6]:
from google.colab import userdata
MISTRAL_API_KEY=userdata.get("MISTRAL_API_KEY")
mistral = MistralClient(api_key=MISTRAL_API_KEY)

In [7]:
#Parameters

In [8]:
TOP_K = 5
RETRIEVAL_THRESHOLD = 0.45
EVIDENCE_THRESHOLD = 0.40
WEB_MAX_RESULTS = 5

CHUNK_SIZE = 500
OVERLAP = 100
SUMMARY_SAMPLE_RATE = 8

In [9]:
#Text Extraction

In [10]:
def extract_text_pymupdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = []
    for i in range(len(doc)):
        text.append(f"PAGE_{i+1}")
        text.append(doc[i].get_text("text"))
    doc.close()
    return "\n".join(text)

In [11]:
#Cleaning Text

In [12]:
def clean_text(text):
    out = []
    for line in text.splitlines():
        line = line.strip()
        line = re.sub(r"^\(\d+\)\s*", "", line)

        if not line:
            continue

        if line.startswith("PAGE_"):
            out.append(line)
            continue

        if line.isdigit():
            continue

        if len(line) == 1 and line.isalpha() and line.isupper():
            continue

        out.append(re.sub(r"\s+", " ", line))

    return "\n".join(out)


In [13]:
#Chunking

In [14]:
def chunk_text(text, chunk_size=1000, overlap=100):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        chunk = " ".join(words[start : start + chunk_size])
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks


raw_text = extract_text_pymupdf(PDF_PATH)
cleaned_text = clean_text(raw_text)
chunks = chunk_text(cleaned_text)

print(cleaned_text[:500])

PAGE_1
Account closure (depositor account)
The closure of beneficiary and pool accounts by the investor and the clearing member or at the
discretion of the participant, if the client has defaulted in its obligations towards the participant.
Accounts Payable
A current liability showing the amounts due to others within a period of one year when such liability
resulted from the purchase or manufacturing of inventory.
Accounts Receivable
Any money due to a business for merchandise or securities that


In [15]:
#Glossary

In [16]:
def parse_glossary_entries(text):
    lines = [l.strip() for l in text.splitlines() if l.strip()]
    entries = []
    term = None
    definition = []
    pages = set()
    current_page = None

    def flush():

        nonlocal term, definition, pages
        if term and definition:
            entries.append({
                "term": term,
                "definition": " ".join(definition),
                "pages": sorted(list(pages))
            })
        term, definition, pages = None, [], set()

    for line in lines:
        if line.startswith("PAGE_"):
            try:
                current_page = int(line.replace("PAGE_", ""))
            except:
                current_page = None
            continue

        is_term = len(line) <= 80 and not line.endswith(".") and not line.isdigit()

        if is_term:
            flush()
            term = line
            if current_page is not None:
                pages.add(current_page)
        else:
            if term:
                definition.append(line)
                if current_page is not None:
                    pages.add(current_page)

    flush()
    return entries


In [17]:
#Embeddings
#FAISS index

In [18]:
entries = parse_glossary_entries(cleaned_text)
entries_text = [f"{e['term']}: {e['definition']}" for e in entries]

embedder = SentenceTransformer(EMBEDDING_MODEL_NAME)
entry_embeddings = embedder.encode(
    entries_text,
    convert_to_numpy=True,
    normalize_embeddings=True,
    show_progress_bar=True
).astype("float32")

dim = entry_embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(entry_embeddings)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading weights:   0%|          | 0/103 [00:00<?, ?it/s]

BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Batches:   0%|          | 0/22 [00:00<?, ?it/s]

In [19]:
def is_summary_request(query):
    q = query.lower().strip()
    triggers = [
        "summary", "summarize", "summarise",
        "pdf summary", "give pdf summary", "summary of the pdf",
        "summarize the pdf", "summarize this pdf", "summarize this document"
    ]
    return any(t in q for t in triggers)


In [20]:
def is_direct_definition_query(q):
    ql = q.strip().lower()
    return ql.startswith("what is") or ql.startswith("define") or ql.startswith("meaning of")

In [21]:
def is_indirect_query(q):
    ql = q.lower()
    triggers = [
        "compare", "difference", "vs", "versus",
        "based on", "which term", "what term", "term applies",
        "scenario", "suppose", "if a", "explain", "why",
        "and also", "both", "advantages", "disadvantages"
    ]
    return any(t in ql for t in triggers)


In [22]:
def extract_key_terms(query):
    stop = {"what","is","the","a","an","of","for","to","in","and","based","definition","definitions","pdf"}
    # Find all words using regex
    words = re.findall(r"[a-zA-Z]+", query.lower())
    # Return only the meaningful words
    return [w for w in words if w not in stop and len(w) > 2]


In [23]:
def retrieve_pdf(query, top_k=3):

    q_emb = embedder.encode([query], convert_to_numpy=True, normalize_embeddings=True).astype("float32")

    scores, ids = index.search(q_emb, top_k)

    hits = []
    for s, i in zip(scores[0], ids[0]):
        if i == -1:
            continue

        e = entries[int(i)]
        hits.append({
            "score": float(s),
            "entry": f"{e['term']}: {e['definition']}",
            "pages": e.get("pages", []),
            "id": int(i)
        })
    return hits


In [24]:
def filter_strict_keyword(hits, query):
    key_terms = extract_key_terms(query)

    if not key_terms:
        return hits

    out = []
    for h in hits:
        if all(t in h["entry"].lower() for t in key_terms):
            out.append(h)

    return out


In [25]:
def web_search_ddg(query, max_results=3):
    results = []
    with DDGS() as ddgs:
        for r in ddgs.text(query, max_results=max_results):
            results.append({
                "title": r.get("title", "").strip(),
                "body": r.get("body", "").strip(),
                "href": r.get("href", "").strip()
            })
    return results


In [26]:
def llm_synthesize_from_pdf(query, evidence_entries):

    evidence_block = "\n".join(f"- {e}" for e in evidence_entries)
    msgs = [
        ChatMessage(
            role="system",
            content="You must answer using ONLY the provided PDF evidence. "
                    "If the evidence is insufficient, reply exactly: "
                    "Not available in the provided PDF evidence. "
                    "Output 3 to 7 concise bullet points only."
                    "Do NOT use Markdown or formatting characters (*, **, _, #)."
        ),
        ChatMessage(role="user", content=f"QUESTION:\n{query}\n\nPDF EVIDENCE:\n{evidence_block}")
    ]

    resp = mistral.chat(model=LLM_MODEL, messages=msgs, temperature=0.0)
    return resp.choices[0].message.content.strip()


In [44]:
def llm_from_web(query, web_hits):
    snippets = "\n".join(f"- {h['body']}" for h in web_hits if h.get("body"))
    if not snippets.strip():
        return "No information could be found for the query."
    msgs = [
        ChatMessage(
            role="system",
            content="You are a factual extractor.\n"
                    "Rules:\n"
                    "- Use ONLY the provided web snippets\n"
                    "- Extract ONLY facts that directly answer the question\n"
                    "- Do NOT add explanations, context, or background\n"
                    "- Output 1 to 5 short bullet points (no sentences longer than 20 words)\n"
                    "- No introduction, no conclusion, no headings\n"
                    "- If snippets are insufficient, output exactly:\n"
                    "No information could be found for the query."
                    "Do NOT use Markdown or formatting characters (*, **, _, #).\n"

        ),
        ChatMessage(role="user", content=f"QUESTION:\n{query}\n\nWEB SNIPPETS:\n{snippets}")
    ]
    resp = mistral.chat(model=LLM_MODEL, messages=msgs, temperature=0.0)
    return resp.choices[0].message.content.strip()

In [28]:
def get_page_text(cleaned_text, page_no):
    start_marker = f"PAGE_{page_no}"
    end_marker = f"PAGE_{page_no+1}"
    start = cleaned_text.find(start_marker)
    if start == -1:
        return ""
    start += len(start_marker)
    end = cleaned_text.find(end_marker, start)
    if end == -1:
        end = len(cleaned_text)
    return cleaned_text[start:end].strip()

In [29]:
def is_page_summary_request(query):
    m = re.search(r"(?:page\s*(\d+)\s*summary)|(?:summary\s*page\s*(\d+))", query.lower())
    if not m:
        return None
    return int(m.group(1) or m.group(2))

In [30]:
def summarize_pdf_fast(sample_rate=SUMMARY_SAMPLE_RATE, page_no=None):
    if page_no is not None:
        page_text = get_page_text(cleaned_text, page_no)
        if not page_text:
            return f"No text found for page {page_no}."
        resp = mistral.chat(
            model=LLM_MODEL,
            messages=[
                ChatMessage(
                    role="system",
                    content=(
                        "Summarize the page as a whole (high-level), not a list of definitions.\n"
                        "Rules:\n"
                        "- Output exactly 5 bullet points\n"
                        "- Each bullet must be <= 18 words\n"
                        "- Focus on the overall topic + the most important ideas\n"
                        "- Summarize by grouping related ideas into high-level themes; do NOT list or define individual terms.\n"
                        "- Do NOT use Markdown or formatting characters (*, **, _, #)."
                    )
                ),
                ChatMessage(role="user", content=page_text)
            ],
            temperature=0.0
        )
        return resp.choices[0].message.content.strip()

    sampled_chunks = chunks[::sample_rate]
    partials = []

    for ch in sampled_chunks:
        resp = mistral.chat(
            model=LLM_MODEL,
            messages=[
                ChatMessage(
                    role="system",
                    content=(
                        "Extract high-level themes from this text.\n"
                        "Rules:\n"
                        "- Output 3 bullet points maximum\n"
                        "- Group related ideas; do NOT list or define individual terms\n"
                        "- Do NOT use Markdown or formatting characters (*, **, _, #)\n"
                        "- Use ONLY the provided text"
                    )
                ),
                ChatMessage(role="user", content=ch)
            ],
            temperature=0.0
        )
        partials.append(resp.choices[0].message.content.strip())

    final_resp = mistral.chat(
        model=LLM_MODEL,
        messages=[
            ChatMessage(
                role="system",
                content=(
                    "Output exactly 10-15 bullet points.\n"
                    "Rules:\n"
                    "- No headings, no tables\n"
                    "- Group related ideas into high-level themes; do NOT list or define individual terms\n"
                    "- Do NOT use Markdown or formatting characters (*, **, _, #)\n"
                    "- No repetition"
                )
            ),
            ChatMessage(role="user", content="\n".join(partials))
        ],
        temperature=0.0
    )
    return final_resp.choices[0].message.content.strip()


In [31]:
def web_fallback(query):
    try:
        web_hits = web_search_ddg(query, max_results=WEB_MAX_RESULTS)
    except Exception:
        return "No information could be found for the query."

    if not web_hits:
        return "No information could be found for the query."

    return llm_from_web(query, web_hits)

In [46]:
#Agent

In [32]:
def agent(query):

    page_no = is_page_summary_request(query)
    if page_no is not None:
        return summarize_pdf_fast(page_no=page_no)

    if is_summary_request(query):
        return summarize_pdf_fast()



    hits = retrieve_pdf(query, top_k=TOP_K)

    if is_direct_definition_query(query):
        strict_hits = filter_strict_keyword(hits, query)
        strict_hits = [h for h in strict_hits if h["score"] >= RETRIEVAL_THRESHOLD]

        if strict_hits:
            return "\n".join(f"- {h['entry']}" for h in strict_hits)

        return web_fallback(query)


    if is_indirect_query(query):
        evidence_hits = [h for h in hits if h["score"] >= EVIDENCE_THRESHOLD]
        evidence_entries = [h["entry"] for h in evidence_hits[:TOP_K]]

        if evidence_entries:
            ans = llm_synthesize_from_pdf(query, evidence_entries)


            if "not available in the provided pdf evidence" in ans.lower():
              return web_fallback(query)
            return ans

        return web_fallback(query)

    best_hits = [h for h in hits if h["score"] >= RETRIEVAL_THRESHOLD]
    if best_hits:
        return "\n".join(f"- {h['entry']}" for h in best_hits[:TOP_K])

    return web_fallback(query)


In [53]:
print(agent("What is Accounts Receivable?"))

- Accounts Receivable: Any money due to a business for merchandise or securities that it has sold or for services it has rendered. This is a key determinant in analyzing a company’s liquidity.


In [51]:
print(agent("comapre Bull and Bear market."))

- Bear Market: A weak or falling market characterized by the dominance of sellers.
- Bull Market: A rising market with abundance of buyers and relatively few sellers.
- Bull: A market player who believes prices will rise and would, therefore, purchase a financial instrument with a view to selling it at a higher price. Opposite of a bear.
- Bear Hug: A variety of takeover strategy that seeks to hurry target company managements to recommend acceptance of a tender offer in a short period of time.
- Bear: A pessimist market operator who expects the market price of shares to decline. The term also refers to the one who has sold shares which he does not possess, in the hope of buying them back at a lower price, when the market price of the shares come down in the near future.


In [52]:
print(agent("Who is the current CEO of Apple?"))

- Tim Cook is the current CEO of Apple.
- Cook has been Apple's CEO since 2011.
- He was appointed CEO in August 2011.


In [34]:
print(agent("page 2 summary "))

- Financial market tools and metrics assess security performance, risk, and market trends.
- Stock exchanges regulate listings, trading permissions, and investor communications for transparency.
- Technical indicators like advance/decline lines gauge overall market strength beyond major indices.
- Investment professionals provide guidance, execute trades, and facilitate cross-border securities access.
- Options and securities like ADRs enable flexible trading and global investment opportunities.


In [35]:
print(agent("Give Acid Test Ratio definition and also the latest RBI repo rate."))

- Acid-test ratio (quick ratio) measures a firm’s ability to cover short-term liabilities using liquid assets.
- It excludes inventory and other less liquid assets from current assets.
- A ratio of 1.0 or more indicates ability to pay short-term obligations.
- Current RBI repo rate is 5.25%.


In [38]:
print(agent("Define Accounts Receivable from the PDF and compare it with Accounts Payable. Also explain why both are important for liquidity management."))

- Accounts receivable (AR) are funds a company expects to receive for delivered goods or services, listed as a current asset.
- Accounts payable (AP) are funds a company expects to pay in the future, listed as a current liability.
- AR represents money owed to the company; AP represents money the company owes.
- Both AR and AP impact cash flow and working capital, affecting liquidity management.
- Efficient management of AR and AP ensures timely cash inflows and outflows, maintaining financial stability.


In [37]:
print(agent("Page 29 summary"))

- Financial instruments and securities have assigned values and structured trading mechanisms.
- Investment funds can be grouped, interconnected, or specialized under shared management.
- Market orders and accounting methods dictate trade execution and financial reporting practices.
- Economic downturns severely impact asset prices, institutions, and overall financial stability.
- Protective measures and obligations help manage risk and long-term commitments in finance.


In [39]:
print(agent("what is asset stripper?"))

- Asset Stripper: A person who buys a company in order to make profit by peeling off its assets bit by bit, and then selling them. These assets may be separate subsidiaries or plant and equipment or property. This process invariably involves the stripping of another sort of asset (the employees) of a number of jobs. This has been largely responsible for giving asset strippers a bad name. The asset stripper relies on there being a difference in the price of the business as a whole (as valued by a stock market, for example) and the sum of the amounts that can be raised from its parts sold separately. Such a possibility arises most commonly when a company is making losses or a much smaller profit than seems to be justified by its size.


In [47]:
print(agent("Explain finance"))

- Finance involves managing money, including borrowing, lending, and investing.
- Leverage is using borrowed money to finance an investment.
- Stock lending is when a security owner lends it to a third party for a set time, often for short position coverage or arbitrage.
- The money market deals with short-term non-equity debt instruments like treasury bills and commercial paper.
- Institutionalization refers to the growing influence of institutional investors in financial markets over individual investors.
- Derivatives are securities derived from underlying assets like debt instruments, shares, or loans.


In [49]:
print(agent("query xyzabc"))

No information could be found for the query.


In [50]:
print(agent(
    "Define Accounts Receivable from the PDF, compare it with Accounts Payable, "
    "and explain why both matter for liquidity management."
))

- Accounts receivable (AR) are funds a company expects to receive from customers for goods or services already delivered.
- AR is listed as a current asset on the balance sheet, representing future income.
- Accounts payable (AP) are funds a company owes to suppliers or vendors for goods or services received.
- AP is recorded as a liability on the balance sheet, representing cash outflow.
- AR and AP matter for liquidity management as they impact cash inflows (AR) and outflows (AP).


In [42]:
print(agent("who is wpl 2026 winners?"))

- WPL 2026 winner: Royal Challengers Bengaluru
- Runners-up: Delhi Capitals
- Orange Cap winner: Smriti Mandhana
- Purple Cap winner: Sophie Devine
- Prize money for winners (RCB): ₹6 crore


In [45]:
print(agent("who is ipl 2026 winners"))

No information could be found for the query.
