### Encoding and loading pdfs

In [1]:
import os
import re
from typing import List, Dict, Tuple

import fitz  # PyMuPDF for PDF extraction
import google.generativeai as genai
import numpy as np
import faiss  # Vector search
import requests
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
from dotenv import load_dotenv

# =========================
# Setup
# =========================
load_dotenv()
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
SERPAPI_API_KEY = os.getenv("SERPAPI_API_KEY")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
if not GEMINI_API_KEY:
    raise RuntimeError("GEMINI_API_KEY not found in .env")

genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel("gemini-2.5-flash")

# Embeddings model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

# =========================
# PDF ingestion + chunking
# =========================
def extract_text_from_pdf(pdf_path: str) -> str:
    doc = fitz.open(pdf_path)
    text = []
    for page in doc:
        text.append(page.get_text())
    return "\n".join(text)

def clean_text(s: str) -> str:
    s = s.replace("\u00a0", " ")
    s = re.sub(r"[ \t]+", " ", s)
    s = re.sub(r"\n{3,}", "\n\n", s)
    return s.strip()

def chunk_text(text: str, chunk_size: int = 1100, overlap: int = 150) -> List[str]:
    chunks, i = [], 0
    while i < len(text):
        chunks.append(text[i:i+chunk_size])
        i += chunk_size - overlap
    return chunks

pdf_folder = "./pdf_files/"
if not os.path.isdir(pdf_folder):
    raise FileNotFoundError(f"Folder '{pdf_folder}' not found. Create it and add PDFs.")

documents, doc_sources = [], []
pdf_chunks, pdf_chunk_meta = [], []

pdf_files = [f for f in os.listdir(pdf_folder) if f.lower().endswith(".pdf")]
if not pdf_files:
    raise RuntimeError("No PDFs found in ./pdf_files/. Please add at least one PDF.")

print(f"Loading {len(pdf_files)} PDFs from {pdf_folder} ...")
for file in tqdm(pdf_files):
    file_path = os.path.join(pdf_folder, file)
    raw = extract_text_from_pdf(file_path)
    raw = clean_text(raw)
    documents.append(raw)
    doc_sources.append(file)

    chunks = chunk_text(raw, chunk_size=1100, overlap=150)
    for ci, c in enumerate(chunks):
        pdf_chunks.append(c)
        pdf_chunk_meta.append({"source": file, "chunk_id": ci})

# Build FAISS over PDF chunks
print("Embedding PDF chunks ...")
pdf_embeddings = embedder.encode(pdf_chunks, convert_to_numpy=True, show_progress_bar=True)
dimension = pdf_embeddings.shape[1]
index_pdf = faiss.IndexFlatL2(dimension)
index_pdf.add(pdf_embeddings.astype(np.float32))

print(f"PDF index built with {len(pdf_chunks)} chunks. Embedding dim={dimension}")

# =========================
# SerpAPI Web Search (no extra SDK)
# =========================
def web_search_serpapi(query: str, k: int = 8, hl: str = "en", gl: str = "es", last_year_only: bool = True) -> List[Dict]:
    """
    Uses SerpAPI (Google) to fetch organic results.
    Returns list of dicts: {title, url, content}
    """
    if not SERPAPI_API_KEY:
        return []

    params = {
        "engine": "google",
        "q": query,
        "api_key": SERPAPI_API_KEY,
        "num": k,            # up to 10 typical
        "hl": hl,            # language UI
        "gl": gl,            # country bias
        "safe": "active",
        "filter": "1",       # remove similar
    }
    # Limit to recent results (last year) if needed
    # tbs options: qdr:d (day), w (week), m (month), y (year)
    if last_year_only:
        params["tbs"] = "qdr:y"

    try:
        r = requests.get("https://serpapi.com/search.json", params=params, timeout=30)
        r.raise_for_status()
        data = r.json()
        org = data.get("organic_results", []) or []
        cleaned = []
        for item in org:
            title = item.get("title", "")
            link = item.get("link", "") or item.get("url", "")
            snippet = item.get("snippet", "") or item.get("content", "")
            if not link:
                continue
            cleaned.append({
                "title": title,
                "url": link,
                "content": clean_text(snippet)[:2000]
            })
        return cleaned
    except Exception as e:
        print(f"[web] SerpAPI error: {e}")
        return []

def embed_texts(texts: List[str]) -> np.ndarray:
    return embedder.encode(texts, convert_to_numpy=True)

# =========================
# Hybrid retrieval (PDF + Web)
# =========================
def retrieve_hybrid(query: str, top_k_pdf: int = 5, top_k_web: int = 4) -> Tuple[List[Dict], List[Dict]]:
    # PDF retrieval via FAISS
    q_emb = embed_texts([query]).astype(np.float32)
    distances, indices = index_pdf.search(q_emb, top_k_pdf)
    pdf_hits = []
    for rank, idx in enumerate(indices[0]):
        score = -float(distances[0][rank])  # higher is better
        pdf_hits.append({
            "text": pdf_chunks[idx],
            "score": score,
            "source": pdf_chunk_meta[idx]["source"],
            "chunk_id": pdf_chunk_meta[idx]["chunk_id"]
        })

    # Web retrieval via SerpAPI + re-rank with embeddings (cosine)
    web_results = web_search_serpapi(query, k=8) if top_k_web > 0 else []
    web_hits = []
    if web_results:
        web_texts = [w["content"] for w in web_results if w.get("content")]
        if web_texts:
            w_embs = embed_texts(web_texts).astype(np.float32)
            qn = q_emb / (np.linalg.norm(q_emb, axis=1, keepdims=True) + 1e-12)
            wn = w_embs / (np.linalg.norm(w_embs, axis=1, keepdims=True) + 1e-12)
            sims = (wn @ qn[0])
            top_idx = np.argsort(-sims)[:top_k_web]
            for i in top_idx:
                w = web_results[i]
                web_hits.append({
                    "text": w["content"],
                    "score": float(sims[i]),
                    "url": w["url"],
                    "title": w.get("title", "")
                })

    return pdf_hits, web_hits

def make_context_blocks(pdf_hits: List[Dict], web_hits: List[Dict], max_chars: int = 2400) -> Tuple[str, List[str]]:
    combined = []
    for i in range(max(len(pdf_hits), len(web_hits))):
        if i < len(pdf_hits): combined.append(("pdf", pdf_hits[i]))
        if i < len(web_hits): combined.append(("web", web_hits[i]))

    context_parts, citations, used = [], [], 0
    for typ, item in combined:
        snippet = item["text"].strip()
        if not snippet:
            continue
        header = f"[PDF:{item['source']}#chunk{item['chunk_id']}]" if typ == "pdf" else f"[WEB:{item.get('title','')}]"
        block = f"{header}\n{snippet}\n"
        if used + len(block) > max_chars:
            break
        context_parts.append(block)
        used += len(block)
        if typ == "web" and item.get("url"):
            citations.append(item["url"])

    return "\n\n---\n\n".join(context_parts), citations

# =========================
# Generation
# =========================
SYSTEM_INSTRUCTIONS = (
    "You are a credit risk and regulation assistant. "
    "Use the provided context (PDF and web snippets) when relevant, "
    "and you may also use your own knowledge. "
    "When you use web snippets, include a 'Sources:' section with URLs."
)

def query_hybrid(prompt: str) -> str:
    pdf_hits, web_hits = retrieve_hybrid(prompt, top_k_pdf=5, top_k_web=4)
    context, urls = make_context_blocks(pdf_hits, web_hits, max_chars=2400)

    final_prompt = (
        f"{SYSTEM_INSTRUCTIONS}\n\n"
        f"User question: {prompt}\n\n"
        f"Context below. Prefer it for factual grounding.\n"
        f"{'-'*40}\n{context}\n{'-'*40}\n\n"
        "If you used any web evidence, add a short 'Sources:' list of URLs at the end."
    )

    response = model.generate_content(final_prompt)
    text = response.text or ""
    if urls:
        uniq_urls = list(dict.fromkeys(urls))
        text += "\n\nSources:\n" + "\n".join(uniq_urls)
    return text


Loading 8 PDFs from ./pdf_files/ ...


100%|██████████| 8/8 [00:00<00:00, 10.83it/s]


Embedding PDF chunks ...


Batches: 100%|██████████| 18/18 [00:16<00:00,  1.12it/s]

PDF index built with 545 chunks. Embedding dim=384





### Evaluation Module

In [4]:
import time
import re
from typing import List, Dict, Optional
import numpy as np
import pandas as pd
from rouge_score import rouge_scorer

# Optional: rouge scorer if available
try:
    _rouge_scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
except Exception:
    _rouge_scorer = None

# NOTE: This cell relies on existing names from the notebook:
# - query_hybrid(prompt) -> str
# - embedder -> SentenceTransformer instance
# If you want to evaluate already-collected responses, set call_model=False
# and provide 'response' in each dataset entry.

def _extract_urls(text: str) -> List[str]:
    return re.findall(r"https?://\S+", text)

def _has_sources_section(text: str) -> bool:
    return bool(re.search(r"\bSources:\b", text, flags=re.IGNORECASE)) or bool(_extract_urls(text))

def _cosine_similarity(a: np.ndarray, b: np.ndarray) -> float:
    na = np.linalg.norm(a) + 1e-12
    nb = np.linalg.norm(b) + 1e-12
    return float((a @ b.T) / (na * nb))

def evaluate_single(prompt: str,
                    reference: Optional[str] = None,
                    call_model: bool = True) -> Dict:
    """
    Run the genAI model (unless call_model is False) and compute automatic metrics.
    Returns a dict with response, timing, cosine similarity (embedding), rouge-L (if available),
    source detection and simple length stats.
    """
    start = time.time()
    response = query_hybrid(prompt) if call_model else ""
    latency = time.time() - start

    # embeddings-based similarity (requires a reference)
    cos_sim = None
    if reference:
        resp_emb = embedder.encode([response], convert_to_numpy=True)
        ref_emb = embedder.encode([reference], convert_to_numpy=True)
        cos_sim = _cosine_similarity(resp_emb[0], ref_emb[0])

    # rouge-l (optional)
    rouge_l_f = None
    if reference and _rouge_scorer is not None:
        scr = _rouge_scorer.score(reference, response)
        rouge_l_f = scr["rougeL"].fmeasure

    urls = _extract_urls(response)
    has_sources = _has_sources_section(response)

    return {
        "prompt": prompt,
        "response": response,
        "reference": reference,
        "latency_s": latency,
        "len_response_chars": len(response),
        "num_urls": len(urls),
        "has_sources_section": has_sources,
        "cosine_sim": cos_sim,
        "rougeL_f": rouge_l_f,
        "urls": urls,
    }

def evaluate_dataset(dataset: List[Dict],
                     call_model: bool = True,
                     show_progress: bool = True) -> pd.DataFrame:
    """
    dataset: list of {"prompt": str, "reference": Optional[str], "response": Optional[str]}
    If call_model=True, 'response' is ignored and query_hybrid is invoked for each prompt.
    If call_model=False, 'response' must be present in each entry.
    Returns a pandas DataFrame with metrics per item and prints a short summary.
    """
    results = []
    for i, item in enumerate(dataset):
        if show_progress:
            print(f"Evaluating {i+1}/{len(dataset)}", end="\r")
        prompt = item.get("prompt")
        reference = item.get("reference")
        if not call_model and "response" in item:
            # Use provided response
            resp = item["response"]
            # We create a tiny wrapper to avoid calling model
            def _dummy_query(_p): return resp
            global query_hybrid  # temporarily swap
            _orig = query_hybrid
            query_hybrid = _dummy_query
            try:
                r = evaluate_single(prompt, reference, call_model=True)
            finally:
                query_hybrid = _orig
        else:
            r = evaluate_single(prompt, reference, call_model=call_model)
        results.append(r)
    df = pd.DataFrame(results)
    # summary
    avg_cos = df["cosine_sim"].dropna().mean() if "cosine_sim" in df else None
    avg_rouge = df["rougeL_f"].dropna().mean() if "rougeL_f" in df else None
    pct_with_sources = 100.0 * df["has_sources_section"].mean() if not df.empty else 0.0
    print("\n--- Evaluation summary ---")
    print(f"Items: {len(df)}")
    if avg_cos is not None:
        print(f"Avg embedding cosine similarity (resp vs ref): {avg_cos:.4f}")
    if avg_rouge is not None:
        print(f"Avg ROUGE-L F1: {avg_rouge:.4f}")
    print(f"% responses including a 'Sources' section or URLs: {pct_with_sources:.1f}%")
    print(f"Avg latency (s): {df['latency_s'].mean():.2f}")
    return df

In [6]:
q1 = "Where does Capitalflow operate and what does it do?"
print("Q:", q1)
print(query_hybrid(q1))
print("\n" + "="*80 + "\n")
q2 = "Summarize PD calculation approaches relevant to Capitalflow's portfolio. Include regulatory references if possible."
print("Q:", q2)
print(query_hybrid(q2))

Q: Where does Capitalflow operate and what does it do?
Capitalflow is based in **Dublin, Ireland**.

It operates as a **Diversified lending Non-Bank Financial Institution (NBFI)**.

Sources:
*   [https://www.crunchbase.com/organization/capitalflow](https://www.crunchbase.com/organization/capitalflow)

Sources:
https://tracxn.com/d/companies/capitalflow/__eYxFS6jERHrPNkjB4s9uenQZsLlB529jQzaWC2Eag7E


Q: Summarize PD calculation approaches relevant to Capitalflow's portfolio. Include regulatory references if possible.
Capitalflow employs internal-data-driven approaches for Probability of Default (PD) calculation, primarily within the framework of its IFRS 9 Expected Credit Loss (ECL) models.

**Capitalflow's PD Calculation Approaches:**

1.  **Internal Data Utilisation:** Capitalflow leverages its extensive internal data, specifically from Q3 2016 to Q4 2024, for model building and recalibration of PD.
2.  **Portfolio Segmentation:** The models are built with clear segmentation by portfo

In [None]:
# Example usage:
# Provide references (gold answers) when available.
dataset = [
    {"prompt": q1, "reference": "CapitalFlow Group (CFG) is an Irsish subsidiary of bunq; it operates in Ireland and Northern Ireland offering CRE, Asset Finance and Invoice Discounting."}
    #{"prompt": q2, "reference": "<short gold summary of PD calculation approaches>"},
]
df_metrics = evaluate_dataset(dataset, call_model=True)
display(df_metrics)

Evaluating 1/1
--- Evaluation summary ---
Items: 1
Avg embedding cosine similarity (resp vs ref): 0.5787
Avg ROUGE-L F1: 0.1000
% responses including a 'Sources' section or URLs: 100.0%
Avg latency (s): 3.65


Unnamed: 0,prompt,response,reference,latency_s,len_response_chars,num_urls,has_sources_section,cosine_sim,rougeL_f,urls
0,Where does Capitalflow operate and what does i...,"Capitalflow operates from **Dublin, Ireland**....",CapitalFlow Group (CFG) is an Irish subsidiary...,3.645576,348,2,True,0.578698,0.1,[https://www.crunchbase.com/organization/capit...
