In [None]:
import os
import io
import pdfplumber
import fitz
import pytesseract
import pandas as pd
from PIL import Image

from sentence_transformers import SentenceTransformer
import chromadb
from rank_bm25 import BM25Okapi

from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# ============================================================
# -------------- DOCUMENT INGESTION FUNCTIONS ----------------
# ============================================================

def extract_pdf_text(path):
    try:
        with pdfplumber.open(path) as pdf:
            return "\n".join([page.extract_text() or "" for page in pdf.pages])
    except Exception as e:
        print(f"[WARN] Failed to extract text from PDF {path}: {e}")
        return ""

def ocr_image_bytes(image_bytes):
    try:
        image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
        return pytesseract.image_to_string(image)
    except:
        return ""

def extract_pdf_images(path):
    ocr_list = []
    try:
        pdf = fitz.open(path)
        for page in pdf:
            for img in page.get_images(full=True):
                xref = img[0]
                base = pdf.extract_image(xref)
                ocr_list.append(ocr_image_bytes(base["image"]))
    except Exception as e:
        print(f"[WARN] Image extraction failed: {e}")
    return ocr_list

def extract_tables_csv_or_excel(path):
    try:
        if path.lower().endswith("xlsx"):
            df = pd.read_excel(path)
        else:
            df = pd.read_csv(path)
        return [df.to_markdown(index=False)]
    except:
        return []

def ingest_pdf(path):
    return {
        "source": os.path.basename(path),
        "text": extract_pdf_text(path),
        "tables": extract_tables_csv_or_excel(path),
        "images_ocr": extract_pdf_images(path)
    }

def ingest_image(path):
    try:
        ocr = ocr_image_bytes(open(path,"rb").read())
    except:
        ocr = ""
    return {"source": os.path.basename(path),"text":"", "tables":[], "images_ocr":[ocr]}

def ingest_table(path):
    return {"source": os.path.basename(path),"text":"", "tables":extract_tables_csv_or_excel(path),"images_ocr":[]}

def ingest_document_set(pdf_paths=[], image_paths=[], table_paths=[]):
    docs = []
    for p in pdf_paths: docs.append(ingest_pdf(p))
    for p in image_paths: docs.append(ingest_image(p))
    for p in table_paths: docs.append(ingest_table(p))
    return docs

# ============================================================
# ------------------------ CHUNKING --------------------------
# ============================================================

def chunk_text(text, source, size=600, overlap=100):
    if not text: return []
    chunks, words, i = [], text.split(), 0
    cid = 0
    while i < len(words):
        chunk = " ".join(words[i:i+size])
        chunks.append({
            "chunk_id": f"{source}_text_{cid}",
            "content": chunk,
            "type": "text",
            "source": source
        })
        i += size - overlap
        cid += 1
    return chunks

def chunk_table(md_table, source, rows_per_chunk=8):
    if not md_table: return []
    rows = md_table.split("\n")
    header, body = rows[:2], rows[2:]
    chunks = []
    for i in range(0, len(body), rows_per_chunk):
        part = header + body[i:i+rows_per_chunk]
        chunks.append({
            "chunk_id": f"{source}_table_{i}",
            "content": "\n".join(part),
            "type": "table",
            "source": source
        })
    return chunks

def chunk_image_text(text, source):
    if not text: return []
    return [{
        "chunk_id": f"{source}_img_0",
        "content": text,
        "type": "image",
        "source": source
    }]

# ============================================================
# ---------------------- INDEXING -----------------------------
# ============================================================

embedder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
chroma_client = chromadb.Client()

def initialize_dense_index(name="enterprise_dense"):
    try:
        col = chroma_client.get_collection(name)
    except:
        col = chroma_client.create_collection(name)
    return col

def build_dense_index(collection, chunks):
    # clear to avoid duplicates
    try: chroma_client.delete_collection(collection.name)
    except: pass
    collection = chroma_client.create_collection(collection.name)

    docs = [c["content"] for c in chunks]
    ids  = [c["chunk_id"] for c in chunks]
    metas= [{"source":c["source"], "type":c["type"]} for c in chunks]

    embeds = embedder.encode(docs, convert_to_numpy=True)

    collection.add(
        embeddings=embeds.tolist(),
        ids=ids,
        documents=docs,
        metadatas=metas
    )

    return collection

def initialize_bm25(chunks):
    corpus = [c["content"].split() for c in chunks]
    return BM25Okapi(corpus)

# ============================================================
# -------------------- HYBRID RETRIEVAL -----------------------
# ============================================================

def hybrid_retrieve(query, chunks, dense_db, bm25, k=5):
    q_emb = embedder.encode([query])[0]
    dense_res = dense_db.query(query_embeddings=[q_emb], n_results=len(chunks))

    dense_ids = dense_res["ids"][0]
    dense_dist= dense_res["distances"][0]
    dmap = {cid:dist for cid,dist in zip(dense_ids, dense_dist)}

    sparse_scores = bm25.get_scores(query.split())

    results = []
    for i, ch in enumerate(chunks):
        cid = ch["chunk_id"]

        # distance → similarity
        dsim = 1 / (1 + dmap.get(cid, 999))

        bm = sparse_scores[i]
        bsim = bm / (1 + bm) if bm > 0 else 0

        results.append({
            "chunk_id": cid,
            "content": ch["content"],
            "source": ch["source"],
            "type": ch["type"],
            "fused_score": dsim + bsim
        })

    return sorted(results, key=lambda x: x["fused_score"], reverse=True)[:k]

# ============================================================
# ----------------------- AGENT ------------------------------
# ============================================================

model_id = "microsoft/phi-1_5"   
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto")
llm = pipeline("text-generation", model=model, tokenizer=tokenizer)

rag_prompt = """
You are an enterprise document intelligence agent.

Given the retrieved evidence chunks below, you must:

1. Use ONLY the retrieved chunks.
2. Cite chunks using their metadata.
3. Explain WHY each chunk was retrieved (keyword match or semantic relevance).
4. Provide a final answer.
5. Provide JSON output:

{
  "key_findings": [],
  "numerical_data": [],
  "risk_flags": [],
  "insights": []
}

Retrieved Evidence:
{evidence}

Question:
{query}

If evidence is insufficient, answer: "INSUFFICIENT EVIDENCE".
"""

def run_agent(query, chunks, dense_db, bm25, k=5):
    retrieved = hybrid_retrieve(query, chunks, dense_db, bm25, k)

    ev = ""
    for r in retrieved:
        ev += f"[{r['chunk_id']} | {r['source']} | score={r['fused_score']:.4f}]\n{r['content']}\n\n"

    prompt = rag_prompt.format(evidence=ev, query=query)
    output = llm(prompt, max_length=1200, do_sample=False)[0]["generated_text"]
    return output

# ============================================================
# ---------------------- DEMO --------------------------------
# ============================================================

docs = ingest_document_set(
    pdf_paths=[r'/content/Data Scientist Case Study.pdf'],
    image_paths=[],
    table_paths=[]
)

print(f"Total documents ingested: {len(docs)}")
print(docs[0] if docs else "No documents found.")

# Build chunks
all_chunks = []
for doc in docs:
    all_chunks.extend(chunk_text(doc["text"], doc["source"]))
    for table in doc["tables"]:
        all_chunks.extend(chunk_table(table, doc["source"]))
    for img_text in doc["images_ocr"]:
        all_chunks.extend(chunk_image_text(img_text, doc["source"]))

print(f"Total chunks created: {len(all_chunks)}")

# Build indexes
dense_db = initialize_dense_index("enterprise_dense")
dense_db = build_dense_index(dense_db, all_chunks)
bm25 = initialize_bm25(all_chunks)

print("Hybrid indexes ready.")

# Queries
queries = [
    "Summarize key product specifications.",
    "List numerical data in the report.",
    "Identify risks highlighted in the documents.",
    "Provide overall insights."
]

for query in queries:
    print("="*60)
    print(f"Query: {query}\n")
    output = run_agent(
        query=query,
        chunks=all_chunks,
        dense_db=dense_db,
        bm25=bm25,
        k=5
    )
    print(output)
    print("="*60, "\n")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Device set to use cuda:0


Total documents ingested: 1
{'source': 'Data Scientist Case Study.pdf', 'text': 'Case Study: Multimodal RAG + Enterprise\nDocument Analysis\nTheme\nMultimodal RAG + Enterprise document analysis\nDuration\n3 days\nContext\nYour platform must analyze unstructured and structured documents (PDFs, tables, screenshots,\nlogs, emails, invoices), combine them semantically, and generate dependable summaries,\ninsights, and workflows. The solution must run inside a secure Kubernetes cluster, though\nKubernetes itself is not tested here. A Notebook (e.g., Google Colab) is recommended for\ndemonstration.\nTask\n● Design and partially implement a proof-of-concept RAG pipeline that achieves the\nfollowing:\n● Ingests a small set of PDFs (3–5 provided by us: a report, a product specification, a\ntable-heavy document).\n● Extracts text, tables, and images (optional but highly valued).\n● Builds a hybrid RAG index / Knowledge Base.\n● Implements retrieval methods that search indexes, explain why chunks

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Both `max_new_tokens` (=256) and `max_length`(=1200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)
Both `max_new_tokens` (=256) and `max_length`(=1200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_gen


You are an enterprise document intelligence agent.

Retrieved Evidence:
[Data Scientist Case Study.pdf_text_0 | Data Scientist Case Study.pdf | score=0.4230]
Case Study: Multimodal RAG + Enterprise Document Analysis Theme Multimodal RAG + Enterprise document analysis Duration 3 days Context Your platform must analyze unstructured and structured documents (PDFs, tables, screenshots, logs, emails, invoices), combine them semantically, and generate dependable summaries, insights, and workflows. The solution must run inside a secure Kubernetes cluster, though Kubernetes itself is not tested here. A Notebook (e.g., Google Colab) is recommended for demonstration. Task ● Design and partially implement a proof-of-concept RAG pipeline that achieves the following: ● Ingests a small set of PDFs (3–5 provided by us: a report, a product specification, a table-heavy document). ● Extracts text, tables, and images (optional but highly valued). ● Builds a hybrid RAG index / Knowledge Base. ● Implement

Both `max_new_tokens` (=256) and `max_length`(=1200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



You are an enterprise document intelligence agent.

Retrieved Evidence:
[Data Scientist Case Study.pdf_text_0 | Data Scientist Case Study.pdf | score=0.3707]
Case Study: Multimodal RAG + Enterprise Document Analysis Theme Multimodal RAG + Enterprise document analysis Duration 3 days Context Your platform must analyze unstructured and structured documents (PDFs, tables, screenshots, logs, emails, invoices), combine them semantically, and generate dependable summaries, insights, and workflows. The solution must run inside a secure Kubernetes cluster, though Kubernetes itself is not tested here. A Notebook (e.g., Google Colab) is recommended for demonstration. Task ● Design and partially implement a proof-of-concept RAG pipeline that achieves the following: ● Ingests a small set of PDFs (3–5 provided by us: a report, a product specification, a table-heavy document). ● Extracts text, tables, and images (optional but highly valued). ● Builds a hybrid RAG index / Knowledge Base. ● Implement

Both `max_new_tokens` (=256) and `max_length`(=1200) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



You are an enterprise document intelligence agent.

Retrieved Evidence:
[Data Scientist Case Study.pdf_text_0 | Data Scientist Case Study.pdf | score=0.3958]
Case Study: Multimodal RAG + Enterprise Document Analysis Theme Multimodal RAG + Enterprise document analysis Duration 3 days Context Your platform must analyze unstructured and structured documents (PDFs, tables, screenshots, logs, emails, invoices), combine them semantically, and generate dependable summaries, insights, and workflows. The solution must run inside a secure Kubernetes cluster, though Kubernetes itself is not tested here. A Notebook (e.g., Google Colab) is recommended for demonstration. Task ● Design and partially implement a proof-of-concept RAG pipeline that achieves the following: ● Ingests a small set of PDFs (3–5 provided by us: a report, a product specification, a table-heavy document). ● Extracts text, tables, and images (optional but highly valued). ● Builds a hybrid RAG index / Knowledge Base. ● Implement