In [1]:
# 1. INSTALL AND IMPORT LIBRARIES

# Install PyMuPDF for high-performance PDF text extraction
!pip install PyMuPDF

# Install OpenAI SDK for OpenRouter API interaction
!pip install openai

# Install python-dotenv to read API keys from a .env file (optional in Colab)
!pip install python-dotenv

Collecting PyMuPDF
  Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.7-cp310-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m54.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.26.7


In [2]:


import os
import textwrap
import fitz # PyMuPDF is imported as 'fitz'
from openai import OpenAI
from google.colab import files

# --- Configuration ---
# You can replace the placeholder text here with your actual key
# OR, upload a .env file to Colab and uncomment the lines below.
OPENROUTER_API_KEY = "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"

# If you upload a .env file, you can uncomment these lines:
# from dotenv import load_dotenv
# load_dotenv()
# OPENROUTER_API_KEY = os.environ.get("OPENROUTER_API_KEY")

# Check for API Key
if not OPENROUTER_API_KEY:
    raise ValueError("Please replace 'sk-or-v1-YOUR_OPENROUTER_API_KEY_HERE' with your actual OpenRouter API key.")

print("Setup complete. Libraries installed and API key loaded.")

Setup complete. Libraries installed and API key loaded.


In [3]:
# 2. UPLOAD PDF AND EXTRACT TEXT

def upload_and_extract_pdf_text():
    """Uploads a PDF file to Colab and extracts its text using PyMuPDF."""
    print("Please upload your PDF file (e.g., 'my_insurance_policy.pdf'):")
    uploaded = files.upload()

    if not uploaded:
        print("No file uploaded. Exiting.")
        return None

    # Get the filename of the first uploaded file
    pdf_filename = list(uploaded.keys())[0]
    full_text = ""

    try:
        print(f"Reading text from {pdf_filename}...")
        doc = fitz.open(pdf_filename)
        for page in doc:
            # Use 'text' option for plain text extraction
            full_text += page.get_text("text") + "\n"
        doc.close()

        # Clean up the file from the Colab environment after reading
        os.remove(pdf_filename)

        return full_text.strip()

    except Exception as e:
        print(f"An error occurred during PDF processing: {e}")
        return None

# Execute the upload and extraction
document_text = upload_and_extract_pdf_text()

if document_text:
    print(f"\n✅ Text Extraction Successful! Total characters: {len(document_text)}")
    print("\n--- Preview of Extracted Text ---")
    print(textwrap.fill(document_text[:1000].strip(), width=80) + "...")
else:
    print("❌ Failed to extract document text. Cannot proceed.")

Please upload your PDF file (e.g., 'my_insurance_policy.pdf'):


Saving commercial-insurance-policy.pdf to commercial-insurance-policy.pdf
Reading text from commercial-insurance-policy.pdf...

✅ Text Extraction Successful! Total characters: 234374

--- Preview of Extracted Text ---
COMMERCIAL INSURANCE  Policy Document   COMMERCIAL INSURANCE  |  Policy Document
2
Contents  Page  Introduction  3  – Claims enquiries  3  – Policy information
Helpline services  Information services  General defnitions  Insuring clause
General exclusions  General conditions  Claims conditions  Security conditions
General memorandum  Sections of the policy  (each section is operative only if
shown as insured in the schedule)  Section 1 – Property damage  Section 2 –
Property damage plus  Section 3 – Equipment breakdown  Section 4 – Business
interruption  Section 5 – Liabilities  Section 6 – Legal expenses  Section 7 –
Money with assault extension  Section 8 – Goods in transit  Section 9 – Personal
accident  Section 10 – Terrorism  General information  74  – Complaint hand

In [4]:
import re

def parse_triplets_from_text(triplet_text: str):
    """
    Parses (Entity A, RELATION, Entity B) lines into structured tuples.
    Ignores malformed lines safely.
    """
    triplets = []
    pattern = r"\(\s*(.*?)\s*,\s*(.*?)\s*,\s*(.*?)\s*\)"

    for line in triplet_text.splitlines():
        match = re.match(pattern, line.strip())
        if match:
            h, r, t = match.groups()
            triplets.append((h, r, t))

    return triplets


In [5]:
 def setup_openrouter_client(api_key):

    """Initializes the OpenRouter client."""

    client = OpenAI(

        base_url="https://openrouter.ai/api/v1",

        api_key=api_key,

    )

    return client

In [6]:
def generate_knowledge_graph_data(document_text):
    """Generates KG triplets and returns them in structured form."""
    if not document_text:
        print("Error: Document text is empty.")
        return None, None

    try:
        client = setup_openrouter_client(OPENROUTER_API_KEY)

        system_prompt = textwrap.dedent("""
            You are an expert NLP and knowledge graph generation system.
            Extract key entities and their relationships.

            Output ONLY a list of (Entity A, Relationship, Entity B) triplets.
            One triplet per line. No explanations.

            Example:
            (Policy, COVERS, Fire)
            (Policy, ISSUED_BY, ABC Insurance)
        """)

        user_prompt = (
            "Analyze the following insurance document text and generate the knowledge graph triplets:\n\n"
            "---\n\n"
            f"{document_text}"
        )

        print("\n3. Sending request to OpenRouter for knowledge graph data...")

        completion = client.chat.completions.create(
            model="tngtech/deepseek-r1t2-chimera:free",
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.0
        )

        # 🔹 Raw LLM output (string)
        raw_triplet_text = completion.choices[0].message.content.strip()

        # 🔹 NEW: Parse into structured triplets
        triplets = parse_triplets_from_text(raw_triplet_text)

        print("\n✅ Knowledge Graph Triplet Generation Successful!")
        print(f"Extracted {len(triplets)} triplets")

        # Optional: preview first few
        for t in triplets[:5]:
            print(t)

        # 🔹 RETURN BOTH (important)
        return triplets, raw_triplet_text

    except Exception as e:
        print(f"❌ Error during Knowledge Graph generation: {e}")
        return None, None


In [7]:
triplets, kg_raw_text = generate_knowledge_graph_data(document_text)



3. Sending request to OpenRouter for knowledge graph data...

✅ Knowledge Graph Triplet Generation Successful!
Extracted 41 triplets
('Policy', 'ISSUED_BY', 'The Baptist Insurance Company')
('Policy', 'COVERS', 'Property Damage')
('Property Damage Section', 'INCLUDES', 'Fire')
('Property Damage Section', 'INCLUDES', 'Lightning')
('Property Damage Section', 'INCLUDES', 'Explosion')


In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from google.colab import userdata
import networkx as nx

In [10]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [11]:
model_id = "google/gemma-2b-it"


tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

tokenizer_config.json:   0%|          | 0.00/34.2k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

In [12]:
# --- 2. Build the Knowledge Graph ---
G = nx.Graph() # Using a standard Graph for easier bidirectional lookup
for head, relation, tail in triplets:
    G.add_edge(head, tail, label=relation)

In [19]:
!pip install -q thefuzz

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━[0m [32m1.8/3.2 MB[0m [31m53.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m56.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [41]:
from thefuzz import process
import torch

def get_global_summary():
    """Extracts high-level hub nodes to provide a general document overview."""
    hubs = sorted(G.degree, key=lambda x: x[1], reverse=True)[:10]
    summary_facts = []
    for node, degree in hubs:
        for neighbor in list(G.neighbors(node))[:3]:
            rel = G[node][neighbor].get('label', 'includes').replace("_", " ")
            summary_facts.append(f"- {node} {rel} {neighbor}")
    return "\n".join(summary_facts)

def ask_policy(question):
    # --- 1. LLM Intent Routing with Few-Shot Examples ---
    # We provide examples to teach Gemma the difference between broad and specific queries
    router_prompt = f"""<start_of_turn>user
    Classify the user's question as either [SUMMARY] or [DETAIL].

    EXAMPLES:
    "What does this document cover?" -> [SUMMARY]
    "Give me an overview of the policy." -> [SUMMARY]
    "What are the property damages covered?" -> [DETAIL]
    "What is the limit for theft?" -> [DETAIL]
    "Who is the insured party?" -> [DETAIL]
    "List the exclusions." -> [DETAIL]

    Question: "{question}"
    Answer with only the tag: [SUMMARY] or [DETAIL].<end_of_turn>
    <start_of_turn>model
    """

    router_inputs = tokenizer(router_prompt, return_tensors="pt").to("cuda")
    # Setting temperature=0.0 ensures the router is consistent and not creative
    router_outputs = model.generate(**router_inputs, max_new_tokens=5, temperature=0.0, do_sample=False)
    intent = tokenizer.decode(router_outputs[0], skip_special_tokens=True).upper()

    # Simple Keyword Override for safety
    if any(kw in question.lower() for kw in ["list", "limit", "specific", "amount", "deductible"]):
        intent = "[DETAIL]"

    # --- 2. Context Gathering based on Intent ---
    if "[SUMMARY]" in intent and "list" not in question.lower():
        print("--- Routing to: Global Summary ---")
        context_text = get_global_summary()
        system_role = "Provide a high-level summary of the insurance policy based on these core facts."
    else:
        print("--- Routing to: Local Fact Search (Refined 2-Hop) ---")
        entities_in_graph = list(G.nodes())
        potential_matches = []
        for word in question.split():
            if len(word) > 3:
                # Find nodes in the graph that match the user's words
                matches = process.extract(word, entities_in_graph, limit=2)
                for m in matches:
                    if m[1] > 75:
                        potential_matches.append(m[0])

        mentioned_entities = list(set(potential_matches))
        context_list = set() # Use a set to avoid duplicate facts

        for entity in mentioned_entities:
            # --- REFINED LOCAL SEARCH (Two-Hop) ---
            # Step 1: Get immediate facts
            for neighbor in G.neighbors(entity):
                rel1 = G[entity][neighbor].get('label', 'is related to').replace("_", " ")
                context_list.add(f"- {entity} {rel1} {neighbor}")

                # Step 2: Get deeper details (The second hop)
                # This finds things like Fire -> Limit -> $50,000
                for second_neighbor in G.neighbors(neighbor):
                    rel2 = G[neighbor][second_neighbor].get('label', 'has').replace("_", " ")
                    context_list.add(f"- {neighbor} {rel2} {second_neighbor}")

        context_text = "\n".join(list(context_list)) if context_list else "No specific matches found."
        system_role = "Answer the question using ONLY the specific facts provided. Be concise and use bullet points."

    # --- 3. Final Answer Generation ---
    final_prompt = f"""<start_of_turn>user
    You are an insurance specialist. {system_role}

    POLICY DATA:
    {context_text}

    USER QUESTION: {question}
    <end_of_turn>
    <start_of_turn>model
    """

    inputs = tokenizer(final_prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(**inputs, max_new_tokens=256, temperature=0.2)

    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("model\n")[-1].strip()

In [50]:
# --- 4. Usage ---
# Example: If your triplets had ('Fire', 'LIMIT_OF_LIABILITY', '$50,000')
user_query = "im prone to fire damage should i buy this insurance?"
print(f"Response: {ask_policy(user_query)}")

--- Routing to: Global Summary ---
Response: Yes, the policy covers property damage caused by fire, lightning, and explosion. Therefore, if you are prone to these types of fires, you should consider purchasing this insurance.
