In [2]:
!pip install transformers sentence-transformers accelerate faiss-gpu pymupdf --quiet


In [1]:
import fitz  # PyMuPDF
import re
import json
import os
from typing import List, Dict
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from sentence_transformers import SentenceTransformer
import torch
import faiss
import numpy as np


In [2]:
!pip install accelerate



In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load Mistral-7B for enrichment
'''mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
mistral_tokenizer.pad_token = mistral_tokenizer.eos_token
mistral_model = AutoModelForCausalLM.from_pretrained(
    "mistralai/Mistral-7B-Instruct-v0.2",
    torch_dtype=torch.float16,
    device_map="auto"
)'''


# Load Zephyr-7B with full chat capabilities
pipe = pipeline(
    "text-generation",
    model="HuggingFaceH4/zephyr-7b-beta",
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)


# Load embedding model
embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")


Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

Device set to use cuda:0


In [4]:
def extract_text_from_pdf(pdf_path: str) -> str:
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text("text") + "\n"
    doc.close()
    return text.strip()


In [5]:
import re
import json

def extract_with_regex(text: str) -> dict:
    def extract_field(field_name: str) -> str:
        # Matches `field_name:` and captures everything until the next label or end of string
        pattern = rf"{field_name}\s*:\s*(.*?)(?=\n(?:\w+\s*:)|\Z)"
        match = re.search(pattern, text, re.IGNORECASE | re.DOTALL)
        return match.group(1).strip() if match else ""

    return {
        "policy_type": extract_field("policy_type"),
        "coverage": extract_field("coverage"),
        "content": extract_field("content")
    }
def enrich_chunk_with_zephyr(section_text: str, section_title: str, source: str) -> dict:
    messages = [
        {
            "role": "system",
            "content": (
                "You are an AI assistant helping extract structured information from insurance policy documents. "
                "Your job is to return a valid JSON object with the following fields:\n\n"
                "- section_title: The title of the section (same as input)\n"
                "- content: A cleaned, complete, and meaningful paragraph in natural language summarizing the key information from the section. "
                "This should be plain text — not a dictionary or nested structure. Think like a human explaining this section in full sentences.\n"
                "- policy_type: Extract only if clearly mentioned (e.g., Health, Life, or product name like my:health Suraksha)\n"
                "- coverage: Only if benefits, conditions, or limits are clearly described\n\n"
                "If any field is not present, leave it as an empty string. "
                "Return ONLY a valid JSON object. No extra markdown, explanation, or formatting."
            )
        },
        {
            "role": "user",
            "content": f"Section Title: {section_title}\nContent:\n{section_text}"
        }
    ]

    try:
        prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        outputs = pipe(prompt, max_new_tokens=512, do_sample=False)
        generated_text = outputs[0]["generated_text"]

        # Extract the JSON portion only
        json_part = generated_text.split("<|assistant|>")[-1].strip()
        print(json_part)

        try:
            metadata = json.loads(json_part)
            metadata["source"] = source
            metadata.setdefault("section_title", section_title)
            metadata.setdefault("content", section_text)
            metadata["error"] = None

        except json.JSONDecodeError:
            print("⚠️ JSON parsing failed, attempting regex fallback...")
            regex_data = extract_with_regex(section_text)
            print(regex_data)
            if regex_data["policy_type"] or regex_data["coverage"]:
                metadata = {
                    "source": source,
                    "section_title": section_title,
                    "content": regex_data["content"],
                    "policy_type": regex_data["policy_type"],
                    "coverage": regex_data["coverage"],
                    "error": "LLM parse failed - regex fallback used"
                }
            else:
                print("❌ Both LLM JSON and regex extraction failed.")
                metadata = {
                    "source": source,
                    "section_title": section_title,
                    "content": section_text,
                    "policy_type": "",
                    "coverage": "",
                    "error": "LLM parse failed & regex fallback both failed"
                }


    except Exception as e:
        raise RuntimeError(f"enrich_chunk_with_zephyr failed for section '{section_title}': {str(e)}")

    return {
        "text": f"Section Title: {metadata['section_title']}\n{metadata['content']}",
        "metadata": metadata
    }

In [None]:
enrich_chunk_with_zephyr("""my: health Suraksha General Conditions Proposer • Minimum Entry Age - 18 Years • Maximum Enty Age - Lifetime Entry Adult Dependent • Minimum Entry Age - 18 Years • Maximum Entry Age - Lifetime Entry Child/Children • Minimum Entry Age - 91 Days • Maximum Entry Age - 25 Years 1. Entry Age: my:health Suraksha, and Unlimited Retore (Add on) 2. Type of Policy:  The base policy can be issued on individual, multi-individual and family ﬂoater basis  In case of Family Floater policies ﬂoater discount of 50% will be applied on all the members except the oldest member 3. Coverage for dependents  Individual Sum Insured Option: • Proposer • Dependent children • Grandmother • Grandson • Daughter-in-law • Sister • Sister-in-law • Niece • Spouse • Dependant parents/in-laws • Grandfather • Granddaughter • Son-in-law • Brother • Nephew • Brother-in-law Floater sum insured option: Self, spouse, dependent children* and dependent parents/parents in law can be covered under ﬂoater option *Dependent children: A child is considered a dependent for insurance purposes until his 25th birthday provided he is ﬁnancially dependent, on the proposer. 4. Policy period: This policy can be issued for 1 year/ 2 years/ 3 years.""","Paragraph 7","hdfc")

In [6]:
def chunk_text_by_paragraphs(text: str, min_length: int = 100) -> List[Dict]:
    raw_chunks = re.split(r'\n{2,}', text)
    sections = []
    for i, chunk in enumerate(raw_chunks):
        # Remove line breaks and tabs, normalize spacing
        cleaned = re.sub(r'[\n\t\r]+', ' ', chunk)
        cleaned = re.sub(r'\s{2,}', ' ', cleaned).strip()

        if len(cleaned) >= min_length:
            sections.append({
                "section_title": f"Paragraph {i+1}",
                "content": cleaned
            })
    return sections


In [None]:
pdf_dir = "./policy_pdfs"
output_json_dir = "./enriched_jsons"
os.makedirs(output_json_dir, exist_ok=True)

pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith(".pdf")]
print(f"📄 Found {len(pdf_files)} PDFs.")

all_enriched = []

for pdf_file in pdf_files:
    full_path = os.path.join(pdf_dir, pdf_file)
    print(f"🧾 Processing {pdf_file}...")
    
    # Extract and chunk
    text = extract_text_from_pdf(full_path)
    chunks = chunk_text_by_paragraphs(text)

    enriched = []
    for chunk in chunks:
        enriched_chunk = enrich_chunk_with_zephyr(
            section_text=chunk["content"],
            section_title=chunk["section_title"],
            source=pdf_file
        )
        enriched.append(enriched_chunk)

    # Save per file
    json_path = os.path.join(output_json_dir, pdf_file.replace(".pdf", "_enriched.json"))
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(enriched, f, indent=2, ensure_ascii=False)

    all_enriched.extend(enriched)
    print(f"✅ Extracted and enriched {len(enriched)} chunks → {json_path}")

# Build FAISS index from all enriched chunks
#build_faiss_index(all_enriched)


In [7]:
import os
import json
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# ✅ Load model
model = SentenceTransformer("BAAI/bge-base-en-v1.5")
print("🚀 Model loaded!")

# ✅ Directory paths
json_folder = "./enriched_jsons"

# ✅ Load and embed texts
all_embeddings = []
all_metadata = []

json_files = [f for f in os.listdir(json_folder) if f.endswith("_enriched.json")]
print(f"📁 Found {len(json_files)} JSON file(s)")

for file in tqdm(json_files, desc="Embedding JSONs"):
    path = os.path.join(json_folder, file)

    with open(path, "r", encoding="utf-8") as f:
        data = json.load(f)

    for entry in data:
        text = entry["text"]
        metadata = entry["metadata"]

        # ✅ Remove "error" field if it exists
        metadata.pop("error", None)

        # Generate embedding
        embedding = model.encode(text, show_progress_bar=False)

        all_embeddings.append(embedding)
        all_metadata.append(metadata)

print(f"\n✅ Generated {len(all_embeddings)} embeddings with cleaned metadata (no 'error' field).")


🚀 Model loaded!
📁 Found 1 JSON file(s)


Embedding JSONs: 100%|██████████| 1/1 [00:00<00:00,  1.78it/s]


✅ Generated 11 embeddings with cleaned metadata (no 'error' field).





In [8]:
# ✅ Install FAISS
!pip install faiss-gpu --quiet  # Use faiss-gpu if you're on a GPU runtime

# ✅ Imports
import faiss
import numpy as np
import pickle

# ✅ Convert embeddings list to NumPy array
embedding_dim = len(all_embeddings[0])
embedding_matrix = np.array(all_embeddings).astype("float32")

# ✅ Create FAISS index
index = faiss.IndexFlatL2(embedding_dim)  # L2 distance (Euclidean)
index.add(embedding_matrix)
print(f"📦 FAISS index created and {index.ntotal} vectors added.")

# ✅ Save the index and metadata
faiss.write_index(index, "faiss_index_bge_base.index")

# Save metadata using pickle
with open("faiss_metadata.pkl", "wb") as f:
    pickle.dump(all_metadata, f)

print("✅ FAISS index and metadata saved successfully.")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


📦 FAISS index created and 11 vectors added.
✅ FAISS index and metadata saved successfully.
