<a href="https://colab.research.google.com/github/SudarshanReddy41/initial-agent-service/blob/master/RCD_Construction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests, os, csv, logging, datetime, time, shutil
from google.colab import drive

# -------------------
# CONFIGURATION
# -------------------
API_KEY = "X3gXLEGe4J8qUbkaFpsl2BNUT9R2PmBSJgJ4WX51"
API_ENDPOINT = "https://api.regulations.gov/v4/documents"
HEADERS = {"X-Api-Key": API_KEY}
METADATA_CSV = "construction_metadata.csv"

# Mount Google Drive
drive.mount('/content/drive')

# Define your Google Drive folder path
GDRIVE_FOLDER_PATH = '/content/drive/MyDrive/construction_documents'
os.makedirs(GDRIVE_FOLDER_PATH, exist_ok=True)

# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger()

# -------------------
# Search Documents
# -------------------
def search_documents(term, page_size=100, max_pages=10):
    all_docs = []
    start_date = datetime.date(2024, 1, 1)
    end_date = datetime.date(2025, 4, 4)

    for page in range(1, max_pages + 1):
        params = {
            'filter[searchTerm]': term,
            'filter[documentType]': 'Rule,Proposed Rule',
            'filter[postedDate][ge]': start_date.isoformat(),
            'filter[postedDate][le]': end_date.isoformat(),
            'page[size]': page_size,
            'page[number]': page
        }
        logger.info(f"Fetching page {page}...")
        response = requests.get(API_ENDPOINT, headers=HEADERS, params=params)

        if response.status_code != 200:
            logger.error(f"API error {response.status_code}")
            break

        data = response.json().get("data", [])
        if not data:
            break

        all_docs.extend(data)
    return all_docs

# -------------------
# Download PDF and upload to Google Drive folder
# -------------------
def download_and_upload_pdf(doc_id, folder_path, retries=3, delay=5):
    url = f"https://downloads.regulations.gov/{doc_id}/content.pdf"
    local_path = f"{doc_id}.pdf"
    drive_path = os.path.join(folder_path, f"{doc_id}.pdf")

    for attempt in range(retries):
        try:
            response = requests.get(url, headers=HEADERS, timeout=15)
            if response.status_code == 200:
                with open(local_path, 'wb') as f:
                    f.write(response.content)

                shutil.move(local_path, drive_path)
                return f"Uploaded to: {drive_path}", 200
            else:
                logger.warning(f"Failed to download {doc_id}: HTTP {response.status_code}")
                return None, response.status_code
        except Exception as e:
            logger.warning(f"Attempt {attempt+1} failed: {e}")
            time.sleep(delay)
    return None, None

# -------------------
# Save Metadata to CSV
# -------------------
def save_metadata_to_csv(records, path=METADATA_CSV):
    headers = ["doc_id", "document_type", "title", "posted_date", "gdrive_url", "download_status"]
    write_header = not os.path.exists(path)

    with open(path, 'a', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=headers)
        if write_header:
            writer.writeheader()
        writer.writerows(records)

# -------------------
# Main Pipeline
# -------------------
def run_pipeline(term='construction safety compliance', max_pages=10):
    docs = search_documents(term, page_size=100, max_pages=max_pages)
    results = []

    for doc in docs:
        doc_id = doc.get('id')
        attr = doc.get('attributes', {})
        gdrive_path, status = download_and_upload_pdf(doc_id, GDRIVE_FOLDER_PATH)

        results.append({
            "doc_id": doc_id,
            "document_type": attr.get('documentType', ''),
            "title": attr.get('title', ''),
            "posted_date": attr.get('postedDate', ''),
            "gdrive_url": gdrive_path or '',
            "download_status": status or ''
        })

    save_metadata_to_csv(results)
    logger.info(f"Saved metadata for {len(results)} documents.")

# -------------------
# Run
# -------------------
run_pipeline(term='construction safety compliance', max_pages=10)


In [None]:
!pip install -q pymupdf sentence-transformers faiss-cpu
!pip install huggingface_hub[hf_xet]
!pip install tools

In [20]:
!pip install --upgrade pymupdf


Collecting pymupdf
  Using cached pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Using cached pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
Installing collected packages: pymupdf
Successfully installed pymupdf-1.25.5


In [None]:
import fitz  # from PyMuPDF
doc = fitz.open  # just check that the function exists
print("✅ fitz.open is accessible")

In [None]:

import os
import csv
import fitz  # PyMuPDF
import faiss
import numpy as np
import textwrap
import subprocess
from sentence_transformers import SentenceTransformer
from google.colab import drive
import numpy as np

# ------------------------
# Mount Google Drive
# ------------------------
drive.mount('/content/drive')

# ------------------------
# Configuration
# ------------------------
PDF_FOLDER = '/content/drive/MyDrive/construction_documents'
METADATA_CSV = '/content/drive/MyDrive/construction_metadata_log.csv'
EMBED_MODEL_NAME = "intfloat/multilingual-e5-small"
USER_QUERY = "What are the cybersecurity compliance requirements in these documents?"

# ------------------------
# 1. PDF Chunking
# ------------------------
def extract_text_chunks_from_folder(folder_path, chunk_size=500):
    all_chunks = []
    pdf_files = [f for f in os.listdir(folder_path) if f.endswith(".pdf")]

    for pdf_file in pdf_files:
        pdf_path = os.path.join(folder_path, pdf_file)
        doc = fitz.open(pdf_path)

        for page_num, page in enumerate(doc):
            text = page.get_text()
            wrapped = textwrap.wrap(text, chunk_size)
            for i, chunk in enumerate(wrapped):
                all_chunks.append({
                    "doc_id": pdf_file,
                    "chunk_id": f"{pdf_file}_p{page_num}_c{i}",
                    "text": chunk
                })
    return all_chunks

# ------------------------
# 2. Embedding
# ------------------------
def embed_chunks(chunks, model_name=EMBED_MODEL_NAME, batch_size=32):
    model = SentenceTransformer(model_name)
    texts = [c["text"] for c in chunks]
    embeddings = []

    for i in range(0, len(texts), batch_size):
        batch = texts[i:i + batch_size]
        emb = model.encode(batch, convert_to_tensor=False, show_progress_bar=True)
        embeddings.extend(emb)

    return np.array(embeddings), chunks, model


# ------------------------
# 3. FAISS Indexing
# ------------------------
def create_faiss_index(embeddings):
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)
    return index

# ------------------------
# 4. Retrieval
# ------------------------
def retrieve_relevant_chunks(query, index, chunks, embed_model, k=5):
    query_vec = embed_model.encode([query])
    D, I = index.search(np.array(query_vec), k)
    retrieved = set(I[0])
    return [chunks[i] for i in I[0]], retrieved

# ------------------------
# 5. Query LLaMA 4 (via Ollama CLI)
# ------------------------
def query_llama4(context_chunks, user_query):
    context_text = "\n\n".join([f"[{c['chunk_id']}] {c['text']}" for c in context_chunks])
    prompt = f"""You are a compliance analyst. Given the following regulatory document sections, answer the question below with precision.

### Context:
{context_text}

### Question:
{user_query}

### Answer:"""

    result = subprocess.run(
        ['ollama', 'run', 'llama4'],
        input=prompt.encode('utf-8'),
        stdout=subprocess.PIPE
    )
    return result.stdout.decode()

# ------------------------
# 6. Metadata Logging
# ------------------------
def save_metadata_to_csv(chunks, retrieved_ids, csv_path=METADATA_CSV):
    fieldnames = ["doc_id", "chunk_id", "text_length", "retrieved_for_answer"]
    write_header = not os.path.exists(csv_path)

    with open(csv_path, "a", newline="", encoding="utf-8") as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        if write_header:
            writer.writeheader()

        for i, chunk in enumerate(chunks):
            writer.writerow({
                "doc_id": chunk["doc_id"],
                "chunk_id": chunk["chunk_id"],
                "text_length": len(chunk["text"]),
                "retrieved_for_answer": "YES" if i in retrieved_ids else "NO"
            })

# ------------------------
# 7. Main Pipeline
# ------------------------
def main():
    if not os.path.exists(PDF_FOLDER):
        raise FileNotFoundError(f"Folder not found: {PDF_FOLDER}")

    print("Extracting and chunking PDFs...")
    chunks = extract_text_chunks_from_folder(PDF_FOLDER)

    print("Embedding chunks...")
    embeddings, chunks, embed_model = embed_chunks(chunks)

    print("Creating FAISS index...")
    index = create_faiss_index(embeddings)

    print("Retrieving relevant chunks for query...")
    top_chunks, retrieved_ids = retrieve_relevant_chunks(USER_QUERY, index, chunks, embed_model)

    print("Saving metadata...")
    save_metadata_to_csv(chunks, retrieved_ids)

    print("Querying LLaMA 4...")
    response = query_llama4(top_chunks, USER_QUERY)

    print("\n--- LLaMA 4 Response ---\n")
    print(response)

# ------------------------
# Run
# ------------------------
if __name__ == "__main__":
    main()
