<a href="https://colab.research.google.com/github/SudarshanReddy41/initial-agent-service/blob/master/RC_Constrcution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import json
import os
import time
from urllib.parse import urlparse, urlunparse

# Define the base URL and API key
API_KEY = "X3gXLEGe4J8qUbkaFpsl2BNUT9R2PmBSJgJ4WX51"
API_ENDPOINT = "https://api.regulations.gov/v4/documents"
file_url = "https://downloads.regulations.gov/"
HEADERS = {"X-Api-Key": API_KEY,
           "Accept" : "application/json"}
DOWNLOAD_DIR = "rule_documents"
PAGE_SIZE = 100000


def download_pdf(document_id, file_url):
    print("📥 Starting PDF download...")
    file_url = file_url + document_id + "/content.pdf"
    # Create directory for the document
    document_dir = os.path.join(DOWNLOAD_DIR, document_id)
    os.makedirs(document_dir, exist_ok=True)
    print(f"📁 Document directory: {document_dir}")

    # Use the filename from the URL
    filename = os.path.basename(file_url)
    save_path = os.path.join(document_dir, filename)

    print(f"➡️  Downloading: {file_url}")
    print(f"📁 Saving to: {save_path}")

    try:
        response = requests.get(file_url, stream=True)
        response.raise_for_status()
        with open(save_path, "wb") as f:
            for chunk in response.iter_content(1024):
                f.write(chunk)
        print(f"✅ Successfully downloaded: {filename}")
    except Exception as e:
        print(f"❌ Failed to download {document_id}: {e}")



def search_documents(search_term):
    page = 1
    total_pages = 1  # Initialize to enter the loop

    while page <= total_pages:
        params = {
            'filter[searchTerm]': search_term,
            'page[number]': page,
            'page[size]': PAGE_SIZE
        }

        print(f"\n📄 Fetching page {page}...")
        response = requests.get(API_ENDPOINT, params=params, headers=HEADERS)
        print(f"➡️  Response : {response}")
        if response.status_code != 200:
            print(f"❌ API error: {response.status_code}")
            break

        try:
            data = response.json()
        except Exception as e:
            print(f"❌ Failed to parse JSON: {e}")
            print(f"Raw response:\n{response.text}")
            break
        documents = data.get("data", [])

        meta = data.get("meta", {})
        total_pages = meta.get("totalPages", 1)

        print(f"🔍 Found {len(documents)} documents on page {page} / {total_pages}")

        for doc in documents:
            doc_attrs = doc.get("attributes", {})
            document_id = doc.get("id")
            document_type = doc_attrs.get("documentType")
            title = doc_attrs.get("title")
            if "Rule" in document_type:
              print(f"Rule: {title}")
              download_pdf(document_id, file_url)

        page += 1
        time.sleep(0.5)  # Be kind to the API server!

# === Run the function with your search term
search_documents("construction safety compliance")


In [16]:
import requests, os, csv, logging, datetime, time, shutil
from google.colab import drive

# -------------------
# CONFIGURATION
# -------------------
API_KEY = "X3gXLEGe4J8qUbkaFpsl2BNUT9R2PmBSJgJ4WX51"
API_ENDPOINT = "https://api.regulations.gov/v4/documents"
HEADERS = {"X-Api-Key": API_KEY}
METADATA_CSV = "insurance_metadata.csv"

# Mount Google Drive
drive.mount('/content/drive')

# Define your Google Drive folder path
GDRIVE_FOLDER_PATH = '/content/drive/MyDrive/insurance_documents'
os.makedirs(GDRIVE_FOLDER_PATH, exist_ok=True)

# Setup logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger()

# -------------------
# Search Documents
# -------------------
def search_documents(term, page_size=100, max_pages=10):
    all_docs = []
    start_date = datetime.date(2024, 1, 1)
    end_date = datetime.date(2025, 1, 1)

    for page in range(1, max_pages + 1):
        params = {
            'filter[searchTerm]': term,
            'filter[documentType]': 'Rule,Proposed Rule',
            'filter[postedDate][ge]': start_date.isoformat(),
            'filter[postedDate][le]': end_date.isoformat(),
            'page[size]': page_size,
            'page[number]': page
        }
        logger.info(f"Fetching page {page}...")
        response = requests.get(API_ENDPOINT, headers=HEADERS, params=params)

        if response.status_code != 200:
            logger.error(f"API error {response.status_code}")
            break

        data = response.json().get("data", [])
        if not data:
            break

        all_docs.extend(data)
    return all_docs

# -------------------
# Download PDF and upload to Google Drive folder
# -------------------
def download_and_upload_pdf(doc_id, folder_path, retries=3, delay=5):
    url = f"https://downloads.regulations.gov/{doc_id}/content.pdf"
    local_path = f"{doc_id}.pdf"
    drive_path = os.path.join(folder_path, f"{doc_id}.pdf")

    for attempt in range(retries):
        try:
            response = requests.get(url, headers=HEADERS, timeout=15)
            if response.status_code == 200:
                with open(local_path, 'wb') as f:
                    f.write(response.content)

                shutil.move(local_path, drive_path)
                return f"Uploaded to: {drive_path}", 200
            else:
                logger.warning(f"Failed to download {doc_id}: HTTP {response.status_code}")
                return None, response.status_code
        except Exception as e:
            logger.warning(f"Attempt {attempt+1} failed: {e}")
            time.sleep(delay)
    return None, None

# -------------------
# Save Metadata to CSV
# -------------------
def save_metadata_to_csv(records, path=METADATA_CSV):
    headers = ["doc_id", "document_type", "title", "posted_date", "gdrive_url", "download_status"]
    write_header = not os.path.exists(path)

    with open(path, 'a', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=headers)
        if write_header:
            writer.writeheader()
        writer.writerows(records)

# -------------------
# Main Pipeline
# -------------------
def run_pipeline(term='insurance', max_pages=10):
    docs = search_documents(term, page_size=100, max_pages=max_pages)
    results = []

    for doc in docs:
        doc_id = doc.get('id')
        attr = doc.get('attributes', {})
        gdrive_path, status = download_and_upload_pdf(doc_id, GDRIVE_FOLDER_PATH)

        results.append({
            "doc_id": doc_id,
            "document_type": attr.get('documentType', ''),
            "title": attr.get('title', ''),
            "posted_date": attr.get('postedDate', ''),
            "gdrive_url": gdrive_path or '',
            "download_status": status or ''
        })

    save_metadata_to_csv(results)
    logger.info(f"Saved metadata for {len(results)} documents.")

# -------------------
# Run
# -------------------
run_pipeline(term='insurance', max_pages=10)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).




In [None]:
from pydrive2.auth import GoogleAuth
from pydrive2.drive import GoogleDrive
from google.colab import auth
import os

# Step 1: Authenticate with Google
auth.authenticate_user()

# Step 2: Setup GoogleAuth for Colab
gauth = GoogleAuth()

# Force manual flow if automatic fails
gauth.LoadCredentialsFile("credentials.json")

if gauth.credentials is None:
    # First-time login, use interactive browser
    gauth.LocalWebserverAuth()
elif gauth.access_token_expired:
    # Refresh expired credentials
    gauth.Refresh()
else:
    # Authorize in-memory if credentials already exist
    gauth.Authorize()

# Save credentials for future use
gauth.SaveCredentialsFile("credentials.json")

# Step 3: Create Google Drive object
drive = GoogleDrive(gauth)
