In [1]:
# Imports and project path setup

from pathlib import Path


# Identify project root. 
cwd = Path.cwd()
PROJECT_ROOT = cwd.parent if cwd.name == "Notebook" else cwd

# Project root folder should be named "PTAB_Project3".
if PROJECT_ROOT.name != "PTAB_Project3":
    print(f"⚠️ Note: expected project root to be 'PTAB_Project3' but found '{PROJECT_ROOT.name}'. Proceeding anyway.")

# Define key data paths that weill be used throughout the project.
DATA_DIR = PROJECT_ROOT / "data"
RAW_DIR = DATA_DIR / "raw"
PROCESSED_DIR = DATA_DIR / "processed"
INDEX_DIR = PROJECT_ROOT / "index"
MODELS_DIR = PROJECT_ROOT / "models"

# Print paths to make sure the directories exist.
for p in [DATA_DIR, RAW_DIR, PROCESSED_DIR, INDEX_DIR, MODELS_DIR]:
    p.mkdir(parents=True, exist_ok=True)

print("Project root:    ", PROJECT_ROOT)
print("Notebook path:   ", cwd)
print("Data (raw):      ", RAW_DIR)
print("Data (processed):", PROCESSED_DIR)
print("Index:           ", INDEX_DIR)
print("Models:          ", MODELS_DIR)

Project root:     /Users/rellu/Documents/PTAB_Project/AI-Builders-Bootcamp-5/PTAB_Project3
Notebook path:    /Users/rellu/Documents/PTAB_Project/AI-Builders-Bootcamp-5/PTAB_Project3/Notebook
Data (raw):       /Users/rellu/Documents/PTAB_Project/AI-Builders-Bootcamp-5/PTAB_Project3/data/raw
Data (processed): /Users/rellu/Documents/PTAB_Project/AI-Builders-Bootcamp-5/PTAB_Project3/data/processed
Index:            /Users/rellu/Documents/PTAB_Project/AI-Builders-Bootcamp-5/PTAB_Project3/index
Models:           /Users/rellu/Documents/PTAB_Project/AI-Builders-Bootcamp-5/PTAB_Project3/models


In [2]:
# PTAB API setup + last-week date range

from datetime import datetime, timedelta, timezone
from pathlib import Path
import requests

# PTAB API base (v3 paths live under this host)
PTAB_BASE = "https://developer.uspto.gov/ptab-api"

# Save the PDFs the API returns in the raw_pdfs folder
RAW_PDFS_DIR = DATA_DIR / "raw_pdfs"
RAW_PDFS_DIR.mkdir(parents=True, exist_ok=True)

from datetime import datetime, timedelta, timezone

def last_week_dates_yyyy_mm_dd(today=None):
    """
    Returns (from_date, to_date) as strings 'YYYY-MM-DD' covering the last 7 days.
    """
    # Use timezone-aware UTC date
    if today is None:
        today = datetime.now(timezone.utc).date()
    to_date = today
    from_date = today - timedelta(days=7)
    return from_date.isoformat(), to_date.isoformat()

FROM_DATE, TO_DATE = last_week_dates_yyyy_mm_dd()
print(f"Querying PTAB for decisions from {FROM_DATE} to {TO_DATE}")

Querying PTAB for decisions from 2025-08-25 to 2025-09-01


In [3]:
# USPTO PTAB API Call: fetch PTAB decisions within [FROM_DATE, TO_DATE], with pagination

import time
import json
from typing import List, Dict, Any
import requests

def fetch_decisions(from_date: str, to_date: str, page_size: int = 100, max_pages: int = 50) -> List[Dict[str, Any]]:
    """
    Call the USPTO PTAB API /decisions endpoint to list decisions in a date range.
    - Handles pagination with 'page' and 'size' style params.
    - Returns a flat list of 'decision' records (dicts as provided by the API).
    """
    all_rows: List[Dict[str, Any]] = []
    page = 0

    while page < max_pages:
        params = {
            "decisionFromDate": from_date,  # inclusive
            "decisionToDate": to_date,      # inclusive
            "size": page_size,
            "page": page,
        }
        url = f"{PTAB_BASE}/decisions"
        try:
            resp = requests.get(url, params=params, timeout=30)
            resp.raise_for_status()
        except requests.RequestException as e:
            print(f"❌ Request error on page {page}: {e}")
            break

        data = resp.json()

        # defensive parsing: try common fields, fall back gracefully
        rows = []
        for key in ("decisions", "results", "items", "content"):
            if isinstance(data.get(key), list):
                rows = data[key]
                break

        # if the API returns the list directly (rare), accept that too
        if not rows and isinstance(data, list):
            rows = data

        if not rows:
            # either no results or unexpected shape; show keys for debugging
            print(f"⚠️ No rows found on page {page}. Top-level keys: {list(data.keys())}")
            break

        all_rows.extend(rows)
        print(f"Fetched page {page}: {len(rows)} rows (total so far: {len(all_rows)})")

        # stop if this page wasn't full (likely last page)
        if len(rows) < page_size:
            break

        page += 1
        time.sleep(0.2)  # tiny pause to be polite

    return all_rows

# ---- run the fetch for your last-week window ----
decisions_last_week = fetch_decisions(FROM_DATE, TO_DATE, page_size=100)

print(f"\n✅ Total decisions fetched: {len(decisions_last_week)}")

# Show a small preview of useful fields if present
preview_keys = ["proceedingNumber", "decisionDate", "documentIdentifier", "documentId", "partyName", "applicationNumber", "technologyCenter"]
print("\nSample rows (up to 3):")
for row in decisions_last_week[:3]:
    sample = {k: row.get(k) for k in preview_keys if k in row}
    print(json.dumps(sample, indent=2))

Fetched page 0: 25 rows (total so far: 25)

✅ Total decisions fetched: 25

Sample rows (up to 3):
{
  "proceedingNumber": "2024001413",
  "decisionDate": "08-25-2025",
  "documentIdentifier": "3c30a4ca-c489-4853-87d7-2b698c11348b"
}
{
  "proceedingNumber": "2024003328",
  "decisionDate": "08-25-2025",
  "documentIdentifier": "6e1d62ef-5a33-4f3f-a136-f490d83f0f25"
}
{
  "proceedingNumber": "2024003699",
  "decisionDate": "08-25-2025",
  "documentIdentifier": "09abf05b-3b0e-484d-8817-0fd2ee3dc740"
}


In [4]:
# Download decision PDFs to data/raw_pdfs folder

import os
import re
from typing import Optional

def sanitize(s: str) -> str:
    """Make a string safe for filenames: keep alphanumerics, dash, underscore."""
    return re.sub(r"[^A-Za-z0-9_\-]+", "_", s).strip("_")

def make_pdf_filename(proceeding: str, decision_date: str, doc_id: str) -> str:
    """
    Create a readable filename like:
      2024001413_2025-08-25_3c30a4ca-c489-4853-87d7-2b698c11348b.pdf
    """
    # Try to normalize date to YYYY-MM-DD if it’s like MM-DD-YYYY
    d = decision_date
    if re.match(r"^\d{2}-\d{2}-\d{4}$", d):  # MM-DD-YYYY
        mm, dd, yyyy = d.split("-")
        d = f"{yyyy}-{mm}-{dd}"
    return f"{sanitize(proceeding)}_{sanitize(d)}_{sanitize(doc_id)}.pdf"

def download_pdf(document_identifier: str, out_path: Path, timeout: int = 60) -> bool:
    """
    Download a single PDF by documentIdentifier into out_path.
    Returns True if saved, False otherwise.
    """
    url = f"{PTAB_BASE}/documents/{document_identifier}/download"
    try:
        with requests.get(url, stream=True, timeout=timeout) as r:
            r.raise_for_status()
            # simple content-type check (sometimes it's octet-stream)
            ctype = (r.headers.get("Content-Type") or "").lower()
            if "pdf" not in ctype and "octet-stream" not in ctype:
                print(f"⚠️ Unexpected content-type for {document_identifier}: {ctype}")
            # write to disk in chunks
            with open(out_path, "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
        return True
    except requests.RequestException as e:
        print(f"❌ Download failed for {document_identifier}: {e}")
        return False

saved, skipped, failed = 0, 0, 0

for row in decisions_last_week:
    doc_id = row.get("documentIdentifier")
    proceeding = row.get("proceedingNumber") or "unknown"
    decision_date = row.get("decisionDate") or "unknown-date"
    if not doc_id:
        # if this happens, we could later add a "list documents by proceeding" step
        print(f"⚠️ No documentIdentifier for proceeding {proceeding}; skipping.")
        continue

    fname = make_pdf_filename(proceeding, decision_date, doc_id)
    out_path = RAW_PDFS_DIR / fname

    if out_path.exists() and out_path.stat().st_size > 0:
        skipped += 1
        continue

    ok = download_pdf(doc_id, out_path)
    if ok and out_path.exists() and out_path.stat().st_size > 1024:
        saved += 1
    else:
        # remove zero-byte or tiny files if any
        try:
            if out_path.exists():
                out_path.unlink()
        except Exception:
            pass
        failed += 1

print(f"\n📄 PDFs saved: {saved}, skipped (already present): {skipped}, failed: {failed}")
print(f"Files are in: {RAW_PDFS_DIR}")


📄 PDFs saved: 0, skipped (already present): 25, failed: 0
Files are in: /Users/rellu/Documents/PTAB_Project/AI-Builders-Bootcamp-5/PTAB_Project3/data/raw_pdfs


In [5]:
# Convert raw PDFs to raw .txt files

from pypdf import PdfReader

RAW_PDFS_DIR = DATA_DIR / "raw_pdfs"
RAW_PDFS_DIR.mkdir(parents=True, exist_ok=True)

def extract_pdf_text(pdf_path: Path, max_pages: int | None = None) -> str:
    """
    Extract text from a PDF using pypdf.
    - max_pages: limit pages to speed things up (or None = all pages).
    """
    reader = PdfReader(str(pdf_path))
    pages = reader.pages
    n = len(pages) if max_pages is None else min(len(pages), max_pages)
    chunks = []
    for i in range(n):
        t = pages[i].extract_text() or ""
        if t.strip():
            chunks.append(t)
    return "\n\n".join(chunks)

pdf_files = sorted(RAW_PDFS_DIR.glob("*.pdf"))
print(f"Found {len(pdf_files)} PDFs in {RAW_PDFS_DIR}")

for pdf_path in pdf_files:
    # Use the filename (without extension) as the document id and title
    doc_id = pdf_path.stem
    title = doc_id.replace("_", " ").replace("-", " ").strip()

    # Extract text (you can raise max_pages or set to None if you want full docs)
    raw_text = extract_pdf_text(pdf_path, max_pages=20)

    if not raw_text.strip():
        print(f"⚠️ Skipping (no text extracted): {pdf_path.name}")
        continue

    # Save to RAW_DIR as a .txt file (first line = title, then body)
    out_txt = RAW_DIR / f"{doc_id}.txt"
    out_txt.write_text(f"{title}\n\n{raw_text}", encoding="utf-8")
    print(f"📝 Wrote raw text: {out_txt.name}")

print("\n✅ Done converting PDFs.")

Found 26 PDFs in /Users/rellu/Documents/PTAB_Project/AI-Builders-Bootcamp-5/PTAB_Project3/data/raw_pdfs
📝 Wrote raw text: 2024000857_2025-08-26_75d26863-bf26-41d9-92c5-17605cea3c51.txt
📝 Wrote raw text: 2024001050_2025-08-26_96b1de0c-7637-4aeb-b290-ea76af8a4087.txt
📝 Wrote raw text: 2024001413_2025-08-25_3c30a4ca-c489-4853-87d7-2b698c11348b.txt
📝 Wrote raw text: 2024002039_2025-08-25_8cf23631-8e2c-4060-a7c4-534b97087a2d.txt
📝 Wrote raw text: 2024002880_2025-08-26_6583dc03-0b34-46bd-a958-93345eb5e15c.txt
📝 Wrote raw text: 2024003101_2025-08-26_0d0f2c69-9dc8-4643-a019-914b4792e551.txt
📝 Wrote raw text: 2024003328_2025-08-25_6e1d62ef-5a33-4f3f-a136-f490d83f0f25.txt
📝 Wrote raw text: 2024003687_2025-08-25_4d318724-04f0-46ec-acb8-3a93d92886bf.txt
📝 Wrote raw text: 2024003699_2025-08-25_09abf05b-3b0e-484d-8817-0fd2ee3dc740.txt
📝 Wrote raw text: 2024003717_2025-08-26_60b52718-384b-44de-abeb-c7e7a9816963.txt
📝 Wrote raw text: 2024003965_2025-08-27_d8cc0133-42e0-4bb3-bd78-6badfcc759d0.txt
📝 Wro

In [6]:
# Define functions that will be used for ingesting pdfs

from typing import List, Tuple
import re

def clean_text(text: str) -> str:
    """
    Light cleanup for legal PDFs/text:
    - normalize Mac line endings to '\n'
    - collapse 3+ newlines to 2 (keeps paragraphs but removes huge gaps)
    - fix hyphenated line breaks like 'compu-\ntation' -> 'computation'
    - collapse multiple spaces to one
    - strip leading/trailing whitespace
    """
    if not text:
        return ""

    # normalize newlines
    t = text.replace("\r\n", "\n").replace("\r", "\n")

    # fix common hyphenation across line breaks: word-\nword -> wordword
    t = re.sub(r"(\w)-\n(\w)", r"\1\2", t)

    # collapse 3+ newlines -> 2 newlines (paragraph spacing)
    t = re.sub(r"\n{3,}", "\n\n", t)

    # collapse runs of spaces/tabs
    t = re.sub(r"[ \t]{2,}", " ", t)

    # strip extra space around newlines
    t = re.sub(r"[ \t]*\n[ \t]*", "\n", t)

    return t.strip()


def list_raw_text_files(raw_dir=RAW_DIR) -> List[Path]:
    """
    Return a list of .txt files currently in data/raw.
    Use this to see what input there is in raw folder before processing.
    """
    return sorted(raw_dir.glob("*.txt"))


def save_processed_text(doc_id: str, title: str, body: str, processed_dir=PROCESSED_DIR) -> Path:
    """
    Save a cleaned document into data/processed as a simple UTF-8 .txt.
    Conventions:
      - First line = title
      - Blank line
      - Body text
    Returns the saved Path.
    """
    processed_dir.mkdir(parents=True, exist_ok=True)
    out_path = processed_dir / f"{doc_id}.txt"
    content = f"{title.strip()}\n\n{body.strip()}\n"
    out_path.write_text(content, encoding="utf-8")
    return out_path


def read_txt(path: Path) -> Tuple[str, str]:
    """
    Read a .txt file and split into (title, body).
    If the first line is empty, we treat the filename stem as title.
    """
    raw = path.read_text(encoding="utf-8", errors="ignore")
    lines = raw.splitlines()
    if lines:
        title = lines[0].strip() or path.stem
        body = "\n".join(lines[1:]).strip()
    else:
        title, body = (path.stem, "")
    return title, body

In [7]:
# Process all raw files into Data/Processed folders

raw_files = list_raw_text_files()
print(f"Found {len(raw_files)} raw text files.")

for path in raw_files:
    # extract title + body from the raw file
    title, body = read_txt(path)
    
    # clean up the body text
    cleaned_body = clean_text(body)
    
    # use the stem (filename without extension) as document id
    doc_id = path.stem
    
    # save into processed folder
    out_path = save_processed_text(doc_id, title, cleaned_body)
    print(f"Processed and saved: {out_path.name}")

print("\n✅ All raw files processed into data/processed/")

Found 27 raw text files.
Processed and saved: 2024000857_2025-08-26_75d26863-bf26-41d9-92c5-17605cea3c51.txt
Processed and saved: 2024001050_2025-08-26_96b1de0c-7637-4aeb-b290-ea76af8a4087.txt
Processed and saved: 2024001413_2025-08-25_3c30a4ca-c489-4853-87d7-2b698c11348b.txt
Processed and saved: 2024002039_2025-08-25_8cf23631-8e2c-4060-a7c4-534b97087a2d.txt
Processed and saved: 2024002880_2025-08-26_6583dc03-0b34-46bd-a958-93345eb5e15c.txt
Processed and saved: 2024003101_2025-08-26_0d0f2c69-9dc8-4643-a019-914b4792e551.txt
Processed and saved: 2024003328_2025-08-25_6e1d62ef-5a33-4f3f-a136-f490d83f0f25.txt
Processed and saved: 2024003687_2025-08-25_4d318724-04f0-46ec-acb8-3a93d92886bf.txt
Processed and saved: 2024003699_2025-08-25_09abf05b-3b0e-484d-8817-0fd2ee3dc740.txt
Processed and saved: 2024003717_2025-08-26_60b52718-384b-44de-abeb-c7e7a9816963.txt
Processed and saved: 2024003965_2025-08-27_d8cc0133-42e0-4bb3-bd78-6badfcc759d0.txt
Processed and saved: 2024003977_2025-08-27_94f3e88e