In [5]:
import pandas as pd, requests, os

# export your Google Sheet as CSV first
SPREADSHEET_CSV = "data/supplementals/supplementals.csv"
PDF_DIR = "docs"
os.makedirs(PDF_DIR, exist_ok=True)

df = pd.read_csv(SPREADSHEET_CSV)
links = df.stack().dropna().astype(str).tolist()

for url in links:
    if url.lower().endswith(".pdf"):
        try:
            fname = url.split("/")[-1] or "download.pdf"
            path = os.path.join(PDF_DIR, fname)
            r = requests.get(url, timeout=20, verify=True)
            r.raise_for_status()
            with open(path, "wb") as f:
                f.write(r.content)
            print("✓ Downloaded:", fname)
        except Exception as e:
            print("✗ Skipped:", url, "->", e)


✓ Downloaded: 2023-02484.pdf
✓ Downloaded: FoodCodeRuleRevision-SignificantAnalysis.pdf
✗ Skipped: https://templates.upmetrics.co/wp-content/uploads/2022/07/food-delivery-business-plan-example.pdf -> HTTPSConnectionPool(host='templates.upmetrics.co', port=443): Max retries exceeded with url: /wp-content/uploads/2022/07/food-delivery-business-plan-example.pdf (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1028)')))
✗ Skipped: https://jungleworks.com/download/Guide-UberEats.pdf -> 403 Client Error: Forbidden for url: https://jungleworks.com/download/Guide-UberEats.pdf
✓ Downloaded: JFDR55.2_4_Peng.pdf
✓ Downloaded: 882.pdf
✗ Skipped: https://www.emerald.com/bpmj/article-pdf/29/8/210/1735135/bpmj-04-2023-0308.pdf -> 403 Client Error: Forbidden for url: https://www.emerald.com/bpmj/article-pdf/29/8/210/1735135/bpmj-04-2023-0308.pdf
✓ Downloaded: id611.pdf
✗ Skipped: https://www.emer

In [6]:
# rag.ipynb

# === 0. Setup ===
%pip install sentence-transformers faiss-cpu PyPDF2 openai tiktoken

import os, re, hashlib, pickle
from pathlib import Path
import numpy as np
import faiss
from PyPDF2 import PdfReader
from sentence_transformers import SentenceTransformer
import tiktoken

PDF_DIR = "docs"        # put your PDFs here
PAGE_FILE = "page.file"
TOPK = 6
CHUNK_TOKENS = 150      # target tokens per chunk

model = SentenceTransformer("all-MiniLM-L6-v2")
tokenizer = tiktoken.get_encoding("cl100k_base")


Collecting sentence-transformers
  Downloading sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp313-cp313-macosx_14_0_arm64.whl.metadata (5.1 kB)
Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Collecting openai
  Downloading openai-1.105.0-py3-none-any.whl.metadata (29 kB)
Collecting tiktoken
  Downloading tiktoken-0.11.0-cp313-cp313-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading transformers-4.56.0-py3-none-any.whl.metadata (40 kB)
Collecting tqdm (from sentence-transformers)
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting torch>=1.11.0 (from sentence-transformers)
  Downloading torch-2.8.0-cp313-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting scikit-learn (from sentence-transformers)
  Downloading scikit_learn-1.7.1-cp313-cp313-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scip

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# === 1. Helpers ===

def file_sig(path: Path):
    h = hashlib.md5()
    h.update(str(path.stat().st_mtime_ns).encode())
    h.update(str(path.stat().st_size).encode())
    return h.hexdigest()

def load_texts(pdf_path: Path):
    reader = PdfReader(str(pdf_path))
    out = []
    for i, page in enumerate(reader.pages, start=1):
        text = page.extract_text() or ""
        if text.strip():
            out.append((text, {"doc": pdf_path.name, "page": i}))
    return out

def chunk_text_tokens(text, meta, max_tokens=CHUNK_TOKENS):
    words = text.split()
    chunks, metas, cur, cur_len = [], [], [], 0
    for w in words:
        tlen = len(tokenizer.encode(w))
        if cur_len + tlen > max_tokens:
            chunks.append(" ".join(cur))
            metas.append({**meta, "chunk": len(chunks)})
            cur, cur_len = [], 0
        cur.append(w); cur_len += tlen
    if cur:
        chunks.append(" ".join(cur))
        metas.append({**meta, "chunk": len(chunks)})
    return chunks, metas


In [8]:
# === 2. Build pagefile ===

def build_pagefile(pdf_dir=PDF_DIR, path=PAGE_FILE):
    all_chunks, all_metas = [], []
    for pdf in Path(pdf_dir).glob("*.pdf"):
        for t, meta in load_texts(pdf):
            cs, ms = chunk_text_tokens(t, meta)
            all_chunks.extend(cs); all_metas.extend(ms)
    X = np.asarray(model.encode(all_chunks, convert_to_numpy=True), "float32")
    ix = faiss.IndexFlatL2(X.shape[1]); ix.add(X)
    manifest = {str(p): file_sig(p) for p in Path(pdf_dir).glob("*.pdf")}
    with open(path, "wb") as f:
        pickle.dump({"ix": ix, "X": X, "chunks": all_chunks,
                     "metas": all_metas, "manifest": manifest}, f)
    return ix, all_chunks, all_metas

def load_pagefile(path=PAGE_FILE):
    with open(path, "rb") as f:
        pf = pickle.load(f)
    return pf["ix"], pf["chunks"], pf["metas"]


In [9]:
# === 3. Query ===

def query_rag(query, ix, chunks, metas, k=TOPK):
    qvec = np.asarray(model.encode([query], convert_to_numpy=True), "float32")
    D, I = ix.search(qvec, k)
    results = []
    for rank, (d, idx) in enumerate(zip(D[0], I[0]), 1):
        m = metas[idx]; snip = chunks[idx][:120].replace("\n"," ")
        results.append((rank, d, m["doc"], m["page"], snip))
    return results

def show_results(results):
    print("# Semantic Page Table")
    for r in results:
        print(f"{r[0]:>2}. L2^2={r[1]:.3f}  {r[2]} p{r[3]}  '{r[4]}'")


In [10]:
# === 4. Export Bullet Notes ===

def export_notes(results, outfile="bullet_notes.md"):
    with open(outfile, "w") as f:
        f.write("# Bullet Notes (Auto-Generated)\n\n")
        for rank, d, doc, page, snip in results:
            f.write(f"- **{doc} p{page}**: {snip.strip()} …\n")
    print(f"Saved {outfile}")


In [12]:
# === Run RAG on all PDFs ===

# 1. Build (or rebuild) the index from everything in docs/
ix, chunks, metas = build_pagefile(PDF_DIR, PAGE_FILE)

# 2. Ask a query
query = "What are the allergen labeling requirements?"
results = query_rag(query, ix, chunks, metas, k=6)

# 3. Show results
show_results(results)

# 4. Export notes (optional, to supplementals/)
export_notes(results, outfile="data/supplementals/notes.md")


# Semantic Page Table
 1. L2^2=1.129  FoodCodeRuleRevision-SignificantAnalysis.pdf p42  'food. Benefits: The proposed change has the benefit of potentially reducing risk of foodborne illness for populations th'
 2. L2^2=1.242  11782-case-studies-food-loss-and-waste-in-north-america-en.pdf p50  'Kingdom. OECD Foo d, Agriculture and Fisheries Papers 76. Pingree, C. 2016. Introducing commonsense bill to standardize '
 3. L2^2=1.265  FoodCodeRuleRevision-SignificantAnalysis.pdf p4  '(Title, Number or Code Year Only); (3) Incorporate legislation passed by state legislature; (4) Incorpora te RCW or rule'
 4. L2^2=1.304  FoodCodeRuleRevision-SignificantAnalysis.pdf p42  'specific cost estimates . One respondent indicated they repackage a lo t of food and will have to arrange for another vo'
 5. L2^2=1.315  2023-02484.pdf p12  'burden of the proposed collection of information, including the validity of the methodology and assumptions used; (c) wa'
 6. L2^2=1.316  JFDR55.2_4_Peng.pdf p17  'of i

In [15]:
def summarize_all_pdfs(pdf_dir=PDF_DIR):
    for pdf in Path(pdf_dir).glob("*.pdf"):
        base = pdf.stem
        results = query_rag(f"Summarize the key compliance and tax rules from {base}", ix, chunks, metas, k=8)
        outfile = f"data/supplementals/{base}_notes.md"
        export_notes(results, outfile)
        print("✓ Saved notes for", base)

In [16]:
summarize_all_pdfs(PDF_DIR)

Saved data/supplementals/11782-case-studies-food-loss-and-waste-in-north-america-en_notes.md
✓ Saved notes for 11782-case-studies-food-loss-and-waste-in-north-america-en
Saved data/supplementals/Vol.13%20No.5.37_notes.md
✓ Saved notes for Vol.13%20No.5.37
Saved data/supplementals/paper_notes.md
✓ Saved notes for paper
Saved data/supplementals/861943-1255479_notes.md
✓ Saved notes for 861943-1255479
Saved data/supplementals/IJRPR28409_notes.md
✓ Saved notes for IJRPR28409
Saved data/supplementals/882_notes.md
✓ Saved notes for 882
Saved data/supplementals/JHTC_Vol7Issue1_Khan_case_notes.md
✓ Saved notes for JHTC_Vol7Issue1_Khan_case
Saved data/supplementals/2023-02484_notes.md
✓ Saved notes for 2023-02484
Saved data/supplementals/644_notes.md
✓ Saved notes for 644
Saved data/supplementals/ba6c61f24ca22c12ba2d400f8e6ba711c010_notes.md
✓ Saved notes for ba6c61f24ca22c12ba2d400f8e6ba711c010
Saved data/supplementals/1201-1602245215_notes.md
✓ Saved notes for 1201-1602245215
Saved data/suppl

In [17]:
from pathlib import Path

def export_all_notes(pdf_dir=PDF_DIR, outfile="supplementals/all_notes.md"):
    Path("supplementals").mkdir(exist_ok=True, parents=True)
    with open(outfile, "w") as f:
        f.write("# Combined Bullet Notes from All PDFs\n\n")
        for pdf in Path(pdf_dir).glob("*.pdf"):
            base = pdf.stem
            # query the index for each doc
            results = query_rag(f"Summarize the key compliance and tax rules from {base}", ix, chunks, metas, k=8)
            f.write(f"## {base}\n\n")
            for rank, d, doc, page, snip in results:
                f.write(f"- **{doc} p{page}**: {snip.strip()} …\n")
            f.write("\n")
    print("✓ Saved", outfile)


In [18]:
# Build/reload the index (if not already built)
ix, chunks, metas = build_pagefile(PDF_DIR, PAGE_FILE)

# Export combined notes
export_all_notes(PDF_DIR, outfile="supplementals/all_notes.md")


✓ Saved supplementals/all_notes.md
