<a href="https://colab.research.google.com/github/RegNLP/ReguSum/blob/main/ReguSum_Data_Preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# ===========================================
# ReguSum: End-to-end build + stats (with stats/ folder and optional Drive save)
# ===========================================

# -----------------------------
# Config
# -----------------------------
FOLDER_ID = "16fcIZvW8xR6-tfRK5X_o7zLcpze_Rm0d"
BASE_DIR  = "/content"                              # base working dir
DATA_DIR  = f"{BASE_DIR}/data"
DOCINFO_DIR = f"{DATA_DIR}/DocumentsInfo"
HTML_DIR    = f"{DATA_DIR}/DocumentsHTML"
DATASET_PATH = f"{DATA_DIR}/regusum_dataset.json"

STATS_DIR = f"{BASE_DIR}/stats"                     # <-- stats folder
SAVE_TO_DRIVE = False                              # <-- set True to copy outputs to Drive
DRIVE_DIR = "/content/drive/MyDrive/ReguSum-GithubRepo"        # where to copy if SAVE_TO_DRIVE=True

# -----------------------------
# Step 0: deps & dirs
# -----------------------------
!pip -q install gdown pandas beautifulsoup4 lxml tqdm requests nltk matplotlib numpy

import os, re, glob, json, time
from pathlib import Path
from urllib.parse import urlparse

import pandas as pd
import numpy as np
import requests
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
from tqdm import tqdm
import nltk
nltk.download("punkt", quiet=True)
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize, word_tokenize

# Make dirs
Path(DOCINFO_DIR).mkdir(parents=True, exist_ok=True)
Path(HTML_DIR).mkdir(parents=True, exist_ok=True)
Path(STATS_DIR).mkdir(parents=True, exist_ok=True)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [2]:
# -----------------------------
# Step 1: Download DocumentsInfo CSVs
# -----------------------------
print("Downloading 'DocumentsInfo' from Google Drive...")
!gdown --folder "https://drive.google.com/drive/folders/{FOLDER_ID}" -O {DOCINFO_DIR} --remaining-ok

csv_paths = sorted(glob.glob(os.path.join(DOCINFO_DIR, "*.csv")))
print(f"\nFound {len(csv_paths)} CSV file(s) in {DOCINFO_DIR}:\n")
for p in csv_paths[:10]:
    print(" -", os.path.basename(p))
if len(csv_paths) > 10:
    print(f" ... and {len(csv_paths)-10} more.")

Downloading 'DocumentsInfo' from Google Drive...
Retrieving folder contents
Processing file 1J_bRWMhNyO-7aD7pdszS9JAn5FGvoCKS m6g-oqrn-vuxf.csv
Processing file 1Uoyv4-pA0lmp5dUAOfdDzmgA5khG-OuY m6i-1gtx-mmls.csv
Processing file 1u2nDMbmHOyoyPVlAYQ1nU3BHJm4te5og m6i-1jxq-2dx1.csv
Processing file 1m9xg6by-y_RBUalmzZphhzTkRkL6t9Ex m6i-1lz5-v9ud.csv
Processing file 1Fa3hmoBIGyBjStgQT5RElABTE_yX-TBx m6i-1pux-9azw.csv
Processing file 1m3Q83tIY4JRWGnE40GyZNLfdNiKZ3y3Q m6i-1sj5-t1tn.csv
Processing file 1NTAy9d4UtrDl7LY4uGorYUO7cVpLEJoY m6i-1tht-4u4x.csv
Processing file 1A6oPF-TB6XOtZ00iedx_vUkrUeRXfNfp m6i-1un0-8mu9.csv
Processing file 1eNyn287c3CGqzDPvwHtQsX9OOYBXI7Tp m6i-1vl3-nybo.csv
Processing file 1Rn71Xl3b7zUyZzCWkkH2co4aDHqov3ZX m6i-1wpl-dukq.csv
Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1J_bRWMhNyO-7aD7pdszS9JAn5FGvoCKS
To: /content/data/DocumentsInfo/m6g-oqrn-vuxf.csv
100

In [3]:
# -----------------------------
# Step 2: Parse CSVs and download only .htm files (saved as <DocumentID>.htm)
# -----------------------------
SPLIT_RE = re.compile(r'[,\s;]+', re.IGNORECASE)

def extract_htm_urls(cell_value: str):
    if pd.isna(cell_value):
        return []
    parts = [p.strip().rstrip(').,;\'"') for p in SPLIT_RE.split(str(cell_value).strip()) if p.strip()]
    return [u for u in parts if re.search(r'\.(htm|html)(\?|#|$)', u, re.IGNORECASE)]

def pick_content_column(df: pd.DataFrame):
    if "Content Files" in df.columns:
        return "Content Files"
    for c in df.columns:
        lc = c.lower()
        if "content" in lc and "file" in lc:
            return c
    return None

SESSION = requests.Session()
SESSION.headers.update({
    "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0 Safari/537.36"
})

def fetch_htm(url: str, dest_dir: Path) -> Path | None:
    try:
        m = re.search(r'/([A-Z]+-\d{4}-\d+-\d+)/content', url)
        if not m:
            print(f"[warn] Could not parse DocumentID from {url}")
            return None
        doc_id = m.group(1)
        out_path = Path(dest_dir) / f"{doc_id}.htm"
        if out_path.exists() and out_path.stat().st_size > 0:
            return out_path
        r = SESSION.get(url, timeout=30, allow_redirects=True)
        if r.status_code != 200:
            print(f"[warn] {r.status_code} for {url}")
            return None
        out_path.write_bytes(r.content)
        return out_path
    except Exception as e:
        print(f"[warn] Download failed {url}: {e}")
        return None

all_htm_urls = []
for csv_path in csv_paths:
    df = pd.read_csv(csv_path, dtype=str, keep_default_na=False)
    col = pick_content_column(df)
    if not col:
        print(f"[skip] No 'Content Files' column in {os.path.basename(csv_path)}")
        continue
    for cell in df[col]:
        all_htm_urls.extend(extract_htm_urls(cell))

seen = set()
unique_htm_urls = [u for u in all_htm_urls if not (u in seen or seen.add(u))]
print(f"Discovered {len(unique_htm_urls)} unique .htm URLs.")

saved_paths = []
for url in tqdm(unique_htm_urls, desc="Downloading .htm files"):
    p = fetch_htm(url, Path(HTML_DIR))
    if p:
        saved_paths.append(p)

print(f"\nSaved {len(saved_paths)} HTM files to: {Path(HTML_DIR).resolve()}")
if saved_paths[:5]:
    print("Examples:")
    for p in saved_paths[:5]:
        print(" -", Path(p).name)

# -----------------------------
# Step 3: Parse .htm -> regusum_dataset.json (skip empty/missing gold summary)
# -----------------------------
MIN_SECTION_LENGTH = 100

def segment_content(main_content):
    if not main_content:
        return []
    sections = []
    major_parts = re.split(
        r'\n(?=I\. |II\. |III\. |IV\. |V\. |VI\. |VII\. |VIII\. |IX\. |X\. |SUMMARY:|DATES:|FOR FURTHER INFORMATION CONTACT:)',
        main_content
    )
    major_parts = [part.strip() for part in major_parts if part and part.strip()]
    if len(major_parts) > 1:
        for part in major_parts:
            lines = part.split('\n')
            header, text = lines[0].strip(), '\n'.join(lines[1:]).strip()
            if len(header) < 200 and text:
                sections.append({"header": header, "text": text})
            else:
                if sections:
                    sections[-1]['text'] += '\n\n' + part
                else:
                    sections.append({"header": "Introduction", "text": part})
    else:
        sections.append({"header": "Main Content", "text": main_content})
    final_sections, temp_text_buffer = [], ""
    for section in reversed(sections):
        if len(section['text']) < MIN_SECTION_LENGTH and final_sections:
            temp_text_buffer = section['header'] + '\n\n' + section['text'] + '\n\n' + temp_text_buffer
        else:
            section['text'] += ('\n\n' + temp_text_buffer) if temp_text_buffer else ''
            final_sections.append(section)
            temp_text_buffer = ""
    if temp_text_buffer and final_sections:
        final_sections[-1]['text'] = temp_text_buffer + final_sections[-1]['text']
    return list(reversed(final_sections))

def extract_text_from_html(file_path):
    try:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as file:
            soup = BeautifulSoup(file, "html.parser")
        pre_tag = soup.find("pre")
        return (pre_tag.get_text("\n\n") if pre_tag else soup.get_text("\n\n")), soup
    except Exception as e:
        print(f"Error reading {os.path.basename(file_path)}: {e}")
        return None, None

def extract_title(soup):
    return soup.title.string.strip() if soup and soup.title and soup.title.string else "Title Not Found"

def parse_regulation_file(file_path):
    doc_id = os.path.splitext(os.path.basename(file_path))[0]
    try:
        full_text, soup = extract_text_from_html(file_path)
        if not full_text:
            return None
        summary_match = re.search(
            r"SUMMARY:(.*?)(?=SUPPLEMENTARY INFORMATION:|FOR FURTHER INFORMATION CONTACT:)",
            full_text, re.DOTALL | re.IGNORECASE
        )
        if not summary_match:
            return None
        gold_summary = summary_match.group(1).strip()
        if not gold_summary:
            return None
        content_match = re.search(r"SUPPLEMENTARY INFORMATION:", full_text, re.IGNORECASE)
        main_content = full_text[content_match.end():].strip() if content_match else full_text
        sections = segment_content(main_content) or [{"header": "Main Content", "text": main_content}]
        title = extract_title(soup)
        agency_id = doc_id.split('-')[0] if '-' in doc_id else "UNKNOWN"
        topics_match = re.search(
            r"List of Subjects in.*?PART \d+(.*?)(?:Correction of Publication|Accordingly|PART)",
            full_text, re.DOTALL | re.IGNORECASE
        )
        topics = re.sub(r'\s+', ' ', topics_match.group(1).strip()).replace('\n', ',') if topics_match else "Topics Not Found"
        return {
            "ID": doc_id,
            "gold_summary": gold_summary,
            "Agency ID": agency_id,
            "Title": title,
            "Topics": topics,
            "original_content": main_content,
            "Sections": sections
        }
    except Exception as e:
        print(f"Could not process file {doc_id}. Error: {e}")
        return None

# Build dataset
html_files = sorted([f for f in os.listdir(HTML_DIR) if f.lower().endswith((".htm", ".html"))])
print(f"Found {len(html_files)} HTM/HTML files to process.")
records, skipped = [], 0
for fn in html_files:
    rec = parse_regulation_file(os.path.join(HTML_DIR, fn))
    if rec and rec.get("gold_summary"):
        records.append(rec)
    else:
        skipped += 1

with open(DATASET_PATH, "w", encoding="utf-8") as f:
    json.dump(records, f, indent=2, ensure_ascii=False)
print(f"[ok] Processed {len(records)} documents (skipped={skipped}).")
print(f"[ok] Dataset saved -> {DATASET_PATH}")



Discovered 379 unique .htm URLs.


Downloading .htm files:  15%|█▌        | 58/379 [00:02<00:09, 35.57it/s]

[warn] Could not parse DocumentID from https://downloads.regulations.gov/IRS_FRDOC_0001-2179/content.htm
[warn] Could not parse DocumentID from https://downloads.regulations.gov/IRS_FRDOC_0001-2208/content.htm
[warn] Could not parse DocumentID from https://downloads.regulations.gov/IRS_FRDOC_0001-2209/content.htm
[warn] Could not parse DocumentID from https://downloads.regulations.gov/IRS_FRDOC_0001-2211/content.htm
[warn] Could not parse DocumentID from https://downloads.regulations.gov/IRS_FRDOC_0001-2248/content.htm
[warn] Could not parse DocumentID from https://downloads.regulations.gov/IRS_FRDOC_0001-2267/content.htm


Downloading .htm files:  36%|███▋      | 138/379 [00:05<00:08, 27.03it/s]

[warn] Could not parse DocumentID from https://downloads.regulations.gov/IRS_FRDOC_0001-1901/content.htm
[warn] Could not parse DocumentID from https://downloads.regulations.gov/IRS_FRDOC_0001-1903/content.htm


Downloading .htm files:  43%|████▎     | 162/379 [00:06<00:06, 32.12it/s]

[warn] Could not parse DocumentID from https://downloads.regulations.gov/IRS_FRDOC_0001-2092/content.htm
[warn] Could not parse DocumentID from https://downloads.regulations.gov/IRS_FRDOC_0001-2097/content.htm
[warn] Could not parse DocumentID from https://downloads.regulations.gov/IRS_FRDOC_0001-2112/content.htm


Downloading .htm files:  62%|██████▏   | 234/379 [00:09<00:04, 34.60it/s]

[warn] Could not parse DocumentID from https://downloads.regulations.gov/IRS_FRDOC_0001-2035/content.htm
[warn] Could not parse DocumentID from https://downloads.regulations.gov/IRS_FRDOC_0001-2037/content.htm
[warn] Could not parse DocumentID from https://downloads.regulations.gov/IRS_FRDOC_0001-2038/content.htm
[warn] Could not parse DocumentID from https://downloads.regulations.gov/IRS_FRDOC_0001-2039/content.htm
[warn] Could not parse DocumentID from https://downloads.regulations.gov/IRS_FRDOC_0001-2040/content.htm
[warn] Could not parse DocumentID from https://downloads.regulations.gov/IRS_FRDOC_0001-2041/content.htm
[warn] Could not parse DocumentID from https://downloads.regulations.gov/IRS_FRDOC_0001-2048/content.htm


Downloading .htm files: 100%|██████████| 379/379 [00:17<00:00, 21.49it/s]



Saved 361 HTM files to: /content/data/DocumentsHTML
Examples:
 - IRS-2014-0030-0004.htm
 - IRS-2016-0007-0010.htm
 - IRS-2016-0010-0035.htm
 - IRS-2016-0044-0011.htm
 - IRS-2018-0027-0010.htm
Found 361 HTM/HTML files to process.
[ok] Processed 345 documents (skipped=16).
[ok] Dataset saved -> /content/data/regusum_dataset.json


In [4]:
# ===== Step 4: Inspect the built dataset (IDs, Titles, Section headers) =====
import json, os
from pathlib import Path
import pandas as pd

# --- Options ---
USE_DRIVE_DATASET = False  # set True to use your Drive path below
DRIVE_DATASET_PATH = ""

# Default to the dataset built in Step 3:
LOCAL_DATASET_PATH = "/content/data/regusum_dataset.json"

# Where to write a compact overview CSV (optional)
OVERVIEW_CSV_PATH = "/content/data/regusum_dataset_overview.csv"

# --- Mount Drive only if needed ---
if USE_DRIVE_DATASET:
    from google.colab import drive
    drive.mount('/content/drive')

# --- Resolve path ---
DATASET_PATH = DRIVE_DATASET_PATH if USE_DRIVE_DATASET else LOCAL_DATASET_PATH
print(f"Using dataset: {DATASET_PATH}")

def inspect_all_documents(dataset_path: str, write_overview_csv: bool = True):
    """
    Loads the dataset and prints ID, Title, and section headers for each document.
    Also writes an overview CSV with basic stats per document (optional).
    """
    try:
        if not os.path.exists(dataset_path):
            print(f"Error: File not found at '{dataset_path}'")
            return

        with open(dataset_path, "r", encoding="utf-8") as f:
            dataset = json.load(f)

        if not dataset:
            print("The dataset is empty. Nothing to display.")
            return

        print("--- Document IDs and Section Headers ---")

        total_docs = 0
        docs_with_multiple_sections = 0
        total_sections = 0

        # For optional CSV overview
        overview_rows = []

        for document in dataset:
            total_docs += 1

            doc_id = document.get("ID", "ID Not Found")
            title = document.get("Title", "Title Not Found")
            sections = document.get("Sections", []) if isinstance(document.get("Sections", []), list) else []

            num_sections = len(sections)
            total_sections += num_sections
            if num_sections > 1:
                docs_with_multiple_sections += 1

            print(f"\nID: {doc_id}")
            print(f"Title: {title}")
            print(f"Sections Found: {num_sections}")

            headers = []
            if sections:
                for section in sections:
                    header = section.get("header", "Header Not Found")
                    headers.append(header)
                    print(f"  - Header: {header}")
            else:
                print("  - No sections found for this document.")

            overview_rows.append({
                "ID": doc_id,
                "Title": title,
                "NumSections": num_sections,
                "SectionHeaders": " | ".join(headers)
            })

        # --- Final Summary ---
        print("\n" + "="*40)
        print("--- Dataset Inspection Summary ---")
        print(f"Total Documents Processed: {total_docs}")
        print(f"Documents with Multiple Sections: {docs_with_multiple_sections}")
        if total_docs > 0:
            avg_sections = round(total_sections / total_docs, 2)
            print(f"Average Sections per Document: {avg_sections}")
        print("="*40)

        # Optional CSV
        if write_overview_csv:
            df = pd.DataFrame(overview_rows)
            Path(os.path.dirname(OVERVIEW_CSV_PATH)).mkdir(parents=True, exist_ok=True)
            df.to_csv(OVERVIEW_CSV_PATH, index=False)
            print(f"\n[ok] Wrote overview CSV -> {OVERVIEW_CSV_PATH}")

    except Exception as e:
        print(f"An error occurred: {e}")

# Run the inspection
inspect_all_documents(DATASET_PATH, write_overview_csv=True)


Using dataset: /content/data/regusum_dataset.json
--- Document IDs and Section Headers ---

ID: IRS-2008-0041-0003
Title: Federal Register, Volume 85 Issue 113 (Thursday, June 11, 2020)
Sections Found: 1
  - Header: Main Content

ID: IRS-2008-0053-0009
Title: Federal Register, Volume 88 Issue 146 (Tuesday, August 1, 2023)
Sections Found: 1
  - Header: Main Content

ID: IRS-2008-0092-0007
Title: Federal Register, Volume 85 Issue 162 (Thursday, August 20, 2020)
Sections Found: 1
  - Header: Main Content

ID: IRS-2011-0050-0006
Title: Federal Register, Volume 85 Issue 46 (Monday, March 9, 2020)
Sections Found: 1
  - Header: Main Content

ID: IRS-2014-0001-0014
Title: Federal Register, Volume 85 Issue 88 (Wednesday, May 6, 2020)
Sections Found: 1
  - Header: Main Content

ID: IRS-2014-0030-0004
Title: Federal Register, Volume 89 Issue 231 (Monday, December 2, 2024)
Sections Found: 14
  - Header: Authority
  - Header: I. Overlapping Economic Risk of Loss
  - Header: II. Tiered Partnerships


In [8]:
# === ReguSum overall stats + LDS metrics (paper-faithful; no agency grouping) ===
import os, json, math, string
from pathlib import Path
from collections import Counter, defaultdict
from itertools import combinations

import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

# -----------------------------
# Paths (use existing globals if present)
# -----------------------------
DATASET_PATH = globals().get("DATASET_PATH", "/content/data/regusum_dataset.json")
STATS_DIR    = globals().get("STATS_DIR",    "/content/stats")
Path(STATS_DIR).mkdir(parents=True, exist_ok=True)

# -----------------------------
# Load dataset
# -----------------------------
with open(DATASET_PATH, "r", encoding="utf-8") as f:
    dataset = json.load(f)

# -----------------------------
# Tokenization (consistent across all metrics)
# - Lowercase
# - Keep tokens that contain at least one alphanumeric char
# - Sentence tokenizer: NLTK's punkt
# -----------------------------
STOPWORDS = set(stopwords.words('english'))
PUNCT     = set(string.punctuation)

def norm_tokenize(text: str):
    if not text: return []
    toks = word_tokenize(text.lower())
    return [t for t in toks if any(ch.isalnum() for ch in t)]

def sent_split(text: str):
    return sent_tokenize(text) if text else []

# -----------------------------
# Precompute tokens/sentences for each doc
# -----------------------------
rows = []
for doc in dataset:
    doc_text = doc.get("original_content", "")
    sum_text = doc.get("gold_summary", "")
    doc_toks = norm_tokenize(doc_text)
    sum_toks = norm_tokenize(sum_text)
    doc_sents = sent_split(doc_text)
    sum_sents = sent_split(sum_text)

    rows.append({
        "ID": doc.get("ID",""),
        "DocTokens": doc_toks,
        "SumTokens": sum_toks,
        "DocSents": len(doc_sents),
        "SumSents": len(sum_sents),
        "NumSections": len(doc.get("Sections", [])),
        "DocText": doc_text,
        "SumText": sum_text,
    })
df = pd.DataFrame(rows)

# Overall (Total) size/length stats (macro over docs)
total_stats = {
    "n_docs": int(df.shape[0]),
    "avg_doc_tokens": float(df["DocTokens"].map(len).mean()),
    "avg_doc_sents":  float(df["DocSents"].mean()),
    "avg_sum_tokens": float(df["SumTokens"].map(len).mean()),
    "avg_sum_sents":  float(df["SumSents"].mean()),
    "avg_sections":   float(df["NumSections"].mean()),
}

# -----------------------------
# Coverage & Density (NEWSROOM / Grusky et al., 2018)
# Greedy, contiguous, maximal shared fragments from S that appear in D.
# -----------------------------
def build_ngram_sets(tokens, max_n=30):
    # set of n-gram tuples for n=1..max_n
    ngram_sets = {n: set() for n in range(1, max_n+1)}
    L = len(tokens)
    for n in range(1, max_n+1):
        if L < n: break
        window = tokens
        for i in range(L - n + 1):
            ngram_sets[n].add(tuple(window[i:i+n]))
    return ngram_sets

def greedy_fragments(doc_tokens, sum_tokens, max_n=30):
    if not doc_tokens or not sum_tokens: return []
    ngram_sets = build_ngram_sets(doc_tokens, max_n=max_n)
    i, L = 0, len(sum_tokens)
    frags = []
    while i < L:
        best_n = 0
        upper = min(max_n, L - i)
        for n in range(upper, 0, -1):  # try longest first
            cand = tuple(sum_tokens[i:i+n])
            if cand in ngram_sets[n]:
                best_n = n
                break
        if best_n:
            frags.append(best_n)   # store only length; faster, enough for metrics
            i += best_n
        else:
            i += 1
    return frags  # list of fragment lengths

def coverage_and_density(doc_tokens, sum_tokens, max_n=30):
    if not sum_tokens:
        return float('nan'), float('nan')
    lens = greedy_fragments(doc_tokens, sum_tokens, max_n=max_n)
    if not lens:
        return 0.0, 0.0
    S = len(sum_tokens)
    cov = sum(lens) / S
    den = sum(l*l for l in lens) / S
    return cov, den

# -----------------------------
# Compression (paper: per-doc ratio, then macro-average)
# -----------------------------
def comp_tokens(doc_tok_len, sum_tok_len):
    return (doc_tok_len / sum_tok_len) if sum_tok_len > 0 else float('nan')

def comp_sents(doc_sent_len, sum_sent_len):
    return (doc_sent_len / sum_sent_len) if sum_sent_len > 0 else float('nan')

# -----------------------------
# Redundancy (avg ROUGE-L F1 over all distinct pairs of summary sentences)
# ROUGE-L via classic LCS over tokens.
# -----------------------------
def lcs_len(a, b):
    la, lb = len(a), len(b)
    dp = [0]*(lb+1)
    for i in range(1, la+1):
        prev = 0
        for j in range(1, lb+1):
            tmp = dp[j]
            dp[j] = prev + 1 if a[i-1] == b[j-1] else max(dp[j], dp[j-1])
            prev = tmp
    return dp[lb]

def rouge_l_f1(s1, s2):
    t1 = norm_tokenize(s1); t2 = norm_tokenize(s2)
    if not t1 or not t2: return 0.0
    L = lcs_len(t1, t2)
    p, r = L/len(t1), L/len(t2)
    return (2*p*r)/(p+r) if (p+r) > 0 else 0.0

def redundancy(summary_text: str):
    sents = [s for s in sent_split(summary_text) if s.strip()]
    m = len(sents)
    if m < 2: return 0.0
    scores = []
    for i in range(m):
        for j in range(i+1, m):
            scores.append(rouge_l_f1(sents[i], sents[j]))
    return float(np.mean(scores)) if scores else 0.0

# -----------------------------
# Uniformity (normalized entropy over deciles of salient unigrams)
# - Salient unigrams = top-20 TF-IDF terms from the summary (stopwords removed)
# - Count ALL occurrences of those unigrams across the document
# - Bin positions into 10 equal-length deciles; compute H / log2(10)
# -----------------------------
def build_summary_idf(all_sum_token_lists):
    # DF over summaries (does the summary contain token?)
    N = len(all_sum_token_lists)
    df_counts = Counter()
    for toks in all_sum_token_lists:
        df_counts.update(set([t for t in toks if t not in STOPWORDS and t not in PUNCT and not t.isdigit()]))
    # idf with +1 smooth
    idf = {w: math.log((N + 1) / (df + 1)) + 1.0 for w, df in df_counts.items()}
    return idf, N

def topk_tfidf_unigrams(summary_tokens, idf_map, k=20):
    # tf over summary (filtered)
    toks = [t for t in summary_tokens if t not in STOPWORDS and t not in PUNCT and not t.isdigit()]
    if not toks: return []
    tf = Counter(toks)
    # score = tf * idf
    scores = {w: tf[w] * idf_map.get(w, 1.0) for w in tf}
    # top-k by score
    return [w for w, _ in sorted(scores.items(), key=lambda x: x[1], reverse=True)[:k]]

def uniformity(doc_tokens, summary_tokens, idf_map, top_k=20):
    if not doc_tokens: return float('nan')
    unis = topk_tfidf_unigrams(summary_tokens, idf_map, k=top_k)
    if not unis: return float('nan')

    # positions of ALL occurrences of the salient unigrams
    positions = []
    for idx, w in enumerate(doc_tokens):
        if w in unis:
            positions.append(idx)
    if not positions: return float('nan')

    L = len(doc_tokens)
    deciles = [0]*10
    for pos in positions:
        d = min(9, int((pos / max(1, L-1)) * 10))
        deciles[d] += 1

    total = sum(deciles)
    if total == 0: return float('nan')
    probs = [c/total for c in deciles if c > 0]
    H = -sum(p * math.log(p, 2) for p in probs)
    return H / math.log(10, 2)

# -----------------------------
# Compute per-document metrics
# -----------------------------
# IDF map built once across all summaries
idf_map, N_summaries = build_summary_idf(df["SumTokens"].tolist())

MAX_N = 30  # closer to "longest fragment" ideal; set 15–20 for speed if needed

per_doc = []
for _, r in df.iterrows():
    doc_toks = r["DocTokens"]; sum_toks = r["SumTokens"]
    # Coverage & Density
    cov, den = coverage_and_density(doc_toks, sum_toks, max_n=MAX_N)
    # Compression
    ctok = comp_tokens(len(doc_toks), len(sum_toks))
    csent = comp_sents(r["DocSents"], r["SumSents"])
    # Redundancy
    red = redundancy(r["SumText"])
    # Uniformity
    uni = uniformity(doc_toks, sum_toks, idf_map, top_k=20)

    per_doc.append({
        "ID": r["ID"],
        "CompTokens": ctok,
        "CompSents":  csent,
        "Coverage":   cov,
        "Density":    den,
        "Redundancy": red,
        "Uniformity": uni,
    })

metrics_df = pd.DataFrame(per_doc)
metrics_df.to_csv(f"{STATS_DIR}/regusum_per_doc_metrics.csv", index=False)

# -----------------------------
# Aggregate (macro over documents) and save
# -----------------------------
total_lds = {
    "CompTokens": float(metrics_df["CompTokens"].mean()),
    "CompSents":  float(metrics_df["CompSents"].mean()),
    "Coverage":   float(metrics_df["Coverage"].mean()),
    "Density":    float(metrics_df["Density"].mean()),
    "Redundancy": float(metrics_df["Redundancy"].mean()),
    "Uniformity": float(metrics_df["Uniformity"].mean()),
}

total_out = {**total_stats, **total_lds}
total_df = pd.DataFrame([total_out])

# Rounded display copy
disp = total_df.copy()
disp["avg_doc_tokens"] = disp["avg_doc_tokens"].round(1)
disp["avg_doc_sents"]  = disp["avg_doc_sents"].round(1)
disp["avg_sum_tokens"] = disp["avg_sum_tokens"].round(1)
disp["avg_sum_sents"]  = disp["avg_sum_sents"].round(1)
for c in ["CompTokens","CompSents","Coverage","Density","Redundancy","Uniformity"]:
    disp[c] = disp[c].round(3)

csv_total = f"{STATS_DIR}/regusum_totals_with_lds_metrics.csv"
disp.to_csv(csv_total, index=False)
print("[ok] Saved overall metrics ->", csv_total)

# Optional quick peek:
print(disp.to_markdown(index=False))


[ok] Saved overall metrics -> /content/stats/regusum_totals_with_lds_metrics.csv
|   n_docs |   avg_doc_tokens |   avg_doc_sents |   avg_sum_tokens |   avg_sum_sents |   avg_sections |   CompTokens |   CompSents |   Coverage |   Density |   Redundancy |   Uniformity |
|---------:|-----------------:|----------------:|-----------------:|----------------:|---------------:|-------------:|------------:|-----------:|----------:|-------------:|-------------:|
|      345 |          31679.7 |          1116.6 |              129 |             5.9 |        7.10725 |      212.473 |       183.1 |       0.85 |     4.404 |        0.106 |        0.883 |


In [None]:
'''import os
from google.colab import files

# Replace with the actual path to your folder
folder_path = "/content/data"
zip_path = "/tmp/data.zip"

# Zip the folder
if os.path.exists(folder_path):
    !zip -r "$zip_path" "$folder_path"
    print(f"Zipped {folder_path} to {zip_path}")

    # Download the zipped file
    try:
        files.download(zip_path)
        print(f"Downloaded {zip_path}")
    except Exception as e:
        print(f"An error occurred during download: {e}")
else:
    print(f"Error: Folder not found at {folder_path}")'''