# Smart Document-Level Split

**Goal:** Split 30 PDFs into 20 train / 5 val / 5 test while preserving class balance.  
**Constraint:** No model training — split only.

---
## Step 1 — Load Data & Compute Per-PDF Profiles

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

df = pd.read_csv("mini_labels.csv")
df["pdf_prefix"] = df["image_name"].str.replace(r"_page_\d+\.jpg$", "", regex=True)

classes = sorted(df["label"].unique())
all_pdfs = sorted(df["pdf_prefix"].unique())
n_pdfs = len(all_pdfs)

global_dist = df["label"].value_counts(normalize=True).reindex(classes).values

ct = df.groupby(["pdf_prefix", "label"]).size().unstack(fill_value=0)
for c in classes:
    if c not in ct.columns:
        ct[c] = 0
ct = ct[classes]
ct["total"] = ct.sum(axis=1)

print(f"Loaded {len(df)} images | {n_pdfs} PDFs | {len(classes)} classes")
print(f"Target split: 20 train / 5 val / 5 test")
print(f"Target image ratio: ~{len(df)*20//30} / ~{len(df)*5//30} / ~{len(df)*5//30}")

Loaded 1179 images | 30 PDFs | 5 classes
Target split: 20 train / 5 val / 5 test
Target image ratio: ~786 / ~196 / ~196


---
## Step 2 — Greedy Balanced Split Algorithm

In [2]:
def compute_split_deviation(pdf_list, ct_df, classes, global_dist):
    """Compute max absolute deviation of a split's class distribution from global."""
    if not pdf_list:
        return float("inf")
    counts = ct_df.loc[pdf_list, classes].sum()
    total = counts.sum()
    if total == 0:
        return float("inf")
    split_dist = (counts / total).values
    return np.max(np.abs(split_dist - global_dist))


def compute_split_cost(pdf_list, ct_df, classes, global_dist):
    """Sum of squared deviations — lower is better."""
    if not pdf_list:
        return float("inf")
    counts = ct_df.loc[pdf_list, classes].sum()
    total = counts.sum()
    if total == 0:
        return float("inf")
    split_dist = (counts / total).values
    return np.sum((split_dist - global_dist) ** 2)


def greedy_split(all_pdfs, ct_df, classes, global_dist,
                 n_train=20, n_val=5, n_test=5, seed=42):
    """
    Greedy heuristic:
    1. Sort PDFs by page count descending (assign large PDFs first).
    2. For each PDF, try adding it to each split that still has room.
    3. Pick the assignment that minimizes the combined cost across all 3 splits.
    """
    rng = np.random.RandomState(seed)
    pdf_order = ct_df.loc[all_pdfs, "total"].sort_values(ascending=False).index.tolist()

    splits = {"train": [], "val": [], "test": []}
    caps = {"train": n_train, "val": n_val, "test": n_test}

    for pdf in pdf_order:
        best_split = None
        best_cost = float("inf")

        candidates = [s for s in ["train", "val", "test"] if len(splits[s]) < caps[s]]
        rng.shuffle(candidates)  # break ties randomly

        for s in candidates:
            trial = splits[s] + [pdf]
            cost = compute_split_cost(trial, ct_df, classes, global_dist)
            # weight val/test higher — their balance matters more with fewer PDFs
            if s in ("val", "test"):
                cost *= 2.0
            if cost < best_cost:
                best_cost = cost
                best_split = s

        splits[best_split].append(pdf)

    return splits

print("Split algorithm defined.")

Split algorithm defined.


---
## Step 3 — Run Split & Select Best Seed

In [3]:
best_splits = None
best_total_cost = float("inf")
best_seed = None

for seed in range(200):
    candidate = greedy_split(all_pdfs, ct, classes, global_dist, seed=seed)
    cost = sum(
        compute_split_cost(candidate[s], ct, classes, global_dist)
        for s in ["train", "val", "test"]
    )
    if cost < best_total_cost:
        best_total_cost = cost
        best_splits = candidate
        best_seed = seed

splits = best_splits
print(f"Best seed: {best_seed} (total cost: {best_total_cost:.6f})")
print(f"\nTrain PDFs ({len(splits['train'])}): {sorted(splits['train'])}")
print(f"Val PDFs   ({len(splits['val'])}):   {sorted(splits['val'])}")
print(f"Test PDFs  ({len(splits['test'])}):  {sorted(splits['test'])}")

Best seed: 2 (total cost: 0.001172)

Train PDFs (20): ['FS1', 'FS10', 'FS11', 'FS12', 'FS15', 'FS17', 'FS18', 'FS19', 'FS22', 'FS3', 'FS4', 'FS7', 'FS9', 'RSF1', 'RSF2', 'RSF3', 'RSF4', 'RSF5', 'RSF6', 'RSF7']
Val PDFs   (5):   ['FS13', 'FS16', 'FS20', 'FS21', 'FS5']
Test PDFs  (5):  ['FS14', 'FS2', 'FS6', 'FS8', 'RSF8']


---
## Step 4 — Split Statistics

In [4]:
def split_stats(split_name, pdf_list, df, ct_df, classes, global_dist):
    """Print and return stats for one split."""
    sub = df[df["pdf_prefix"].isin(pdf_list)]
    n = len(sub)
    counts = sub["label"].value_counts().reindex(classes, fill_value=0)
    pcts = counts / n * 100
    devs = pcts.values - global_dist * 100

    print(f"\n{'='*60}")
    print(f"  {split_name.upper()} — {len(pdf_list)} PDFs, {n} images ({n/len(df)*100:.1f}%)")
    print(f"{'='*60}")
    print(f"  PDFs: {sorted(pdf_list)}")
    print(f"  {'Class':<35} {'Count':>6} {'Split%':>8} {'Global%':>8} {'Dev':>8}")
    print(f"  {'-'*35} {'-'*6} {'-'*8} {'-'*8} {'-'*8}")
    for i, c in enumerate(classes):
        print(f"  {c:<35} {counts[c]:>6} {pcts[c]:>7.1f}% {global_dist[i]*100:>7.1f}% {devs[i]:>+7.1f}%")
    max_dev = np.max(np.abs(devs))
    print(f"\n  Max absolute deviation: {max_dev:.2f}%")
    return {"split": split_name, "n_pdfs": len(pdf_list), "n_images": n,
            "pct_images": round(n / len(df) * 100, 1),
            "class_pcts": pcts.to_dict(), "deviations": dict(zip(classes, devs)),
            "max_dev": round(max_dev, 2)}

stats = {}
for name in ["train", "val", "test"]:
    stats[name] = split_stats(name, splits[name], df, ct, classes, global_dist)


  TRAIN — 20 PDFs, 740 images (62.8%)
  PDFs: ['FS1', 'FS10', 'FS11', 'FS12', 'FS15', 'FS17', 'FS18', 'FS19', 'FS22', 'FS3', 'FS4', 'FS7', 'FS9', 'RSF1', 'RSF2', 'RSF3', 'RSF4', 'RSF5', 'RSF6', 'RSF7']
  Class                                Count   Split%  Global%      Dev
  ----------------------------------- ------ -------- -------- --------
  Financial Sheets                        83    11.2%    10.6%    +0.6%
  Independent Auditor's Report            70     9.5%     9.4%    +0.0%
  Notes (Tabular)                        338    45.7%    46.4%    -0.7%
  Notes (Text)                           205    27.7%    28.0%    -0.3%
  Other Pages                             44     5.9%     5.6%    +0.3%

  Max absolute deviation: 0.72%

  VAL — 5 PDFs, 219 images (18.6%)
  PDFs: ['FS13', 'FS16', 'FS20', 'FS21', 'FS5']
  Class                                Count   Split%  Global%      Dev
  ----------------------------------- ------ -------- -------- --------
  Financial Sheets              

---
## Step 5 — Comparison Table

In [5]:
rows = []
for name in ["train", "val", "test"]:
    row = {"Split": name, "PDFs": stats[name]["n_pdfs"],
           "Images": stats[name]["n_images"],
           "Images %": stats[name]["pct_images"]}
    for c in classes:
        row[c] = f"{stats[name]['class_pcts'][c]:.1f}%"
        row[f"{c} dev"] = f"{stats[name]['deviations'][c]:+.1f}%"
    row["Max Dev"] = f"{stats[name]['max_dev']:.2f}%"
    rows.append(row)

# add global row
g_row = {"Split": "GLOBAL", "PDFs": n_pdfs, "Images": len(df), "Images %": 100.0}
for i, c in enumerate(classes):
    g_row[c] = f"{global_dist[i]*100:.1f}%"
    g_row[f"{c} dev"] = "—"
g_row["Max Dev"] = "—"
rows.append(g_row)

comp = pd.DataFrame(rows).set_index("Split")
with pd.option_context("display.max_columns", None, "display.width", 220):
    display(comp)

Unnamed: 0_level_0,PDFs,Images,Images %,Financial Sheets,Financial Sheets dev,Independent Auditor's Report,Independent Auditor's Report dev,Notes (Tabular),Notes (Tabular) dev,Notes (Text),Notes (Text) dev,Other Pages,Other Pages dev,Max Dev
Split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
train,20,740,62.8,11.2%,+0.6%,9.5%,+0.0%,45.7%,-0.7%,27.7%,-0.3%,5.9%,+0.3%,0.72%
val,5,219,18.6,9.6%,-1.0%,9.1%,-0.3%,48.9%,+2.5%,27.9%,-0.1%,4.6%,-1.0%,2.46%
test,5,220,18.7,9.5%,-1.1%,9.5%,+0.1%,46.4%,-0.0%,29.1%,+1.1%,5.5%,-0.1%,1.10%
GLOBAL,30,1179,100.0,10.6%,—,9.4%,—,46.4%,—,28.0%,—,5.6%,—,—


---
## Step 6 — Save Split CSVs

In [6]:
for name in ["train", "val", "test"]:
    sub = df[df["pdf_prefix"].isin(splits[name])][["image_name", "label", "pdf_prefix"]].copy()
    sub = sub.sort_values(["pdf_prefix", "image_name"]).reset_index(drop=True)
    path = f"{name}_doc_split.csv"
    sub.to_csv(path, index=False)
    print(f"Saved {path} — {len(sub)} rows")

Saved train_doc_split.csv — 740 rows
Saved val_doc_split.csv — 219 rows
Saved test_doc_split.csv — 220 rows


---
## Step 7 — Generate Split Report

In [7]:
lines = []
lines.append("# Document-Level Split Report")
lines.append("")
lines.append("---")
lines.append("")

# --- Overview ---
lines.append("## 1. Split Overview")
lines.append("")
lines.append(f"- **Total images:** {len(df)}")
lines.append(f"- **Total PDFs:** {n_pdfs}")
lines.append(f"- **Split ratio (PDFs):** 20 / 5 / 5")
lines.append(f"- **Algorithm:** Greedy heuristic — assign largest PDFs first, minimizing class-distribution deviation")
lines.append(f"- **Best seed:** {best_seed}")
lines.append("")

# --- PDFs per split ---
lines.append("## 2. PDFs per Split")
lines.append("")
for name in ["train", "val", "test"]:
    s = stats[name]
    lines.append(f"### {name.capitalize()} — {s['n_pdfs']} PDFs, {s['n_images']} images ({s['pct_images']}%)")
    lines.append("")
    for pdf in sorted(splits[name]):
        pages = int(ct.loc[pdf, 'total'])
        lines.append(f"- {pdf} ({pages} pages)")
    lines.append("")

# --- Image counts ---
lines.append("## 3. Image Counts")
lines.append("")
lines.append("| Split | PDFs | Images | % of Total |")
lines.append("|-------|-----:|-------:|-----------:|")
for name in ["train", "val", "test"]:
    s = stats[name]
    lines.append(f"| {name.capitalize()} | {s['n_pdfs']} | {s['n_images']} | {s['pct_images']}% |")
lines.append(f"| **Total** | **{n_pdfs}** | **{len(df)}** | **100.0%** |")
lines.append("")

# --- Class distribution per split ---
lines.append("## 4. Class Distribution per Split")
lines.append("")
header = "| Split | " + " | ".join(classes) + " |"
sep = "| --- | " + " | ".join(["---:"]*len(classes)) + " |"
lines.append(header)
lines.append(sep)
for name in ["train", "val", "test"]:
    vals = " | ".join(f"{stats[name]['class_pcts'][c]:.1f}%" for c in classes)
    lines.append(f"| {name.capitalize()} | {vals} |")
global_vals = " | ".join(f"{global_dist[i]*100:.1f}%" for i in range(len(classes)))
lines.append(f"| **Global** | {global_vals} |")
lines.append("")

# --- Deviation ---
lines.append("## 5. Deviation from Global Distribution")
lines.append("")
header = "| Split | " + " | ".join(classes) + " | Max Dev |"
sep = "| --- | " + " | ".join(["---:"]*len(classes)) + " | ---: |"
lines.append(header)
lines.append(sep)
for name in ["train", "val", "test"]:
    vals = " | ".join(f"{stats[name]['deviations'][c]:+.1f}%" for c in classes)
    lines.append(f"| {name.capitalize()} | {vals} | {stats[name]['max_dev']:.2f}% |")
lines.append("")

# --- Recommendation ---
lines.append("## 6. Recommendation")
lines.append("")
all_devs = [stats[s]["max_dev"] for s in ["train", "val", "test"]]
worst_dev = max(all_devs)
worst_split = ["train", "val", "test"][all_devs.index(worst_dev)]

if worst_dev < 3.0:
    verdict = "EXCELLENT"
    detail = "All splits have class distributions within 3 percentage points of the global distribution. This split is well-balanced and ready for use."
elif worst_dev < 5.0:
    verdict = "ACCEPTABLE"
    detail = f"Maximum deviation is {worst_dev:.1f}% in the {worst_split} split. This is within acceptable tolerance for a document-level split with only 30 PDFs."
else:
    verdict = "MARGINAL"
    detail = f"Maximum deviation is {worst_dev:.1f}% in the {worst_split} split. Consider manual adjustment of borderline PDFs."

lines.append(f"**Verdict: {verdict}**")
lines.append("")
lines.append(detail)
lines.append("")
lines.append("### Key observations")
lines.append("")
lines.append(f"- Worst-case maximum deviation: **{worst_dev:.2f}%** ({worst_split} split)")
lines.append(f"- All 5 classes are represented in every split.")

# check skew spread
skewed_pdf = "FS19"
for s in ["train", "val", "test"]:
    if skewed_pdf in splits[s]:
        lines.append(f"- Skewed PDF ({skewed_pdf}) placed in **{s}** split.")
        break

# check outlier spread
outlier_pdfs = ["FS19", "RSF8", "RSF3", "RSF4", "RSF7", "FS11", "FS3",
                "FS12", "FS4", "FS17", "RSF2", "FS22", "RSF1"]
for s in ["train", "val", "test"]:
    in_split = [p for p in outlier_pdfs if p in splits[s]]
    lines.append(f"- Outlier PDFs in {s}: {len(in_split)} — {sorted(in_split)}")

lines.append("")
lines.append("### Files generated")
lines.append("")
lines.append("- `train_doc_split.csv`")
lines.append("- `val_doc_split.csv`")
lines.append("- `test_doc_split.csv`")
lines.append("")
lines.append("---")
lines.append("")
lines.append("*Report generated automatically by DOCUMENT_LEVEL_SPLIT.ipynb — no model training performed.*")

report_path = Path("DOCUMENT_LEVEL_SPLIT_REPORT.md")
report_path.write_text("\n".join(lines), encoding="utf-8")
print(f"Report written to {report_path.resolve()}")

Report written to /Users/ta/Desktop/Sukuk-AI-Assessment/DOCUMENT_LEVEL_SPLIT_REPORT.md
