# Document-Level Audit

**Purpose:** Audit the labeled dataset before performing a document-level train/val/test split.  
**Scope:** Read-only analysis — no files are modified, no models are trained.

---
## Section 1 — Load Data

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

df = pd.read_csv("mini_labels.csv")
print(f"Total labeled images: {len(df)}")
print(f"Columns: {list(df.columns)}")
print(f"Classes: {sorted(df['label'].unique())}")
print(f"Number of classes: {df['label'].nunique()}")
df.head()

Total labeled images: 1179
Columns: ['image_name', 'label']
Classes: ['Financial Sheets', "Independent Auditor's Report", 'Notes (Tabular)', 'Notes (Text)', 'Other Pages']
Number of classes: 5


Unnamed: 0,image_name,label
0,FS1_page_1.jpg,Other Pages
1,FS1_page_2.jpg,Other Pages
2,FS1_page_3.jpg,Independent Auditor's Report
3,FS1_page_4.jpg,Independent Auditor's Report
4,FS1_page_5.jpg,Independent Auditor's Report


In [2]:
df["pdf_prefix"] = df["image_name"].str.replace(r"_page_\d+\.jpg$", "", regex=True)

unique_pdfs = sorted(df["pdf_prefix"].unique())
print(f"Number of unique PDFs: {len(unique_pdfs)}")
print("\nSorted PDF prefixes:")
for i, p in enumerate(unique_pdfs, 1):
    print(f"  {i:>2}. {p}")

Number of unique PDFs: 30

Sorted PDF prefixes:
   1. FS1
   2. FS10
   3. FS11
   4. FS12
   5. FS13
   6. FS14
   7. FS15
   8. FS16
   9. FS17
  10. FS18
  11. FS19
  12. FS2
  13. FS20
  14. FS21
  15. FS22
  16. FS3
  17. FS4
  18. FS5
  19. FS6
  20. FS7
  21. FS8
  22. FS9
  23. RSF1
  24. RSF2
  25. RSF3
  26. RSF4
  27. RSF5
  28. RSF6
  29. RSF7
  30. RSF8


---
## Section 2 — PDF Summary Table

In [3]:
classes = sorted(df["label"].unique())

ct = df.groupby(["pdf_prefix", "label"]).size().unstack(fill_value=0)
for c in classes:
    if c not in ct.columns:
        ct[c] = 0
ct = ct[classes]

ct["total_pages"] = ct.sum(axis=1)

pct = ct[classes].div(ct["total_pages"], axis=0).multiply(100).round(1)
pct.columns = [f"{c} %" for c in classes]

summary = pd.concat([ct, pct], axis=1)
summary = summary.sort_values("total_pages", ascending=False)

print("PDF Summary Table (sorted by total pages descending):\n")
with pd.option_context("display.max_rows", None, "display.max_columns", None, "display.width", 200):
    display(summary)

PDF Summary Table (sorted by total pages descending):



Unnamed: 0_level_0,Financial Sheets,Independent Auditor's Report,Notes (Tabular),Notes (Text),Other Pages,total_pages,Financial Sheets %,Independent Auditor's Report %,Notes (Tabular) %,Notes (Text) %,Other Pages %
pdf_prefix,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
FS12,4,4,37,17,2,64,6.2,6.2,57.8,26.6,3.1
FS22,4,5,31,19,1,60,6.7,8.3,51.7,31.7,1.7
FS16,5,4,28,18,2,57,8.8,7.0,49.1,31.6,3.5
FS19,4,2,35,12,2,55,7.3,3.6,63.6,21.8,3.6
FS6,5,5,25,15,2,52,9.6,9.6,48.1,28.8,3.8
FS8,4,4,20,16,1,45,8.9,8.9,44.4,35.6,2.2
FS21,4,5,18,15,2,44,9.1,11.4,40.9,34.1,4.5
RSF8,4,4,17,12,5,42,9.5,9.5,40.5,28.6,11.9
FS2,4,4,20,11,2,41,9.8,9.8,48.8,26.8,4.9
FS14,4,4,20,10,2,40,10.0,10.0,50.0,25.0,5.0


---
## Section 3 — Distribution Consistency Check

In [4]:
proportions = ct[classes].div(ct["total_pages"], axis=0)

global_dist = df["label"].value_counts(normalize=True).reindex(classes).fillna(0)
mean_dist = proportions.mean()
std_dist = proportions.std()

dist_table = pd.DataFrame({
    "Global %": (global_dist * 100).round(1),
    "Mean across PDFs %": (mean_dist * 100).round(1),
    "Std across PDFs %": (std_dist * 100).round(1),
})
print("Class distribution consistency:\n")
display(dist_table)

Class distribution consistency:



Unnamed: 0_level_0,Global %,Mean across PDFs %,Std across PDFs %
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Financial Sheets,10.6,11.1,2.4
Independent Auditor's Report,9.4,9.7,2.2
Notes (Tabular),46.4,45.4,7.8
Notes (Text),28.0,27.8,5.6
Other Pages,5.6,5.9,2.5


In [5]:
DEVIATION_THRESHOLD = 1.5  # flag PDFs where any class deviates > 1.5 std from global

deviations = proportions.sub(global_dist, axis=1).div(std_dist, axis=1).abs()
flagged = deviations.max(axis=1)
outlier_pdfs = flagged[flagged > DEVIATION_THRESHOLD].sort_values(ascending=False)

print(f"PDFs deviating > {DEVIATION_THRESHOLD} std from global distribution ({len(outlier_pdfs)} found):\n")
if len(outlier_pdfs) > 0:
    for pdf, score in outlier_pdfs.items():
        worst_class = deviations.loc[pdf].idxmax()
        pdf_pct = (proportions.loc[pdf, worst_class] * 100).round(1)
        global_pct = (global_dist[worst_class] * 100).round(1)
        print(f"  {pdf:<12} max z-score={score:.2f}  (worst class: '{worst_class}' — PDF {pdf_pct}% vs global {global_pct}%)")
else:
    print("  None — all PDFs are reasonably balanced.")

PDFs deviating > 1.5 std from global distribution (13 found):

  FS19         max z-score=2.61  (worst class: 'Independent Auditor's Report' — PDF 3.6% vs global 9.4%)
  RSF8         max z-score=2.57  (worst class: 'Other Pages' — PDF 11.9% vs global 5.6%)
  RSF3         max z-score=2.51  (worst class: 'Other Pages' — PDF 11.8% vs global 5.6%)
  RSF4         max z-score=2.38  (worst class: 'Other Pages' — PDF 11.4% vs global 5.6%)
  RSF7         max z-score=2.06  (worst class: 'Notes (Text)' — PDF 39.5% vs global 28.0%)
  FS11         max z-score=2.03  (worst class: 'Financial Sheets' — PDF 15.4% vs global 10.6%)
  FS3          max z-score=1.93  (worst class: 'Financial Sheets' — PDF 15.2% vs global 10.6%)
  FS12         max z-score=1.84  (worst class: 'Financial Sheets' — PDF 6.2% vs global 10.6%)
  FS4          max z-score=1.83  (worst class: 'Notes (Tabular)' — PDF 32.1% vs global 46.4%)
  FS17         max z-score=1.76  (worst class: 'Notes (Text)' — PDF 18.2% vs global 28.0%)
  RSF

---
## Section 4 — Minimum / Maximum Pages Per PDF

In [6]:
page_counts = ct["total_pages"].sort_values()

print("Page-count statistics per PDF:")
print(f"  Mean:   {page_counts.mean():.1f}")
print(f"  Median: {page_counts.median():.1f}")
print(f"  Std:    {page_counts.std():.1f}")
print(f"  Min:    {page_counts.min()} ({page_counts.idxmin()})")
print(f"  Max:    {page_counts.max()} ({page_counts.idxmax()})")
print()

print("All PDFs ranked by page count (ascending):")
for pdf, count in page_counts.items():
    print(f"  {pdf:<12} {count:>3} pages")

Page-count statistics per PDF:
  Mean:   39.3
  Median: 36.5
  Std:    9.6
  Min:    26 (FS11)
  Max:    64 (FS12)

All PDFs ranked by page count (ascending):
  FS11          26 pages
  FS4           28 pages
  FS7           30 pages
  RSF1          31 pages
  RSF6          32 pages
  FS1           33 pages
  RSF2          33 pages
  FS3           33 pages
  FS9           33 pages
  FS17          33 pages
  RSF3          34 pages
  FS15          34 pages
  RSF5          35 pages
  RSF4          35 pages
  FS18          36 pages
  FS10          37 pages
  RSF7          38 pages
  FS5           39 pages
  FS13          39 pages
  FS20          40 pages
  FS14          40 pages
  FS2           41 pages
  RSF8          42 pages
  FS21          44 pages
  FS8           45 pages
  FS6           52 pages
  FS19          55 pages
  FS16          57 pages
  FS22          60 pages
  FS12          64 pages


---
## Section 5 — Feasibility Check for Document-Level Split

In [7]:
n_pdfs = len(unique_pdfs)
target_train = round(n_pdfs * 20 / 30)
target_val = round(n_pdfs * 5 / 30)
target_test = n_pdfs - target_train - target_val

print(f"Target split for {n_pdfs} PDFs:")
print(f"  Train: {target_train} PDFs")
print(f"  Val:   {target_val} PDFs")
print(f"  Test:  {target_test} PDFs")
print()

# Check class coverage: would val/test have all 5 classes?
from itertools import combinations

def check_class_coverage(pdf_list, proportions_df, classes):
    """Return set of classes that have zero representation in the given PDFs."""
    sub = proportions_df.loc[pdf_list]
    totals = sub.sum()
    return set(c for c in classes if totals.get(c, 0) == 0)

# Check how many PDFs contain each class
class_presence = (ct[classes] > 0).sum()
print("PDFs containing each class:")
for c in classes:
    print(f"  {c:<35} {class_presence[c]:>2} / {n_pdfs} PDFs")

print()
rare_classes = class_presence[class_presence < target_val + target_test]
if len(rare_classes) > 0:
    print("WARNING — Classes that appear in fewer PDFs than val+test size:")
    for c, count in rare_classes.items():
        print(f"  '{c}' only in {count} PDFs — risk of missing class in val/test")
else:
    print("All classes appear in enough PDFs to populate val+test splits.")

Target split for 30 PDFs:
  Train: 20 PDFs
  Val:   5 PDFs
  Test:  5 PDFs

PDFs containing each class:
  Financial Sheets                    30 / 30 PDFs
  Independent Auditor's Report        30 / 30 PDFs
  Notes (Tabular)                     30 / 30 PDFs
  Notes (Text)                        30 / 30 PDFs
  Other Pages                         30 / 30 PDFs

All classes appear in enough PDFs to populate val+test splits.


In [8]:
# Skew analysis: PDFs where one class dominates > 60%
SKEW_THRESHOLD = 0.60

print(f"PDFs where a single class exceeds {SKEW_THRESHOLD*100:.0f}% of pages:\n")
skewed = []
for pdf in unique_pdfs:
    max_class = proportions.loc[pdf].idxmax()
    max_pct = proportions.loc[pdf].max()
    if max_pct > SKEW_THRESHOLD:
        skewed.append((pdf, max_class, max_pct))
        print(f"  {pdf:<12} {max_class:<35} {max_pct*100:.1f}%")

if not skewed:
    print("  None.")
else:
    print(f"\n  Total skewed PDFs: {len(skewed)} / {n_pdfs}")

PDFs where a single class exceeds 60% of pages:

  FS19         Notes (Tabular)                     63.6%

  Total skewed PDFs: 1 / 30


In [9]:
# Small-PDF risk: PDFs with fewer pages than number of classes
SMALL_THRESHOLD = len(classes)

small_pdfs = page_counts[page_counts <= SMALL_THRESHOLD]
print(f"Small PDFs (≤ {SMALL_THRESHOLD} pages) that could distort splits:\n")
if len(small_pdfs) > 0:
    for pdf, count in small_pdfs.items():
        present = [c for c in classes if ct.loc[pdf, c] > 0]
        print(f"  {pdf:<12} {count} pages — classes present: {present}")
else:
    print("  None — all PDFs have a reasonable number of pages.")

Small PDFs (≤ 5 pages) that could distort splits:

  None — all PDFs have a reasonable number of pages.


---
## Generate Audit Report

In [10]:
lines = []
lines.append("# Document-Level Audit Report")
lines.append("")
lines.append("---")
lines.append("")

# --- Dataset Summary ---
lines.append("## 1. Dataset Summary")
lines.append("")
lines.append(f"- **Total labeled images:** {len(df)}")
lines.append(f"- **Number of unique PDFs:** {n_pdfs}")
lines.append(f"- **Number of classes:** {len(classes)}")
lines.append(f"- **Classes:** {', '.join(classes)}")
lines.append("")
lines.append("### Global class distribution")
lines.append("")
lines.append("| Class | Count | Percentage |")
lines.append("|-------|------:|-----------:|")
for c in classes:
    cnt = int((df["label"] == c).sum())
    pct = cnt / len(df) * 100
    lines.append(f"| {c} | {cnt} | {pct:.1f}% |")
lines.append("")

# --- List of all PDFs ---
lines.append("## 2. List of All PDFs")
lines.append("")
for i, pdf in enumerate(unique_pdfs, 1):
    lines.append(f"{i}. {pdf}")
lines.append("")

# --- PDF Summary Table ---
lines.append("## 3. PDF Summary Table")
lines.append("")
header_cols = ["PDF"] + classes + ["Total"] + [f"{c} %" for c in classes]
lines.append("| " + " | ".join(header_cols) + " |")
sep = "| --- " + "| ---: " * (len(header_cols) - 1) + "|"
lines.append(sep)
for pdf in summary.index:
    row_vals = [pdf]
    for c in classes:
        row_vals.append(str(int(summary.loc[pdf, c])))
    row_vals.append(str(int(summary.loc[pdf, "total_pages"])))
    for c in classes:
        row_vals.append(f"{summary.loc[pdf, f'{c} %']:.1f}%")
    lines.append("| " + " | ".join(row_vals) + " |")
lines.append("")

# --- Distribution Consistency ---
lines.append("## 4. Distribution Consistency Analysis")
lines.append("")
lines.append("### Mean and Std of class proportions across PDFs")
lines.append("")
lines.append("| Class | Global % | Mean across PDFs % | Std across PDFs % |")
lines.append("|-------|--------:|---------:|---------:|")
for c in classes:
    lines.append(f"| {c} | {global_dist[c]*100:.1f}% | {mean_dist[c]*100:.1f}% | {std_dist[c]*100:.1f}% |")
lines.append("")

lines.append(f"### PDFs deviating > {DEVIATION_THRESHOLD} std from global distribution")
lines.append("")
if len(outlier_pdfs) > 0:
    lines.append("| PDF | Max Z-Score | Worst Class | PDF % | Global % |")
    lines.append("|-----|--------:|------------|------:|--------:|")
    for pdf, score in outlier_pdfs.items():
        worst_class = deviations.loc[pdf].idxmax()
        pdf_pct = proportions.loc[pdf, worst_class] * 100
        glob_pct = global_dist[worst_class] * 100
        lines.append(f"| {pdf} | {score:.2f} | {worst_class} | {pdf_pct:.1f}% | {glob_pct:.1f}% |")
else:
    lines.append("None — all PDFs are reasonably balanced.")
lines.append("")

# --- Page count stats ---
lines.append("## 5. Page Count Statistics")
lines.append("")
lines.append(f"- **Mean pages per PDF:** {page_counts.mean():.1f}")
lines.append(f"- **Median pages per PDF:** {page_counts.median():.1f}")
lines.append(f"- **Std of page counts:** {page_counts.std():.1f}")
lines.append(f"- **Smallest PDF:** {page_counts.idxmin()} ({page_counts.min()} pages)")
lines.append(f"- **Largest PDF:** {page_counts.idxmax()} ({page_counts.max()} pages)")
lines.append("")

# --- Feasibility & Recommendations ---
lines.append("## 6. Recommendation for Split Strategy")
lines.append("")
lines.append(f"Target split ratio: **20 / 5 / 5** (train / val / test) from {n_pdfs} PDFs.")
lines.append(f"")
lines.append(f"Equivalent target: **{target_train} / {target_val} / {target_test}** PDFs.")
lines.append("")

lines.append("### Class coverage")
lines.append("")
lines.append("| Class | Present in N PDFs |")
lines.append("|-------|------------------:|")
for c in classes:
    lines.append(f"| {c} | {class_presence[c]} / {n_pdfs} |")
lines.append("")

lines.append("### Skewed PDFs")
lines.append("")
if skewed:
    lines.append(f"{len(skewed)} PDF(s) have a single class exceeding {SKEW_THRESHOLD*100:.0f}% of their pages:")
    lines.append("")
    for pdf, cls, pct in skewed:
        lines.append(f"- **{pdf}** — {cls} at {pct*100:.1f}%")
else:
    lines.append("No PDFs are heavily skewed toward a single class.")
lines.append("")

lines.append("### Small PDFs")
lines.append("")
if len(small_pdfs) > 0:
    lines.append(f"{len(small_pdfs)} PDF(s) have ≤ {SMALL_THRESHOLD} pages and could distort class balance in small splits:")
    lines.append("")
    for pdf, count in small_pdfs.items():
        lines.append(f"- **{pdf}** — {count} pages")
else:
    lines.append("No PDFs are too small to distort splits.")
lines.append("")

# --- Risks ---
lines.append("## 7. Potential Risks")
lines.append("")
risks = []
if len(outlier_pdfs) > 0:
    risks.append(f"**Distribution outliers:** {len(outlier_pdfs)} PDF(s) deviate significantly from the global class distribution. Placing these in val/test could bias evaluation metrics.")
if skewed:
    risks.append(f"**Skewed PDFs:** {len(skewed)} PDF(s) are dominated by a single class. If several end up in the same split, that split will be imbalanced.")
if len(small_pdfs) > 0:
    risks.append(f"**Small PDFs:** {len(small_pdfs)} PDF(s) have very few pages. They contribute little data but take up a PDF slot in the split.")
if len(rare_classes) > 0:
    for c, count in rare_classes.items():
        risks.append(f"**Rare class coverage:** '{c}' only appears in {count} PDFs, which is fewer than the val+test allocation. There is a risk that val or test will have zero samples of this class.")
if not risks:
    risks.append("No major risks identified. The dataset is well-distributed for a document-level split.")
for r in risks:
    lines.append(f"- {r}")
lines.append("")
lines.append("---")
lines.append("")
lines.append("*Report generated automatically by DOCUMENT_LEVEL_AUDIT.ipynb — audit only, no modifications made.*")

report_path = Path("DOCUMENT_LEVEL_AUDIT_REPORT.md")
report_path.write_text("\n".join(lines), encoding="utf-8")
print(f"Report written to {report_path.resolve()}")
print(f"Total lines: {len(lines)}")

Report written to /Users/ta/Desktop/Sukuk-AI-Assessment/DOCUMENT_LEVEL_AUDIT_REPORT.md
Total lines: 161
