In [29]:
import torch
print(torch.__version__)  # should show +cu118 or +cu121
print("CUDA available:", torch.cuda.is_available())  # should be True if everything is okay

2.8.0+cpu
CUDA available: False


In [30]:
from pathlib import Path

In [31]:
from typing import List, Dict, Any, Tuple

In [32]:
import sys
print(sys.executable)


C:\Users\user\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe


In [33]:
import os
from pathlib import Path
import torch
from transformers import pipeline
import matplotlib.pyplot as plt

In [34]:
# OCR
import pytesseract
from pytesseract import Output

In [35]:
# Imaging & PDF
from PIL import Image, ImageDraw
import fitz  # PyMuPDF

In [36]:
# NER
from transformers import pipeline

In [37]:
# CSV log
import csv
from datetime import datetime

In [38]:
DEFAULT_NER_MODEL = "dslim/bert-base-NER"  # good general English NER
SUPPORTED_IMAGE_EXTS = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp"}

In [39]:
from dataclasses import dataclass
from typing import Tuple
@dataclass
class RedactionEvent:
    page_index: int
    text: str
    label: str
    score: float
    bbox: Tuple[int, int, int, int]  # (left, top, right, bottom) on page image coords
    line_index: int

In [40]:
@dataclass
class DocumentSummary:
    input_path: str
    total_pages: int
    total_entities: int
    redacted_entities: int
    sensitivity_score: float
    sensitivity_label: str
    threshold: float
    model_name: str
    generated_at: str

In [41]:
def load_ner_pipeline(model_name: str = DEFAULT_NER_MODEL):
    return pipeline("token-classification", model=model_name, aggregation_strategy="simple")

In [42]:
def pdf_to_images(pdf_path: Path, dpi: int = 200) -> List[Image.Image]:
    """Render each PDF page to a PIL Image using PyMuPDF."""
    images = []
    with fitz.open(pdf_path) as doc:
        for page in doc:
            mat = fitz.Matrix(dpi/72, dpi/72)
            pix = page.get_pixmap(matrix=mat, alpha=False)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            images.append(img)
    return images

def load_image_any(path: Path) -> Image.Image:
    return Image.open(path).convert("RGB")

def ocr_image_words(img: Image.Image, lang: str = "eng") -> List[Dict[str, Any]]:
    """
    OCR an image and return a list of word dictionaries including:
      - text, conf, left, top, width, height, line_num
    Uses pytesseract.image_to_data to get word-level boxes.
    """
    data = pytesseract.image_to_data(img, lang=lang, output_type=Output.DICT)
    n = len(data["text"])
    words = []
    for i in range(n):
        text = data["text"][i]
        if text is None or text.strip() == "":
            continue
        try:
            conf = float(data["conf"][i])
        except Exception:
            conf = -1.0
        word = {
            "text": text,
            "conf": conf,
            "left": int(data["left"][i]),
            "top": int(data["top"][i]),
            "width": int(data["width"][i]),
            "height": int(data["height"][i]),
            "line_num": int(data.get("line_num", [0]*n)[i]),
            "block_num": int(data.get("block_num", [0]*n)[i]),
            "par_num": int(data.get("par_num", [0]*n)[i]),
        }
        words.append(word)
    return words

In [43]:
def words_to_lines(words: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Group words by their line_num, preserve order by x-position.
    Returns list of lines: { line_index, text, spans }
    spans: list of {text, bbox, start_char, end_char}
    """
    # group by (block, par, line) to avoid collisions
    from collections import defaultdict
    groups = defaultdict(list)
    for w in words:
        key = (w["block_num"], w["par_num"], w["line_num"])
        groups[key].append(w)

    lines = []
    for idx, (key, ws) in enumerate(sorted(groups.items(), key=lambda kv: (kv[0][0], kv[0][1], kv[0][2]))):
        # sort words left-to-right
        ws_sorted = sorted(ws, key=lambda x: x["left"])
        text_parts = []
        spans = []
        cursor = 0
        for w in ws_sorted:
            word_text = str(w["text"])
            if text_parts:
                # Insert a space between words
                text_parts.append(" ")
                cursor += 1
            start = cursor
            text_parts.append(word_text)
            cursor += len(word_text)
            end = cursor
            bbox = (w["left"], w["top"], w["left"] + w["width"], w["top"] + w["height"])
            spans.append({
                "text": word_text,
                "bbox": bbox,
                "start_char": start,
                "end_char": end,
                "conf": w["conf"],
            })
        lines.append({
            "line_index": idx,
            "text": "".join(text_parts),
            "spans": spans
        })
    return lines

In [44]:
def align_entity_to_bboxes(entity: Dict[str, Any], line: Dict[str, Any]) -> List[Tuple[int,int,int,int]]:
    """
    Given an entity with 'start'/'end' (char offsets in line) and the line spans,
    return list of bboxes covering the entity.
    """
    start = entity["start"]
    end = entity["end"]
    bboxes = []
    for span in line["spans"]:
        if span["end_char"] <= start or span["start_char"] >= end:
            continue  # no overlap
        bboxes.append(span["bbox"])
    return bboxes

In [45]:
def union_bboxes(bboxes: List[Tuple[int,int,int,int]]) -> Tuple[int,int,int,int]:
    l = min(b[0] for b in bboxes)
    t = min(b[1] for b in bboxes)
    r = max(b[2] for b in bboxes)
    b = max(b[3] for b in bboxes)
    return (l, t, r, b)

def redact_boxes(img: Image.Image, boxes: List[Tuple[int,int,int,int]], margin: int = 2) -> Image.Image:
    draw = ImageDraw.Draw(img)
    for (l,t,r,b) in boxes:
        draw.rectangle([l - margin, t - margin, r + margin, b + margin], fill="black")
    return img

def compute_sensitivity_score(events: List[RedactionEvent]) -> float:
    """
    Aggregate entity-level scores into a document sensitivity score in [0,1].
    Weighted by entity type importance (tunable); default weights approximate risk.
    """
    if not events:
        return 0.0
    weights = {
        "PER": 1.0, "PERSON": 1.0,
        "ORG": 0.8, "ORGANIZATION": 0.8,
        "LOC": 0.6, "LOCATION": 0.6,
        "GPE": 0.6,
        "DATE": 0.4, "TIME": 0.4,
        "NORP": 0.5, "MISC": 0.5,
        "EMAIL": 0.9, "EMAIL_ADDRESS": 0.9,
        "PHONE_NUMBER": 0.9, "CARDINAL": 0.5,
        "MONEY": 0.8, "CREDIT_CARD": 1.0, "ACCOUNT_NUMBER": 1.0,
        # add other domain-specific tags as needed
    }
    total = 0.0
    denom = 0.0
    for e in events:
        w = weights.get(e.label, 0.7)  # default mid-high weight
        total += w * e.score
        denom += w
    score = total / max(denom, 1e-6)
    # Normalize gently to emphasize high-risk presence
    return min(1.0, max(0.0, score))

def label_sensitivity(score: float) -> str:
    if score >= 0.85:
        return "HIGH"
    elif score >= 0.6:
        return "MEDIUM"
    elif score >= 0.3:
        return "LOW"
    else:
        return "PUBLIC"

In [46]:
def process_image(
    img: Image.Image,
    ner_pipe,
    threshold: float = 0.85,
    ocr_lang: str = "eng",
) -> Tuple[Image.Image, List[RedactionEvent]]:
    """
    OCR + NER on a single image. Returns redacted image and list of events.
    """
    words = ocr_image_words(img, lang=ocr_lang)
    lines = words_to_lines(words)
    events: List[RedactionEvent] = []
    draw_boxes: List[Tuple[int,int,int,int]] = []

    for line in lines:
        text = line["text"]
        if not text.strip():
            continue
        entities = ner_pipe(text)  # [{'entity_group': 'PER', 'score': 0.99, 'word': 'John', 'start': 0, 'end': 4}, ...]
        for ent in entities:
            score = float(ent.get("score", 0.0))
            if score < threshold:
                continue
            bboxes = align_entity_to_bboxes(ent, line)
            if not bboxes:
                continue
            union = union_bboxes(bboxes)
            draw_boxes.append(union)
            events.append(RedactionEvent(
                page_index=0,  # to be overwritten by caller for PDFs
                text=text[ent["start"]:ent["end"]],
                label=str(ent.get("entity_group", ent.get("entity", "ENTITY"))),
                score=score,
                bbox=union,
                line_index=line["line_index"]
            ))
    # Draw redactions
    redacted = img.copy()
    redact_boxes(redacted, draw_boxes)
    return redacted, events

In [47]:
def save_pdf_from_images(images: List[Image.Image], out_path: Path):
    # Ensure all images are RGB
    images_rgb = [im.convert("RGB") for im in images]
    if len(images_rgb) == 1:
        images_rgb[0].save(out_path, "PDF", resolution=200.0)
    else:
        images_rgb[0].save(out_path, "PDF", resolution=200.0, save_all=True, append_images=images_rgb[1:])

In [48]:
def write_logs(events: List[RedactionEvent], summary: DocumentSummary, out_dir: Path, base_name: str):
    # JSON
    json_path = out_dir / f"{base_name}_redaction_log.json"
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump([asdict(e) for e in events], f, indent=2)

    # CSV
    csv_path = out_dir / f"{base_name}_redaction_log.csv"
    with open(csv_path, "w", encoding="utf-8", newline="") as f:
        writer = csv.writer(f)
        writer.writerow(["page_index", "line_index", "label", "score", "text", "bbox_left", "bbox_top", "bbox_right", "bbox_bottom"])
        for e in events:
            l, t, r, b = e.bbox
            writer.writerow([e.page_index, e.line_index, e.label, f"{e.score:.4f}", e.text, l, t, r, b])

    # Summary
    summary_path = out_dir / f"{base_name}_summary.json"
    with open(summary_path, "w", encoding="utf-8") as f:
        json.dump(asdict(summary), f, indent=2)

In [49]:
def process_file(input_path: Path, output_dir: Path, threshold: float, model_name: str, ocr_lang: str):
    output_dir.mkdir(parents=True, exist_ok=True)
    ner_pipe = load_ner_pipeline(model_name=model_name)

    ext = input_path.suffix.lower()
    redaction_events: List[RedactionEvent] = []
    redacted_images: List[Image.Image] = []
    total_pages = 0

    if ext == ".pdf":
        page_images = pdf_to_images(input_path, dpi=200)
        total_pages = len(page_images)
        for i, page_img in enumerate(page_images):
            red_img, events = process_image(page_img, ner_pipe, threshold=threshold, ocr_lang=ocr_lang)
            # set page_index on events
            for e in events:
                e.page_index = i
            redacted_images.append(red_img)
            redaction_events.extend(events)

        # Save redacted PDF
        base_name = input_path.stem + "_REDACTED"
        out_pdf = output_dir / f"{base_name}.pdf"
        save_pdf_from_images(redacted_images, out_pdf)

    elif ext in SUPPORTED_IMAGE_EXTS:
        total_pages = 1
        img = load_image_any(input_path)
        red_img, events = process_image(img, ner_pipe, threshold=threshold, ocr_lang=ocr_lang)
        for e in events:
            e.page_index = 0
        redaction_events.extend(events)
        # Save redacted image (same format as input)
        base_name = input_path.stem + "_REDACTED"
        out_img = output_dir / f"{base_name}{ext}"
        red_img.save(out_img)

    else:
        raise ValueError(f"Unsupported file type: {ext}")

    # Compute sensitivity
    sens_score = compute_sensitivity_score(redaction_events)
    sens_label = label_sensitivity(sens_score)

    # Write logs
    base_name_core = input_path.stem
    summary = DocumentSummary(
        input_path=str(input_path),
        total_pages=total_pages,
        total_entities=len(redaction_events),
        redacted_entities=len(redaction_events),
        sensitivity_score=sens_score,
        sensitivity_label=sens_label,
        threshold=threshold,
        model_name=model_name,
        generated_at=datetime.utcnow().isoformat() + "Z",
    )
    write_logs(redaction_events, summary, output_dir, base_name_core)

    return summary

In [50]:
import torch
import sys
import json
from pathlib import Path
from transformers import pipeline


def main(input_file, output_dir, threshold=0.85, model_name="dslim/bert-base-NER", ocr_lang="eng"):
    print("🚀 Running AI Shredder++")
    print(f"Input file     : {input_file}")
    print(f"Output folder  : {output_dir}")
    print(f"Confidence thr : {threshold}")

    # Convert strings → Path objects
    input_path = Path(input_file)
    output_path = Path(output_dir)

    if not input_path.exists():
        print(f"❌ Input file not found: {input_path}", file=sys.stderr)
        sys.exit(1)

    # Ensure output directory exists
    output_path.mkdir(parents=True, exist_ok=True)

    try:
        summary = process_file(
            input_path,
            output_path,
            threshold=threshold,
            model_name=model_name,
            ocr_lang=ocr_lang
        )
        print(json.dumps(asdict(summary), indent=2))
        print(f"\n✅ Redaction complete. Files saved to: {output_path}")
    except Exception as e:
      print(f"❌ Error: {e}", file=sys.stderr)
      return 


In [51]:
from dataclasses import asdict

In [52]:
import os
from pathlib import Path

# ---- Modified main() ----
def main(input_file, output_dir, threshold=0.9):
    print("🚀 Running AI Shredder++")
    print(f"Input file     : {input_file}")
    print(f"Output folder  : {output_dir}")
    print(f"Confidence thr : {threshold}")

    # --- your existing code goes here ---
    # Example: load image, run OCR/NER model, redact sensitive entities
    # Let's assume you collect all detected entities in a list
    detected_entities = []  

    # TODO: Replace this dummy example with your actual detection logic
    # Example: detected_entities = ner_model(image, threshold)
    # cv2.imwrite(output_path, redacted_image)

    # For now assume we simulate: found 3 entities
    # (Remove this line when your actual pipeline is there)
    detected_entities = ["Name", "Date", "AccountNumber"]

    entities_found = len(detected_entities)

    # ---- Save your redacted image as before ----
    # output_path = os.path.join(output_dir, Path(input_file).name)
    # cv2.imwrite(output_path, redacted_image)

    return entities_found   # ✅ Now returns integer


# ---- Batch Processing with Stats ----
input_folder = Path("budget/budget")
output_folder = Path("budget/redacted")
output_folder.mkdir(parents=True, exist_ok=True)

exts = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp"}

# Stats
total_images = 0
total_entities = 0
redacted_images = 0

for img_path in input_folder.iterdir():
    if img_path.suffix.lower() in exts:
        total_images += 1

        print(f"\nRedacting: {img_path}")
        entities_found = main(
            input_file=str(img_path),
            output_dir=str(output_folder),
            threshold=0.9
        ) or 0   # fallback: if None, treat as 0

        if entities_found > 0:
            redacted_images += 1
            total_entities += entities_found

# ---- SUMMARY ----
print("\n📊 SUMMARY REPORT")
print(f"Total Images Processed     : {total_images}")
print(f"Images with Redactions     : {redacted_images}")
print(f"Total Entities Redacted    : {total_entities}")
if total_images > 0:
    print(f"Avg Entities per Image     : {total_entities / total_images:.2f}")
    print(f"Redaction Coverage (%)     : {100 * redacted_images / total_images:.2f}%")



Redacting: budget\budget\0000009955.tif
🚀 Running AI Shredder++
Input file     : budget\budget\0000009955.tif
Output folder  : budget\redacted
Confidence thr : 0.9

Redacting: budget\budget\0000009994.tif
🚀 Running AI Shredder++
Input file     : budget\budget\0000009994.tif
Output folder  : budget\redacted
Confidence thr : 0.9

Redacting: budget\budget\0000011675.tif
🚀 Running AI Shredder++
Input file     : budget\budget\0000011675.tif
Output folder  : budget\redacted
Confidence thr : 0.9

Redacting: budget\budget\0000011677.tif
🚀 Running AI Shredder++
Input file     : budget\budget\0000011677.tif
Output folder  : budget\redacted
Confidence thr : 0.9

Redacting: budget\budget\0000016462.tif
🚀 Running AI Shredder++
Input file     : budget\budget\0000016462.tif
Output folder  : budget\redacted
Confidence thr : 0.9

Redacting: budget\budget\0000016526.tif
🚀 Running AI Shredder++
Input file     : budget\budget\0000016526.tif
Output folder  : budget\redacted
Confidence thr : 0.9

Redacting

In [53]:
pip install PyPDF2 fpdf pdfplumber


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 25.2
[notice] To update, run: C:\Users\user\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [54]:
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [57]:
import os
from pathlib import Path
from PIL import Image, ImageDraw
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
from dataclasses import dataclass, asdict
import pytesseract
from pytesseract import Output
import fitz # PyMuPDF
from fpdf import FPDF
from PyPDF2 import PdfReader, PdfWriter
import csv
from datetime import datetime
from collections import defaultdict
from typing import List, Dict, Any, Tuple

# --- Data Classes from original notebook ---
@dataclass
class RedactionEvent:
    page_index: int
    text: str
    label: str
    score: float
    bbox: Tuple[int, int, int, int]
    line_index: int

@dataclass
class DocumentSummary:
    input_path: str
    total_pages: int
    total_entities: int
    redacted_entities: int
    sensitivity_score: float
    sensitivity_label: str
    threshold: float
    model_name: str
    generated_at: str

# --- Helper functions from original notebook (functional versions) ---
def load_ner_pipeline(model_name: str = "dslim/bert-base-NER"):
    # This is the correct pipeline loading from your original code.
    return pipeline("token-classification", model=model_name, aggregation_strategy="simple")

def ocr_image_words(img: Image.Image, lang: str = "eng") -> List[Dict[str, Any]]:
    """
    OCR an image and return a list of word dictionaries including:
      - text, conf, left, top, width, height, line_num
    """
    data = pytesseract.image_to_data(img, lang=lang, output_type=Output.DICT)
    n = len(data["text"])
    words = []
    for i in range(n):
        text = data["text"][i]
        if text is None or text.strip() == "":
            continue
        try:
            conf = float(data["conf"][i])
        except (ValueError, TypeError):
            conf = -1.0
        word = {
            "text": text,
            "conf": conf,
            "left": int(data["left"][i]),
            "top": int(data["top"][i]),
            "width": int(data["width"][i]),
            "height": int(data["height"][i]),
            "line_num": int(data.get("line_num", [0]*n)[i]),
            "block_num": int(data.get("block_num", [0]*n)[i]),
            "par_num": int(data.get("par_num", [0]*n)[i]),
        }
        words.append(word)
    return words

def words_to_lines(words: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
    """
    Group words by their line_num, preserve order by x-position.
    """
    groups = defaultdict(list)
    for w in words:
        key = (w["block_num"], w["par_num"], w["line_num"])
        groups[key].append(w)

    lines = []
    for idx, (key, ws) in enumerate(sorted(groups.items(), key=lambda kv: (kv[0][0], kv[0][1], kv[0][2]))):
        ws_sorted = sorted(ws, key=lambda x: x["left"])
        text_parts = []
        spans = []
        cursor = 0
        for w in ws_sorted:
            word_text = str(w["text"])
            if text_parts:
                text_parts.append(" ")
                cursor += 1
            start = cursor
            text_parts.append(word_text)
            cursor += len(word_text)
            end = cursor
            bbox = (w["left"], w["top"], w["left"] + w["width"], w["top"] + w["height"])
            spans.append({
                "text": word_text,
                "bbox": bbox,
                "start_char": start,
                "end_char": end,
                "conf": w["conf"],
            })
        lines.append({
            "line_index": idx,
            "text": "".join(text_parts),
            "spans": spans
        })
    return lines

def align_entity_to_bboxes(entity: Dict[str, Any], line: Dict[str, Any]) -> List[Tuple[int,int,int,int]]:
    """
    Aligns NER entity character offsets to OCR word bounding boxes.
    """
    start = entity["start"]
    end = entity["end"]
    bboxes = []
    for span in line["spans"]:
        if span["end_char"] <= start or span["start_char"] >= end:
            continue
        bboxes.append(span["bbox"])
    return bboxes

def union_bboxes(bboxes: List[Tuple[int,int,int,int]]) -> Tuple[int,int,int,int]:
    """Combines a list of bounding boxes into a single, encompassing box."""
    if not bboxes:
        return None
    l = min(b[0] for b in bboxes)
    t = min(b[1] for b in bboxes)
    r = max(b[2] for b in bboxes)
    b = max(b[3] for b in bboxes)
    return (l, t, r, b)

def redact_boxes(img: Image.Image, boxes: List[Tuple[int,int,int,int]], margin: int = 2) -> Image.Image:
    """Draws black rectangles over specified bounding boxes."""
    draw = ImageDraw.Draw(img)
    if boxes:
        for (l,t,r,b) in boxes:
            draw.rectangle([l - margin, t - margin, r + margin, b + margin], fill="black")
    return img

def compute_sensitivity_score(events: List[RedactionEvent]) -> float:
    """Calculates a document sensitivity score based on entity weights."""
    if not events:
        return 0.0
    weights = {
        "PER": 1.0, "PERSON": 1.0,
        "ORG": 0.8, "ORGANIZATION": 0.8,
        "LOC": 0.6, "LOCATION": 0.6,
        "GPE": 0.6,
        "DATE": 0.4, "TIME": 0.4,
        "NORP": 0.5, "MISC": 0.5,
        "EMAIL": 0.9, "EMAIL_ADDRESS": 0.9,
        "PHONE_NUMBER": 0.9, "CARDINAL": 0.5,
        "MONEY": 0.8, "CREDIT_CARD": 1.0, "ACCOUNT_NUMBER": 1.0,
    }
    total = 0.0
    denom = 0.0
    for e in events:
        w = weights.get(e.label, 0.7)
        total += w * e.score
        denom += w
    score = total / max(denom, 1e-6)
    return min(1.0, max(0.0, score))

def label_sensitivity(score: float) -> str:
    """Assigns a label based on the sensitivity score."""
    if score >= 0.85:
        return "HIGH"
    elif score >= 0.6:
        return "MEDIUM"
    else:
        return "LOW"

# --- Main script logic to process images and generate a summary report ---
def process_image_and_classify(img_path: Path, output_dir: Path, ner_pipe, threshold: float = 0.9):
    """
    Orchestrates the image processing, classification, and redaction.
    Returns classification and number of entities redacted.
    """
    img = Image.open(img_path).convert("RGB")
    words = ocr_image_words(img)
    lines = words_to_lines(words)
    redaction_events = []
    draw_boxes = []

    for line in lines:
        text = line["text"]
        if not text.strip():
            continue
        entities = ner_pipe(text)
        for ent in entities:
            score = float(ent.get("score", 0.0))
            if score < threshold:
                continue
            bboxes = align_entity_to_bboxes(ent, line)
            if not bboxes:
                continue
            union_bbox = union_bboxes(bboxes)
            draw_boxes.append(union_bbox)
            redaction_events.append(RedactionEvent(
                page_index=0,
                text=text[ent["start"]:ent["end"]],
                label=str(ent.get("entity_group", ent.get("entity", "ENTITY"))),
                score=score,
                bbox=union_bbox,
                line_index=line["line_index"]
            ))

    sens_score = compute_sensitivity_score(redaction_events)
    classification = label_sensitivity(sens_score)
    
    # Redact and save the image if not LOW
    if classification in ["MEDIUM", "HIGH"]:
        redacted_img = img.copy()
        redact_boxes(redacted_img, draw_boxes)
        output_path = output_dir / img_path.name
        redacted_img.save(output_path)
    
    return classification, len(redaction_events)

# --- Batch Processing and Final Summary Report ---
def run_redaction_and_summary(input_folder, output_folder, threshold=0.9):
    output_folder.mkdir(parents=True, exist_ok=True)
    ner_pipe = load_ner_pipeline()
    image_extensions = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp"}

    # Stats counters
    classification_counts = {"LOW": 0, "MEDIUM": 0, "HIGH": 0}
    total_images_processed = 0
    total_redacted_entities = 0

    print("🚀 Starting image redaction and classification...")
    
    for img_file in input_folder.iterdir():
        if img_file.suffix.lower() in image_extensions:
            try:
                classification, entity_count = process_image_and_classify(
                    img_file, output_folder, ner_pipe, threshold=threshold
                )
                
                total_images_processed += 1
                total_redacted_entities += entity_count
                
                # Update classification counts
                classification_counts[classification] += 1
            
            except Exception as e:
                print(f"Error processing {img_file.name}: {e}")

    # --- Final Summary Report ---
    print("\n📊 FINAL CLASSIFICATION SUMMARY REPORT")
    print(f"Total Images Processed      : {total_images_processed}")
    print(f"Images Classified as LOW    : {classification_counts['LOW']}")
    print(f"Images Classified as MEDIUM : {classification_counts['MEDIUM']}")
    print(f"Images Classified as HIGH   : {classification_counts['HIGH']}")
    print(f"Total Entities Redacted     : {total_redacted_entities}")
    if total_images_processed > 0:
        print(f"Avg Entities per Image      : {total_redacted_entities / total_images_processed:.2f}")
    
# --- Execution block (you would uncomment this to run) ---
input_folder = Path("budget/budget")
output_folder = Path("budget/classification")
run_redaction_and_summary(input_folder, output_folder)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


🚀 Starting image redaction and classification...


KeyboardInterrupt: 