In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from IPython.display import display
import ipywidgets as widgets
import os

INPUT_CSV = "../outputs/ocr_lines.csv"
OUTPUT_CSV = "../outputs/ocr_lines_reviewed.csv"

df = pd.read_csv(INPUT_CSV)

# kolumny decyzyjne (jeśli nie istnieją)
for col, default in {
    "keep": True,
    "excluded": False,
    "note": ""
}.items():
    if col not in df.columns:
        df[col] = default

df.head()

Unnamed: 0,text,file_name,file_id,gcs_path,page,line_id,bbox_norm,script,source,keep,excluded,note
0,PROLETARIUSZE WSZYSTKICH KRAJOW ŁACZCIE SIE,test.jpg,03e15d229e2352f17c7c2c7c5770e7a9ab0d3c98d035f5...,gs://ocr-2026/photos/test.jpg,1,0,"359.0,173.0,865.0,217.0",latin,gcv_ocr_line,True,False,


### KOMÓRKA 2 — funkcje pomocnicze bbox + rysowanie

In [2]:
import matplotlib.pyplot as plt
from PIL import Image

def parse_bbox(bbox_str):
    """
    bbox_norm: 'x1,y1,x2,y2' w zakresie [0–1]
    """
    x1, y1, x2, y2 = map(float, bbox_str.split(","))
    return x1, y1, x2, y2


def draw_bboxes(image, rows):
    """
    Rysuje bboxy linii OCR na obrazie.
    """
    fig, ax = plt.subplots(figsize=(12, 7))
    ax.imshow(image)
    ax.axis("off")

    w, h = image.size

    for _, r in rows.iterrows():
        if pd.isna(r["bbox_norm"]):
            continue
        x1, y1, x2, y2 = parse_bbox(r["bbox_norm"])
        rect = plt.Rectangle(
            (x1 * w, y1 * h),
            (x2 - x1) * w,
            (y2 - y1) * h,
            linewidth=2,
            edgecolor="red",
            facecolor="none"
        )
        ax.add_patch(rect)

    plt.show()

### KOMÓRKA 3 — wybór obrazu (batch-aware)

In [3]:
import ipywidgets as widgets

files = sorted(df["file_name"].unique())

file_selector = widgets.Dropdown(
    options=files,
    description="Obraz:"
)

display(file_selector)

Dropdown(description='Obraz:', options=('test.jpg',), value='test.jpg')

### KOMÓRKA 4 — wyświetlenie obrazu + bboxów

In [4]:
def draw_bboxes(image, rows):
    fig, ax = plt.subplots(figsize=(12, 7))
    ax.imshow(image)
    ax.axis("off")

    w, h = image.size

    for _, r in rows.iterrows():
        if pd.isna(r["bbox_norm"]):
            continue

        x1, y1, x2, y2 = map(float, r["bbox_norm"].split(","))
        rect = plt.Rectangle(
            (x1 * w, y1 * h),
            (x2 - x1) * w,
            (y2 - y1) * h,
            linewidth=2,
            edgecolor="red",
            facecolor="none"
        )
        ax.add_patch(rect)

    plt.show()

### KOMÓRKA 5 — tabela decyzyjna (keep / watermark / note)

In [5]:
decision_rows = []

def decision_table(file_name):
    subset = df[df["file_name"] == file_name]
    decision_rows.clear()

    header = widgets.HBox([
        widgets.HTML("<b>text</b>"),
        widgets.HTML("<b>keep</b>"),
        widgets.HTML("<b>excluded<br>(watermark)</b>"),
        widgets.HTML("<b>note</b>")
    ])
    display(header)

    for idx, r in subset.iterrows():
        keep = widgets.Checkbox(value=bool(r.get("keep", True)))
        excluded = widgets.Checkbox(value=bool(r.get("excluded", False)))
        note = widgets.Text(value=r.get("note", ""), placeholder="uwaga / interpretacja")

        decision_rows.append((idx, keep, excluded, note))

        display(widgets.HBox([
            widgets.HTML(f"<div style='width:520px'>{r['text']}</div>"),
            keep,
            excluded,
            note
        ]))

widgets.interact(decision_table, file_name=file_selector);

interactive(children=(Dropdown(description='Obraz:', options=('test.jpg',), value='test.jpg'), Output()), _dom…

### KOMÓRKA 4.5 — wzorce znaków wodnych (GLOBALNE)

In [7]:
watermark_input = widgets.Text(
    value="",
    placeholder="np. © Brama Grodzka",
    description="Watermark:"
)

apply_btn = widgets.Button(description="Oznacz jako watermark")

def apply_watermark(_):
    pattern = watermark_input.value.strip()
    if not pattern:
        print("Brak wzorca.")
        return

    mask = df["text"].astype(str).str.contains(pattern, case=False, na=False)
    df.loc[mask, "excluded"] = True
    df.loc[mask, "note"] = "watermark (auto)"
    print(f"Oznaczono {mask.sum()} wierszy jako watermark.")

apply_btn.on_click(apply_watermark)

display(widgets.VBox([watermark_input, apply_btn]))

VBox(children=(Text(value='', description='Watermark:', placeholder='np. © Brama Grodzka'), Button(description…

### KOMÓRKA 6 — zapis decyzji do CSV

In [6]:
for idx, keep_w, excl_w, note_w in decision_rows:
    df.loc[idx, "keep"] = keep_w.value
    df.loc[idx, "excluded"] = excl_w.value
    df.loc[idx, "note"] = note_w.value

OUTPUT_CSV = "../outputs/ocr_lines_reviewed.csv"
df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8")

print(f"[DONE] Zapisano: {OUTPUT_CSV}")

[DONE] Zapisano: ../outputs/ocr_lines_reviewed.csv
