### 1. Importy + ścieżki

In [45]:
### 7. Importy i konfiguracja ścieżek

from pathlib import Path
import pandas as pd
import re
from IPython.display import display
from ipywidgets import widgets, VBox, HBox, Output
from PIL import Image

PROJECT_ROOT = Path(".").resolve()
OUTPUT_CSV_DIR = PROJECT_ROOT / "outputs" / "csv"

OUTPUT_CSV_DIR.mkdir(parents=True, exist_ok=True)

# Pomocnicza funkcja do rozbijania tekstów typu "person; tie"
def clean_split(value: str):
    if not isinstance(value, str):
        return []
    tmp = value.replace("(", ";").replace(")", ";")
    parts = [p.strip() for p in re.split(r"[;,]", tmp)]
    return [p for p in parts if p]

### 2. CLIP → df_clip

In [46]:
### 7.0 CLIP – wczytanie wyników z CSV

clip_csv = OUTPUT_CSV_DIR / "clip_scene_subjects.csv"

if not clip_csv.exists():
    print("[07] Brak pliku:", clip_csv)
    df_clip = pd.DataFrame(
        columns=["image_id","subject_label_pl","subject_label_en","source"]
    )
else:
    df_raw = pd.read_csv(clip_csv)
    print("[07] CLIP kolumny:", list(df_raw.columns))

    df_clip = pd.DataFrame({
        "image_id":         df_raw["file_path"],
        "subject_label_pl": df_raw["scene_label"],
        "subject_label_en": df_raw["scene_prompt_en"],
        "source":           "clip",
    })

    print("[07] df_clip shape:", df_clip.shape)

df_clip.head()

[07] CLIP kolumny: ['file_path', 'scene_label', 'scene_score', 'scene_prompt_en', 'timestamp']
[07] df_clip shape: (74, 4)


Unnamed: 0,image_id,subject_label_pl,subject_label_en,source
0,/Users/olga/MetaLogic/inputs/0004.jpg,wnętrze sklepu,shop interior,clip
1,/Users/olga/MetaLogic/inputs/0006.jpg,tłum na ulicy,crowd of people on the street,clip
2,/Users/olga/MetaLogic/inputs/0009.jpg,napisy lub hasła na murach,graffiti or political slogans on walls,clip
3,/Users/olga/MetaLogic/inputs/0022.jpg,demonstracja uliczna,street protest,clip
4,/Users/olga/MetaLogic/inputs/0034.jpg,wózki dziecięce lub place zabaw,strollers or playgrounds,clip


### 3. YOLO → df_yolo

In [47]:
### 7.1 YOLO – wczytanie detekcji jako subjecty

yolo_csv = OUTPUT_CSV_DIR / "yolo_subject_suggestions.csv"

# słownik EN→PL
CLASS_PL = {
    "person": "osoba",
    "car": "samochód",
    "bicycle": "rower",
    "motorcycle": "motocykl",
    "bus": "autobus",
    "truck": "ciężarówka",
    "airplane": "samolot",
    "train": "pociąg",
    "bench": "ławka",
    "bottle": "butelka",
    "cup": "kubek",
    "bowl": "miska",
    "chair": "krzesło",
}

def load_yolo_subjects(csv_path):
    if not csv_path.exists():
        print("[07] YOLO: brak pliku CSV")
        return pd.DataFrame(
            columns=["image_id","subject_label_pl","subject_label_en","source"]
        )

    df = pd.read_csv(csv_path)
    print("[07] YOLO kolumny:", list(df.columns))

    # image_id
    if "file_path" in df.columns:
        image_id = df["file_path"]
    else:
        raise ValueError("YOLO CSV nie ma kolumny file_path")

    # etykiety YOLO
    if "subject_en" in df.columns:
        subj_en_series = df["subject_en"]
    elif "labels" in df.columns:
        subj_en_series = df["labels"]
    else:
        raise ValueError("YOLO CSV musi mieć kolumnę subject_en lub labels")

    rows = []
    for iid, en_str in zip(image_id, subj_en_series):
        labels_en = clean_split(str(en_str))
        for en in labels_en:
            pl = CLASS_PL.get(en, en)  # fallback
            rows.append({
                "image_id": iid,
                "subject_label_pl": pl,
                "subject_label_en": en,
                "source": "yolo",
            })

    return pd.DataFrame(rows)

df_yolo = load_yolo_subjects(yolo_csv)
df_yolo.head()

[07] YOLO kolumny: ['file_path', 'subject_en', 'subject_pl']


Unnamed: 0,image_id,subject_label_pl,subject_label_en,source
0,/Users/olga/MetaLogic/inputs/0004.jpg,butelka,bottle,yolo
1,/Users/olga/MetaLogic/inputs/0006.jpg,osoba,person,yolo
2,/Users/olga/MetaLogic/inputs/0006.jpg,tie,tie,yolo
3,/Users/olga/MetaLogic/inputs/0009.jpg,osoba,person,yolo
4,/Users/olga/MetaLogic/inputs/0022.jpg,osoba,person,yolo


### 4. OCR → df_ocr (wersja prosta)

In [48]:
### 7.2 OCR – pozyskanie słów jako subjectów (wersja prosta)

ocr_csv = OUTPUT_CSV_DIR / "ocr_results.csv"

if ocr_csv.exists():
    df_ocr_raw = pd.read_csv(ocr_csv)
else:
    df_ocr_raw = pd.DataFrame(columns=["file_path","full_text"])

def extract_words(text):
    if not isinstance(text,str):
        return []
    # słowa ≥ 3 litery
    return re.findall(r"[A-Za-zĄąĆćĘęŁłŃńÓóŚśŻżŹź]{3,}", text)

rows = []
for _, row in df_ocr_raw.iterrows():
    iid = row["file_path"]
    text = row.get("full_text","")
    for w in extract_words(text):
        rows.append({
            "image_id": iid,
            "subject_label_pl": w.lower(),
            "subject_label_en": w.lower(),
            "source": "ocr",
        })

df_ocr = pd.DataFrame(rows).drop_duplicates()
df_ocr.head()

Unnamed: 0,image_id,subject_label_pl,subject_label_en,source
0,inputs/0004.jpg,arcey,arcey,ocr
1,inputs/0004.jpg,cwikławikła,cwikławikła,ocr
2,inputs/0004.jpg,pard,pard,ocr
3,inputs/0004.jpg,cwikł,cwikł,ocr
4,inputs/0004.jpg,cwik,cwik,ocr


### 5. Scalanie → df_subjects_all

In [49]:
### 7.3 Scalanie subjectów CLIP + YOLO + OCR

frames = []

if "df_clip" in globals(): frames.append(df_clip)
if "df_yolo" in globals(): frames.append(df_yolo)
if "df_ocr" in globals():  frames.append(df_ocr)

df_subjects_all = (
    pd.concat(frames, ignore_index=True)
    .drop_duplicates()
)

print("[07] df_subjects_all:", df_subjects_all.shape)
df_subjects_all.head(20)

[07] df_subjects_all: (695, 4)


Unnamed: 0,image_id,subject_label_pl,subject_label_en,source
0,/Users/olga/MetaLogic/inputs/0004.jpg,wnętrze sklepu,shop interior,clip
1,/Users/olga/MetaLogic/inputs/0006.jpg,tłum na ulicy,crowd of people on the street,clip
2,/Users/olga/MetaLogic/inputs/0009.jpg,napisy lub hasła na murach,graffiti or political slogans on walls,clip
3,/Users/olga/MetaLogic/inputs/0022.jpg,demonstracja uliczna,street protest,clip
4,/Users/olga/MetaLogic/inputs/0034.jpg,wózki dziecięce lub place zabaw,strollers or playgrounds,clip
5,/Users/olga/MetaLogic/inputs/0043.jpg,autobus lub tramwaj,bus or tram exterior,clip
6,/Users/olga/MetaLogic/inputs/0044.jpg,szyldy sklepowe lub reklamy uliczne,shop signs or street advertisements,clip
7,/Users/olga/MetaLogic/inputs/0074.jpg,rowery lub motocykle,bicycles or motorcycles,clip
8,/Users/olga/MetaLogic/inputs/0075.jpg,demonstracja uliczna,street protest,clip
9,/Users/olga/MetaLogic/inputs/0077.jpg,tłum na ulicy,crowd of people on the street,clip


### 6. Interfejs (miniatura + tabela + checkboxy + zapis)

In [50]:
### 7.4 Interaktywny edytor subject (CLIP + YOLO + OCR)

# zapis wyborów:
SAVE_PATH = OUTPUT_CSV_DIR / "subject_human_selected.csv"
SAVE_PATH.parent.mkdir(parents=True, exist_ok=True)

# dane początkowe:
if SAVE_PATH.exists():
    df_sel = pd.read_csv(SAVE_PATH)
else:
    df_sel = pd.DataFrame(columns=[
        "image_id","subject_label_pl","subject_label_en","accepted"
    ])

state = {"df_sel": df_sel}

# lista obrazów:
image_ids = sorted(df_subjects_all["image_id"].unique())

# podgląd miniatur (z YOLO, jeśli istnieją)
YOLO_PREVIEW_DIR = PROJECT_ROOT / "outputs" / "previews" / "yolo"

def load_thumbnail(image_id: str):
    p = Path(image_id)
    if not p.is_absolute():
        img_path = PROJECT_ROOT / p
    else:
        img_path = p

    thumb = YOLO_PREVIEW_DIR / f"thumb_{img_path.stem}.jpg"
    for candidate in [thumb, img_path]:
        if candidate.exists():
            try:
                img = Image.open(candidate).convert("RGB")
                img.thumbnail((900,900))
                return img
            except:
                pass
    return None

# interfejs
dropdown = widgets.Dropdown(options=image_ids, description="Obraz:")
out = Output()

def render_image_subjects(image_id):
    out.clear_output()
    with out:
        rows = df_subjects_all[df_subjects_all["image_id"] == image_id]
        if rows.empty:
            print("Brak kandydatów dla:", image_id)
            return

        # miniatura
        img = load_thumbnail(image_id)
        img_out = Output()
        with img_out:
            if img is not None: display(img)
            else: print("(brak podglądu)")

        # lista checkboxów
        checkbox_widgets = []
        for _, row in rows.iterrows():
            pl = str(row["subject_label_pl"])
            en = str(row["subject_label_en"])
            src = str(row["source"])

            label_text = f"{pl} ({en}) [{src}]"

            accepted_before = (
                (state["df_sel"]["image_id"] == image_id)
                & (state["df_sel"]["subject_label_pl"] == pl)
                & (state["df_sel"]["subject_label_en"] == en)
                & (state["df_sel"]["accepted"] == True)
            ).any()

            cb = widgets.Checkbox(
                value=bool(accepted_before),
                description=label_text,
                indent=False,
            )
            checkbox_widgets.append((row, cb))

        save_btn = widgets.Button(description="Zapisz")

        def save_clicked(_):
            df_old = state["df_sel"]

            # usuń stare wpisy dla tego obrazka
            mask = df_old["image_id"] == image_id
            df_new = df_old[~mask]

            # dodaj nowe wpisy
            for row, cb in checkbox_widgets:
                df_new = pd.concat([
                    df_new,
                    pd.DataFrame([{
                        "image_id": image_id,
                        "subject_label_pl": row["subject_label_pl"],
                        "subject_label_en": row["subject_label_en"],
                        "accepted": cb.value
                    }])
                ], ignore_index=True)

            df_new.to_csv(SAVE_PATH, index=False)
            state["df_sel"] = df_new
            print("Zapisano:", SAVE_PATH)

        save_btn.on_click(save_clicked)

        controls = VBox([w for (_,w) in checkbox_widgets] + [save_btn])
        display(HBox([img_out, controls]))

def on_select(change):
    render_image_subjects(change["new"])

dropdown.observe(on_select, names="value")

display(dropdown, out)

if len(image_ids)>0:
    render_image_subjects(image_ids[0])

Dropdown(description='Obraz:', options=('/Users/olga/MetaLogic/inputs/0004.jpg', '/Users/olga/MetaLogic/inputs…

Output()

### 7. Eksport Dublin Core

In [51]:
### 7.5 Eksport Dublin Core – zbiór zaakceptowanych subjectów

selected_csv = SAVE_PATH

if not selected_csv.exists():
    print("Brak pliku z zaakceptowanymi etykietami:", selected_csv)
else:
    df_sel = pd.read_csv(selected_csv)
    df_accepted = df_sel[df_sel["accepted"] == True]

    dc = df_accepted[["image_id","subject_label_pl"]].copy()
    dc.columns = ["identifier", "subject"]

    out_dc = OUTPUT_CSV_DIR / "dublin_core_export.csv"
    dc.to_csv(out_dc, index=False)
    print("Zapisano:", out_dc)

    dc.head()

Zapisano: /Users/olga/MetaLogic/outputs/csv/dublin_core_export.csv


### 8. KOMÓRKA — Diagnostyka 7.A–7.C (opcjonalnie)

In [52]:
### 7.A Diagnostyka: katalogi źródłowe

def parent_folder(p):
    return Path(p).parent.name

def get_path_series(df):
    for col in ["image_id","file_path","src_path"]:
        if col in df.columns:
            return df[col]
    return None

if not df_clip.empty:
    clip_par = sorted({parent_folder(p) for p in get_path_series(df_clip)})
    print("CLIP folders:", clip_par)

if not df_yolo.empty:
    yolo_par = sorted({parent_folder(p) for p in get_path_series(df_yolo)})
    print("YOLO folders:", yolo_par)

CLIP folders: ['inputs']
YOLO folders: ['inputs']


In [53]:
### 7.B Diagnostyka: zgodność list plików

def norm_name(p): return Path(p).name

clip_names = {norm_name(p) for p in get_path_series(df_clip)} if not df_clip.empty else set()
yolo_names  = {norm_name(p) for p in get_path_series(df_yolo)} if not df_yolo.empty else set()
ocr_names   = {norm_name(p) for p in get_path_series(df_ocr)}  if not df_ocr.empty else set()

print("Wspólne CLIP & YOLO:", len(clip_names & yolo_names))
print("Tylko CLIP:", len(clip_names - yolo_names))
print("Tylko YOLO:", len(yolo_names - clip_names))
print("Tylko OCR:",  len(ocr_names - clip_names - yolo_names))

Wspólne CLIP & YOLO: 74
Tylko CLIP: 0
Tylko YOLO: 0
Tylko OCR: 0


In [54]:
### 7.C Podgląd brakujących / dodatkowych plików

print("Przykłady tylko w CLIP:", list(clip_names - yolo_names)[:10])
print("Przykłady tylko w YOLO:", list(yolo_names - clip_names)[:10])
print("Przykłady tylko w OCR:",  list(ocr_names - clip_names - yolo_names)[:10])

Przykłady tylko w CLIP: []
Przykłady tylko w YOLO: []
Przykłady tylko w OCR: []
