In [1]:
# INIT ‚Äî CLIP zero-shot + predict
# Uruchom tƒô kom√≥rkƒô po restarcie kernela. Potem od razu UI.

import torch, open_clip
from PIL import Image

# 1) UrzƒÖdzenie
DEVICE = (
    "mps" if torch.backends.mps.is_available()
    else "cuda" if torch.cuda.is_available()
    else "cpu"
)

# 2) Model FP16
MODEL_NAME = "ViT-B-32"
PRETRAINED = "openai"  # lub "laion2b_s34b_b79k"
model, _, preprocess = open_clip.create_model_and_transforms(MODEL_NAME, pretrained=PRETRAINED)
model = model.eval().to(DEVICE)
model.to(dtype=torch.float16)  # sp√≥jno≈õƒá FP16

# 3) Klasy i prompty (najpro≈õciej: tu lokalnie)
LABELS = ["przed 1945", "PRL 1945‚Äì1989", "po 1990"]
TEXT_PROMPTS = [
    "photograph in Poland before 1945; interwar clothing; prewar architecture; cobblestones; horse carriages; sepia or black-and-white",
    "photo from the Polish People‚Äôs Republic 1945‚Äì1989; prefab blocks; neon signs; RUCH kiosk; Fiat 126p, Polonez, ≈ªuk; 70s‚Äì80s clothing; socialist posters",
    "photograph in Poland after 1990; modern ads; PVC banners; smartphones; cars after 2005; malls; glass offices; renovated tenements",
]

# 4) Teksty ‚Üí wektory
tokenizer = open_clip.get_tokenizer(MODEL_NAME)
with torch.no_grad():
    text_tokens = tokenizer(TEXT_PROMPTS).to(DEVICE)
    text_features = model.encode_text(text_tokens)
    text_features = text_features / text_features.norm(dim=-1, keepdim=True)

# 5) Predykcja dla obrazu PIL
def predict(pil_img: Image.Image):
    pil = pil_img.convert("RGB")
    image = preprocess(pil).unsqueeze(0).to(DEVICE)

    # dopasuj dtype wej≈õcia do wag modelu (FP16)
    target_dtype = next(model.parameters()).dtype
    image = image.to(dtype=target_dtype)

    with torch.no_grad():
        image_features = model.encode_image(image)
        image_features = image_features / image_features.norm(dim=-1, keepdim=True)
        logits = (100.0 * image_features @ text_features.T).squeeze(0)
        probs = logits.softmax(dim=-1)

    scores = {LABELS[i]: float(probs[i].item()) for i in range(len(LABELS))}
    label = LABELS[int(torch.argmax(probs).item())]
    return scores, label

print(f"INIT OK ‚Üí device={DEVICE}, model={MODEL_NAME}/{PRETRAINED}, classes={LABELS}")



INIT OK ‚Üí device=mps, model=ViT-B-32/openai, classes=['przed 1945', 'PRL 1945‚Äì1989', 'po 1990']


### 1Ô∏è‚É£ Importy i inicjalizacja

In [2]:
# 1Ô∏è‚É£ Importy i inicjalizacja
"""
Zero-shot klasyfikacja obraz√≥w (np. PRL / non-PRL) przy u≈ºyciu modelu CLIP.
U≈ºywa FP16 dla sp√≥jno≈õci z trenowaniem i oszczƒôdno≈õci pamiƒôci.
"""

import torch
import open_clip
import pandas as pd
from pathlib import Path
from PIL import Image
from datetime import datetime, timezone
import time, json

# ≈öcie≈ºki
DIR_STAGING = Path("data/staging")
DIR_OUT = Path("outputs")
DIR_OUT.mkdir(parents=True, exist_ok=True)

# Detekcja urzƒÖdzenia
device = (
    "mps"
    if torch.backends.mps.is_available()
    else "cuda"
    if torch.cuda.is_available()
    else "cpu"
)
print("Device:", device)

# Rejestracja runu
run_id = datetime.now(timezone.utc).isoformat(timespec="seconds")
print("Run ID:", run_id)

Device: mps
Run ID: 2025-11-12T13:24:09+00:00


### 2Ô∏è‚É£ ≈Åadowanie modelu CLIP (FP16)

In [3]:
# 2Ô∏è‚É£ ≈Åadowanie modelu CLIP (FP16)
"""
≈Åaduje model CLIP (ViT-B/32) z biblioteki open_clip_torch.
Ustawia FP16 i tryb ewaluacji.
"""

MODEL_NAME = "ViT-B-32"
PRETRAINED = "openai"

model, _, preprocess = open_clip.create_model_and_transforms(
    MODEL_NAME, pretrained=PRETRAINED
)
tokenizer = open_clip.get_tokenizer(MODEL_NAME)

model = model.eval().to(device)
model.to(dtype=torch.float16)

print("Za≈Çadowano model:", MODEL_NAME)



Za≈Çadowano model: ViT-B-32


### 3Ô∏è‚É£ Definicja etykiet i pe≈Çnych prompt√≥w opisowych

In [4]:
# 3Ô∏è‚É£ Definicja etykiet i pe≈Çnych prompt√≥w opisowych
"""
Trzy klasy epok: przed 1945, PRL (1945‚Äì1989), po 1990.
Ka≈ºda ma bogaty prompt tekstowy opisujƒÖcy kontekst wizualny sceny.
"""

LABELS = ["do 1944", "PRL 1945‚Äì1989", "po 1990"]

TEXT_PROMPTS = [
    # A) Do 1944
    "photograph taken in Poland up to 1944; interwar or older clothing; coats, hats, uniforms; cobblestone streets; prewar tenement houses; horse carriages or very old cars; hand-painted shop signs; art deco typography; sepia or black and white style",

    # B) PRL 1945‚Äì1989
    "photo from the Polish People's Republic (1945‚Äì1989), PRL; prefab panel blocks, RUCH kiosk, neon signs, 'Spo≈Çem' or 'Pewex' stores; queues and everyday street scenes; 1970s‚Äì1980s clothing with shaggy hairstyles, thick-rimmed glasses, polyester suits; Fiat 126p, Polonez, ≈ªuk or Nysa vans; community theater or amateur performances; socialist-era typography and posters",

    # C) Po 1990
    "photograph in Poland after 1990; modern ads and global brand logos; PVC banners, colorful shop signs; street trade and open markets; modern cars after 2005; sportswear with visible logos; smartphones, glass office buildings, shopping malls, renovated tenement houses",
]

print(f"Zbudowano {len(LABELS)} klasy: {', '.join(LABELS)}")

with torch.no_grad(), torch.cuda.amp.autocast(dtype=torch.float16):
    text_tokens = tokenizer(TEXT_PROMPTS).to(device)
    text_features = model.encode_text(text_tokens)
    text_features /= text_features.norm(dim=-1, keepdim=True)

Zbudowano 3 klasy: do 1944, PRL 1945‚Äì1989, po 1990


  with torch.no_grad(), torch.cuda.amp.autocast(dtype=torch.float16):


### 4Ô∏è‚É£ Wczytanie obraz√≥w i generowanie embedding√≥w

In [5]:
from contextlib import nullcontext

device = torch.device("mps" if torch.backends.mps.is_available() else
                      "cuda" if torch.cuda.is_available() else "cpu")

# model ju≈º utworzony wy≈ºej: `model, _, preprocess = open_clip.create_model_and_transforms(...)`
model.to(device).eval()

# FP16 na MPS/CUDA, FP32 na CPU
if device.type in ("mps", "cuda"):
    model.half()
    IMG_DTYPE = torch.float16
    amp_ctx = nullcontext()           # NIE u≈ºywamy autocast na MPS
else:
    model.float()
    IMG_DTYPE = torch.float32
    amp_ctx = nullcontext()

# tekst te≈º w tym samym dtype
with torch.no_grad():
    text_tokens = tokenizer(TEXT_PROMPTS).to(device)
    text_features = model.encode_text(text_tokens)
    text_features = text_features / text_features.norm(dim=-1, keepdim=True)
    text_features = text_features.to(IMG_DTYPE)  # sp√≥jno≈õƒá dtype

# --- pƒôtla po plikach ---
records = []
files = sorted([p for p in DIR_STAGING.rglob("*")
                if p.suffix.lower() in [".jpg",".jpeg",".png",".tif",".tiff"]])

print(f"Znaleziono {len(files)} obraz√≥w w {DIR_STAGING}")

for i, path in enumerate(files):
    try:
        image = preprocess(Image.open(path).convert("RGB")).unsqueeze(0)
        image = image.to(device=device, dtype=IMG_DTYPE)

        with torch.no_grad():         # bez autocast na MPS
            image_features = model.encode_image(image)
            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
            probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)[0].tolist()

            best_label = LABELS[probs.index(max(probs))]    
            records.append({
            "filename": path.name,
            "label_pred": best_label,
            "p_prl": probs[1],
            "p_post1989": probs[2],
            "p_pre1945": probs[0],
        })

        if (i + 1) % 10 == 0:
            print(f"Przetworzono {i+1}/{len(files)}")

    except Exception as e:
        print(f"B≈ÇƒÖd w pliku {path.name}: {e}")

Znaleziono 100 obraz√≥w w data/staging
Przetworzono 10/100
Przetworzono 20/100
Przetworzono 30/100
Przetworzono 40/100
Przetworzono 50/100
Przetworzono 60/100
Przetworzono 70/100
Przetworzono 80/100
Przetworzono 90/100
Przetworzono 100/100


### 5Ô∏è‚É£ Zapis wynik√≥w i logu

In [6]:
# 5Ô∏è‚É£ Zapis wynik√≥w i logu
"""
Zapisuje wyniki zero-shot do outputs/ oraz loguje etap w logs/runlog.jsonl.
"""

df = pd.DataFrame(records)
out_csv = DIR_OUT / "clip_zero_shot_results.csv"
df.to_csv(out_csv, index=False)
print(f"üìÑ Zapisano wyniki: {out_csv} ({len(df)} rekord√≥w)")

# rejestracja runu
log_entry = {
    "run_id": run_id,
    "stage": "clip_zero_shot",
    "n_files": len(df),
    "model": MODEL_NAME,
    "precision": "fp16",
    "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
}
Path("logs").mkdir(exist_ok=True)
with open("logs/runlog.jsonl", "a", encoding="utf-8") as f:
    f.write(json.dumps(log_entry, ensure_ascii=False) + "\n")

üìÑ Zapisano wyniki: outputs/clip_zero_shot_results.csv (100 rekord√≥w)


### 6Ô∏è‚É£ PodglƒÖd pr√≥bki wynik√≥w

In [7]:
# 6Ô∏è‚É£ PodglƒÖd pr√≥bki wynik√≥w
"""
PodglƒÖd kilku pierwszych wynik√≥w klasyfikacji.
"""

df.sample(5, random_state=42)

Unnamed: 0,filename,label_pred,p_prl,p_post1989,p_pre1945
83,0196.jpg,do 1944,0.067383,0.161621,0.770996
53,0022.jpg,PRL 1945‚Äì1989,0.742188,0.199707,0.058136
70,0136.jpg,PRL 1945‚Äì1989,0.825195,0.118835,0.056152
45,0256.jpg,do 1944,0.228638,0.130249,0.641113
44,0253.jpg,PRL 1945‚Äì1989,0.560547,0.14856,0.290771
