In [None]:
import torch
from diffusers import StableDiffusion3Img2ImgPipeline
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import numpy as np

In [None]:
from huggingface_hub import interpreter_login

interpreter_login()

In [None]:
# 1b) Load CLIP for entailment testing
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [None]:
# Stable Diffusion img2img pipeline
sd_pipe = StableDiffusion3Img2ImgPipeline.from_pretrained(
    "stabilityai/stable-diffusion-3.5-medium",
    torch_dtype=torch.float16 if device == "cuda" else torch.float32
)
sd_pipe.enable_model_cpu_offload()


In [None]:
def clip_entailment_score(image: Image.Image, sentence: str) -> float:
    # 1) process image only
    image_inputs = clip_processor(images=[image], return_tensors="pt").to(device)
    # 2) process text only
    text_inputs  = clip_processor(text=[sentence], return_tensors="pt", padding=True).to(device)

    with torch.no_grad():
        # pass only the pixel_values to get_image_features
        image_embeds = clip_model.get_image_features(pixel_values=image_inputs.pixel_values)
        # pass only the text tensors to get_text_features
        text_embeds  = clip_model.get_text_features(
            input_ids=text_inputs.input_ids,
            attention_mask=text_inputs.attention_mask
        )

    # normalize
    image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
    text_embeds  = text_embeds  / text_embeds.norm(p=2, dim=-1, keepdim=True)
    # cosine similarity
    sim = (image_embeds * text_embeds).sum(dim=-1).item()
    return sim

def entails(image: Image.Image, sentence: str, threshold: float = 0.33) -> bool:
    """
    Returns True if CLIP similarity ≥ threshold.
    """
    score = clip_entailment_score(image, sentence)
    return score >= threshold


In [None]:
def edit_away(
    image: Image.Image,
    caption: str,
    avoid_sentence: str,
    strength: float = 0.75,
    guidance_scale: float = 7.5,
    num_inference_steps: int = 50
) -> Image.Image:
    """
    Edit `image` to preserve `caption` but move it away from `avoid_sentence`.
    Uses `avoid_sentence` as negative prompt.
    """
    result = sd_pipe(
        prompt=caption,
        negative_prompt=avoid_sentence,
        image=image,
        strength=strength,
        guidance_scale=guidance_scale,
        num_inference_steps=num_inference_steps
    )
    return result.images[0]

In [None]:
# Load the dataset
import json

with open("./data.jsonl", "r") as f:
    data = [json.loads(line) for line in f]

In [None]:
import requests
from io import BytesIO

caption_id: str = data[1]['captionID'].split("#")[0]

request = requests.get("https://hazeveld.org/snli-ve/images/" + caption_id)

img = Image.open(BytesIO(request.content)).convert("RGB")

In [None]:
dict = {}
for i in range(80):
    caption_id: str = data[i]['captionID'].split("#")[0]
    request = requests.get("https://hazeveld.org/snli-ve/images/" + caption_id)
    image = Image.open(BytesIO(request.content)).convert("RGB")
    orig_score = entails(image, data[i]['sentence2'], threshold=0.23)
    score2 = entails(image, data[i]['sentence1'], threshold=0.23)
    dict[caption_id] = [orig_score, score2]

In [None]:
orig_score = entails(image, data[1]['sentence2'], threshold=0.20)

In [None]:
image = edit_away(
    img,
    caption=data[1]['sentence1'],
    avoid_sentence=data[1]['sentence2'],
)