# Stable Diffusion Counterfactual Image generation
Using libraries from huggingface and the open source model Stable Diffusion 3.5-large or medium

In [1]:
from huggingface_hub import interpreter_login
interpreter_login()



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|



Enter your token (input will not be visible):  ········
Add token as git credential? (Y/n)  n


## Loading the model
loading the model using huggingface's diffuser library

In [2]:
import torch
from diffusers import StableDiffusion3Img2ImgPipeline

pipe = StableDiffusion3Img2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-3.5-large", torch_dtype=torch.bfloat16).to("cuda")

Loading pipeline components...:   0%|          | 0/9 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers


## Loading the dataset
Load the generated dataset containing only neutral relationships

In [3]:
# Load the dataset
import json

with open("./data.jsonl", "r") as f:
    data = [json.loads(line) for line in f]


data[0]

{'annotator_labels': ['neutral'],
 'captionID': '3416050480.jpg#4',
 'gold_label': 'neutral',
 'pairID': '3416050480.jpg#4r1n',
 'sentence1': 'A person on a horse jumps over a broken down airplane.',
 'sentence1_binary_parse': '( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( broken ( down airplane ) ) ) ) ) . ) )',
 'sentence1_parse': '(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) (NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ broken) (JJ down) (NN airplane)))) (. .)))',
 'sentence2': 'A person is training his horse for a competition.',
 'sentence2_binary_parse': '( ( A person ) ( ( is ( ( training ( his horse ) ) ( for ( a competition ) ) ) ) . ) )',
 'sentence2_parse': '(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (VP (VBG training) (NP (PRP$ his) (NN horse)) (PP (IN for) (NP (DT a) (NN competition))))) (. .)))'}

#### Test drive first entry

In [4]:
from dataset import get_url
import requests
import torch
from PIL import Image
from io import BytesIO

caption_id: str = data[0]['captionID'].split("#")[0]

url: str = get_url(caption_id, local=False)

response = requests.get(url)

image = Image.open(BytesIO(response.content)).convert("RGB")

prompt = data[0]['sentence2']

In [6]:
genImage = pipe(
    prompt=prompt, 
    image=image,
    height=288,
    width=496,
    num_inference_steps=30, 
    guidance_scale=7.5).images[0]

genImage.save(f"output/class_x/{data[0]['captionID']}.png")

  0%|          | 0/18 [00:00<?, ?it/s]

#### Evaluating test image

##### Clip score

In [7]:
import torch
import clip
from PIL import Image

# Load model and device
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

# Load and preprocess image
image = preprocess(genImage).unsqueeze(0).to(device)

# Provide a caption
text = clip.tokenize(prompt).to(device)

# Get features
with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)

# Normalize
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)

# Compute cosine similarity (i.e., CLIP score)
clip_score = (image_features @ text_features.T).item()
print(f"CLIP score: {clip_score:.4f}")


CLIP score: 0.2690


##### Fréchet inception distance

In [10]:
from fidFolder import compute_fid_between_folders

folder1 = "original"
folder2 = "output"

compute_fid_between_folders(folder1, folder2)



Downloading: "https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth" to /root/.cache/torch/hub/checkpoints/inception_v3_google-0cc3c7bd.pth


100%|██████████| 104M/104M [00:00<00:00, 221MB/s] 
Processing original:   2%|▏         | 499/31783 [00:25<27:01, 19.30it/s]


KeyboardInterrupt: 