In [5]:
!wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
!unzip annotations_trainval2017.zip

--2025-08-07 16:28:33--  http://images.cocodataset.org/annotations/annotations_trainval2017.zip
Resolving images.cocodataset.org (images.cocodataset.org)... 3.5.29.225, 3.5.30.39, 3.5.21.80, ...
Connecting to images.cocodataset.org (images.cocodataset.org)|3.5.29.225|:80... connected.
HTTP request sent, awaiting response... 503 Slow Down
2025-08-07 16:28:33 ERROR 503: Slow Down.

unzip:  cannot find or open annotations_trainval2017.zip, annotations_trainval2017.zip.zip or annotations_trainval2017.zip.ZIP.


# Get normal datasets

In [4]:
from pycocotools.coco import COCO
import pandas as pd
import json, os, shutil, glob

# ---------------- config ----------------
DATA_DIR = '../data'
ANNO_DIR = f'{DATA_DIR}/annotations_trainval2017/annotations'

# Use VAL split instead of TRAIN
SPLIT = 'val2017'  # change to 'train2017' if you ever want to switch back

instances_json = f'{ANNO_DIR}/instances_{SPLIT}.json'
captions_json  = f'{ANNO_DIR}/captions_{SPLIT}.json'
IMG_DIR        = f'{DATA_DIR}/{SPLIT}'

SAVE_DIR = DATA_DIR
os.makedirs(SAVE_DIR, exist_ok=True)

# Sanity check: make sure images are there
if not os.path.isdir(IMG_DIR):
    raise FileNotFoundError(
        f"Couldn't find image dir: {IMG_DIR}\n"
        f"Download with:\n"
        f"  wget -c http://images.cocodataset.org/zips/{SPLIT}.zip\n"
        f"  unzip {SPLIT}.zip"
    )
# Optional light check (doesn't scan all files)
some_jpgs = glob.glob(os.path.join(IMG_DIR, "*.jpg"))
if len(some_jpgs) == 0:
    raise FileNotFoundError(f"No .jpg files found in {IMG_DIR}. Did the unzip complete?")

# -------------- load annotations --------------
coco = COCO(instances_json)
cap  = COCO(captions_json)

# category IDs are identical across COCO 2017 splits
cat_id   = coco.getCatIds(catNms=['cat'])[0]      # 17
dog_id   = coco.getCatIds(catNms=['dog'])[0]      # 18
human_id = coco.getCatIds(catNms=['person'])[0]   # 1

# Precompute membership sets
imgs_with_cat   = set(coco.getImgIds(catIds=[cat_id]))
imgs_with_dog   = set(coco.getImgIds(catIds=[dog_id]))
imgs_with_human = set(coco.getImgIds(catIds=[human_id]))

# Map: image_id -> [captions...]
caps_by_img = {}
for ann in cap.dataset['annotations']:
    caps_by_img.setdefault(ann['image_id'], []).append(ann['caption'])

# Build master table (one row per image) — sort for determinism
records = []
for img in coco.loadImgs(sorted(coco.getImgIds())):
    img_id = img['id']
    records.append({
        'image_id': img_id,
        'file_name': img['file_name'],
        'captions': json.dumps(caps_by_img.get(img_id, []), ensure_ascii=False),
        'has_cat':   int(img_id in imgs_with_cat),
        'has_dog':   int(img_id in imgs_with_dog),
        'has_human': int(img_id in imgs_with_human),
    })
df = pd.DataFrame(records).sort_values('image_id').reset_index(drop=True)

# --------- helper: balance to ~1:1 by downsampling majority ----------
SEED = 42
def balanced_1to1(df_bin, seed=SEED):
    pos = df_bin[df_bin['label'] == 1]
    neg = df_bin[df_bin['label'] == 0]
    if len(pos) == 0 or len(neg) == 0:
        return df_bin.copy()
    n = min(len(pos), len(neg))
    out = pd.concat([
        pos.sample(n, random_state=seed),
        neg.sample(n, random_state=seed)
    ], axis=0).sample(frac=1.0, random_state=seed).reset_index(drop=True)
    return out

# --------- build three binary datasets from the VAL DF (overlap allowed) ---------
def make_binary(df, label_col):
    raw = df[['image_id', 'file_name', 'captions', label_col]].rename(columns={label_col: 'label'})
    bal = balanced_1to1(raw)
    return raw, bal

cat_raw,   cat_df   = make_binary(df, 'has_cat')
dog_raw,   dog_df   = make_binary(df, 'has_dog')
human_raw, human_df = make_binary(df, 'has_human')

# --------- quick stats ----------
def stats(name, d):
    pos = int(d.label.sum()); neg = len(d) - pos
    print(f"{name}: {len(d)} rows | pos={pos}, neg={neg}")

stats("VAL cat_df (balanced)",   cat_df)
stats("VAL dog_df (balanced)",   dog_df)
stats("VAL human_df (balanced)", human_df)

# --------- helpers to save images + manifest (frozen selection) ----------
def _ensure_dir(path):
    os.makedirs(path, exist_ok=True)

def _copy_images_for_df(df_split, dest_dir, img_root):
    _ensure_dir(dest_dir)
    missing = []
    copied = 0
    # unique file names per split to avoid redundant copies
    for fn in sorted(df_split['file_name'].unique()):
        src = os.path.join(img_root, fn)
        dst = os.path.join(dest_dir, fn)
        if not os.path.exists(src):
            missing.append(fn)
            continue
        os.makedirs(os.path.dirname(dst), exist_ok=True)
        if not os.path.exists(dst):
            shutil.copy2(src, dst)
            copied += 1
    print(f"Saved {copied} images to {dest_dir}. Missing: {len(missing)}")
    if missing:
        with open(os.path.join(dest_dir, '_missing_images.txt'), 'w', encoding='utf-8') as f:
            f.write("\n".join(missing))

def _write_manifest(df_split, manifest_path, seed):
    out = {
        "seed": seed,
        "split": SPLIT,
        "count": int(len(df_split)),
        "image_ids": df_split['image_id'].tolist(),
        "file_names": df_split['file_name'].tolist(),
    }
    with open(manifest_path, 'w', encoding='utf-8') as f:
        json.dump(out, f, ensure_ascii=False, indent=2)

def save_csv_and_images(df_split, csv_path, img_root):
    df_split.to_csv(csv_path, index=False)
    stem = os.path.splitext(os.path.basename(csv_path))[0]
    images_dir = os.path.join(os.path.dirname(csv_path), f"{stem}_images")
    manifest_path = os.path.join(os.path.dirname(csv_path), f"{stem}_manifest.json")
    _copy_images_for_df(df_split, images_dir, img_root)
    _write_manifest(df_split, manifest_path, SEED)

# --------- save (balanced) + images + manifest ----------
save_csv_and_images(cat_df,   f'{SAVE_DIR}/coco_{SPLIT}_cat_binary_with_captions_balanced.csv',   IMG_DIR)
save_csv_and_images(dog_df,   f'{SAVE_DIR}/coco_{SPLIT}_dog_binary_with_captions_balanced.csv',   IMG_DIR)
save_csv_and_images(human_df, f'{SAVE_DIR}/coco_{SPLIT}_human_binary_with_captions_balanced.csv', IMG_DIR)

# --------- (optional) also save the full, unbalanced versions + images ----------
save_csv_and_images(cat_raw,   f'{SAVE_DIR}/coco_{SPLIT}_cat_binary_with_captions_all.csv',   IMG_DIR)
save_csv_and_images(dog_raw,   f'{SAVE_DIR}/coco_{SPLIT}_dog_binary_with_captions_all.csv',   IMG_DIR)
save_csv_and_images(human_raw, f'{SAVE_DIR}/coco_{SPLIT}_human_binary_with_captions_all.csv', IMG_DIR)


loading annotations into memory...
Done (t=0.55s)
creating index...
index created!
loading annotations into memory...
Done (t=0.08s)
creating index...
index created!
VAL cat_df (balanced): 368 rows | pos=184, neg=184
VAL dog_df (balanced): 354 rows | pos=177, neg=177
VAL human_df (balanced): 4614 rows | pos=2307, neg=2307
Saved 368 images to ../data/coco_val2017_cat_binary_with_captions_balanced_images. Missing: 0
Saved 354 images to ../data/coco_val2017_dog_binary_with_captions_balanced_images. Missing: 0
Saved 4614 images to ../data/coco_val2017_human_binary_with_captions_balanced_images. Missing: 0
Saved 5000 images to ../data/coco_val2017_cat_binary_with_captions_all_images. Missing: 0
Saved 5000 images to ../data/coco_val2017_dog_binary_with_captions_all_images. Missing: 0
Saved 5000 images to ../data/coco_val2017_human_binary_with_captions_all_images. Missing: 0


In [3]:
human_df

Unnamed: 0,image_id,file_name,captions,label
0,92953,000000092953.jpg,"[""A woman rides a horse while others look on"",...",1
1,533739,000000533739.jpg,"[""Heavy traffic in a city with a \""Citi Bank\""...",1
2,72850,000000072850.jpg,"[""a close up of a baseball player with a ball ...",1
3,421103,000000421103.jpg,"[""A horse and donkeys standing up in a field o...",0
4,453481,000000453481.jpg,"[""A couple of people with some bikes on a stre...",1
...,...,...,...,...
108339,571287,000000571287.jpg,"[""A large cooked cut pizza on a table."", ""A cl...",0
108340,266503,000000266503.jpg,"[""A sign that has a camel on it."", ""Various st...",0
108341,464498,000000464498.jpg,"[""a few batches of various baked goods like do...",0
108342,394627,000000394627.jpg,"[""A small boy holding a yellow baseball bat, ""...",1


# Create datasets for modality probe 

In [None]:
# Let's read the data randomly
# balance between text and image data
# combine into one dataset f"{SAVE_DIR}/coco_imgs_text_balanced.csv'"

In [2]:
from pycocotools.coco import COCO
import pandas as pd, os, random

ROWS_TO_TEST = 200

# --- paths ---
ANNO_DIR = '../data/annotations_trainval2017/annotations'
IMG_DIR  = '../data/val2017'
SAVE_DIR = '../data'; os.makedirs(SAVE_DIR, exist_ok=True)
SEED = 1337; rng = random.Random(SEED)

# --- load COCO VAL annotations ---
coco = COCO(f'{ANNO_DIR}/instances_val2017.json')
cap  = COCO(f'{ANNO_DIR}/captions_val2017.json')

# map: image_id -> [captions]
caps_by_img = {}
for a in cap.dataset['annotations']:
    caps_by_img.setdefault(a['image_id'], []).append(a['caption'])

# sample only VAL images
imgs = coco.loadImgs(coco.getImgIds())

# --- pairwise rows: each file_name gets BOTH modalities ---
rows = []
for img in imgs:
    fid = img["file_name"]
    cap_text = rng.choice(caps_by_img.get(img['id'], [""]))
    rows.append({
        "modality": "text",
        "input": cap_text,
        "label": 0,
        "file_name": fid
    })
    rows.append({
        "modality": "image",
        "input": f"{IMG_DIR}/{fid}",
        "label": 1,
        "file_name": fid
    })

# shuffle + subsample
out = pd.DataFrame(rows).sample(
    frac=ROWS_TO_TEST/len(rows), random_state=SEED
).reset_index(drop=True)

# save
out.to_csv(f"{SAVE_DIR}/coco_imgs_text_balanced_val.csv", index=False)
print(f"Saved {len(out)} rows -> {SAVE_DIR}/coco_imgs_text_balanced_val.csv")


loading annotations into memory...
Done (t=0.54s)
creating index...
index created!
loading annotations into memory...
Done (t=0.02s)
creating index...
index created!
Saved 200 rows -> ../data/coco_imgs_text_balanced_val.csv


In [3]:
out

Unnamed: 0,modality,input,label,file_name
0,image,../data/val2017/000000040083.jpg,1,000000040083.jpg
1,image,../data/val2017/000000074733.jpg,1,000000074733.jpg
2,image,../data/val2017/000000523175.jpg,1,000000523175.jpg
3,image,../data/val2017/000000039484.jpg,1,000000039484.jpg
4,text,"A room with an old sofa, coffee table, lamp an...",0,000000458410.jpg
...,...,...,...,...
195,image,../data/val2017/000000437110.jpg,1,000000437110.jpg
196,text,A little girl holding a baseball bat on grass.,0,000000270474.jpg
197,image,../data/val2017/000000087875.jpg,1,000000087875.jpg
198,image,../data/val2017/000000090155.jpg,1,000000090155.jpg


In [4]:
# move the imgs
import shutil

# after building `out`
img_rows = out[out['modality'] == 'image']
dest = f"{SAVE_DIR}/coco_imgs_text_cls_imgs"
os.makedirs(dest, exist_ok=True)

for fn in img_rows['file_name']:
    src = f"{IMG_DIR}/{fn}"
    dst = f"{dest}/{fn}"
    shutil.copy(src, dst)


In [5]:
out.label.value_counts()

label
1    109
0     91
Name: count, dtype: int64

# Create Data for Brightness Probes

In [6]:
from pycocotools.coco import COCO
from PIL import Image, ImageEnhance
import os, random, pandas as pd

# --- config ---
SEED = 1337
random.seed(SEED)

ANNO_DIR = "../data/annotations_trainval2017/annotations"
IMG_DIR  = "../data/val2017"               # COCO val2017 images
OUT_IMG_DIR = "../data/brightness_pairs"   # where we save modified images
OUT_CSV = "../data/brightness_dataset.csv" # output csv
N_GROUPS = 200                             # how many base images to use (each yields 2 variants)

BRIGHT_FACTORS = [(0.5, "dark", 0), (1.5, "bright", 1)]  # (factor, suffix, label)

os.makedirs(OUT_IMG_DIR, exist_ok=True)

# --- load COCO val annotations + captions ---
coco = COCO(f"{ANNO_DIR}/instances_val2017.json")
cap  = COCO(f"{ANNO_DIR}/captions_val2017.json")

# map image_id -> [captions]
caps_by_img = {}
for a in cap.dataset["annotations"]:
    caps_by_img.setdefault(a["image_id"], []).append(a["caption"])

# only keep images that have at least 1 caption
img_ids = [iid for iid in coco.getImgIds() if caps_by_img.get(iid)]
random.shuffle(img_ids)
img_ids = img_ids[:N_GROUPS]  # choose N base images

rows = []
for img in coco.loadImgs(img_ids):
    base_id = os.path.splitext(img["file_name"])[0]  # e.g. 000000123456
    src_path = os.path.join(IMG_DIR, img["file_name"])
    if not os.path.isfile(src_path):
        continue  # skip if missing locally

    # pick one caption to “pair” with this semantic object
    caption = random.choice(caps_by_img.get(img["id"], [""]))

    # open source once
    with Image.open(src_path).convert("RGB") as im:
        enhancer = ImageEnhance.Brightness(im)

        # create both variants for the SAME base image
        for factor, suffix, label in BRIGHT_FACTORS:
            out_name = f"{base_id}_{suffix}.jpg"
            out_path = os.path.join(OUT_IMG_DIR, out_name)

            im_mod = enhancer.enhance(factor)  # apply brightness
            im_mod.save(out_path, quality=95)

            rows.append({
                "base_id": base_id,              # group id (pair control)
                "orig_file_name": img["file_name"],
                "variant_file_name": out_name,   # saved filename
                "variant_path": out_path,        # absolute/relative path to saved variant
                "variant": suffix,               # "dark" / "bright"
                "label": label,                  # 0 = dark, 1 = bright
                "caption": caption               # paired text for semantic control
            })

# write CSV
df = pd.DataFrame(rows)
df.to_csv(OUT_CSV, index=False)
print(f"Saved {len(df)} rows ({len(df)//2} groups) -> {OUT_CSV}")
print(f"Images written to: {OUT_IMG_DIR}")


loading annotations into memory...
Done (t=0.49s)
creating index...
index created!
loading annotations into memory...
Done (t=0.02s)
creating index...
index created!
Saved 400 rows (200 groups) -> ../data/brightness_dataset.csv
Images written to: ../data/brightness_pairs


In [7]:
df

Unnamed: 0,base_id,orig_file_name,variant_file_name,variant_path,variant,label,caption
0,000000364884,000000364884.jpg,000000364884_dark.jpg,../data/brightness_pairs/000000364884_dark.jpg,dark,0,A person in a snow sporting event is going ra...
1,000000364884,000000364884.jpg,000000364884_bright.jpg,../data/brightness_pairs/000000364884_bright.jpg,bright,1,A person in a snow sporting event is going ra...
2,000000140840,000000140840.jpg,000000140840_dark.jpg,../data/brightness_pairs/000000140840_dark.jpg,dark,0,Various kites near the ground in a field.
3,000000140840,000000140840.jpg,000000140840_bright.jpg,../data/brightness_pairs/000000140840_bright.jpg,bright,1,Various kites near the ground in a field.
4,000000353096,000000353096.jpg,000000353096_dark.jpg,../data/brightness_pairs/000000353096_dark.jpg,dark,0,A computer with an image of lighting on the sc...
...,...,...,...,...,...,...,...
395,000000546976,000000546976.jpg,000000546976_bright.jpg,../data/brightness_pairs/000000546976_bright.jpg,bright,1,A man riding on the back of a motorcycle.
396,000000262895,000000262895.jpg,000000262895_dark.jpg,../data/brightness_pairs/000000262895_dark.jpg,dark,0,A fairly curmudgeonly looking old gentleman gr...
397,000000262895,000000262895.jpg,000000262895_bright.jpg,../data/brightness_pairs/000000262895_bright.jpg,bright,1,A fairly curmudgeonly looking old gentleman gr...
398,000000474881,000000474881.jpg,000000474881_dark.jpg,../data/brightness_pairs/000000474881_dark.jpg,dark,0,The elk have horns and are eating grass.
