# Baseline for CLIP AI Detection Adversarial Attack

## Setup

In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!pip install transformers torch --quiet
!pip install torchattacks --quiet

In [3]:
import os
import ast
import csv

import numpy as np
import pandas as pd

from PIL import Image

import torch
from transformers import CLIPProcessor, CLIPModel

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

from torchattacks import PGD
import torchvision

from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader


In [4]:
DIFFUSION_MODELS = ["openjourney", "titan", "dalle", "real", "openjourney_v4", "stable_diff"]
ROOT = '/content/drive/MyDrive/IDL Image Generation'

## Process Images with CLIP

In [5]:
!ls data

ls: cannot access 'data': No such file or directory


In [6]:
def collect_images(generators):
  generated_images = {}
  for directory in generators:
    print(directory, len(os.listdir(os.path.join(ROOT, "data", directory))))
    for filepath in os.listdir(os.path.join(ROOT, "data", directory)):
        full_path = os.path.join(ROOT, "data", directory, filepath)
        id_idx = filepath.rfind('_') + 1
        id = filepath[id_idx:-4]
        label = 1 if directory == "real" else 0
        generated_images[full_path] = {
            "generator": directory,
            "label": label, # 0 = fake, 1 = real
            "id": id,
        }
  return generated_images


In [7]:
images = collect_images(DIFFUSION_MODELS)

openjourney 3376
titan 2058
dalle 435
real 3633
openjourney_v4 3465
stable_diff 3345


In [8]:
print("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

model.to(device)

cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e

In [9]:
def run_clip(image_path):
  with Image.open(image_path) as img:
    img = img.convert("RGB")
    if img.size != (512, 512):
      print(f"Resizing {image_path} to 512x512")
      img = img.resize((512, 512))
      img.save(image_path)
  # inputs = processor(images=img, return_tensors="pt", padding=True, truncation=True)
  inputs = processor(images=img, return_tensors="pt")
  inputs = inputs.to(device)
  with torch.no_grad():
    # image_features = model.get_image_features(**inputs)
    outputs = model.vision_model(inputs.pixel_values)
    image_features = outputs.last_hidden_state[:, 0, :]
    image_features = image_features / image_features.norm(dim=-1, keepdim=True)
    return image_features.squeeze().cpu().numpy()

In [10]:
def get_processed_filepaths(csv_path: str):
    processed_filepaths = set()
    if not os.path.isfile(csv_path):
        return set()
    try:
        df = pd.read_csv(csv_path)
        df = df[df["Generator"].isin(DIFFUSION_MODELS)]
        df = df.drop_duplicates(subset=["Generator", "Id"])
        processed_filepaths = set(df["Filepath"].values)
    except Exception as e:
        print(e)

    return processed_filepaths

In [11]:
def process_images_and_log(image_dict: dict, csv_fname: str) -> None:
    processed_filepaths = get_processed_filepaths(os.path.join(ROOT, csv_fname))
    print(f"{len(processed_filepaths)} files have already been processed")
    for path, info in tqdm(image_dict.items(), desc="Embedding Images"):
      if path in processed_filepaths:
        # print(f"{info['generator']}_{info['id']} has already been processed")
        continue

      encoding = run_clip(path)

      data = [path, info["id"], info["generator"], str(encoding), info["label"]]

      csv_path = os.path.join(ROOT, csv_fname)
      file_exists = os.path.isfile(csv_path)
      with open(csv_path, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        if not file_exists:
            writer.writerow(["Filepath", "Id", "Generator", "Features", "Label"])
        writer.writerow(data)
      # print(f"{info['generator']}_{info['id']} encoded and logged to csv")
    print(f"All input images encoded!")

In [12]:
# process_images_and_log(images, "CLIP_embeddings.csv")
processed_filepaths = get_processed_filepaths(os.path.join(ROOT, "CLIP_embeddings.csv"))
print(f"{len(processed_filepaths)} files have already been processed")

16313 files have already been processed


## Train SVM for AI Image Detection

In [13]:
class CLIPSVMDiscriminator:
    def __init__(self, model_name="openai/clip-vit-base-patch32", device=None):
        self.device = device if device is not None else torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print("Running on:", self.device)
        self.model = CLIPModel.from_pretrained(model_name)
        self.processor = CLIPProcessor.from_pretrained(model_name)
        self.model.to(self.device)
        self.model.eval()
        self.svm = SVC(kernel="linear", C=1.0, probability=True)
        self.svm_trained = False

    def run_clip(self, imgs):
        inputs = self.processor(images=imgs, return_tensors="pt").to(self.device)
        with torch.no_grad():
            outputs = self.model.vision_model(inputs.pixel_values)
            image_features = outputs.last_hidden_state[:, 0, :]
            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
            return image_features.squeeze().cpu().numpy()

    def train_svm(self, X_train, y_train):
        self.svm.fit(X_train, y_train)
        self.svm_trained = True
        train_accuracy = self.svm.score(X_train, y_train)
        print(f"Training accuracy for discriminator: {train_accuracy:.4f}")
        return self.svm

    def predict_from_embeddings(self, embeddings):
        preds = self.svm.predict(embeddings)
        probs = self.svm.predict_proba(embeddings)[:, 1]
        return preds, probs

    def evaluate(self, X_test, y_test):
        model = self.svm
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        auc = roc_auc_score(y_test, y_pred_proba)
        ap_per_class = []
        for class_label in np.unique(y_test):
            y_test_binary = (y_test == class_label).astype(int)
            ap = average_precision_score(y_test_binary, y_pred_proba)
            ap_per_class.append(ap)
        map_score = np.mean(ap_per_class)
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print(f"AUC: {auc:.4f}")
        print(f"mAP: {map_score:.4f}")
        return {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "auc": auc,
            "map": map_score
        }

In [14]:
def string_to_np(feature_str):
    embedding_list = np.fromstring(feature_str[1:-1], sep=' ')
    assert len(embedding_list) == 768
    return np.array(embedding_list)


df = pd.read_csv(os.path.join(ROOT, "CLIP_embeddings.csv"), usecols=range(5))
# Filter df for rows where Generator is in DIFFUSION MODELS
df = df[df["Generator"].isin(DIFFUSION_MODELS)]
df = df.drop_duplicates(subset=["Filepath", "Id", "Generator", "Label"])
df["Features"] = df["Features"].apply(string_to_np).to_numpy()
X = np.stack(df["Features"])
y = df["Label"].to_numpy()

# df["Generator"].value_counts()
df.head()

Unnamed: 0,Filepath,Id,Generator,Features,Label
0,/content/drive/MyDrive/IDL Image Generation/da...,93489,openjourney,"[-0.0115892114, 0.0602126196, -0.0306698009, -...",0
1,/content/drive/MyDrive/IDL Image Generation/da...,93500,openjourney,"[-0.0148345735, 0.0203769933, -0.00583226699, ...",0
2,/content/drive/MyDrive/IDL Image Generation/da...,93502,openjourney,"[-0.0165445693, 0.00986933149, -0.0408154577, ...",0
3,/content/drive/MyDrive/IDL Image Generation/da...,93511,openjourney,"[-0.0157231297, 0.0540607572, -0.00418719556, ...",0
4,/content/drive/MyDrive/IDL Image Generation/da...,93523,openjourney,"[0.0038340704, -0.000587593589, 0.0121148545, ...",0


In [15]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [16]:
print(X.shape, y.shape)

(16313, 768) (16313,)


In [17]:
model = CLIPSVMDiscriminator()


model.train_svm(X_train, y_train)
print("Testing Metrics:")
metrics = model.evaluate(X_test, y_test)

Running on: cuda
Training accuracy for discriminator: 0.9976
Testing Metrics:
Accuracy: 0.9979
Precision: 0.9979
Recall: 0.9979
F1 Score: 0.9979
AUC: 0.9998
mAP: 0.7854


In [18]:
torch.save(model, os.path.join(ROOT, "CLIP_discriminator.pt"))

## Evaluate method on Unseen Generators

In [19]:
# for generator in ["openjourney", "titan", "dalle", "openjourney_v4"]:
#   if generator == "real":
#     continue

#   test_df = df[df['Generator'] == generator]
#   train_df = df[df['Generator'] != generator]

#   # Reset the index for both partitions
#   test_df = test_df.reset_index(drop=True)
#   train_df = train_df.reset_index(drop=True)

#   X_test = np.stack(test_df["Features"].apply(string_to_np).to_numpy())
#   y_test = test_df["Label"].to_numpy()

#   X_train = np.stack(train_df["Features"].apply(string_to_np).to_numpy())
#   y_train = train_df["Label"].to_numpy()

#   print(f"Results for no-{generator} model")
#   svm = train(X_train, y_train)
#   print(f"Results on {generator}:")
#   metrics = evaluate(svm, X_test, y_test)
#   print("******************************************")



## Dataloaders

In [20]:
class ArtEmbeddingDataset(Dataset):
    """
    A dataset class for loading art image embeddings and labels from a CSV file.

    Expected CSV columns:
      - "Filepath": location of the original image.
      - "Features": string representation of the image embedding vector.
      - "Label": the label indicating whether the image is real or AI–generated.
    """
    def __init__(self, csv_file, transform=None, ai_only=False):
        df = pd.read_csv(os.path.join(ROOT, "CLIP_embeddings.csv"), usecols=range(5))
        df = df[df["Generator"].isin(DIFFUSION_MODELS)]
        df = df.drop_duplicates(subset=["Filepath", "Id", "Generator", "Label"])
        df["Features"] = df["Features"].apply(string_to_np).to_numpy()

        self.data = df
        self.transform = transform

        for g in DIFFUSION_MODELS:
          os.makedirs(os.path.join("data", "adversarial_images", g), exist_ok=True)

        # Filter for AI-generated only if requested (Label=0)
        if ai_only:
            self.data = self.data[self.data["Label"] == 0]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        filepath = row["Filepath"]
        features = row["Features"]
        label = row["Label"]

        sample = {"filepath": filepath, "features": features, "label": label}

        if self.transform is not None:
            sample = self.transform(sample)

        return sample

In [21]:
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
def collate_for_pgd(batch):
    filepaths = [item["filepath"] for item in batch if os.path.exists(item["filepath"])]

    # read in images from filepaths using torchvision
    images = [torchvision.io.read_image(filepath) for filepath in filepaths]
    images = torch.stack(images)
    images = processor(images=images, return_tensors="pt")["pixel_values"]

    outpaths = [os.path.join("data",
                             "adversarial_images",
                             filepath.split(os.sep)[-2],
                             f"adversarial_{os.path.basename(filepath)}.png")
                for filepath in filepaths]

    return images, outpaths


In [25]:
dataset = ArtEmbeddingDataset(os.path.join(ROOT, "CLIP_embeddings.csv"), ai_only=True)
dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_for_pgd, shuffle=False)

## Prepare PGD Attack on Method

In [26]:
from torchattacks import PGD

class CLIPPGDAttack(PGD):
    def __init__(self, model, eps=8/255, alpha=2/255, steps=10, random_start=True):
        super().__init__(model, eps, alpha, steps, random_start)

    def get_logits(self, inputs):
        if self._normalization_applied is False:
            inputs = self.normalize(inputs)

        # Get image features from the vision model
        vision_outputs = self.model.vision_model(inputs)
        image_features = vision_outputs.last_hidden_state[:, 0, :]

        return image_features

In [None]:
discriminator = model.model
processor = model.processor

attack = CLIPPGDAttack(discriminator, eps=8/255, alpha=2/255, steps=10, random_start=True)


for images, outpaths in tqdm(dataloader, desc="Adversarial Attacks"):

  if all(os.path.exists(outpath) for outpath in outpaths):
    continue

  images = images.to(discriminator.device)
  target_label = torch.tensor([1]*len(outpaths)).to(discriminator.device)

  # # Generate adversarial example
  adversarial_image = attack(images, target_label)

  for i in range(adversarial_image.shape[0]):
    # Get the image and filepath for the current index
    image = adversarial_image[i]
    filepath = outpaths[i]

    # Save the image using torchvision.utils.save_image
    torchvision.utils.save_image(image, filepath)



Adversarial Attacks:   7%|▋         | 29/397 [01:17<18:16,  2.98s/it]

## Find Descriminatory Prediction on Adversarial Images

In [None]:
model = torch.load(os.path.join(ROOT, "CLIP_discriminator.pt"), weights_only=False)


In [None]:

def collate_for_svm(batch):
    filepaths = [item["filepath"] for item in batch if os.path.exists(item["filepath"])]
    outpaths = [os.path.join("data",
                             "adversarial_images",
                             filepath.split(os.sep)[-2],
                             f"adversarial_{os.path.basename(filepath)}.png")
                for filepath in filepaths]

    # read in images from filepaths using torchvision
    images = [torchvision.io.read_image(filepath) for filepath in outpaths]
    images = torch.stack(images)
    embeddings = model.run_clip(images)
    return embeddings, outpaths

In [None]:
svm_dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_for_svm, shuffle=False)

In [None]:
records = []


for embeddings, paths in tqdm(svm_dataloader, "Saving Adversarial Embeddings"):
  for i, emb in enumerate(embeddings):
    records.append({"filepath": paths[i], "embedding": emb})

df = pd.DataFrame(records)
df.to_csv(os.path.join(ROOT, "CLIP_adversarial_embeddings.csv"), index=False)



In [None]:
labels = [0] * len(df)

df_adv = pd.read_csv(os.path.join(ROOT, "CLIP_adversarial_embeddings.csv"))
df_adv["embedding"] = df_adv["embedding"].apply(string_to_np)
embeddings = df_adv["embedding"].to_numpy()
embeddings = np.vstack(embeddings)
print(embeddings.shape)

model.evaluate(embeddings, labels)