# Baseline for CLIP AI Detection Adversarial Attack

## Setup

In [74]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [75]:
%cd '/content/drive/MyDrive/IDL Image Generation'

/content/drive/.shortcut-targets-by-id/1SUnyLWY7LvpxPNxyFvip9Ae4S39ePRqJ/IDL Image Generation


In [59]:
!pip install transformers torch --quiet
!pip install torchattacks --quiet

[31mERROR: Operation cancelled by user[0m[31m
[0m

In [107]:
import os
import ast
import csv

import numpy as np
import pandas as pd

from PIL import Image

import torch
from transformers import CLIPProcessor, CLIPModel

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

from torchattacks import PGD
import torchvision

from tqdm import tqdm


In [108]:
DIFFUSION_MODELS = ["openjourney", "titan", "dalle", "real", "openjourney_v4", "stable_diff"]
ROOT = '/content/drive/MyDrive/IDL Image Generation'

## Process Images with CLIP

In [109]:
!ls data

adversarial_images  openjourney     real	 titan
dalle		    openjourney_v4  stable_diff


In [110]:
def collect_images(generators):
  generated_images = {}
  for directory in generators:
    print(directory, len(os.listdir(os.path.join(ROOT, "data", directory))))
    for filepath in os.listdir(os.path.join(ROOT, "data", directory)):
        full_path = os.path.join(ROOT, "data", directory, filepath)
        id_idx = filepath.rfind('_') + 1
        id = filepath[id_idx:-4]
        label = 1 if directory == "real" else 0
        generated_images[full_path] = {
            "generator": directory,
            "label": label, # 0 = fake, 1 = real
            "id": id,
        }
  return generated_images


In [111]:
images = collect_images(DIFFUSION_MODELS)

openjourney 3377
titan 2058
dalle 435
real 3633
openjourney_v4 3465
stable_diff 3346


In [65]:
print("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

model.to(device)

cuda


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e

In [84]:
def run_clip(image_path):
  with Image.open(image_path) as img:
    img = img.convert("RGB")
    if img.size != (512, 512):
      print(f"Resizing {image_path} to 512x512")
      img = img.resize((512, 512))
      img.save(image_path)
  # inputs = processor(images=img, return_tensors="pt", padding=True, truncation=True)
  inputs = processor(images=img, return_tensors="pt")
  inputs = inputs.to(device)
  with torch.no_grad():
    # image_features = model.get_image_features(**inputs)
    outputs = model.vision_model(inputs.pixel_values)
    image_features = outputs.last_hidden_state[:, 0, :]
    image_features = image_features / image_features.norm(dim=-1, keepdim=True)
    return image_features.squeeze().cpu().numpy()

In [100]:
def get_processed_filepaths(csv_path: str):
    processed_filepaths = set()
    if not os.path.isfile(csv_path):
        return set()
    try:
        df = pd.read_csv(csv_path)
        df = df[df["Generator"].isin(DIFFUSION_MODELS)]
        df = df.drop_duplicates(subset=["Generator", "Id"])
        processed_filepaths = set(df["Filepath"].values)
    except Exception as e:
        print(e)

    return processed_filepaths

In [113]:
def process_images_and_log(image_dict: dict, csv_fname: str) -> None:
    processed_filepaths = get_processed_filepaths(os.path.join(ROOT, csv_fname))
    print(f"{len(processed_filepaths)} files have already been processed")
    for path, info in tqdm(image_dict.items(), desc="Embedding Images"):
      if path in processed_filepaths and not (path.endswith(".png") or path.endswith(".jpg")):
        # print(f"{info['generator']}_{info['id']} has already been processed")
        continue

      encoding = run_clip(path)

      data = [path, info["id"], info["generator"], str(encoding), info["label"]]

      csv_path = os.path.join(ROOT, csv_fname)
      file_exists = os.path.isfile(csv_path)
      with open(csv_path, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        if not file_exists:
            writer.writerow(["Filepath", "Id", "Generator", "Features", "Label"])
        writer.writerow(data)
      # print(f"{info['generator']}_{info['id']} encoded and logged to csv")
    print(f"All input images encoded!")

In [None]:
process_images_and_log(images, "CLIP_embeddings.csv")

Generator
openjourney    249
Name: count, dtype: int64
249
249 files have already been processed


Embedding Images:  21%|██▏       | 3477/16314 [03:39<3:25:58,  1.04it/s]

## Train SVM for AI Image Detection

In [13]:
class CLIPSVMDiscriminator:
    def __init__(self, model_name="openai/clip-vit-base-patch32", device=None):
        self.device = device if device is not None else torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print("Running on:", self.device)
        self.model = CLIPModel.from_pretrained(model_name)
        self.processor = CLIPProcessor.from_pretrained(model_name)
        self.model.to(self.device)
        self.model.eval()
        self.svm = SVC(kernel="linear", C=1.0, probability=True)
        self.svm_trained = False

    def run_clip(self, image_path):
        inputs = self.processor(images=img, return_tensors="pt").to(self.device)
        with torch.no_grad():
            outputs = self.model.vision_model(inputs.pixel_values)
            image_features = outputs.last_hidden_state[:, 0, :]
            image_features = image_features / image_features.norm(dim=-1, keepdim=True)
            return image_features.squeeze().cpu().numpy()

    def train_svm(self, X_train, y_train):
        self.svm.fit(X_train, y_train)
        self.svm_trained = True
        train_accuracy = self.svm.score(X_train, y_train)
        print(f"Training accuracy for discriminator: {train_accuracy:.4f}")
        return self.svm

    def predict_from_embeddings(self, embeddings):
        preds = self.svm.predict(embeddings)
        probs = self.svm.predict_proba(embeddings)[:, 1]
        return preds, probs

    def evaluate(self, X_test, y_test):
        model = self.svm
        y_pred = model.predict(X_test)
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')
        auc = roc_auc_score(y_test, y_pred_proba)
        ap_per_class = []
        for class_label in np.unique(y_test):
            y_test_binary = (y_test == class_label).astype(int)
            ap = average_precision_score(y_test_binary, y_pred_proba)
            ap_per_class.append(ap)
        map_score = np.mean(ap_per_class)
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print(f"AUC: {auc:.4f}")
        print(f"mAP: {map_score:.4f}")
        return {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "auc": auc,
            "map": map_score
        }

In [16]:
def string_to_np(feature_str):
    embedding_list = np.fromstring(feature_str[1:-1], sep=' ')
    assert len(embedding_list) == 768
    return np.array(embedding_list)


df = pd.read_csv(os.path.join(ROOT, "CLIP_embeddings.csv"), usecols=range(5))
# Filter df for rows where Generator is in DIFFUSION MODELS
df = df[df["Generator"].isin(DIFFUSION_MODELS)]
df["Features"] = df["Features"].apply(string_to_np).to_numpy()
X = np.stack(df["Features"])
y = df["Label"].to_numpy()

df["Generator"].value_counts()

In [93]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

Unnamed: 0_level_0,count
Generator,Unnamed: 1_level_1
stablediff,6810
openjourney,6754
real,3633
openjourney_v4,3465
titan,2084
stable_diff,999
dalle,435


In [17]:
print(X.shape, y.shape)

(16312, 768) (16312,)


In [21]:
model = CLIPSVMDiscriminator()


model.train_svm(X_train, y_train)
print("Testing Metrics:")
metrics = model.evaluate(X_test, y_test)

Running on: cuda
Training accuracy for discriminator: 0.9981
Testing Metrics:
Accuracy: 0.9975
Precision: 0.9975
Recall: 0.9975
F1 Score: 0.9975
AUC: 0.9998
mAP: 0.7850


## Evaluate method on Unseen Generators

In [None]:
for generator in ["openjourney", "titan", "dalle", "openjourney_v4"]:
  if generator == "real":
    continue

  test_df = df[df['Generator'] == generator]
  train_df = df[df['Generator'] != generator]

  # Reset the index for both partitions
  test_df = test_df.reset_index(drop=True)
  train_df = train_df.reset_index(drop=True)

  X_test = np.stack(test_df["Features"].apply(string_to_np).to_numpy())
  y_test = test_df["Label"].to_numpy()

  X_train = np.stack(train_df["Features"].apply(string_to_np).to_numpy())
  y_train = train_df["Label"].to_numpy()

  print(f"Results for no-{generator} model")
  svm = train(X_train, y_train)
  print(f"Results on {generator}:")
  metrics = evaluate(svm, X_test, y_test)
  print("******************************************")



Results for no-openjourney_v4 model
Training accuracy: 0.9978
Results on openjourney_v4:
Accuracy: 0.9968
Precision: 1.0000
Recall: 0.9968
F1 Score: 0.9984
AUC: nan
mAP: 1.0000
******************************************


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Prepare PGD Attack on Method

In [22]:
from torchattacks import PGD

class CLIPPGDAttack(PGD):
    def __init__(self, model, eps=8/255, alpha=2/255, steps=10, random_start=True):
        super().__init__(model, eps, alpha, steps, random_start)

    def get_logits(self, inputs):
        if self._normalization_applied is False:
            inputs = self.normalize(inputs)

        # Get image features from the vision model
        vision_outputs = self.model.vision_model(inputs)
        image_features = vision_outputs.last_hidden_state[:, 0, :]

        return image_features

In [20]:
discriminator = model.model
processor = model.processor

attack = CLIPPGDAttack(discriminator, eps=0.3, alpha=2/255, steps=10, random_start=True)


for image_path in tqdm(images, desc="Adversarial Attacks"):
  input_filename = os.path.basename(image_path)
  outpath = os.path.join("data", "adversarial_images", images[image_path]["generator"], f"adversarial_{input_filename}.png")
  if os.path.exists(outpath):
    continue
  os.makedirs(os.path.join("data","adversarial_images", images[image_path]["generator"]), exist_ok=True)


  # Load the image to torch
  image = torchvision.io.read_image(image_path)
  image = processor(images=image, return_tensors="pt")["pixel_values"]

  target_label = torch.tensor([0]).to(model.device)

  # Generate adversarial example
  adversarial_image = attack(image, target_label)

  # Save the adversarial image


  # print(f"Saving adversarial image to {outpath}")
  torchvision.utils.save_image(adversarial_image.squeeze(0), outpath)


Adversarial Attacks:   0%|          | 12/9502 [01:09<15:14:17,  5.78s/it]


KeyboardInterrupt: 