## Do Images from Memorized Prompts Contain Greater Memorization Risk on Average?

In [11]:
import os
import csv
import argparse
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
from dataset import SiameseNetworkDataset
from model import SiameseNetwork

from sklearn import metrics
import Utils

In [None]:
model = SiameseNetwork(network="ResNet-50", in_channels=3, n_features=128).to('cuda')

# Loading ckpt
CKPT_PATH = "trained_models/best_network.pth"
# CKPT_PATH = "/raid/s2198939/diffusion_memorization/PatientVerification/checkpoints/ResNet-50_epoch1_data_handling_RPN.pth"
model.load_state_dict(torch.load(CKPT_PATH))
model.eval()

In [6]:
CSV_DIR = "/raid/s2198939/MIMIC_Dataset/physionet.org/files/mimic-cxr-jpg/2.0.0/Prepared_CSV2"
IMG_DIR = "/raid/s2198939/MIMIC_Dataset/physionet.org/files/mimic-cxr-jpg/2.0.0"
train_csv = os.path.join(CSV_DIR, "FINAL_TRAIN.xlsx")
test_csv = os.path.join(CSV_DIR, "FINAL_TEST.xlsx")
val_csv = os.path.join(CSV_DIR, "FINAL_VAL.xlsx")

In [None]:
df_train = pd.read_excel(train_csv)
df_test = pd.read_excel(test_csv)
df_val = pd.read_excel(val_csv)

len(df_train), len(df_test), len(df_val), len(df_combined)

(110340, 14065, 13487, 137892)

In [13]:
# Create paths
df_train["path"] = df_train["path"].apply(lambda x: os.path.join(IMG_DIR, x))
df_test["path"] = df_test["path"].apply(lambda x: os.path.join(IMG_DIR, x))
df_val["path"] = df_val["path"].apply(lambda x: os.path.join(IMG_DIR, x))

df_combined = pd.concat([df_train, df_test, df_val])

In [8]:
df_mem_prompts = pd.read_csv("/raid/s2198939/diffusion_memorization/det_outputs_radedit/memorized_prompts_with_paths.csv")
df_non_mem_prompts = pd.read_csv("/raid/s2198939/diffusion_memorization/det_outputs_radedit/non_memorized_prompts_with_paths.csv")

len(df_mem_prompts), len(df_non_mem_prompts)

(474, 141)

In [4]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

In [9]:
def get_reidentification_score(model, img1, img2, transforms):
    with torch.no_grad():
        img1 = transforms(img1).unsqueeze(0).to('cuda')
        img2 = transforms(img2).unsqueeze(0).to('cuda')
        outputs = model(img1, img2)
        score = torch.sigmoid(outputs)

    return score.item()

In [16]:
# Compare each image in memorized_prompts with all images in df_combined

mem_paths = list(df_mem_prompts["path"].values)
all_images_paths = list(df_combined["path"].values)

reid_scores = []
for mem_path in mem_paths:
    # img1 = Image.open(os.path.join(IMG_DIR, mem_path))
    img1 = Image.open(mem_path).convert('RGB')
    for img_path in all_images_paths:
        img2 = Image.open(img_path).convert('RGB')
        reid_score = get_reidentification_score(model, img1, img2, transform)
        reid_scores.append(reid_score)
        break
    break