# Hidden Characters

Hi! Welcome to the official colab demo for our demo "Diffusion Illusions: Hiding Images in Plain Sight". [https://ryanndagreat.github.io/Diffusion-Illusions/](https://ryanndagreat.github.io/Diffusion-Illusions/)

This project was inspired by our paper "Peekaboo: Text to Image Diffusion Models are Zero-Shot Segmentors". The Peekaboo project website: [https://ryanndagreat.github.io/peekaboo/](https://ryanndagreat.github.io/peekaboo/)

Instructions:

0. Go to the Runtime menu, and make sure this notebook is using GPU!
1. Run the top 2 code cells (one cleans colab's junk and downloads the source code, while the other installs python packages)
2. Click 'Runtime', then 'Restart Runtime'. You need to do this the first time you open this notebook to avoid weird random errors from the pip installations.
3. Run code cells to load stable diffusion. The first time you run it it will take a few minutes to download; subsequent times won't take long at all though.
4. Run all the cells below that, and customize prompt_w, prompt_x, prompt_y, and prompt_z!
5. Take the result top_image and bottom_image, print them out, and shine a backlight through them like shown in the Diffusion Illusion website (link above!)

I may also create a YouTube tutorial if there's interest. Let me know if this would be helpful!

This notebook was written by Ryan Burgert. Feel free to reach out to me at rburgert@cs.stonybrook.edu if you have any questions! 

In [None]:
import numpy as np
import rp
import torch
import torch.nn as nn
import source.stable_diffusion as sd
from easydict import EasyDict
from source.learnable_textures import LearnableImageFourier
from source.stable_diffusion_labels import NegativeLabel
from itertools import chain
import time

In [None]:
#ONLY GOOD PROMPTS HERE
example_prompts = rp.load_yaml_file('source/example_prompts.yaml')
print('Available example prompts:', ', '.join(example_prompts))

title='miku froggo lipstick kitten_in_box darth_vader'
title='miku miku miku miku picard'
title='froggo froggo froggo froggo porche'
title='pencil_cow pencil_penguin pencil_dog_head pencil_giraffe_head pencil_cat_head'
#These prompts are all strings - you can replace them with whatever you want! By default it lets you choose from example prompts
prompt_a, prompt_b, prompt_c, prompt_d, prompt_z = rp.gather(example_prompts, title.split())
#Prompts a,b,c,d are the normal looking images
#Prompt z is the hidden image you get when you overlay them all on top of each other

negative_prompt = ''

prompt_c="an intricate detailed hb pencil sketch of a puppy dog bichon head"
SK='hb pencil sketch'
CO='photorealistic color oil painting'
prompt_a=prompt_a.replace(SK,CO)
prompt_b=prompt_b.replace(SK,CO)
prompt_c=prompt_c.replace(SK,CO)
prompt_d=prompt_d.replace(SK,CO)
prompt_z=prompt_z.replace(SK,CO)

print()
print('Negative prompt:',repr(negative_prompt))
print()
print('Chosen prompts:')
print('    prompt_a =', repr(prompt_a))
print('    prompt_b =', repr(prompt_b))
print('    prompt_c =', repr(prompt_c))
print('    prompt_d =', repr(prompt_d))
print('    prompt_z =', repr(prompt_z))

# New Section

In [None]:
if 's' not in dir():
    model_name="CompVis/stable-diffusion-v1-4"
    gpu='cuda:0'
    s=sd.StableDiffusion(gpu,model_name)
device=s.device

In [None]:
label_a = NegativeLabel(prompt_a,negative_prompt)
label_b = NegativeLabel(prompt_b,negative_prompt)
label_c = NegativeLabel(prompt_c,negative_prompt)
label_d = NegativeLabel(prompt_d,negative_prompt)
label_z = NegativeLabel(prompt_z,negative_prompt)

In [None]:
#Image Parametrization and Initialization (this section takes vram)

#Select Learnable Image Size (this has big VRAM implications!):
#Note: We use implicit neural representations for better image quality
#They're previously used in our paper "TRITON: Neural Neural Textures make Sim2Real Consistent" (see tritonpaper.github.io)
# ... and that representation is based on Fourier Feature Networks (see bmild.github.io/fourfeat)
learnable_image_maker = lambda: LearnableImageFourier(height=256, width=256, hidden_dim=256, num_features=128).to(s.device); SIZE=256
learnable_image_maker = lambda: LearnableImageFourier(height=384,width=384,num_features=256,hidden_dim=256,scale=15).to(s.device);SIZE=384
# learnable_image_maker = lambda: LearnableImageFourier(height=512,width=512,num_features=256,hidden_dim=256,scale=20).to(s.device);SIZE=512

image_a=learnable_image_maker()
image_b=learnable_image_maker()
image_c=learnable_image_maker()
image_d=learnable_image_maker()

In [None]:
CLEAN_MODE = True # If it's False, we augment the images by randomly simulating how good a random printer might be when making the overlays...

def simulate_overlay(a,b,c,d):
    if CLEAN_MODE:
        exp=1
        brightness=3
        black=0
    else:
        exp=rp.random_float(.5,1)
        brightness=rp.random_float(1,5)
        black=rp.random_float(0,.5)
        bottom=rp.blend(bottom,black,rp.random_float())
        top=rp.blend(top,black,rp.random_float())
    return (a**exp * b**exp *c**exp * d**exp * brightness).clamp(0,99).tanh()

learnable_image_a=lambda: image_a()
learnable_image_b=lambda: image_b()
learnable_image_c=lambda: image_c()
learnable_image_d=lambda: image_d()
learnable_image_z=lambda: simulate_overlay(image_a(), image_b(), image_c(), image_d())

params=chain(
    image_a.parameters(),
    image_b.parameters(),
    image_c.parameters(),
    image_d.parameters(),
)
optim=torch.optim.SGD(params,lr=1e-4)

In [None]:
labels=[label_a, label_b, label_c, label_d, label_z]
learnable_images=[learnable_image_a,learnable_image_b,learnable_image_c,learnable_image_d,learnable_image_z]

#The weight coefficients for each prompt. For example, if we have [1,1,1,1,5], then the hidden prompt (prompt_z) will be prioritized
weights=[1,1,1,1,3]

weights=rp.as_numpy_array(weights)
weights=weights/weights.sum()
weights=weights*len(weights)

In [None]:
#For saving a timelapse
ims=[]

In [None]:
def get_display_image():
    return rp.tiled_images(
        [
            *[rp.as_numpy_image(image()) for image in learnable_images[:-1]],
            rp.as_numpy_image(learnable_image_z()),
        ],
        length=len(learnable_images),
        border_thickness=0,
    )

In [None]:
NUM_ITER=10000

#Set the minimum and maximum noise timesteps for the dream loss (aka score distillation loss)
s.max_step=MAX_STEP=990
s.min_step=MIN_STEP=10 

display_eta=rp.eta(NUM_ITER, title='Status: ')

DISPLAY_INTERVAL = 200

print('Every %i iterations we display an image in the form [image_a, image_b, image_c, image_d, image_z] where'%DISPLAY_INTERVAL)
print('    image_z = image_a * image_b * image_c * image_d')
print()
print('Interrupt the kernel at any time to return the currently displayed image')
print('You can run this cell again to resume training later on')
print()
print('Please expect this to take hours to get good images (especially on the slower Colab GPU\'s! The longer you wait the better they\'ll be')

try:
    for iter_num in range(NUM_ITER):
        display_eta(iter_num) #Print the remaining time

        preds=[]
        for label,learnable_image,weight in rp.random_batch(list(zip(labels,learnable_images,weights)), batch_size=1):
            pred=s.train_step(
                label.embedding,
                learnable_image()[None],

                #PRESETS (uncomment one):
                noise_coef=.1*weight,guidance_scale=60,#10
                # noise_coef=0,image_coef=-.01,guidance_scale=50,
                # noise_coef=0,image_coef=-.005,guidance_scale=50,
                # noise_coef=.1,image_coef=-.010,guidance_scale=50,
                # noise_coef=.1,image_coef=-.005,guidance_scale=50,
                # noise_coef=.1*weight, image_coef=-.005*weight, guidance_scale=50,
            )
            preds+=list(pred)

        im = get_display_image()
        ims.append(im)
        with torch.no_grad():
            if iter_num and not iter_num%(DISPLAY_INTERVAL*50):
                #Wipe the slate every 50 displays so they don't get cut off
                from IPython.display import clear_output
                clear_output()

            if not iter_num%DISPLAY_INTERVAL:
                rp.display_image(im)

        optim.step()
        optim.zero_grad()
except KeyboardInterrupt:
    print()
    print('Interrupted early at iteration %i'%iter_num)
    im = get_display_image()
    ims.append(im)
    rp.display_image(im)

In [None]:
print('Image A')
rp.display_image(rp.as_numpy_image(learnable_image_a()))

print('Image B')
rp.display_image(rp.as_numpy_image(learnable_image_b()))

print('Image C')
rp.display_image(rp.as_numpy_image(learnable_image_c()))

print('Image D')
rp.display_image(rp.as_numpy_image(learnable_image_d()))

print('Image Z')
rp.display_image(rp.as_numpy_image(learnable_image_z()))

In [None]:
def save_run(name):
    folder="untracked/hidden_character_runs/%s"%name
    if rp.path_exists(folder):
        folder+='_%i'%time.time()
    rp.make_directory(folder)
    ims_names=['ims_%04i.png'%i for i in range(len(ims))]
    print()
    rp.save_video_mp4(ims,folder+'.mp4',video_bitrate='high')
    with rp.SetCurrentDirectoryTemporarily(folder):
        rp.save_images(ims,ims_names,show_progress=True)
        pass
    print('Saved timelapse to folder:',repr(folder))
    
save_run(title) #You can give it a good custom name if you want!

In [None]:
import torch.nn.functional as F


GUIDANCE=32 ;  TIMESTEPS=range(100, 75, -1) ;  EMA_ALPHA=.2 ;  ORIG_ALPHA=.05 ; NEG='blurry unfocused low quality bokeh, depth of field' ;#Medium Light
GUIDANCE=32 ;  TIMESTEPS=range(100, 75, -1) ;  EMA_ALPHA=.2 ;  ORIG_ALPHA=.015 ; NEG='blurry unfocused low quality bokeh, depth of field' ;#Medium Light
GUIDANCE=32 ;  TIMESTEPS=range(300, 75, -5) ;  EMA_ALPHA=.2 ;  ORIG_ALPHA=.055 ; NEG='blurry unfocused low quality bokeh, depth of field' ;#Medium Harsh
# GUIDANCE=32 ;  TIMESTEPS=range(300, 75, -5) ;  EMA_ALPHA=.2 ;  ORIG_ALPHA=.025 ; NEG='blurry unfocused low quality bokeh, depth of field' ;#Medium Harsh
# GUIDANCE=16 ;  TIMESTEPS=range(300, 200, -1) ;  EMA_ALPHA=.2 ;  ORIG_ALPHA=.025 ; NEG='blurry unfocused low quality bokeh, depth of field' ;#Medium Harsh
# GUIDANCE=32 ;  TIMESTEPS=range(500, 10, -10) ;  EMA_ALPHA=.2 ;  ORIG_ALPHA=.01 ; NEG='blurry unfocused low quality bokeh, depth of field' ; #Aggressive
# GUIDANCE=4 ;  TIMESTEPS=range(999, 500, -10) ;  EMA_ALPHA=1 ;  ORIG_ALPHA=0 ; NEG='oversaturated, blurry unfocused low quality bokeh, depth of field, unrealistic, abstract, deep fried' ; #Complete
GUIDANCE=32 ;  TIMESTEPS=range(999, 10, -2) ;  EMA_ALPHA=.05 ;  ORIG_ALPHA=0 ; NEG='' ; #Complete
# GUIDANCE=7 ;  TIMESTEPS=range(999, 10, -2) ;  EMA_ALPHA=.1 ;  ORIG_ALPHA=0 ; NEG='' ; #Complete

COMPLETE=True#If this is set to True, generate the images totally from scratch using the default method
COMPLETE=False

#Show a timelapse of each diffusion process. Can take a while to load into the notebook.
SHOW_ANIMS=True
# SHOW_ANIMS=False 

@rp.monkey_patch(sd.StableDiffusion)
def redenoise_latent(self,                   text_embeddings:torch.Tensor,
                   latent:torch.Tensor,
                   guidance_scale:float=GUIDANCE,
                   t:int=None,):
        
        if t is None:
            t = torch.randint(self.min_step, self.max_step + 1, [1], dtype=torch.long, device=self.device)

        assert 0<=t<self.num_train_timesteps, 'invalid timestep t=%i'%t

        latents=latent[None]

        
        # predict the noise residual with unet, NO grad!
        with torch.no_grad():
            # add noise
            noise = torch.randn_like(latents)
            #This is the only place we use the scheduler...the add_noise function. What's more...it's totally generic! The scheduler doesn't impact the implementation of train_step...
            if t==999:
                latents_noisy=noise+0 #Eh sometimes I want to have complete noise
            else:
                latents_noisy = self.add_noise(latents, noise, t) #The add_noise function is identical for PNDM, DDIM, and DDPM schedulers in the diffusers library
            #TODO: Expand this add_noise function, and put it in this class. That way we don't need the scheduler...and we can also add an inverse function, which is what I need for previews...that subtracts noise...
            #Also, create a dream-loss-based image gen example notebook...

            # pred noise
            latent_model_input = torch.cat([latents_noisy] * 2)
            noise_pred = self.predict_noise(latent_model_input, text_embeddings, t)

                        
            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
            noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
            
            latent_pred = self.remove_noise(latents_noisy, noise_pred, t)
            # rp.ic(latent_pred.shape)
            output = latent_pred[0]
            
            # latent_pred = self.decode_latents(latent_pred)[0]

        return latent_pred[0]


    
def denoise_l(latent,label,T):
    return s.redenoise_latent(latent=latent,
                              text_embeddings=label.embedding,
                              t=torch.tensor(T, dtype=torch.int)
                             )
    
def get_ii_seqo(w=learnable_image_z, lw=label_z):
    seqo=[]
    
    if COMPLETE:
        out=lw.get_sample_image()
        rp.display_image(out)
        return out,[out]
    
    with torch.no_grad():
        # w,lw=learnable_image_x,label_x
        # w,lw=learnable_image_y,label_y
        # w,lw=learnable_image_z,label_z

        lw=NegativeLabel(lw.name,NEG)

        w=w()
        i=w
        i = F.interpolate(i[None], (512, 512), mode='bilinear', align_corners=False)[0]
        l=s.encode_img(i)
        ol=l

        rp.display_image(rp.as_numpy_image(i))

        # for T in [10]*100:
        # for T in [100,100,100,100,100]*10:
        # for T in [100,100,100,100,100]:
        rp.tic()
        # for T in list(range(999, 0, -10)):
        # for T in list(range(500, 0, -10)):
        # for T in list(range(999, 0, -50)):
        # for T in list(range(100, 0, -1)):
        did_999=False
        for T in list(TIMESTEPS):
            if T==999:
                if did_999: continue
                did_999=True
                
            if T!=999:
                T=rp.random_element(set(TIMESTEPS)-{999})
            # torch.manual_seed(298)

            dl=denoise_l(l, lw, T)
            # l=rp.blend(l,dl,1)#Stick to the previous, EMA, make it smooth! Less variance
            # l=rp.blend(l,dl,.2)#Stick to the previous, EMA, make it smooth! Less variance
            # l=rp.blend(l,dl,.01)#Stick to the previous, EMA, make it smooth! Less variance
            if T!=999:
                l=rp.blend(l,dl,EMA_ALPHA)#Stick to the previous, EMA, make it smooth! Less variance

                # l=rp.blend(l,ol,.05)#Stick to the original
                l=rp.blend(l,ol,ORIG_ALPHA)#Stick to the original
            else:
                l=dl
                # l=dl

            from IPython.display import clear_output
            # clear_output()

            ii=rp.as_numpy_image(s.decode_latent(l))
            print(T)
            seqo.append(rp.cv_resize_image(ii,.5))
            if rp.toc()>10:
                rp.display_image(ii)
                rp.tic()
        rp.display_image(ii)
        return ii,seqo

In [None]:
AI,seqo=get_ii_seqo(learnable_image_a,label_a)
if SHOW_ANIMS:rp.display_image_slideshow(seqo)

In [None]:
BI,seqo=get_ii_seqo(learnable_image_b,label_b)
if SHOW_ANIMS:rp.display_image_slideshow(seqo)

In [None]:
CI,seqo=get_ii_seqo(learnable_image_c,label_c)
if SHOW_ANIMS:rp.display_image_slideshow(seqo)

In [None]:
DI,seqo=get_ii_seqo(learnable_image_d,label_d)
if SHOW_ANIMS:rp.display_image_slideshow(seqo)

In [None]:
ZI,seqo=get_ii_seqo(learnable_image_z,label_z)
if SHOW_ANIMS:rp.display_image_slideshow(seqo)

In [None]:
#################################

rp.display_image(rp.tiled_images([AI,BI,CI,DI,ZI]))


##########

In [None]:
#######################

down_images=rp.as_torch_images(rp.as_numpy_array([rp.cv_resize_image(rp.as_numpy_image(x),(SIZE,SIZE)) for x in [AI,BI,CI,DI,ZI]])).to(s.device)

###########

In [None]:
######################

shlump=[]

############

In [None]:
#####################

from source.msssim import msssim

# WEIGHTS=[1,2,1.5,2] #If one of the images looks sus, add more weight!
# WEIGHTS=[1,1,1,1,5] #If one of the images looks sus, add more weight!
WEIGHTS=[1,1,1,1,2] #If one of the images looks sus, add more weight!
# WEIGHTS=[0,0,0,0,1] #Fix Z and only go to it
# WEIGHTS=[1,1,1,.3,1] #Fix Z and only go to it
# WEIGHTS=[1,1,1,.3,5] #Fix Z and only go to it
# WEIGHTS=[1,1,1,1,10] #Fix Z and only go to it
WEIGHTS=rp.as_numpy_array(WEIGHTS)
WEIGHTS=WEIGHTS/WEIGHTS.sum()

MSSSIM_COEF = 0 ; MSE_COEF = 1
MSSSIM_COEF = .2 ; MSE_COEF = 1
# MSSSIM_COEF = .5 ; MSE_COEF = 1
# MSSSIM_COEF = 1 ; MSE_COEF = 1

for _ in range(10000):
    if not _%500:
        with torch.no_grad():
            shlumper=rp.tiled_images(
                [
                    rp.as_numpy_image(learnable_image_a()),
                    rp.as_numpy_image(learnable_image_b()),
                    rp.as_numpy_image(learnable_image_c()),
                    rp.as_numpy_image(learnable_image_d()),
                    rp.as_numpy_image(learnable_image_z()),
                ],
                border_thickness=0,
                length=4
            )
        rp.display_image(shlumper)
        
    DO_ALL=False #Not enough vram

    limage=[
        learnable_image_a,
        learnable_image_b,
        learnable_image_c,
        learnable_image_d,
        learnable_image_z,
    ]
    index=_%len(limage)
    
    if not WEIGHTS[index]:continue
    
    limage=limage[index]
    limage=limage()
    dimage=down_images[index]
    loss=0


    if MSE_COEF   : loss=loss+  ((limage-dimage)**2).mean()       * MSE_COEF
    if MSSSIM_COEF: loss=loss-  msssim(limage[None],dimage[None]) * MSSSIM_COEF

    total_loss=loss*4*WEIGHTS[index]
        
    total_loss=total_loss*10000
        
    total_loss.backward()
    optim.step()
    optim.zero_grad()
    
    if not _%51:
        print(total_loss)