## MANUAL COUNTER-VIDEO

In [1]:
import os
os.environ['PYTORCH_ENABLE_MPS_FALLBACK']='1' #For if using cotracker + mac + MPS

import rp
import einops
import numpy as np
import sys
import torch
from icecream import ic

das_root = rp.printed(rp.get_absolute_path('../..'))
sys.path.append(das_root)
sys.path.append(rp.printed(rp.get_absolute_path(f'{das_root}/source/gaussblobs')))
from source.gaussblobs.render_tracks import draw_blobs_videos


if rp.currently_running_mac(): 
    device = torch.device('cpu') #Don't use MPS - we're using TapNext beacuse it's a lot better
else:
    device = rp.select_torch_device(reserve=True, prefer_used=True)
    
ic(das_root, device);

ic| das_root: '/Users/ryan/CleanCode/Projects/Google2025/DiffusionAsShader'
    device: device(type='cpu')


/Users/ryan/CleanCode/Projects/Google2025/DiffusionAsShader
/Users/ryan/CleanCode/Projects/Google2025/DiffusionAsShader/source/gaussblobs


In [2]:
sys.path.append(rp.get_module_path_from_name('rp.git.tapnet'))
print(sys.path[-1])

/Users/ryan/miniconda3/envs/google25/lib/python3.12/site-packages/rp/git/tapnet


In [3]:
input_video_path = "MoveTheCar.mp4"
prompt = 'A minivan with a bunch of colorful baloons is driving through a dusty desert highway with power pylons in the top right of the screen'
input_video = rp.load_video(input_video_path)
input_video = rp.resize_list(input_video, 49)
input_video = rp.resize_images_to_hold(input_video, height=480, width=720)
input_video = rp.crop_images(input_video, height=480, width=720, origin='center')
input_video = rp.as_float_images(input_video)
input_video = rp.as_numpy_array(input_video)

ic(input_video_path, prompt);

load_video: path='MoveTheCar.mp4': done loading frames, creating numpy array...done.


ic| input_video_path: 'MoveTheCar.mp4'
    prompt: ('A minivan with a bunch of colorful baloons is driving through a dusty desert '
             'highway with power pylons in the top right of the screen')


In [4]:
def gridded_video(input_video):
    grid_input_video = rp.as_float_images(input_video)
    
    grid_input_video[:,::20,:]=.5
    grid_input_video[:,:,::20]=.5
    
    grid_input_video[:,::100,:]=1
    grid_input_video[:,:,::100]=1
    
    grid_input_video = rp.labeled_images(grid_input_video, range(len(grid_input_video)), size=30, font='Arial')
    grid_input_video = rp.video_with_progress_bar(grid_input_video, bar_color='green', position='top', size=5)

    return grid_input_video

rp.display_video(gridded_video(input_video))

0
This browser does not support the video tag.


In [5]:
#In TXY form
init_points = [
    [0, 580, 300], #0 white ::  Car
    [0, 680, 260], #1 red ::  Balloons
    [0, 650, 365], #2 green ::  License Plate
    [32, 600, 60], #3 blue ::  Road 1
    # [0, 343, 185], #3 blue ::  Road 1
    [27, 100, 200], #4 cyan ::  Left of road
    [24, 600, 200], #5 magenta ::  Right dirt
    [48, 400, 250], #6 yellow ::  
]



colors = 'white red green blue cyan magenta yellow'.split()

init_points = np.stack(init_points)

def draw_points_on_video(video, points, colors=colors, visible=None):
    if visible is None:
        visible = [True] * len(points)
        
    output = list(video)
    for (t, x, y), color, v in zip(points, colors, visible):
        if v:
            output[t] = rp.cv_draw_circle(output[t], x, y, radius=15, rim=3, color=color, copy=False)
    output = rp.as_byte_images(output, copy=False)
    return np.stack(output)

# tracks, visibles = rp.run_cotracker(
tracks, visibles = rp.run_tapnext(
    input_video, 
    device = device,
    queries = init_points,
)
# tracks   = torch.tensor(tracks  ).to(device=device)
# visibles = torch.tensor(visibles).to(device=device)

# visibles[:]=1

T, N = rp.validate_tensor_shapes(
    tracks      = 'torch: T N XY',
    visibles    = 'torch: T N',
    init_points = 'numpy:   N TXY',
    input_video = 'numpy: T H W RGB',
    TXY=3,
    XY=2,
    RGB=3,
    return_dims='T N',
)

def draw_tracks(input_video, tracks=tracks, visibles=visibles):
    track_preview_video = input_video
    
    for t  in rp.eta(range(T), 'draw_tracks'):
        track = tracks[t]
        visible = visibles[t]
    
        points = [[t, x, y] for x,y in track]
        
        track_preview_video = draw_points_on_video(track_preview_video, points, colors, visible)

    return track_preview_video

track_preview_video = draw_tracks(input_video)

rp.display_video(gridded_video(track_preview_video))

Loading tapnext model...
draw_tracks: Done! Did 49 items in 0:00:00.328769                                                      


0
This browser does not support the video tag.


In [6]:
def add_drift(tracks, *i, dx=0, dy=0, t_origin=0, do_before=True, do_after=True):
    """
    Adds a drift to tracks at selected indices i
    The drift starts at t_origin and for every timestep before and after dx and dy are added to its x and y
    """
    new_tracks = tracks + 0
    
    # Create time array
    ts = torch.arange(T).to(tracks.device, tracks.dtype)
    
    # Calculate drift for each timestep relative to t_origin
    drift_x = (ts - t_origin) * dx
    drift_y = (ts - t_origin) * dy
    
    # Apply drift to selected track indices
    for idx in i:
        new_tracks[:, idx, 0] += drift_x
        new_tracks[:, idx, 1] += drift_y

    if not do_before:
        new_tracks[:t_origin] = tracks[:t_origin]

    if not do_after:
        new_tracks[t_origin:] = tracks[t_origin:]
    
    return new_tracks

def zoom_tracks(tracks, *i, t_origin=0, x_origin=None, y_origin=None, d_scale=1.03):
    """
    Applies geometric scaling to tracks at selected indices i
    The scaling starts at t_origin with scale factor changing by d_scale each frame
    x_origin and y_origin default to the mean of the tracks at t_origin if None
    """
    import torch
    
    new_tracks = tracks + 0
    T = tracks.shape[0]
    
    # Set default origins to mean of tracks at t_origin if not specified
    if x_origin is None:
        x_origin = tracks[t_origin, :, 0].mean().item()
    if y_origin is None:
        y_origin = tracks[t_origin, :, 1].mean().item()
    
    # Create time array
    ts = torch.arange(T).to(tracks.device, tracks.dtype)
    
    # Calculate scale factors for each timestep relative to t_origin
    scale_factors = d_scale ** (ts - t_origin)
    
    # Apply scaling to selected track indices
    for idx in i:
        # Center coordinates around origin
        centered_x = new_tracks[:, idx, 0] - x_origin
        centered_y = new_tracks[:, idx, 1] - y_origin
        
        # Apply scaling
        new_tracks[:, idx, 0] = centered_x * scale_factors + x_origin
        new_tracks[:, idx, 1] = centered_y * scale_factors + y_origin
    
    return new_tracks



def horz_mirror_tracks(tracks, *i, t_origin=0, x_origin=None):
    """
    Horizontally mirrors tracks at selected indices i around x_origin
    x_origin defaults to the mean x position of the tracks at t_origin if None
    """
    import torch
    
    new_tracks = tracks + 0
    
    # Set default x_origin to mean of tracks at t_origin if not specified
    if x_origin is None:
        x_origin = tracks[t_origin, :, 0].mean().item()
    
    # Apply horizontal mirroring to selected track indices
    for idx in i:
        # Mirror x coordinates around x_origin
        new_tracks[:, idx, 0] = 2 * x_origin - new_tracks[:, idx, 0]
    
    return new_tracks


def horz_mirror_origins(tracks, *i, x_origin=None):
    """
    Horizontally mirrors tracks at selected indices i by mirroring their centroid
    For each frame: calculates mean position of selected points, mirrors that centroid, 
    then applies the delta to all selected points
    """
    import torch
    
    new_tracks = tracks + 0
    
    if x_origin is None:
        # Use center of all tracks as mirror axis
        x_mirror = tracks[:, :, 0].mean(dim=1)  # Shape: [T]
    else:
        # Use custom x_origin as mirror axis
        x_mirror = torch.full((tracks.shape[0],), x_origin, device=tracks.device, dtype=tracks.dtype)
    
    # Calculate centroid of selected points for each frame
    selected_indices = torch.tensor(list(i), device=tracks.device)
    centroid_x = tracks[:, selected_indices, 0].mean(dim=1)  # Shape: [T]
    
    # Mirror the centroid around the mirror axis
    mirrored_centroid_x = 2 * x_mirror - centroid_x
    
    # Calculate delta (how much to move all points)
    delta_x = mirrored_centroid_x - centroid_x
    
    # Apply delta to all selected points
    for idx in i:
        new_tracks[:, idx, 0] += delta_x
    
    return new_tracks


def reverse_tracks(tracks, visibles, *indices):
    tracks = tracks + 0
    visibles = visibles + 0
    for index in indices:
        tracks[:,index]=tracks[:,index].flip(0)
        visibles[:,index]=visibles[:,index].flip(0)
    return tracks, visibles


In [7]:
def draw_arrows(video, old_tracks, new_tracks, old_visibles, new_visibles):
    out=[]
    for frame, old_track, new_track, old_viz, new_viz in zip(video, old_tracks, new_tracks, old_visibles, new_visibles):
        start_x, start_y = old_track.T
        end_x, end_y = new_track.T
        visible = [ov * nv for ov,nv in zip(old_viz, new_viz)]
        
        frame = rp.cv_draw_arrows(frame, start_x, start_y, end_x, end_y, color=colors, tip_length=0, visible=visible)
        out.append(frame)
    return rp.as_numpy_array(out)
        

In [33]:
new_tracks = tracks + 0
new_visibles = visibles + 0
# new_tracks = add_drift(new_tracks,  1, 3, dx=30, dy=0, t_origin=30, do_before=False)
# new_tracks = add_drift(new_tracks,  2, 5, 6, dx=0, dy=-5, t_origin=25)
# new_tracks = add_drift(new_tracks, 5 , dx=-7, dy=20, t_origin=19, do_before=False)
# new_tracks = add_drift(new_tracks, 2 , dx=-5, dy=0, t_origin=11, do_before=False)
# new_tracks = add_drift(new_tracks, 2 , dx=5, dy=0, t_origin=38, do_before=False, do_after=True)
# new_tracks = add_drift(new_tracks, 2 , dx=0, dy=5, t_origin=38, do_before=False, do_after=True)
new_tracks[10:,:3] = new_tracks[10:11,:3] #Freeze pos
new_tracks = add_drift(new_tracks, 0,1,2 , dx=-5, dy=-4, t_origin=0, do_before=False, do_after=True)
new_visibles[:,:3]=1
new_tracks = zoom_tracks(new_tracks, 0, 1, 2, d_scale = .985, t_origin = 10)

new_tracks = add_drift(new_tracks, 0,1,2 , dx=3.4, dy=0, t_origin=0, do_before=False, do_after=True)
new_tracks = horz_mirror_origins(new_tracks, 0, 1, 2, x_origin=450)
new_visibles[:,3]=1
# new_visibles[35:45,1]=0
# new_tracks, new_visibles = reverse_tracks(new_tracks, new_visibles, 0,4)
rp.display_video(gridded_video(draw_arrows(draw_tracks(input_video, new_tracks, new_visibles), tracks, new_tracks, visibles, new_visibles)))

draw_tracks: Done! Did 49 items in 0:00:00.388242                                                      


0
This browser does not support the video tag.


In [34]:
before_preview, after_preview=        rp.labeled_videos(
            [
                gridded_video(draw_tracks(input_video, tracks, visibles)),
                # gridded_video(draw_tracks(input_video, new_tracks, new_visibles)),
                gridded_video(draw_arrows(draw_tracks(input_video, new_tracks, new_visibles), tracks, new_tracks, visibles, new_visibles))
            ],
            ["Counterfactual Input", "Target"],
            size=30,
            font="R:Futura",
            
        )

rp.display_video(
    rp.labeled_images(
    rp.horizontally_concatenated_videos(
        before_preview,
        [rp.cv_resize_image(rp.bordered_image_solid_color((rp.pil_text_to_image('\n→',font='Arial',size=200)),color='black',thickness=30,),1/3)],
        after_preview,
        origin='center',
    ),
        f'{input_video_path}\n{prompt}',font='R:Futura',position='bottom',size=20,size_by_lines=True,text_color='light blue',
    )
)

draw_tracks: Done! Did 49 items in 0:00:00.341256                                                      
draw_tracks: Done! Did 49 items in 0:00:00.328345                                                      


0
This browser does not support the video tag.


In [35]:
def tracks_to_xyzv(tracks, visibles):
    #T N XY -> T N XYZV

    #I currently don't care about depth
    z = torch.ones_like(tracks[:,:,0])

    tracks_xyzv, _ = einops.pack([tracks, z, visibles], 'T N *')

    rp.validate_tensor_shapes(
        tracks      = 'torch: T N XY',
        visibles    = 'torch: T N',
        z           = 'torch: T N',
        tracks_xyzv = 'torch: T N XYZV',
        XYZV=4,
        XY=2,
    )

    return tracks_xyzv

blobs_videos = draw_blobs_videos(
    video         = rp.as_torch_video(input_video),
    counter_video = rp.as_torch_video(input_video),
    video_tracks         = tracks_to_xyzv(new_tracks, new_visibles),
    counter_tracks = tracks_to_xyzv(tracks, visibles),
    sigma = 10,
)

video_gaussians, counter_video_gaussians = rp.destructure(blobs_videos)

#RGBA -> RGB
video_gaussians         = video_gaussians        [:,:3]
counter_video_gaussians = counter_video_gaussians[:,:3]

rp.validate_tensor_shapes(
    video_gaussians         = 'torch: T 3 H W',
    counter_video_gaussians = 'torch: T 3 H W',
    input_video             = 'numpy: T H W 3',
)

#In range [0, 1]
assert 0<=counter_video_gaussians.min()<=counter_video_gaussians.max()<=1
assert 0<=video_gaussians        .min()<=video_gaussians        .max()<=1

rp.display_video(rp.tiled_videos(rp.as_numpy_videos([video_gaussians, counter_video_gaussians]),border_color='white',border_thickness=1))

0
This browser does not support the video tag.


## DIFFUSION

In [80]:
result_title = rp.get_file_name(input_video_path, include_file_extension=False)

result_folder = f'untracked/gaussblob_tests/{result_title}'
result_folder = rp.get_unique_copy_path(result_folder)
rp.make_directory(result_folder)

ic(result_folder)

sample = rp.as_easydict(
    frames               = rp.as_torch_video(input_video) * 2 - 1, #This one doesn't matter
    counter_video_frames = rp.as_torch_video(input_video) * 2 - 1, 
    tracking_frames         = video_gaussians             * 2 - 1,
    counter_tracking_frames = counter_video_gaussians     * 2 - 1,
    prompt = prompt,
)

#SWAP
# sample.frames         , sample.counter_video_frames    = sample.counter_video_frames   , sample.frames         
# sample.tracking_frames, sample.counter_tracking_frames = sample.counter_tracking_frames, sample.tracking_frames

ic| result_folder: 'untracked/gaussblob_tests/meadow_girls_copy1'


In [81]:
rp.display_video(sample.frames                  / 2 + 0.5)
rp.display_video(sample.tracking_frames         / 2 + 0.5)
rp.display_video(sample.counter_tracking_frames / 2 + 0.5)
rp.display_video(sample.counter_video_frames    / 2 + 0.5)

frames                  = rp.as_numpy_images(sample.frames                  / 2 + 0.5)
tracking_frames         = rp.as_numpy_images(sample.tracking_frames         / 2 + 0.5)
counter_tracking_frames = rp.as_numpy_images(sample.counter_tracking_frames / 2 + 0.5)
counter_video_frames    = rp.as_numpy_images(sample.counter_video_frames    / 2 + 0.5)

with rp.SetCurrentDirectoryTemporarily(result_folder):
    frames_path                  = rp.save_video_mp4(frames                 , "frames.mp4",                  framerate=20, video_bitrate="max", show_progress=False)
    tracking_frames_path         = rp.save_video_mp4(tracking_frames        , "tracking_frames.mp4",         framerate=20, video_bitrate="max", show_progress=False)
    counter_tracking_frames_path = rp.save_video_mp4(counter_tracking_frames, "counter_tracking_frames.mp4", framerate=20, video_bitrate="max", show_progress=False)
    counter_video_frames_path    = rp.save_video_mp4(counter_video_frames   , "counter_video_frames.mp4",    framerate=20, video_bitrate="max", show_progress=False)

prompt = sample.prompt

ic(
    prompt                      ,
    frames_path                 ,
    tracking_frames_path        ,
    counter_tracking_frames_path,
    counter_video_frames_path   ,
)

0
This browser does not support the video tag.


0
This browser does not support the video tag.


0
This browser does not support the video tag.


0
This browser does not support the video tag.


ic| prompt: 'Two ladies dancing in the green grassy meadow, with beautiful white dresses'
    frames_path: '/home/jupyter/CleanCode/Github/DaS_Trees/gauss_blobs/test_sample_gen/untracked/gaussblob_tests/meadow_girls_copy1/frames.mp4'
    tracking_frames_path: '/home/jupyter/CleanCode/Github/DaS_Trees/gauss_blobs/test_sample_gen/untracked/gaussblob_tests/meadow_girls_copy1/tracking_frames.mp4'
    counter_tracking_frames_path: '/home/jupyter/CleanCode/Github/DaS_Trees/gauss_blobs/test_sample_gen/untracked/gaussblob_tests/meadow_girls_copy1/counter_tracking_frames.mp4'
    counter_video_frames_path: '/home/jupyter/CleanCode/Github/DaS_Trees/gauss_blobs/test_sample_gen/untracked/gaussblob_tests/meadow_girls_copy1/counter_video_frames.mp4'


('Two ladies dancing in the green grassy meadow, with beautiful white dresses',
 '/home/jupyter/CleanCode/Github/DaS_Trees/gauss_blobs/test_sample_gen/untracked/gaussblob_tests/meadow_girls_copy1/frames.mp4',
 '/home/jupyter/CleanCode/Github/DaS_Trees/gauss_blobs/test_sample_gen/untracked/gaussblob_tests/meadow_girls_copy1/tracking_frames.mp4',
 '/home/jupyter/CleanCode/Github/DaS_Trees/gauss_blobs/test_sample_gen/untracked/gaussblob_tests/meadow_girls_copy1/counter_tracking_frames.mp4',
 '/home/jupyter/CleanCode/Github/DaS_Trees/gauss_blobs/test_sample_gen/untracked/gaussblob_tests/meadow_girls_copy1/counter_video_frames.mp4')

In [82]:
##########################
# IMPORTS
##########################

import sys
import os
import shlex

from functools import cached_property

import models.cogvideox_tracking as cogtrack
import rp
import torch
from icecream import ic

import numpy as np

sys.path += rp.get_absolute_paths(
    [
        "~/CleanCode/Management",
        # "~/CleanCode/Github/DiffusionAsShader",
        # "~/CleanCode/Datasets/Vids/Raw_Feb28",
        # "~/CleanCode/Github/CogvideX-Interpolation-Mar23:MotionPrompting",
        # "~/CleanCode/Github/CogvideX-Interpolation-Feb13:Inpainting",
    ]
)

import syncutil

device = rp.select_torch_device(prefer_used=True, reserve=True)

##########################
# FUNCTIONS
##########################

CKPT_folder = rp.path_join(das_root,'diffusion_shader_model_CKPT')
CKPT_transformer_folder = rp.path_join(CKPT_folder, 'transformer')

def update_to_latest_checkpoint():

    # if not rp.folder_exists(CKPT_folder):
    rp.r._run_sys_command(f'rm -rf {CKPT_folder}')
    rp.r._run_sys_command(
        f'cp -al /home/jupyter/CleanCode/Github/DiffusionAsShader/diffusion_shader_model {CKPT_folder}'
        # f'cp -al /home/jupyter/CleanCode/Huggingface/CogVideoX-5b {CKPT_folder}'
    )
    
    latest_transformer_checkpoint = checkpoint_root
    
    rp.fansi_print(f'Using checkpoint: {latest_transformer_checkpoint}','bold green undercurl')

    rp.r._run_sys_command(
        "rm",
        "-rf",
        CKPT_transformer_folder,
    )
    rp.make_hardlink(
        rp.path_join(latest_transformer_checkpoint, "transformer"),
        CKPT_transformer_folder,
        recursive=True,
    )
    

def get_maps(video_path):
    from diffusers.utils import export_to_video, load_image, load_video

    video_path=rp.get_absolute_path(video_path)

    maps = load_video(video_path)
    # Convert list of PIL Images to tensor [T, C, H, W]
    maps = torch.stack(
        [
            torch.from_numpy(np.array(frame)).permute(2, 0, 1).float() / 255.0
            for frame in maps
        ]
    )
    maps = maps.to(device=device, dtype=torch.bfloat16)

    print(f"Encoding tracking maps from {video_path}")
    maps = maps.unsqueeze(0)  # [B, T, C, H, W]
    maps = maps.permute(0, 2, 1, 3, 4)  # [B, C, T, H, W]
    
    maps = maps * 2 - 1 #Normalize from [0,1] to [-1, 1]
    
    with torch.no_grad():
        latent_dist = pipe.vae.encode(maps).latent_dist
        maps = latent_dist.sample() * pipe.vae.config.scaling_factor
        maps = maps.permute(0, 2, 1, 3, 4)  # [B, F, C, H, W]
    
    return maps

def load_video_first_frame(video_path):
    return image_form(next(rp.load_video_stream(rp.get_absolute_path(video_path))))

def image_form(image):
    image=rp.as_rgb_image(image)
    return rp.as_pil_image(image)

@rp.globalize_locals
def run_pipe(
    prompt                    = prompt                      ,
    video_path                = frames_path                 ,
    tracking_map_path         = tracking_frames_path        ,
    counter_tracking_map_path = counter_tracking_frames_path,
    counter_video_map_path    = counter_video_frames_path   ,
):
    ic(
        prompt,
        video_path,
        tracking_map_path,
        counter_tracking_map_path,
        counter_video_map_path,
    )

    # prompt = ''
    # fansi_print("LOOK MA NO PROMPT",'blue')

    pipeline_args = {
        "prompt"                 : prompt,
        "image"                  : load_video_first_frame(video_path),
        "tracking_image"         : load_video_first_frame(tracking_map_path),
        "counter_tracking_image" : load_video_first_frame(counter_tracking_map_path),
        "counter_video_image"    : load_video_first_frame(counter_video_map_path),
        "tracking_maps"          : get_maps(tracking_map_path),
        "counter_tracking_maps"  : get_maps(counter_tracking_map_path),
        "counter_video_maps"     : get_maps(counter_video_map_path),
        "negative_prompt"        : "The video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory. Distortion.",
        "height"              : 480,
        "width"               : 720,
        "num_frames"          : 49,
        "use_dynamic_cfg"     : True,
        "guidance_scale"      : 6, #3 if rp.random_chance() else 6,
        "num_inference_steps" : 50,
    }

    rp.display_dict(rp.gather(pipeline_args,'prompt negative_prompt height width num_frames use_dynamic_cfg guidance_scale num_inference_steps'.split(), as_dict=True))

    pipeline_args |= dict(          
        use_image_conditioning=True,
        # latent_conditioning_dropout=[0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        latent_conditioning_dropout=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], #Weird, not as good actually...
        # latent_conditioning_dropout=[1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1], #Sparse...25%

        # use_image_conditioning=False,
        # latent_conditioning_dropout=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], #Weird, not as good actually...
        # latent_conditioning_dropout=[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1],
        # latent_conditioning_dropout=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
        #latent_conditioning_dropout=[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
    )

    pipeline_args = rp.as_easydict(pipeline_args)

    with torch.no_grad():
        results=pipe(**pipeline_args)
    
    video=results.frames[0]
    video=rp.as_numpy_images(video)
    video = rp.labeled_images(
        video,
        f"PROMPT={repr(prompt[:50])}\nCFG={pipeline_args.guidance_scale} DYN-CFG={pipeline_args.use_dynamic_cfg} STEPS={pipeline_args.num_inference_steps} {''.join(map(str,pipeline_args['latent_conditioning_dropout']))}",
        size=-25,
        background_color="translucent dark blue",
        size_by_lines=False,
    )

    video = rp.as_numpy_array(video)
    
    return video

##########################
# SETTINGS
##########################

# checkpoint_root = '/home/jupyter/CleanCode/Github/DiffusionAsShader/ckpts/your_ckpt_path/CounterChans2500100000__optimizer_adamw__lr-schedule_cosine_with_restarts__learning-rate_1e-4/checkpoint-4500'
# checkpoint_root = '/home/jupyter/CleanCode/Github/DiffusionAsShader/ckpts/your_ckpt_path/CounterChans_RandomSpeed_2500_10000000__optimizer_adamw__lr-schedule_cosine_with_restarts__learning-rate_1e-4/checkpoint-1100'
# checkpoint_root = '/home/jupyter/CleanCode/Github/DiffusionAsShader/ckpts/your_ckpt_path/CounterChans_RandomSpeed_2500_10000000__optimizer_adamw__lr-schedule_cosine_with_restarts__learning-rate_1e-4/checkpoint-6000'
# checkpoint_root = '/home/jupyter/CleanCode/Github/DiffusionAsShader/ckpts/your_ckpt_path/CounterChans_RandomSpeed_WithDropout_2500_10000000__optimizer_adamw__lr-schedule_cosine_with_restarts__learning-rate_1e-4/checkpoint-3000'
# checkpoint_root = '/home/jupyter/CleanCode/Github/DiffusionAsShader/ckpts/your_ckpt_path/CounterChans_RandomSpeed_WithDropout_2500_10000000__optimizer_adamw__lr-schedule_cosine_with_restarts__learning-rate_1e-4/checkpoint-9200'
# checkpoint_root = '/home/jupyter/CleanCode/Github/DiffusionAsShader/ckpts/your_ckpt_path/CounterChans_RandomSpeed_WithDropout_2500_10000000__optimizer_adamw__lr-schedule_cosine_with_restarts__learning-rate_1e-4/checkpoint-14700'
# checkpoint_root = '/home/jupyter/CleanCode/Github/DiffusionAsShader/ckpts/your_ckpt_path/CounterChans_RandomSpeed_WithDropout_2500_10000000__optimizer_adamw__lr-schedule_cosine_with_restarts__learning-rate_1e-4/checkpoint-29000'
# checkpoint_root = '/home/jupyter/CleanCode/Github/DiffusionAsShader/ckpts/your_ckpt_path/CounterChans_RandomSpeed_WithDropout_2500_10000000__optimizer_adamw__lr-schedule_cosine_with_restarts__learning-rate_1e-4/checkpoint-29000'
# checkpoint_root = '/home/jupyter/CleanCode/Github/DiffusionAsShader/ckpts/your_ckpt_path/CounterChans_RandomSpeed_WithDropout_2500_10000000__optimizer_adamw__lr-schedule_cosine_with_restarts__learning-rate_1e-4/checkpoint-29000'
# checkpoint_root = '/home/jupyter/CleanCode/Github/DiffusionAsShader/ckpts/your_ckpt_path/CounterChans_BetterAug_WithDropout_50kSamp_T2V_10000000__optimizer_adamw__lr-schedule_cosine_with_restarts__learning-rate_1e-4/checkpoint-9000'
# checkpoint_root = '/home/jupyter/CleanCode/Github/DiffusionAsShader/ckpts/your_ckpt_path/CounterChans_BetterAug_WithDropout_50kSamp_T2V_10000000__optimizer_adamw__lr-schedule_cosine_with_restarts__learning-rate_1e-4/checkpoint-9000'
checkpoint_root = '/home/jupyter/CleanCode/Github/DaS_Trees/gauss_blobs/ckpts/your_ckpt_path/CounterChans_FIXED_DATASET_BetterAug_WithDropout_50kSamp_T2V_from_scratch_10000000__optimizer_adamw__lr-schedule_cosine_with_restarts__learning-rate_1e-4/checkpoint-3500'
checkpoint_root = '/home/jupyter/CleanCode/Github/DaS_Trees/gauss_blobs/ckpts/your_ckpt_path/CounterChans_FIXED_DATASET_BetterAug_WithDropout_50kSamp_T2V_from_scratch_10000000__optimizer_adamw__lr-schedule_cosine_with_restarts__learning-rate_1e-4/checkpoint-4500'

checkpoint_title = rp.get_folder_name(checkpoint_root)

USE_T2V=True
# USE_T2V=False

if USE_T2V:
    os.environ['T2V_TRANSFORMER_CHECKPOINT'] = "/home/jupyter/CleanCode/Huggingface/CogVideoX-5b/transformer"

NO_CONTROLNET=False
if NO_CONTROLNET:
    os.environ['DISABLE_CONTROLNET'] = "True"


##########################
# SETUP
##########################
    
latest_transformer_checkpoint = syncutil.sync_checkpoint_folder(checkpoint_root)

# rp.set_current_directory('/home/jupyter/CleanCode/Github/DiffusionAsShader')
# if not rp.file_exists('source/datasets/youtube/DaS/Vanilla/prompt.txt'):
#     rp.r._run_sys_command('python source/datasets/youtube/DaS/Vanilla/make_columns.py')
# if not rp.folder_exists('diffusion_shader_model_CKPT'):
#     rp.make_hardlink('diffusion_shader_model','diffusion_shader_model_CKPT',recursive=True)

if "pipe" not in vars():
    update_to_latest_checkpoint()
    pipe = cogtrack.CogVideoXImageToVideoPipelineTracking.from_pretrained(
        CKPT_folder,
    )

    pipe.to(dtype=torch.bfloat16)
    pipe.to(device)
    #pipe.enable_sequential_cpu_offload(device=device)
    pipe.vae.enable_slicing()
    pipe.vae.enable_tiling()
    pipe.transformer.eval()
    pipe.text_encoder.eval()
    pipe.vae.eval()

##########################
# MAIN
##########################

output_video = run_pipe()


This computer has 8 GPUs, but we will be choosing cuda:2 because we're already using it.


ic| prompt: 'Two ladies dancing in the green grassy meadow, with beautiful white dresses'
    video_path: '/home/jupyter/CleanCode/Github/DaS_Trees/gauss_blobs/test_sample_gen/untracked/gaussblob_tests/meadow_girls_copy1/frames.mp4'
    tracking_map_path: '/home/jupyter/CleanCode/Github/DaS_Trees/gauss_blobs/test_sample_gen/untracked/gaussblob_tests/meadow_girls_copy1/tracking_frames.mp4'
    counter_tracking_map_path: '/home/jupyter/CleanCode/Github/DaS_Trees/gauss_blobs/test_sample_gen/untracked/gaussblob_tests/meadow_girls_copy1/counter_tracking_frames.mp4'
    counter_video_map_path: '/home/jupyter/CleanCode/Github/DaS_Trees/gauss_blobs/test_sample_gen/untracked/gaussblob_tests/meadow_girls_copy1/counter_video_frames.mp4'


Encoding tracking maps from /home/jupyter/CleanCode/Github/DaS_Trees/gauss_blobs/test_sample_gen/untracked/gaussblob_tests/meadow_girls_copy1/tracking_frames.mp4
Encoding tracking maps from /home/jupyter/CleanCode/Github/DaS_Trees/gauss_blobs/test_sample_gen/untracked/gaussblob_tests/meadow_girls_copy1/counter_tracking_frames.mp4
Encoding tracking maps from /home/jupyter/CleanCode/Github/DaS_Trees/gauss_blobs/test_sample_gen/untracked/gaussblob_tests/meadow_girls_copy1/counter_video_frames.mp4
[36mguidance_scale[0m[32m --> [0m[34m6[0m
[36mheight[0m[32m --> [0m[34m480[0m
[36mnegative_prompt[0m[32m --> [0m[34mThe video is not of a high quality, it has a low resolution. Watermark present in each frame. The background is solid. Strange body and strange trajectory. Distortion.[0m
[36mnum_frames[0m[32m --> [0m[34m49[0m
[36mnum_inference_steps[0m[32m --> [0m[34m50[0m
[36mprompt[0m[32m --> [0m[34mTwo ladies dancing in the green grassy meadow, with beautiful w

  0%|          | 0/50 [00:00<?, ?it/s]

In [83]:
def overlay_tracks(frames, track_frames):
    rp.validate_tensor_shapes(
        frames="numpy: T H W 3",
        track_frames="numpy: T H W 3",
    )
    alpha = track_frames.max(-1, keepdims=True)
    output = alpha * track_frames + (1 - alpha) * frames
    return output


def text_symbol(x):
    return rp.pil_text_to_image(
        x, font="DejaVuSerif", size=200, color="white", background_color="black"
    )

arrow_image = text_symbol("→")
plus_image = text_symbol("+")
approx_image = text_symbol("≈")

video = rp.as_numpy_array(video)[:, :, :, :3]

preview_video = rp.horizontally_concatenated_videos(
    rp.labeled_images(
        overlay_tracks(counter_video_frames, counter_tracking_frames),
        "Counterfactual Input",
        font='Arial',
        size=20,
    ),
    [arrow_image],
    rp.labeled_images(overlay_tracks(video, tracking_frames), "Diffusion Output", size=20,         font='Arial'),

    ##THESE DONT HAVE GROUND TRUTH
    # [approx_image],
    # rp.labeled_images(overlay_tracks(frames, tracking_frames), "Ground Truth"),
    
    origin="center",
)
rp.display_video(preview_video)

preview_video_path = 'untracked/inferblobs_outputs/'+rp.get_folder_name(checkpoint_root)+'__'+result_title+'.mp4'
preview_video_path = rp.get_unique_copy_path(preview_video_path)
rp.make_parent_directory(preview_video_path)
rp.save_video_mp4(preview_video, preview_video_path, framerate=30, show_progress=False)

ic(preview_video_path);

0
This browser does not support the video tag.


ic| preview_video_path: 'untracked/inferblobs_outputs/checkpoint-4500__meadow_girls_copy1.mp4'
