In [1]:
# Imports and paths setup
import sys
import rp
import torch
import numpy as np
from einops import rearrange

top_dir = rp.get_git_toplevel()
ltx_dir = rp.path_join(top_dir, 'LTX2')
nfs_models_dir = rp.path_join(ltx_dir, 'models')
sys.path += [nfs_models_dir]

from download_models import local_download_dir, download_from_web
models_dir = local_download_dir

# LTX Pipeline imports
from ltx_core.loader import LTXV_LORA_COMFY_RENAMING_MAP, LoraPathStrengthAndSDOps
from ltx_pipelines.ti2vid_two_stages import TI2VidTwoStagesPipeline
from ltx_pipelines.utils.media_io import encode_video
from ltx_pipelines.utils.constants import AUDIO_SAMPLE_RATE

# Model paths
checkpoint_path        = rp.path_join(models_dir, "ltx-2-19b-dev.safetensors")
distilled_lora_path    = rp.path_join(models_dir, "ltx-2-19b-distilled-lora-resized_dynamic_fro095_avg_rank_242_bf16.safetensors")
spatial_upsampler_path = rp.path_join(models_dir, "ltx-2-spatial-upscaler-x2-1.0.safetensors")
detailer_lora_path     = rp.path_join(models_dir, "ltx-2-19b-ic-lora-detailer.safetensors")
gemma_root             = models_dir

# Output directory
output_dir = rp.path_join(top_dir, "outputs")
rp.make_directory(output_dir)

'/root/CleanCode/Experiments/LTX_Tests/outputs'

In [2]:
# Setup
IN_NOTEBOOK = rp.running_in_jupyter_notebook()
DEVICE = rp.select_torch_device(prefer_used=True, reserve=True)
DTYPE = torch.bfloat16
download_from_web()

                      ┏━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━┳━━━━━━━┳━━━━━━┳━━━━━━┳━━━━━━━━━━━┓
                      ┃[1;35m [0m[1;35mGPU ID[0m[1;35m [0m┃[1;35m [0m[1;35m        Name         [0m[1;35m [0m┃[1;35m [0m[1;35m     Used     [0m[1;35m [0m┃[1;35m [0m[1;35m  Free[0m[1;35m [0m┃[1;35m [0m[1;35mTotal[0m[1;35m [0m┃[1;35m [0m[1;35mTemp[0m[1;35m [0m┃[1;35m [0m[1;35mUtil[0m[1;35m [0m┃[1;35m [0m[1;35mProcesses[0m[1;35m [0m┃
                      ┡━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━╇━━━━━━━╇━━━━━━╇━━━━━━╇━━━━━━━━━━━┩
[1mSelecting cuda:0 –––> [0m│[2m [0m[2m  0   [0m[2m [0m│[2m [0m[2mNVIDIA A100-SXM4-80GB[0m[2m [0m│[31m [0m[31m694.6MB   0.8%[0m[31m [0m│[32m [0m[32m79.3GB[0m[32m [0m│[36m [0m[36m 80GB[0m[36m [0m│[34m [0m[34m31°C[0m[34m [0m│[33m [0m[33m  0%[0m[33m [0m│[36m [0m[36m         [0m[36m [0m│
                      │[2m [0m[2m  1   [0m[2m

In [3]:
# Helpers
def show_video(video):
    if IN_NOTEBOOK:
        rp.display_video(video)

def save_video_with_audio(video_tensor, audio_tensor, path, fps=25):
    encode_video(
        video=video_tensor,
        fps=int(fps),
        audio=audio_tensor,
        audio_sample_rate=AUDIO_SAMPLE_RATE,
        output_path=path,
        video_chunks_number=1,
    )

In [5]:
# Create Pipeline (run once)
# Detailer LoRA on Stage 1, Distilled LoRA on Stage 2

detailer_lora = LoraPathStrengthAndSDOps(detailer_lora_path, 0, LTXV_LORA_COMFY_RENAMING_MAP)
distilled_lora = LoraPathStrengthAndSDOps(distilled_lora_path, 1, LTXV_LORA_COMFY_RENAMING_MAP)

pipeline = TI2VidTwoStagesPipeline(
    checkpoint_path=checkpoint_path,
    # distilled_lora=[distilled_lora],
    distilled_lora=[],
    spatial_upsampler_path=spatial_upsampler_path,
    gemma_root=gemma_root,
    loras=[detailer_lora],  # Detailer on Stage 1
)

In [6]:
# Generate T2V
# Two-stage: 1280x768 -> Stage1 at 640x384 -> Upscale to 1280x768
# Dimensions must be divisible by 64

drone_prompt = "Drone shot, helicopter flying fast through a narrow rocky canyon, sun-kissed day, clear turquoise water below, white foam waves, motion blur, sharp focus"
negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted, watermarks, low quality, artifacts, morphing, warping, flicker, text, logo"

height, width, num_frames, frame_rate = 768, 1280, 121, 25.0

ltx_video, ltx_audio = pipeline(
    prompt=drone_prompt,
    negative_prompt=negative_prompt,
    seed=42,
    height=height,
    width=width,
    num_frames=num_frames,
    frame_rate=frame_rate,
    num_inference_steps=50,
    cfg_guidance_scale=4.0,
    images=[],
)

with torch.inference_mode():
    video_tensor = torch.cat(list(ltx_video), dim=0)

video_path = rp.path_join(output_dir, "generated_video.mp4")
save_video_with_audio(video_tensor, ltx_audio, video_path, frame_rate)
video = rp.as_numpy_array(video_tensor)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 50/50 [01:10<00:00,  1.41s/it]
100%|██████████| 3/3 [00:09<00:00,  3.18s/it]
100%|██████████| 1/1 [00:03<00:00,  3.14s/it]


In [7]:
show_video(video_path)