In [None]:
#Imports and paths setup
import sys
import rp
import torch
import numpy as np
from einops import rearrange
import shlex
from icecream import ic

top_dir = rp.get_git_toplevel()

ltx_dir = rp.path_join(top_dir, 'LTX2')
nfs_models_dir = rp.path_join(ltx_dir, 'models')

sys.path+=[
    nfs_models_dir,
]

from download_models import local_download_dir, download_from_web
local_models_dir = local_download_dir

models_dir = local_models_dir

# LTX Pipeline imports
from ltx_core.loader import LTXV_LORA_COMFY_RENAMING_MAP, LoraPathStrengthAndSDOps
from ltx_pipelines.ti2vid_two_stages import TI2VidTwoStagesPipeline
from ltx_pipelines.ic_lora import ICLoraPipeline
from ltx_pipelines.utils.media_io import encode_video
from ltx_pipelines.utils.constants import AUDIO_SAMPLE_RATE

# Model paths
checkpoint_path        = rp.path_join(models_dir, "ltx-2-19b-dev.safetensors")
distilled_lora_path    = rp.path_join(models_dir, "ltx-2-19b-distilled-lora-resized_dynamic_fro095_avg_rank_242_bf16.safetensors")
spatial_upsampler_path = rp.path_join(models_dir, "ltx-2-spatial-upscaler-x2-1.0.safetensors")
gemma_root             = models_dir  # Contains text_encoder/ with model files

# IC-LoRA paths (for refinement/control)
detailer_lora_path     = rp.path_join(models_dir, "ltx-2-19b-ic-lora-detailer.safetensors")

# Output directory
output_dir = rp.path_join(top_dir, "outputs")
rp.make_directory(output_dir)

In [None]:
#Basic Post-Paths/Imports Setup
IN_NOTEBOOK = rp.running_in_jupyter_notebook()
DEVICE = rp.select_torch_device(prefer_used=True, reserve=True) #Select the GPU
DTYPE = torch.bfloat16

download_from_web() #Ensure sure base LTX models are downloaded

print(f"Running in notebook: {IN_NOTEBOOK}")
print(f"Top directory: {top_dir}")

In [None]:
# Helpers
def show_video(video):
    if IN_NOTEBOOK:
        rp.display_video(video)

In [None]:
# T2V (Text-to-Video) Pipeline Setup and Generation
distilled_lora = [
    LoraPathStrengthAndSDOps(
        distilled_lora_path,
        0.6,
        LTXV_LORA_COMFY_RENAMING_MAP,
    ),
]

pipeline = TI2VidTwoStagesPipeline(
    checkpoint_path=checkpoint_path,
    distilled_lora=distilled_lora,
    spatial_upsampler_path=spatial_upsampler_path,
    gemma_root=gemma_root,
    loras=[],
)

In [None]:
# Generate video from text (T2V - no image conditioning)
# NOTE: Two-stage pipeline generates Stage 1 at HALF resolution, then upscales.
#       So 1280x768 -> Stage1 at 640x384 -> Upscale to 1280x768
#       Dimensions must be divisible by 64 for two-stage pipeline.

drone_prompt = "Drone shot, helicopter flying fast through a narrow rocky canyon, sun-kissed day, clear turquoise water below, white foam waves, motion blur, sharp focus"

negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted, watermarks, low quality, artifacts, morphing, warping, flicker, text, logo"

# Generation parameters
height = 768
width = 1280
num_frames = 121
frame_rate = 25.0

# Resolution: 1280x768 is 720p-ish, divisible by 64. Stage1 will be 640x384.
# For even higher quality, try 1920x1088 (1080p, divisible by 64)
ltx_video, ltx_audio = pipeline(
    prompt=drone_prompt,
    negative_prompt=negative_prompt,
    seed=42,
    height=height,
    width=width,
    num_frames=num_frames,
    frame_rate=frame_rate,
    num_inference_steps=40,
    cfg_guidance_scale=4.0,  # Was 3.0 - 4.0 is recommended for sharper output
    images=[],
)

# Collect video chunks and save with audio using LTX's encoder
with torch.inference_mode():
    video_chunks = list(ltx_video)
    video_tensor = torch.cat(video_chunks, dim=0) if len(video_chunks) > 1 else video_chunks[0]

# Save video with audio
video_path = rp.path_join(output_dir, "generated_video.mp4")
encode_video(
    video=video_tensor,
    fps=int(frame_rate),
    audio=ltx_audio,
    audio_sample_rate=AUDIO_SAMPLE_RATE,
    output_path=video_path,
    video_chunks_number=1,
)
print(f"Saved video to: {video_path}")

# Convert to numpy for display
video = rp.as_numpy_array(video_tensor)
print(f"Output video shape: {video.shape}")  # Should be ~(121, 768, 1280, 3)

In [None]:
show_video(video)

In [None]:
# === OPTIONAL: Refine with IC-LoRA Detailer ===
# This takes the generated video and enhances fine details/textures.
# Uses the saved video_path from the previous cell as conditioning.

# Create the detailer pipeline with the IC-LoRA
detailer_lora = [
    LoraPathStrengthAndSDOps(
        detailer_lora_path,
        1.0,  # Full strength for detailer
        LTXV_LORA_COMFY_RENAMING_MAP,
    ),
]

detailer_pipeline = ICLoraPipeline(
    checkpoint_path=checkpoint_path,
    spatial_upsampler_path=spatial_upsampler_path,
    gemma_root=gemma_root,
    loras=detailer_lora,
)

# Refine the video - uses original video as conditioning
detailed_video_gen, detailed_audio = detailer_pipeline(
    prompt=drone_prompt,  # Same prompt as original
    seed=42,
    height=height,
    width=width,
    num_frames=num_frames,
    frame_rate=frame_rate,
    images=[],
    video_conditioning=[(video_path, 1.0)],  # Condition on our generated video
)

# Collect video chunks
with torch.inference_mode():
    detailed_chunks = list(detailed_video_gen)
    detailed_tensor = torch.cat(detailed_chunks, dim=0) if len(detailed_chunks) > 1 else detailed_chunks[0]

# Save detailed video with audio
detailed_video_path = rp.path_join(output_dir, "generated_video_detailed.mp4")
encode_video(
    video=detailed_tensor,
    fps=int(frame_rate),
    audio=detailed_audio,
    audio_sample_rate=AUDIO_SAMPLE_RATE,
    output_path=detailed_video_path,
    video_chunks_number=1,
)
print(f"Saved detailed video to: {detailed_video_path}")

# Convert to numpy for display
detailed_video = rp.as_numpy_array(detailed_tensor)
print(f"Detailed video shape: {detailed_video.shape}")

In [None]:
# Show the detailed/refined video
show_video(detailed_video)