LTX-Video is the DiT-based video generation model

In [None]:
!pip install -q git+https://github.com/huggingface/diffusers.git

In [2]:
import torch
from diffusers import LTXPipeline, LTXImageToVideoPipeline
from diffusers.utils import export_to_video, load_image

# text-to-video

In [None]:
pipe = LTXPipeline.from_pretrained("Lightricks/LTX-Video", torch_dtype=torch.bfloat16)
pipe.to("cuda")

In [4]:
prompt = "A captivating display of physics-based light effects, showcasing intricate patterns of diffraction and interference. Beams of light refract through a crystal prism, forming a vibrant spectrum of colors. The scene highlights realistic caustics and soft volumetric lighting, creating an ethereal and scientific atmosphere. The setting is dark, allowing the luminous effects to stand out with high contrast. The rendering is highly detailed, resembling real-world optical phenomena with smooth transitions and sharp clarity"
negative_prompt = "low resolution, worst quality, unrealistic lighting, overexposed, lack of detail, distorted reflections, inconsistent motion, blurry"

In [5]:
video = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
    width=704,
    height=480,
    num_frames=161,
    num_inference_steps=50,
).frames[0]
export_to_video(video, "physics-light.mp4", fps=24)

  0%|          | 0/50 [00:00<?, ?it/s]

'physics-light.mp4'

In [4]:
prompt = "heavy rain and thunder in city road, few cars are ruuning on the road, suddenly there is a huge lightning happend"
negative_prompt = "worst quality, unrealistic lighting and thunder, lack of detail, blurry"

In [5]:
video = pipe(
    prompt=prompt,
    negative_prompt=negative_prompt,
    width=704,
    height=480,
    num_frames=161,
    num_inference_steps=50,
).frames[0]
export_to_video(video, "rain.mp4", fps=24)

  0%|          | 0/50 [00:00<?, ?it/s]

'rain.mp4'

# image-to-video

In [3]:
pipe = LTXImageToVideoPipeline.from_pretrained("Lightricks/LTX-Video", torch_dtype=torch.bfloat16)
pipe.to("cuda")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading pipeline components...:   0%|          | 0/5 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LTXImageToVideoPipeline {
  "_class_name": "LTXImageToVideoPipeline",
  "_diffusers_version": "0.33.0.dev0",
  "_name_or_path": "Lightricks/LTX-Video",
  "scheduler": [
    "diffusers",
    "FlowMatchEulerDiscreteScheduler"
  ],
  "text_encoder": [
    "transformers",
    "T5EncoderModel"
  ],
  "tokenizer": [
    "transformers",
    "T5Tokenizer"
  ],
  "transformer": [
    "diffusers",
    "LTXVideoTransformer3DModel"
  ],
  "vae": [
    "diffusers",
    "AutoencoderKLLTXVideo"
  ]
}

In [4]:
image = load_image(
    "https://huggingface.co/datasets/a-r-r-o-w/tiny-meme-dataset-captioned/resolve/main/images/8.png"
)
prompt = "A young girl stands calmly in the foreground, looking directly at the camera, as a house fire rages in the background. Flames engulf the structure, with smoke billowing into the air. Firefighters in protective gear rush to the scene, a fire truck labeled '38' visible behind them. The girl's neutral expression contrasts sharply with the chaos of the fire, creating a poignant and emotionally charged scene."
negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"


In [5]:
video = pipe(
    image=image,
    prompt=prompt,
    negative_prompt=negative_prompt,
    width=704,
    height=480,
    num_frames=161,
    num_inference_steps=50,
).frames[0]
export_to_video(video, "fire.mp4", fps=24)

  0%|          | 0/50 [00:00<?, ?it/s]

'fire.mp4'

In [4]:
image = load_image("/content/cricket.jpg")
prompt = "Cricket players arrive at the playground with bats and wickets, standing in dynamic and natural poses. They are captured from a mid-range shot, showing well-defined body proportions and expressive gestures. The scene is lively, with realistic lighting and smooth motion, emphasizing the energy of the game. The players have naturally proportioned faces and bodies, ensuring a visually appealing and high-quality composition."
negative_prompt = "distorted faces, unnatural body proportions, deformed limbs, unrealistic expressions, worst quality, blurry, jittery, inconsistent motion, uncanny appearance, low detail"


In [5]:
video = pipe(
    image=image,
    prompt=prompt,
    negative_prompt=negative_prompt,
    width=704,
    height=480,
    num_frames=161,
    num_inference_steps=50,
).frames[0]
export_to_video(video, "cricket.mp4", fps=24)

  0%|          | 0/50 [00:00<?, ?it/s]

'cricket.mp4'