In [None]:
import av
import numpy as np
import torch
import time
import gc

from pathlib import Path
from transformers import AutoTokenizer, AutoModel, VideoLlavaForConditionalGeneration, VideoLlavaProcessor
from PIL import Image

print("CUDA available:", torch.cuda.is_available())
print("Current device:", torch.cuda.current_device() if torch.cuda.is_available() else "CPU only")
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU")

In [2]:
from huggingface_hub import hf_hub_download
import huggingface_hub
print(huggingface_hub.constants.HF_HUB_CACHE)

CACHE_DIR = Path(huggingface_hub.constants.HF_HUB_CACHE)

/common/home/projectgrps/CS707/CS707G3/.cache/huggingface/hub


##### Load the Model

In [None]:
MODEL_PATH = CACHE_DIR / "models--LanguageBind--Video-LLaVA-7B-hf" / "snapshots"
# MODEL_PATH = "/common/public/InternVL2-8B"

In [None]:
!python -m pip install ipykernel


In [4]:
def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`list[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

In [5]:
model = VideoLlavaForConditionalGeneration.from_pretrained(
    "LanguageBind/Video-LLaVA-7B-hf", 
    dtype=torch.float16, 
    # device_map="auto",
    # attn_implementation="flash_attention_2"
    # attn_implementation="sdpa"
).to("cuda" if torch.cuda.is_available() else "cpu")

processor = VideoLlavaProcessor.from_pretrained("LanguageBind/Video-LLaVA-7B-hf")

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

##### Create a Batch of questions

https://huggingface.co/docs/transformers/model_doc/video_llava#mixed-media-mode

In [6]:
questions = [
    "What is happening in this video?",
    "How many people are in the scene?",
    "What is the person doing at the beginning?"
]

In [8]:
prompt = "USER: <video>\nWhat is happening in this video? ASSISTANT:"
video_path = r"./src/data/video_clip/0102_scene_000_central_perk.mp4"
container = av.open(video_path)

# sample uniformly 8 frames from the video
total_frames = container.streams.video[0].frames
indices = np.arange(0, total_frames, total_frames / 8).astype(int)
clip = read_video_pyav(container, indices)

start_time = time.perf_counter()
inputs = processor(text=prompt, videos=clip, return_tensors="pt")
inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")

# Generate
generate_ids = model.generate(**inputs, max_new_tokens=256)
output_text = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
end_time = time.perf_counter()
elapsed_time = end_time - start_time
print(output_text)
print(f"Script execution time: {elapsed_time:.4f} seconds")

USER: 
What is happening in this video? ASSISTANT: In the video, a man is sitting on a couch with his arms crossed, while a woman is sitting on a chair next to him. They are both watching TV, and the man is talking to the woman.
Script execution time: 1.3337 seconds


In [None]:
gc.collect()
torch.cuda.empty_cache()