In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset

In [3]:
dataset = pd.read_csv('./Charades/Charades/charades_v1_train.csv')
dataset

Unnamed: 0,id,subject,scene,quality,relevance,verified,script,objects,descriptions,actions,length
0,46GP8,HR43,Kitchen,6.0,7.0,Yes,A person cooking on a stove while watching som...,food;stove;window,A person cooks food on a stove before looking ...,c092 11.90 21.20;c147 0.00 12.60,24.83
1,N11GT,0KZ7,Stairs,6.0,7.0,Yes,"One person opens up a folded blanket, then sne...",blanket;broom;floor,Person at the bottom of the staircase shakes a...,c098 8.60 14.20;c075 0.00 11.70;c127 0.00 15.2...,18.33
2,0IH69,6RE8,Bedroom,6.0,5.0,Yes,A person is seen leaving a cabinet. They then ...,book;box;cabinet;shelf,A person is standing in a bedroom. They walk o...,,30.25
3,KRF68,YA10,Laundry room,6.0,7.0,Yes,A person runs into their laundry room. They gr...,clothes;door;phone,A person runs in and shuts door. The person gr...,c018 22.60 27.80;c141 4.10 9.60;c148 10.30 25....,30.33
4,MJO7C,6RE8,Kitchen,6.0,6.0,Yes,A person runs into their pantry holding a bott...,cup;phone,A person runs in place while holding a bottle ...,c015 0.00 32.00;c107 0.00 32.00,31.38
...,...,...,...,...,...,...,...,...,...,...,...
7980,7K2CS,HJZQ,Garage,6.0,6.0,Yes,Person enters the garage while sneezing. Perso...,chair;clothes;door;food;sandwich;shirt;spoon,"A enters through a doorway, sneezes, then clos...",c065 17.60 31.00;c067 17.60 31.00;c153 0.00 5....,30.08
7981,S2A89,KL48,Bathroom,7.0,7.0,Yes,"A person takes a chair and walks it over, plac...",chair;door,A PERSON IS TAKING A CHAIR FROM ONE ROOM TO TH...,c006 4.00 10.80;c141 4.40 10.90;c151 12.80 20....,19.29
7982,01O27,18IT,Bathroom,6.0,7.0,Yes,A person enters a bathroom and closes the door...,door;floor;mirror,A person is walking towards the bathroom. A pe...,c006 5.10 11.50;c008 0.50 6.60;c124 39.00 47.0...,46.08
7983,2MJ72,6RE8,Bedroom,6.0,6.0,Yes,A person opens a window in their laundry room....,door;towel;window,A person opens a window and looks out of it. ...,c006 11.00 17.00;c037 20.70 31.00;c092 0.60 8....,30.25


In [4]:
def IDtoVideoPath(ID):
    return './Charades_v1_480/Charades_v1_480/' + ID + '.mp4'

In [5]:
dataset['video_path'] = dataset['id'].apply(IDtoVideoPath)
dataset

Unnamed: 0,id,subject,scene,quality,relevance,verified,script,objects,descriptions,actions,length,video_path
0,46GP8,HR43,Kitchen,6.0,7.0,Yes,A person cooking on a stove while watching som...,food;stove;window,A person cooks food on a stove before looking ...,c092 11.90 21.20;c147 0.00 12.60,24.83,./Charades_v1_480/Charades_v1_480/46GP8.mp4
1,N11GT,0KZ7,Stairs,6.0,7.0,Yes,"One person opens up a folded blanket, then sne...",blanket;broom;floor,Person at the bottom of the staircase shakes a...,c098 8.60 14.20;c075 0.00 11.70;c127 0.00 15.2...,18.33,./Charades_v1_480/Charades_v1_480/N11GT.mp4
2,0IH69,6RE8,Bedroom,6.0,5.0,Yes,A person is seen leaving a cabinet. They then ...,book;box;cabinet;shelf,A person is standing in a bedroom. They walk o...,,30.25,./Charades_v1_480/Charades_v1_480/0IH69.mp4
3,KRF68,YA10,Laundry room,6.0,7.0,Yes,A person runs into their laundry room. They gr...,clothes;door;phone,A person runs in and shuts door. The person gr...,c018 22.60 27.80;c141 4.10 9.60;c148 10.30 25....,30.33,./Charades_v1_480/Charades_v1_480/KRF68.mp4
4,MJO7C,6RE8,Kitchen,6.0,6.0,Yes,A person runs into their pantry holding a bott...,cup;phone,A person runs in place while holding a bottle ...,c015 0.00 32.00;c107 0.00 32.00,31.38,./Charades_v1_480/Charades_v1_480/MJO7C.mp4
...,...,...,...,...,...,...,...,...,...,...,...,...
7980,7K2CS,HJZQ,Garage,6.0,6.0,Yes,Person enters the garage while sneezing. Perso...,chair;clothes;door;food;sandwich;shirt;spoon,"A enters through a doorway, sneezes, then clos...",c065 17.60 31.00;c067 17.60 31.00;c153 0.00 5....,30.08,./Charades_v1_480/Charades_v1_480/7K2CS.mp4
7981,S2A89,KL48,Bathroom,7.0,7.0,Yes,"A person takes a chair and walks it over, plac...",chair;door,A PERSON IS TAKING A CHAIR FROM ONE ROOM TO TH...,c006 4.00 10.80;c141 4.40 10.90;c151 12.80 20....,19.29,./Charades_v1_480/Charades_v1_480/S2A89.mp4
7982,01O27,18IT,Bathroom,6.0,7.0,Yes,A person enters a bathroom and closes the door...,door;floor;mirror,A person is walking towards the bathroom. A pe...,c006 5.10 11.50;c008 0.50 6.60;c124 39.00 47.0...,46.08,./Charades_v1_480/Charades_v1_480/01O27.mp4
7983,2MJ72,6RE8,Bedroom,6.0,6.0,Yes,A person opens a window in their laundry room....,door;towel;window,A person opens a window and looks out of it. ...,c006 11.00 17.00;c037 20.70 31.00;c092 0.60 8....,30.25,./Charades_v1_480/Charades_v1_480/2MJ72.mp4


In [6]:
from IPython.display import Video, display

# Display the first video in the dataset
video_path = dataset['video_path'].iloc[0]
display(Video(video_path, embed=True))

In [7]:
%pip install av
%pip install transformers
%pip install accelerate>=0.26.0
%pip install sentencepiece

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
print(torch.cuda.is_available())  # Should return True if GPU is available

False


In [12]:
import av
from huggingface_hub import hf_hub_download
from transformers import LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor, AutoTokenizer, AutoModelForCausalLM

def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

# Load the model in half-precision
model = LlavaNextVideoForConditionalGeneration.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf", torch_dtype=torch.float16, device_map="auto")
processor = LlavaNextVideoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")

# Load the video as an np.array, sampling uniformly 8 frames (can sample more for longer videos)
video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")
container = av.open(video_path)
total_frames = container.streams.video[0].frames
indices = np.arange(0, total_frames, total_frames / 8).astype(int)
video = read_video_pyav(container, indices)

conversation = [
    {

        "role": "user",
        "content": [
            {"type": "text", "text": "Why is this video funny?"},
            {"type": "video"},
            ],
    },
]

prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(text=prompt, videos=video, return_tensors="pt")

out = model.generate(**inputs, max_new_tokens=60)
processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)

Loading checkpoint shards: 100%|██████████| 3/3 [00:10<00:00,  3.64s/it]
Some parameters are on the meta device because they were offloaded to the cpu and disk.
Some kwargs in processor config are unused and will not have any effect: num_additional_image_tokens. 
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


RuntimeError: "slow_conv2d_cpu" not implemented for 'Half'