In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "8"
os.environ["PYTHONWARNINGS"] = "ignore"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
import argparse
import torch

from llavavid.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llavavid.conversation import conv_templates, SeparatorStyle
from llavavid.model.builder import load_pretrained_model
from llavavid.utils import disable_torch_init
from llavavid.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria

import json
import os
import math
from tqdm import tqdm
from decord import VideoReader, cpu

from transformers import AutoConfig

import time

import numpy as np


def split_list(lst, n):
    """Split a list into n (roughly) equal-sized chunks"""
    chunk_size = math.ceil(len(lst) / n)  # integer division
    return [lst[i: i + chunk_size] for i in range(0, len(lst), chunk_size)]


def get_chunk(lst, n, k):
    chunks = split_list(lst, n)
    return chunks[k]


def parse_args(args=None):
    """
    Parse command-line arguments.
    """
    parser = argparse.ArgumentParser()

    # Define the command-line arguments
    parser.add_argument("--video_path", help="Path to the video files.", required=True)
    parser.add_argument("--output_dir", help="Directory to save the model results JSON.", required=True)
    parser.add_argument("--output_name", help="Name of the file for storing results JSON.", required=True)
    parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
    parser.add_argument("--model-base", type=str, default=None)
    parser.add_argument("--conv-mode", type=str, default=None)
    parser.add_argument("--chunk-idx", type=int, default=0)
    parser.add_argument("--mm_resampler_type", type=str, default="spatial_pool")
    parser.add_argument("--mm_spatial_pool_stride", type=int, default=4)
    parser.add_argument("--mm_spatial_pool_out_channels", type=int, default=1024)
    parser.add_argument("--mm_spatial_pool_mode", type=str, default="average")
    parser.add_argument("--image_aspect_ratio", type=str, default="anyres")
    parser.add_argument("--image_grid_pinpoints", type=str,
                        default="[(224, 448), (224, 672), (224, 896), (448, 448), (448, 224), (672, 224), (896, 224)]")
    parser.add_argument("--mm_patch_merge_type", type=str, default="spatial_unpad")
    parser.add_argument("--overwrite", type=lambda x: (str(x).lower() == 'true'), default=True)
    parser.add_argument("--for_get_frames_num", type=int, default=4)
    parser.add_argument("--load_8bit", type=lambda x: (str(x).lower() == 'true'), default=False)
    return parser.parse_args(args=args)


def load_video(video_path, args):
    vr = VideoReader(video_path, ctx=cpu(0))
    total_frame_num = len(vr)
    # fps = round(vr.get_avg_fps())
    # frame_idx = [i for i in range(0, len(vr), fps)]
    uniform_sampled_frames = np.linspace(0, total_frame_num - 1, args.for_get_frames_num, dtype=int)
    frame_idx = uniform_sampled_frames.tolist()
    spare_frames = vr.get_batch(frame_idx).asnumpy()
    return spare_frames

In [None]:
"""
Run inference on ActivityNet QA DataSet using the Video-ChatGPT model.

Args:
    args: Command-line arguments.
"""

args = parse_args([
    '--model-path=lmms-lab/LLaVA-NeXT-Video-7B-DPO',
    '--video_path=./out.mp4',
    '--output_dir=./work_dirs/video_demo/LLaVA-NeXT-Video-7B-DPO_vicuna_v1_frames_32_stride_2',
    '--output_name=pred',
    '--chunk-idx=-1',
    '--overwrite=True',
    '--mm_spatial_pool_stride=4',
    '--for_get_frames_num=32',
    '--conv-mode=vicuna_v1'
])

# Initialize the model
model_name = get_model_name_from_path(args.model_path)
# Set model configuration parameters if they exist
if args.overwrite == True:
    overwrite_config = {}
    overwrite_config["mm_resampler_type"] = args.mm_resampler_type
    overwrite_config["mm_spatial_pool_stride"] = args.mm_spatial_pool_stride
    overwrite_config["mm_spatial_pool_out_channels"] = args.mm_spatial_pool_out_channels
    overwrite_config["mm_spatial_pool_mode"] = args.mm_spatial_pool_mode
    overwrite_config["patchify_video_feature"] = False

    cfg_pretrained = AutoConfig.from_pretrained(args.model_path)

    if "224" in cfg_pretrained.mm_vision_tower:
        # suppose the length of text tokens is around 1000, from bo's report
        least_token_number = args.for_get_frames_num * (16 // args.mm_spatial_pool_stride)**2 + 1000
    else:
        least_token_number = args.for_get_frames_num * (24 // args.mm_spatial_pool_stride)**2 + 1000

    scaling_factor = math.ceil(least_token_number / 4096)
    # import pdb;pdb.set_trace()

    if scaling_factor >= 2:
        if "mistral" not in cfg_pretrained._name_or_path.lower() and "7b" in cfg_pretrained._name_or_path.lower():
            print(float(scaling_factor))
            overwrite_config["rope_scaling"] = {"factor": float(scaling_factor), "type": "linear"}
        overwrite_config["max_sequence_length"] = 4096 * scaling_factor
        overwrite_config["tokenizer_model_max_length"] = 4096 * scaling_factor

    tokenizer, model, image_processor, context_len = load_pretrained_model(
        args.model_path, args.model_base, model_name, load_8bit=args.load_8bit, overwrite_config=overwrite_config
    )
else:
    tokenizer, model, image_processor, context_len = load_pretrained_model(
        args.model_path, args.model_base, model_name
    )

In [None]:
# Create the output directory if it doesn't exist
if not os.path.exists(args.output_dir):
    os.makedirs(args.output_dir)

In [15]:
video_path = "./out_d20.mp4"
sample_set = {}
# system = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, polite answers to the user's questions and strictly follows the user's requirement."
# question = "Please provide a detailed description of the video, focusing on the main subjects, their actions, and the background scenes"
# question = "What does this video describe? A. Buiding B.Forest C.coutryside D.Moon \nAnswer with the option's letter from the given choices directly."
# question = "Please provide a brief description of the video, focusing on the main subjects, their actions, the background scenes, and their cultural features."
# question = "Please provide attributes about the video, focusing on the main subjects, their actions, the background scenes, and their cultural features. The formats should be like: ['jazz','soft', 'latin','leizure']."
# question = "Please provide a brief description of the video, focusing on the main subjects, their actions and facial expressions, and the background scenes. You should focus on the feelings and the atmosphere from the video."
# question = "Please provide a summary of the video, focusing on the feelings and atmosphere from the main subjects and the background scenes. Don't describe the actions of the subjects directly, but focus on the feelings and atmosphere from the video."
# question = "Refering to the lyrics of the video, provide a brief description focusing on the feelings and atmosphere from the main subjects and the background scenes. Lyrics: This is my fight song (hey) Take back my life song (hey) Prove I'm alright song (hey, ha) My power's turned on (hey) Starting right now, I'll be strong (hey) I'll play my fight song (hey) And I don't really care if nobody else believes (ha) 'Cause I've still got a lot of fight left in me."
# question = "Refering to the lyrics of the song, provide a brief description focusing on the feelings and atmosphere of the video. Lyrics: 'Like a small boat on the ocean. Sending big waves into motion. Like how a single word. Can make a heart open. I might only have one match. But I can make an explosion'"
# question = "Think step by step. Refering to the lyrics of the song, provide a short single sentence summary focusing on the feelings and atmosphere of the video. Lyrics: 'Like a small boat on the ocean. Sending big waves into motion. Like how a single word. Can make a heart open. I might only have one match. But I can make an explosion'"
# question = "Think step by step. Refering to the lyrics of the song, provide a short single sentence summary focusing on the feelings and atmosphere of the video. Lyrics: 'and this is crazy, but here's my number, so call me, maybe? It's hard to look right, at you baby, but here's my number, so call me, maybe? Hey, I just met you, and this is crazy, but here's my number, so call me, maybe? And all the other boys, try to chase me, but here's my number, so call me, maybe?'"
# question = "Think step by step. Refering to the lyrics of the song, provide a short single sentence summary focusing on the feelings and atmosphere of the video. Lyrics: 'Like a small boat on the ocean. Sending big waves into motion. Like how a single word. Can make a heart open. I might only have one match. But I can make an explosion. And all those things I didn't say. Were wrecking balls inside my brain. I will scream them loud tonight. Can you hear my voice this time?. This is my fight song (hey). Take back my life song (hey). Prove I'm alright song (hey, ha). My power's turned on (hey). Starting right now, I'll be strong (hey). I'll play my fight song (hey). And I don't really care if nobody else believes (ha). 'Cause I've still got a lot of fight left in me. '"
# question = "Think step by step. Provide a short single sentence summary focusing on the feelings and atmosphere of the video."
# question = "Watch the given video and respond to the following question: does the women first appear in the pub or in her room?"
question = "If the given video was split into 30 equal duration segments, focusing on the feelings and atmosphere from the visuals of each segment, tell me which segment most possibly indicate a shift in atmosphere and why? Give me a short answer."


# Check if the video exists
if os.path.exists(video_path):
    print(video_path)
    video = load_video(video_path, args)
    video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].half().cuda()
    video = [video]

# try:
# Run inference on the video and add the output to the list

qs = question
if model.config.mm_use_im_start_end:
    qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + "\n" + qs
else:
    qs = DEFAULT_IMAGE_TOKEN + "\n" + qs

conv = conv_templates[args.conv_mode].copy()

# if (len(system) > 0):
#     conv.system = system

conv.append_message(conv.roles[0], qs)
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()


input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).cuda()
attention_masks = input_ids.ne(tokenizer.pad_token_id).long().cuda()

stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
keywords = [stop_str]
stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)


with torch.inference_mode():
    start_time = time.time()
    output_ids = model.generate(
        inputs=input_ids,
        images=video,
        attention_mask=attention_masks,
        modalities="video",
        do_sample=True,
        temperature=0.2,
        max_new_tokens=1024,
        use_cache=True,
        stopping_criteria=[stopping_criteria]
    )
    end_time = time.time()
    print(f"Time taken for inference: {end_time - start_time} seconds")

outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
print(f"Question: {prompt}\n")
print(f"Response: {outputs}\n")
# import pdb;pdb.set_trace()
if outputs.endswith(stop_str):
    outputs = outputs[: -len(stop_str)]
outputs = outputs.strip()

./out_d20.mp4
Time taken for inference: 2.099351167678833 seconds
Question: A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>
If the given video was split into 30 equal duration segments, focusing on the feelings and atmosphere from the visuals of each segment, tell me which segment most possibly indicate a shift in atmosphere and why? Give me a short answer. ASSISTANT:

Response: Segment 15.

