In [None]:
!pip install av --quiet
!pip install -U transformers --quiet

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import json
import re
import os
import av
import gc
from tqdm.notebook import tqdm
from PIL import Image
from huggingface_hub import hf_hub_download
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

import torch
from transformers import AutoTokenizer, AutoProcessor, LlavaOnevisionForConditionalGeneration, Qwen2VLForConditionalGeneration

## Exploration

In [None]:
model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf", torch_dtype=torch.float16, device_map="auto")
processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf")

In [None]:
def read_frames_from_folder(folder_path, indices):
    '''
    Reads frames from a folder.
    Args:
        folder_path (str): Path to the folder containing frames.
        indices (List[int]): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    # List all files in the folder and sort them to maintain the correct order
    frame_files = sorted(os.listdir(folder_path))

    # Filter the files based on the provided indices
    for i, frame_file in enumerate(frame_files):
        if i > indices[-1]:
            break
        if i in indices:
            frame_path = os.path.join(folder_path, frame_file)
            frame = Image.open(frame_path)
            frames.append(np.array(frame.convert('RGB')))

    return np.stack(frames)

In [None]:
video_folder = '/kaggle/input/dota-100/frames_100/accident/0qfbmt4G8Rw_003068/images'
total_frames_folder = len(os.listdir(video_folder))

# Select indices (example: 8 evenly spaced frames)
indices_folder = np.arange(0, total_frames_folder, total_frames_folder / 8).astype(int)
video1 = read_frames_from_folder(video_folder, indices_folder)

In [None]:
conversation1 = [
    {
        "role": "user",
        "content": [
            {"type": "video"},
            {"type": "text", "text": """
                Generate a detailed report in JSON format on the traffic statistics in this video:
                {
                    vehicles: {
                        'car': <count>,
                        'truck': <count>,
                        'bike': <count>,
                        'bicycle': <count>
                    },
                    congestion_level: '<low|medium|high>',
                    accident: '<yes|no>',
                    user_query_response: '<Yes|No>, the video contains a person wearing a red-colored t-shirt?'
                }
            """},
        ],
    },
]

In [None]:
prompt1 = processor.apply_chat_template(conversation1, add_generation_prompt=True)
inputs1 = processor(videos=list(video1), text=prompt1, return_tensors="pt").to("cuda:0", torch.float16)

out1 = model.generate(**inputs1, max_new_tokens=120)
decoded_output = processor.batch_decode(out1, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]

In [None]:
decoded_output[decoded_output.find('json') + 4:].replace('\n', '').replace(' ', '').replace("'", '"').replace("`", "")

In [None]:
# Clean the decoded output to extract only the JSON part.
json_data = json.loads(decoded_output[decoded_output.find('json') + 4:].replace('\n', '').replace(' ', '').replace("'", '"').replace("`", ""))
print(json.dumps(json_data))

In [None]:
data_to_save = {video_folder: json_data}
output_file = 'video_data.json'
with open(output_file, 'w') as f:
    json.dump(data_to_save, f, indent=4)

## Looping through all the videos

In [None]:
def read_frames_from_folder(folder_path, indices):
    '''
    Reads frames from a folder.
    Args:
        folder_path (str): Path to the folder containing frames.
        indices (List[int]): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    # List all files in the folder and sort them to maintain the correct order
    frame_files = sorted(os.listdir(folder_path))

    # Filter the files based on the provided indices
    for i, frame_file in enumerate(frame_files):
        if i > indices[-1]:
            break
        if i in indices:
            frame_path = os.path.join(folder_path, frame_file)
            frame = Image.open(frame_path)
            frames.append(np.array(frame.convert('RGB')))

    return np.stack(frames)

In [None]:
base_dir = '../data/test/'

def infer(video_folder, video_name):
    video_full_path = os.path.join(base_dir, video_folder, video_name)
    if video_folder == 'accident':
        video_full_path = os.path.join(base_dir, video_folder, video_name, 'images')
    total_frames = len(os.listdir(video_full_path))

    # Select indices (8 evenly spaced frames)
    indices = np.arange(0, total_frames, total_frames / 8).astype(int)
    video = read_frames_from_folder(video_full_path, indices)

    inputs = processor(videos=list(video), text=prompt, return_tensors="pt").to("cuda:0", torch.float16)

    out = model.generate(**inputs, max_new_tokens=120)
    decoded_output = processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]

    json_data = json.loads(decoded_output[decoded_output.find('json') + 4:].replace('\n', '').replace(' ', '').replace("'", '"').replace("`", ""))

#     data_to_save = {video_name: json_data}
    output_dir = './responses'
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, f'{video_name}.json')
    with open(output_file, 'w') as f:
        json.dump(json_data, f, indent=4)

    if json_data["accident"].lower() == "yes":
        return 1
    return 0

In [None]:
def evalVideos():
    video_folders = {'accident': 1, 'non_accident': 0}
    true_labels = []
    predicted_labels = []

    for folder, actual_label in video_folders.items():
        video_folder_path = os.path.join(base_dir, folder)
        videos = os.listdir(video_folder_path)[:20]
        for video_name in tqdm(videos, desc=f'Processing {folder} Videos'):

            # Infer the predicted label for the video
            predicted_label = infer(folder, video_name)

            # Append the actual and predicted labels for evaluation
            true_labels.append(actual_label)
            predicted_labels.append(predicted_label)

            torch.cuda.empty_cache()
            gc.collect()

    # Calculate evaluation metrics
    accuracy = accuracy_score(true_labels, predicted_labels)
    f1 = f1_score(true_labels, predicted_labels, average='binary')
    precision = precision_score(true_labels, predicted_labels, average='binary')
    recall = recall_score(true_labels, predicted_labels, average='binary')

    # Print and return the metrics
    print(f"Accuracy: {accuracy}")
    print(f"F1 Score: {f1}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")

    return accuracy, f1, precision, recall

In [None]:
model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf", torch_dtype=torch.float16, device_map="auto")
processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf")

conversation = [
    {
        "role": "user",
        "content": [
            {"type": "video"},
            {"type": "text", "text": """
                Generate a detailed report in JSON format on the traffic statistics in this video:
                {
                    vehicles: {
                        'car': <count>,
                        'truck': <count>,
                        'bike': <count>,
                        'bicycle': <count>
                    },
                    accident: '<yes|no>',
                    user_query_response: '<Yes|No>, the video contains a person wearing a red-colored t-shirt?'
                }
            """},
        ],
    },
]

prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [None]:
evalVideos()

Processing accident Videos:   0%|          | 0/20 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pa

Processing non_accident Videos:   0%|          | 0/20 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

Accuracy: 0.65
F1 Score: 0.46153846153846156
Precision: 1.0
Recall: 0.3


(0.65, np.float64(0.46153846153846156), np.float64(1.0), np.float64(0.3))

In [None]:
!zip -r responses.zip /kaggle/working/responses

In [None]:
!pip freeze > requirements.txt