In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import os
import av
import gc
import json
from PIL import Image
from pathlib import Path
from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor
from sklearn.metrics import accuracy_score, mean_absolute_error

import torch
import datasets
from trl import SFTTrainer
from torchvision import transforms
from datasets import load_dataset, DatasetDict
from peft import LoraConfig, get_peft_model, PeftModel
from torch.utils.data import Dataset, DataLoader
from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration, BitsAndBytesConfig, set_seed, TrainingArguments, EvalPrediction


import wandb
wandb.login(key='cefd9d24e53c2aa0dc75b785b40045c6e6badced')
# wandb.init(project="accident_detection")

from huggingface_hub import login
login('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')

import warnings
warnings.filterwarnings("ignore")

seed = 42
set_seed(seed)
# torch.set_default_device("cuda")

os.environ["TOKENIZERS_PARALLELISM"] = "false"

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mshashank23088[0m ([33mshashankgsharma[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/shashank23088/.netrc


## DATA PREP

In [2]:
# Input paths
train_accident_dir = "../data/train/accident"
train_non_accident_dir = "../data/train/non_accident"
test_accident_dir = "../data/test/accident"
test_non_accident_dir = "../data/test/non_accident"

train_accident_json = "../data/accident_vehicle_count_YOLO_train.json"
train_non_accident_json = "../data/non_accident_vehicle_count_YOLO_train.json"
test_accident_json = "../data/accident_vehicle_count_YOLO_test.json"
test_non_accident_json = "../data/non_accident_vehicle_count_YOLO_test.json"

# Output paths
train_output_json = "../data/prepared_train_data.json"
test_output_json = "../data/prepared_test_data.json"

repo_name = "shashank23088/llava-onevision-qwen2-7b-traffic"
model_name = "llava-hf/llava-onevision-qwen2-7b-ov-hf"

# Load the JSON files
def load_json(file_path):
    with open(file_path, "r") as f:
        return json.load(f)

In [3]:
train_accident_data = load_json(train_accident_json)
train_non_accident_data = load_json(train_non_accident_json)
test_accident_data = load_json(test_accident_json)
test_non_accident_data = load_json(test_non_accident_json)

In [4]:
train_accident_data['1u69z-wsDIc_000181']

{'car': 6, 'truck': 1, 'bike': 0, 'bicycle': 0}

In [5]:
test_non_accident_data['201804191441002645']

{'car': 12, 'truck': 3, 'bike': 0, 'bicycle': 0}

In [6]:
# Helper function to prepare data
def prepare_data(data, folder, accident_label):
    prepared = []
    for video_id, vehicles in data.items():
        # Construct the path to frames
        video_folder = os.path.join(folder, video_id, "images")
        if not os.path.exists(video_folder):
            video_folder = os.path.join(folder, video_id)  # Fallback if no "images" subfolder
        if not os.path.exists(video_folder):  # Skip if folder doesn't exist
            print(f"Warning: Missing folder for video_id {video_id}")
            continue
        
        # Get all frames
        frames = [
            os.path.join(video_folder, frame)
            for frame in os.listdir(video_folder) if frame.endswith(".jpg")
        ]
        
        # Sort frames for consistency
        frames.sort()
        
        # Skip if no frames are found
        if not frames:
            print(f"Warning: No frames found in folder for video_id {video_id}")
            continue

        # Add congestion_level as placeholder (since it's not available in your data)
        prepared.append({
            "video_id": video_id,
            "frames": frames,
            "labels": {
                "vehicles": vehicles,
                "congestion_level": "unknown",  # Placeholder
                "accident": accident_label
            }
        })
    return prepared


In [7]:
# Prepare train data
train_accident_prepared = prepare_data(train_accident_data, train_accident_dir, "yes")
train_non_accident_prepared = prepare_data(train_non_accident_data, train_non_accident_dir, "no")
train_final_data = train_accident_prepared + train_non_accident_prepared



In [8]:
# Save train data
with open(train_output_json, "w") as f:
    json.dump(train_final_data, f, indent=4)
print(f"Prepared train data saved to {train_output_json}")

Prepared train data saved to ../data/prepared_train_data.json


In [9]:
# Prepare test data
test_accident_prepared = prepare_data(test_accident_data, test_accident_dir, "yes")
test_non_accident_prepared = prepare_data(test_non_accident_data, test_non_accident_dir, "no")
test_final_data = test_accident_prepared + test_non_accident_prepared

In [10]:
# Save test data
with open(test_output_json, "w") as f:
    json.dump(test_final_data, f, indent=4)
print(f"Prepared test data saved to {test_output_json}")

Prepared test data saved to ../data/prepared_test_data.json


## DATA LOADING AND PROCESSING

In [3]:
# Load train and test datasets
def load_data(train_json_path, test_json_path):
    with open(train_json_path, "r") as f:
        train_data = json.load(f)
    with open(test_json_path, "r") as f:
        test_data = json.load(f)
    
    # Convert to Dataset objects
    return DatasetDict({
        "train": datasets.Dataset.from_list(train_data),
        "test": datasets.Dataset.from_list(test_data)
    })

In [4]:
# Load datasets
dataset = load_data(train_output_json, test_output_json)
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['video_id', 'frames', 'labels'],
        num_rows: 197
    })
    test: Dataset({
        features: ['video_id', 'frames', 'labels'],
        num_rows: 100
    })
})


In [5]:
dataset['train']['labels']

[{'accident': 'yes',
  'congestion_level': 'unknown',
  'vehicles': {'bicycle': 0, 'bike': 0, 'car': 6, 'truck': 1}},
 {'accident': 'yes',
  'congestion_level': 'unknown',
  'vehicles': {'bicycle': 2, 'bike': 0, 'car': 1, 'truck': 1}},
 {'accident': 'yes',
  'congestion_level': 'unknown',
  'vehicles': {'bicycle': 0, 'bike': 0, 'car': 10, 'truck': 1}},
 {'accident': 'yes',
  'congestion_level': 'unknown',
  'vehicles': {'bicycle': 0, 'bike': 0, 'car': 1, 'truck': 0}},
 {'accident': 'yes',
  'congestion_level': 'unknown',
  'vehicles': {'bicycle': 0, 'bike': 0, 'car': 1, 'truck': 2}},
 {'accident': 'yes',
  'congestion_level': 'unknown',
  'vehicles': {'bicycle': 0, 'bike': 0, 'car': 6, 'truck': 1}},
 {'accident': 'yes',
  'congestion_level': 'unknown',
  'vehicles': {'bicycle': 0, 'bike': 0, 'car': 14, 'truck': 3}},
 {'accident': 'yes',
  'congestion_level': 'unknown',
  'vehicles': {'bicycle': 0, 'bike': 0, 'car': 4, 'truck': 1}},
 {'accident': 'yes',
  'congestion_level': 'unknown',


In [6]:
def preprocess_dataset(dataset, processor, fixed_frames=8, frame_interval=16, frame_resolution=(112, 112)):
    def process_example(example):
        # Select a fixed number of frames evenly from the start and end
        def select_fixed_frames(frames, fixed_frames, interval, resolution):
            num_frames = len(frames)
            half_frames = fixed_frames // 2

            # Select frames from the start and end with the given interval
            start_frames = frames[::interval][:half_frames]
            end_frames = frames[-1::-interval][:half_frames][::-1]  # Reverse end frames for proper order

            selected_frames = start_frames + end_frames

            # If fewer frames than needed, pad by repeating frames
            while len(selected_frames) < fixed_frames:
                selected_frames.append(selected_frames[-1])

            selected_frames = selected_frames[:fixed_frames]  # Ensure exact number of frames
            processed_frames = [
                np.array(Image.open(frame).convert("RGB").resize(resolution))
                for frame in selected_frames
            ]
            return np.stack(processed_frames)

        # Process frames
        frames = select_fixed_frames(
            example["frames"], fixed_frames, frame_interval, frame_resolution
        )

        # Prepare the label text
        label_text = str({
            "vehicles": example["labels"]["vehicles"],
            "congestion_level": example["labels"]["congestion_level"],
            "accident": example["labels"]["accident"]
        })

        # Include the expected response directly in the input prompt
        # input_text = (
        #     "<|im_start|>user <video>\nGenerate Traffic Report<|im_end|>"
        #     f"<|im_start|>assistant\n{label_text}"
        # )

        conversation = [
            {
                "role": "user",
                "content": [
                    {"type": "video"},
                    {"type": "text", "text": """
                        Generate a detailed report in JSON format on the traffic statistics in this video:
                        {
                            vehicles: {
                                'car': <count>,
                                'truck': <count>,
                                'bike': <count>,
                                'bicycle': <count>
                            },
                            congestion_level: '<low|medium|high>',
                            accident: '<yes|no>'
                        }
                    """},
                ],
            },
        ]

        input_text = processor.apply_chat_template(conversation, add_generation_prompt=False)
        input_text += label_text

        # Tokenize inputs (video frames + text)
        input_encoding = processor(
            text=input_text,
            videos=frames,
            return_tensors="pt",
            padding=True
            # truncation=True,  # Dynamically truncate if needed
        )

        return {
            "input_ids": input_encoding["input_ids"][0],
            "attention_mask": input_encoding["attention_mask"][0],
        }

    # Map the dataset with the processed examples
    processed_dataset = dataset.map(
        lambda x: process_example(x),
        batched=False,
        remove_columns=dataset.column_names,
    )
    return processed_dataset


## FINETUNING WITH SFT AND QLORA

In [7]:
# Load the processor
processor = AutoProcessor.from_pretrained(model_name)

In [8]:
# # Shuffle and select a small sample
# sample_train = dataset["train"].shuffle(seed=42).select(range(10))
# sample_test = dataset["test"].shuffle(seed=42).select(range(10))

# # Preprocess the sample datasets
# sample_train_processed = preprocess_dataset(sample_train, processor)
# sample_test_processed = preprocess_dataset(sample_test, processor)

In [9]:
# sample_train_processed

In [10]:
# len(sample_train_processed['labels'][4])

In [11]:
# len(sample_train_processed['input_ids'][0])

In [12]:
# len(sample_train_processed['input_ids'][4])

In [8]:
dataset["train"] = preprocess_dataset(dataset["train"], processor)
dataset["test"] = preprocess_dataset(dataset["test"], processor)

Map:   0%|          | 0/197 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 197
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 100
    })
})

In [10]:
dataset.push_to_hub("shashank23088/processed-traffic-data")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/shashank23088/processed-traffic-data/commit/db65e7dbeb178f22b03823bf31110c2f79d6511a', commit_message='Upload dataset', commit_description='', oid='db65e7dbeb178f22b03823bf31110c2f79d6511a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/shashank23088/processed-traffic-data', endpoint='https://huggingface.co', repo_type='dataset', repo_id='shashank23088/processed-traffic-data'), pr_revision=None, pr_num=None)

## FINETUNING

In [3]:
dataset = load_dataset("shashank23088/processed-traffic-data")
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 197
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 100
    })
})

In [12]:
# # Shuffle and split the train dataset (80% train, 20% eval)
# split_dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)  # 20% for eval

# # Now you have train and eval datasets in the train_dataset object
# train_dataset = split_dataset['train']  # The training part
# eval_dataset = split_dataset['test']   # The eval part (20%)

In [14]:
# train_dataset

In [15]:
# eval_dataset

In [16]:
# Configure 4-bit quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

# Load the model with 4-bit quantization
model = LlavaOnevisionForConditionalGeneration.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    device_map="auto"
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [17]:
# Configure LoRA
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules="all-linear",
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

In [18]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [19]:
print_trainable_parameters(model)

trainable params: 51425280 || all params: 4613178912 || trainable%: 1.1147471403337585


In [20]:
model.gradient_checkpointing_enable()

In [22]:
# Define the training arguments with memory optimizations
training_args = TrainingArguments(
    output_dir="./llava_finetuned",
    per_device_train_batch_size=1,  # Reduce batch size
    gradient_accumulation_steps=16,  # Accumulate gradients over more steps
    num_train_epochs=5,
    # evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=100,
    learning_rate=2e-5,
    fp16=True,  # Use mixed precision training
    logging_steps=10,
    save_total_limit=3,
    remove_unused_columns=False,
    dataloader_num_workers=4,  # Adjust workers to prevent overload
    report_to="wandb",
    push_to_hub=True,
    hub_model_id=repo_name,
    # load_best_model_at_end=True,
    # metric_for_best_model="accident_accuracy",
    # greater_is_better=True,
    dataloader_pin_memory=True,  # Pin memory for faster data loading
    warmup_steps=100,  # Gradual warm-up of learning rate
)

# Initialize the trainer
trainer = SFTTrainer(
    model=model,
    tokenizer=processor.tokenizer,
    args=training_args,
    train_dataset=dataset['train'],  # Pass the train dataset
    # eval_dataset=eval_dataset,  # Pass the eval dataset
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
# training_args = TrainingArguments(
#     output_dir="./llava_finetuned",  # Output directory for model and checkpoints
#     per_device_train_batch_size=1,  # Batch size for training
#     gradient_accumulation_steps=8,  # Accumulate gradients over 8 steps to simulate a larger batch
#     num_train_epochs=3,  # Number of epochs
#     # max_steps=200,  # Optionally, you can set max steps instead of num_train_epochs
#     evaluation_strategy="no",  # Disable evaluation
#     save_strategy="no",  # Disable model saving during training
#     logging_steps=10,  # Log every 10 steps
#     learning_rate=2e-4,  # Learning rate for training
#     fp16=True,  # Use mixed precision training to reduce memory usage
#     save_total_limit=2,  # Limit the number of saved checkpoints
#     remove_unused_columns=False,  # Ensure unused columns are removed from dataset
#     dataloader_num_workers=2,  # Number of workers for loading data
#     report_to="wandb",  # Log metrics to W&B (optional)
#     push_to_hub=True,  # Optionally, push model to Hugging Face Hub
#     hub_model_id=repo_name,  # Hugging Face model ID
# )

# # Initialize the trainer without evaluation
# trainer = SFTTrainer(
#     model=model,
#     tokenizer=processor.tokenizer,
#     args=training_args,
#     train_dataset=dataset['train'],
# )

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [23]:
torch.cuda.empty_cache() 

In [24]:
# Start training
trainer.train()



VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112244406508074, max=1.0…

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
10,11.2771
20,10.5014
30,10.498
40,10.4874
50,11.2392
60,10.4232


TrainOutput(global_step=60, training_loss=10.737729136149088, metrics={'train_runtime': 689.1833, 'train_samples_per_second': 1.429, 'train_steps_per_second': 0.087, 'total_flos': 7.014390786542995e+16, 'train_loss': 10.737729136149088, 'epoch': 4.649746192893401})

In [25]:
torch.cuda.empty_cache()

In [26]:
# Save the model and processor
trainer.save_model("./llava_finetuned_model")
processor.save_pretrained("./llava_finetuned_model")

['./llava_finetuned_model/processor_config.json']

In [27]:
# Push the model to the Hugging Face Hub
model.push_to_hub(repo_name)
processor.push_to_hub(repo_name)

README.md:   0%|          | 0.00/1.78k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.
No files have been modified since last commit. Skipping to prevent empty commit.


CommitInfo(commit_url='https://huggingface.co/shashank23088/llava-onevision-qwen2-7b-traffic/commit/3cbf5a27b44855114e55c21f5db893c9583fd44f', commit_message='Upload processor', commit_description='', oid='3cbf5a27b44855114e55c21f5db893c9583fd44f', pr_url=None, repo_url=RepoUrl('https://huggingface.co/shashank23088/llava-onevision-qwen2-7b-traffic', endpoint='https://huggingface.co', repo_type='model', repo_id='shashank23088/llava-onevision-qwen2-7b-traffic'), pr_revision=None, pr_num=None)

## Merging the adapters with pretrained model

In [None]:
# Load the base model
base_model = LlavaOnevisionForConditionalGeneration.from_pretrained(
    "llava-hf/llava-onevision-qwen2-7b-ov-hf", device_map="auto"
)

# Load the adapters directly from the Hugging Face Hub
adapter_model_path = "shashank23088/llava-onevision-qwen2-7b-traffic"
model_with_adapters = PeftModel.from_pretrained(base_model, adapter_model_path)

# Merge the adapters with the base model
merged_model = model_with_adapters.merge_and_unload()

print("Adapters successfully merged with the base model.")

# Save the fully merged model
merged_model.save_pretrained("./llava_finetuned_model_merged")

# Save the processor associated with the base model
processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-7b-ov-hf")
processor.save_pretrained("./llava_finetuned_model_merged")
print("Merged model and processor saved to ./llava_finetuned_model_merged")

# Push the merged model and processor to the Hugging Face Hub
merge_repo_name = "shashank23088/llava-traffic-finetuned-merged" 
merged_model.push_to_hub(merge_repo_name)
processor.push_to_hub(merge_repo_name)

print(f"Merged model pushed to Hugging Face Hub: https://huggingface.co/{merge_repo_name}")

## INFERENCE

In [5]:
torch.cuda.empty_cache()

In [6]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model = LlavaOnevisionForConditionalGeneration.from_pretrained(f'{repo_name}-merged', quantization_config=quantization_config, device_map='auto')
processor = AutoProcessor.from_pretrained(f'{repo_name}-merged')

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [7]:
def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

# Load the video as an np.array, sampling uniformly 8 frames (can sample more for longer videos, up to 32 frames)
# video_path = hf_hub_download(repo_id="raushan-testing-hf/videos-test", filename="sample_demo_1.mp4", repo_type="dataset")
container = av.open('../data/video.mp4')
total_frames = container.streams.video[0].frames
indices = np.arange(0, total_frames, total_frames / 8).astype(int)
video = read_video_pyav(container, indices)

In [9]:
conversation = [
    {
        "role": "user",
        "content": [
            {"type": "video"},
            {"type": "text", "text": """
                Generate a detailed report in JSON format on the traffic statistics in this video:
                {
                    vehicles: {
                        'car': <count>,
                        'truck': <count>,
                        'bike': <count>,
                        'bicycle': <count>
                    },
                    congestion_level: '<low|medium|high>',
                    accident: '<yes|no>'
                }
            """},
        ],
    },
]

prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(videos=list(video), text=prompt, return_tensors="pt").to("cuda:0", torch.float16)

out = model.generate(**inputs, max_new_tokens=300)
result = processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)
print(result)

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


['user \n\n                Generate a detailed report in JSON format on the traffic statistics in this video:\n                {\n                    vehicles: {\n                        \'car\': <count>,\n                        \'truck\': <count>,\n                        \'bike\': <count>,\n                        \'bicycle\': <count>\n                    },\n                    congestion_level: \'<low|medium|high>\',\n                    accident: \'<yes|no>\'\n                }\n            assistant\n```json\n{\n  "vehicles": {\n    "car": 10,\n    "truck": 2,\n    "bike": 0,\n    "bicycle": 0\n  },\n  "congestion_level": "low",\n  "accident": "no"\n}\n```']


In [10]:
json.loads(''.join(''.join(result[0].split('assistant')[1].strip('\n').strip('```').strip('json').split('\n')).split(' ')))

{'vehicles': {'car': 10, 'truck': 2, 'bike': 0, 'bicycle': 0},
 'congestion_level': 'low',
 'accident': 'no'}

## Evaluation

In [7]:
torch.cuda.empty_cache()

In [8]:
base_dir = '../data/test/'
output_dir = '../data/responses_test'

In [8]:
def read_frames_from_folder(folder_path, frame_paths, step=8):
    """
    Reads every `step` frames from the given paths and stacks them into a numpy array.
    
    Args:
        folder_path (str): The base path of the folder (not used in this case).
        frame_paths (list): List of paths to the individual frames (images).
        step (int): Step size to sample frames. Default is 8.
        
    Returns:
        np.ndarray: Stacked sampled frames as a numpy array of shape (num_sampled_frames, height, width, 3).
    """
    frames = []
    for i, frame_path in enumerate(frame_paths):
        if i % step == 0:  # Sample every `step` frame
            frame = Image.open(frame_path)
            frames.append(np.array(frame.convert('RGB')))
    return np.stack(frames)

In [9]:
def infer(frame_paths, video_id, processor, model, prompt, output_dir):
    """
    Perform inference on a single video and parse the generated output using regex.
    
    Args:
        frame_paths (list): List of paths to the frames (images) for the video.
        video_id (str): Unique identifier for the video.
        processor: Processor for converting frames and text prompt to tensors.
        model: The model used for inference.
        prompt (str): The text prompt to send with the video.
        output_dir (str): Directory to save the generated JSON file.
        
    Returns:
        dict: Parsed JSON output from the model containing traffic statistics, or None if processing failed.
    """
    # Sample every 8th frame for inference
    sampled_frames = read_frames_from_folder(None, frame_paths, step=8)

    # Prepare the inputs for the model
    inputs = processor(videos=sampled_frames, text=prompt, return_tensors="pt").to("cuda:0", torch.float16)

    # Perform inference (generation)
    try:
        out = model.generate(**inputs, max_new_tokens=300)
        result = processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
        
        # Apply regex to extract the JSON part from the model output
        json_str = ''.join(''.join(result.split('assistant')[1].strip('\n').strip('```').strip('json').split('\n')).split(' '))
        json_data = json.loads(json_str.replace("'", "\""))  # Fix single quotes to double quotes
    except Exception as e:
        print(f"Error parsing result for video {video_id}. Raw output: {result}")
        torch.cuda.empty_cache()
        return None

    # Save the output to a JSON file
    output_file = os.path.join(output_dir, f'{video_id}.json')
    with open(output_file, 'w') as f:
        json.dump(json_data, f, indent=4)

    torch.cuda.empty_cache()
    return json_data

In [10]:
def evalDataset(dataset, processor, model, output_dir, prompt):
    """
    Evaluate the dataset by performing inference on each video and calculating evaluation metrics.
    
    Args:
        dataset: The dataset to evaluate (with frames and labels).
        processor: Processor for converting frames and text prompt to tensors.
        model: The model used for inference.
        output_dir (str): Directory to save the generated JSON files.
        prompt (str): The text prompt to send with the video.
        
    Returns:
        tuple: Accuracy for accident detection, MAE scores for vehicle counts.
    """
    true_labels = []
    predicted_labels = []
    
    # Initialize vehicle count data for MAE calculation
    vehicle_counts_true = {'car': [], 'truck': [], 'bike': [], 'bicycle': []}
    vehicle_counts_pred = {'car': [], 'truck': [], 'bike': [], 'bicycle': []}

    # Iterate through the dataset
    for idx, row in enumerate(tqdm(dataset, desc="Evaluating dataset")):
        video_id = row['video_id']
        frames = row['frames']  # Frames should be a list of paths
        labels = row['labels']
        
        # Extract the true labels
        true_accident = 1 if labels['accident'].lower() == 'yes' else 0
        true_vehicle_counts = labels['vehicles']
        
        # Perform inference to get the predicted data
        json_data = infer(frames, video_id, processor, model, prompt, output_dir)
        
        if json_data is None:
            continue  # Skip this video if inference failed
        
        # Get predicted values for accident and vehicles
        predicted_accident = 1 if json_data["accident"].lower() == 'yes' else 0
        predicted_vehicle_counts = json_data["vehicles"]
        
        # Store the results for evaluation
        true_labels.append(true_accident)
        predicted_labels.append(predicted_accident)

        for vehicle_type in ['car', 'truck', 'bike', 'bicycle']:
            vehicle_counts_true[vehicle_type].append(true_vehicle_counts.get(vehicle_type, 0))
            vehicle_counts_pred[vehicle_type].append(predicted_vehicle_counts.get(vehicle_type, 0))

    # Calculate accuracy for accident detection
    accuracy = accuracy_score(true_labels, predicted_labels)

    # Calculate MAE for vehicle counts
    mae_scores = {}
    for vehicle_type in ['car', 'truck', 'bike', 'bicycle']:
        mae_scores[vehicle_type] = mean_absolute_error(vehicle_counts_true[vehicle_type], vehicle_counts_pred[vehicle_type])

    # Print and return the evaluation results
    print(f"Accuracy for Accident Detection: {accuracy}")
    for vehicle_type, mae in mae_scores.items():
        print(f"MAE for {vehicle_type} count: {mae}")
    
    return accuracy, mae_scores

In [11]:
conversation = [
    {
        "role": "user",
        "content": [
            {"type": "video"},
            {"type": "text", "text": """
                Generate a detailed report in JSON format on the traffic statistics in this video:
                {
                    vehicles: {
                        'car': <count>,
                        'truck': <count>,
                        'bike': <count>,
                        'bicycle': <count>
                    },
                    congestion_level: '<low|medium|high>',
                    accident: '<yes|no>'
                }
            """},
        ],
    },
]

prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

In [12]:
torch.cuda.empty_cache()

In [14]:
evalDataset(dataset=dataset['test'], processor=processor, model=model, output_dir=output_dir, prompt=prompt)

Evaluating dataset:   0%|          | 0/100 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Settin

Accuracy for Accident Detection: 0.8
MAE for car count: 2.68
MAE for truck count: 0.72
MAE for bike count: 0.7
MAE for bicycle count: 0.55


(0.8,
 {'car': np.float64(2.68),
  'truck': np.float64(0.72),
  'bike': np.float64(0.7),
  'bicycle': np.float64(0.55)})