In [1]:
import torch
import numpy as np
import argparse
import pickle
import cv2
from PIL import Image
import pandas as pd
import os
from collections import Counter

from clip_interrogator import Config, Interrogator
import torch.nn.functional as F
import torch
from transformers import AutoProcessor, AutoModelForImageTextToText
from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation
from transformers import AutoProcessor, Blip2ForConditionalGeneration

In [2]:
annotation_file = open("./annotations_public.pkl", 'rb')
annotations = pickle.load(annotation_file)#Load annotations
annotation_file.close()

In [None]:
df = pd.read_csv("road_objects.csv")

# Group by 'video' and collect track_ids into a list
result = df.groupby('video')['track_id'].apply(list).to_dict()

for video in result:
    print(f"{video}: {result[video]}")
# print(result)

In [5]:
def detect_hazard(object_cor, frame):
    is_it_hazard = False
    caption = ""
    x1, y1, x2, y2 = object_cor
    if x1 < 0:
       x1 = 0
    if x2 < 0:
       x2 = 0
    if y1 < 0:
       y1 = 0
    if y2 < 0:
       y2 = 0
    frame_height, frame_width, _ = frame.shape

    # Check if the coordinates are within bounds
    if (y1 - 20 >= 0 and y2 + 20 <= frame_height and 
        x1 - 20 >= 0 and x2 + 20 <= frame_width):
        cropped_object = frame[y1-20 :y2 + 20, x1 - 20:x2 + 20]
    else:
        cropped_object = frame[y1:y2, x1:x2]

    prompt0 = "Question: Is this an animal or a car or a human or a flying-object or an floating-object on the road or an alien? Answer:"
    cropped_image = Image.fromarray(cv2.cvtColor(cropped_object, cv2.COLOR_BGR2RGB))  # Convert to PIL Image
    cropped_image = cropped_image.resize((512,512))
    inputs = processor_hazard(cropped_image, text=prompt0, return_tensors="pt").to(device, torch.float16)
    generated_ids = model_hazard.generate(**inputs, max_new_tokens=10)
    # print(f"Generated IDs: {generated_ids}")
    generated_text_general = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    # print("generated_text_general:", generated_text_general)
    generated_text_general = generated_text_general.split()[-1]
    # print("Last word of generated_text:", generated_text_general)

    contains_car = "car" in generated_text_general.lower()
    contains_human = any(word in generated_text_general.lower() for word in ["human", "person", "man", "woman", "men", "women", "kid"])
    contains_animal = any(word in generated_text_general.lower() for word in ["animal", "dog", "cat", "snake", "bird", "Kangaroo", "moose", "deer", "rabbit", "lizard", "cow", "horse", "goose", "duck", "mouse"])
    contains_flyingobject = "flying-object" in generated_text_general.lower()
    contains_object = any(word in generated_text_general.lower() for word in ["road", "alien"])
    
    if contains_car:
        prompt1 = "Question: Is this car in the opposing lane or a preceding vehicle or in the wrong way? Answer:"
        # print("prompt1:", prompt1)
        inputs = processor_hazard(cropped_image, text=prompt1, return_tensors="pt").to(device, torch.float16)
        generated_ids = model_hazard.generate(**inputs, max_new_tokens=100)
        generated_text = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        # print("generated_text_car:", generated_text)
        contains_lane = any(word in generated_text.lower() for word in ["wrong", "opposing"])
        if contains_lane:
            is_it_hazard = False

    if contains_human:
        # Specific prompt to describe appearance
        prompt_appearance = " This person is wearing a"
        inputs = processor_hazard(cropped_image, text=prompt_appearance, return_tensors="pt").to(device, torch.float16)
        generated_ids = model_hazard.generate(**inputs, max_new_tokens=20)  # Limit the response to approximately 10 words
        appearance_caption = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        
        prompt1 = "Question: Is this person crossing the street? Answer:"
        # print("prompt1:", prompt1)
        inputs = processor_hazard(cropped_image, text=prompt1, return_tensors="pt").to(device, torch.float16)
        generated_ids = model_hazard.generate(**inputs, max_new_tokens=100)
        generated_text = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        # print("generated_text_human:", generated_text)
        contains_lane = any(word in generated_text.lower() for word in ["yes"])
        
        if contains_lane:
            caption = str(generated_text_general) + " The person is going to cross the road " + appearance_caption
            is_it_hazard = True 
            
    if contains_animal:
        # Specific prompt to describe appearance
        prompt_color = f" The color of the {generated_text_general} "
        inputs = processor_hazard(cropped_image, text=prompt_color, return_tensors="pt").to(device, torch.float16)
        generated_ids = model_hazard.generate(**inputs, max_new_tokens=20)  # Limit the response to approximately 10 words
        appearance_color = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

        prompt_appearance = f" The characteristic of the {generated_text_general} "
        inputs = processor_hazard(cropped_image, text=prompt_appearance, return_tensors="pt").to(device, torch.float16)
        generated_ids = model_hazard.generate(**inputs, max_new_tokens=20)  # Limit the response to approximately 10 words
        appearance_caption = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        
        prompt1 = "Question: Is this animal crossing the street? Answer:"
        # print("prompt1:", prompt1)
        inputs = processor_hazard(cropped_image, text=prompt1, return_tensors="pt").to(device, torch.float16)
        generated_ids = model_hazard.generate(**inputs, max_new_tokens=100)
        generated_text = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        # print("generated_text_animal:", generated_text)
        contains_lane = any(word in generated_text.lower() for word in ["yes"])
        
        if contains_lane:
            caption = "It is a "+ str(generated_text_general) + f". The {generated_text_general} is going to cross the road {appearance_color}. {appearance_caption}."
            is_it_hazard = True  
            
    if contains_flyingobject:
        # Specific prompt to describe appearance
        prompt_color = f" The color of the {generated_text_general} "
        inputs = processor_hazard(cropped_image, text=prompt_color, return_tensors="pt").to(device, torch.float16)
        generated_ids = model_hazard.generate(**inputs, max_new_tokens=20)  # Limit the response to approximately 10 words
        appearance_color = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

        prompt_appearance = f" The characteristic of the {generated_text_general} "
        inputs = processor_hazard(cropped_image, text=prompt_appearance, return_tensors="pt").to(device, torch.float16)
        generated_ids = model_hazard.generate(**inputs, max_new_tokens=20)  # Limit the response to approximately 10 words
        appearance_caption = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        
        prompt1 = "Question: Is this object thrown into the air? Answer:"
        # print("prompt1:", prompt1)
        inputs = processor_hazard(cropped_image, text=prompt1, return_tensors="pt").to(device, torch.float16)
        generated_ids = model_hazard.generate(**inputs, max_new_tokens=100)
        generated_text = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        # print("generated_text_flying:", generated_text)
        contains_lane = any(word in generated_text.lower() for word in ["yes"])
        if contains_lane:
            caption = "It is a "+ str(generated_text_general) + f". The {generated_text_general} is thrown to air {appearance_color}. {appearance_caption}."
            is_it_hazard = True

    if contains_object:
        # Specific prompt to describe appearance
        prompt_color = f" The color of the {generated_text_general} "
        inputs = processor_hazard(cropped_image, text=prompt_color, return_tensors="pt").to(device, torch.float16)
        generated_ids = model_hazard.generate(**inputs, max_new_tokens=20)  # Limit the response to approximately 10 words
        appearance_color = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

        prompt_appearance = f" The characteristic of the {generated_text_general} "
        inputs = processor_hazard(cropped_image, text=prompt_appearance, return_tensors="pt").to(device, torch.float16)
        generated_ids = model_hazard.generate(**inputs, max_new_tokens=20)  # Limit the response to approximately 10 words
        appearance_caption = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        
        prompt1 = "Question: Is this object on the road? Answer:"
        # print("prompt1:", prompt1)
        inputs = processor_hazard(cropped_image, text=prompt1, return_tensors="pt").to(device, torch.float16)
        generated_ids = model_hazard.generate(**inputs, max_new_tokens=100)
        generated_text = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        # print("generated_text_object:", generated_text)
        contains_lane = any(word in generated_text.lower() for word in ["yes"])
        if contains_lane:
            caption = "It is an object on the "+ str(generated_text_general) + f". The object is on the road {appearance_color}. {appearance_caption}."
            is_it_hazard = True
        
    return is_it_hazard, caption


In [6]:
def process_track_ids(
    caption_midas_ids, 
    forbidden_word_count, 
    hazard_results, 
    captioned_tracks, 
    frame_image, 
    ci, 
    x1, y1, x2, y2,
):

    # Case 1: Caption already exists in caption_midas_ids (from hazard_results_{video}_v3.pkl:contains caption from most frequent caption that were produced in last step and save the longest caption)
    if track_id in caption_midas_ids:
        # print("track_id2:", track_id)
        caption_to_display = caption_midas_ids[track_id]['longest_caption']
        hazard_tracks.append(track_id)
        hazard_captions.append(caption_to_display)
        
        # Draw bounding box and caption
        cv2.rectangle(frame_image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
        display_text = f"ID: {track_id} | {caption_to_display}"
        cv2.putText(frame_image, display_text, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX,
                    0.5, (255, 0, 0), 2)
            
    else:
        if track_id not in captioned_tracks:
            print("track_id3:", track_id)   
            # Case 2: Generate caption if not in caption_midas_ids
            object_cor = (int(x1), int(y1), int(x2), int(y2))
            is_hazard, caption = detect_hazard(object_cor, frame_image)
    
            if len(caption) > 1:
                # Successful caption generation
                hazard_results.setdefault(track_id, []).append(caption)
                hazard_tracks.append(track_id)
                hazard_captions.append(caption)
                captioned_tracks[track_id] = caption
    
                # Draw bounding box and caption
                cv2.rectangle(frame_image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
                display_text = f"ID: {track_id} | {caption}"
                cv2.putText(frame_image, display_text, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX,
                            0.5, (255, 0, 0), 2)
            else:
                # Use VIT-B-L again for caption
                hazard_tracks.append(track_id)
                forbidden_word_count[track_id] = forbidden_word_count.get(track_id, 0)

                chip = frame_image[int(y1):int(y2), int(x1):int(x2)]
                hazard_chip = cv2.cvtColor(chip, cv2.COLOR_BGR2RGB)
                hazard_chip = Image.fromarray(hazard_chip)
                hazard_chip = hazard_chip.resize((512, 512))

                # Generate caption
                caption = ci.interrogate(hazard_chip)
                caption1 = caption.replace(",", " ")
                caption = " ".join(caption1.split()[:10])

                # Check for forbidden words
                if "car" in caption1.lower() or "vehicle" in caption1.lower():
                    forbidden_word_count[track_id] += 1
                    hazard_tracks.remove(track_id)  # Remove invalid track
                else:
                    captioned_tracks[track_id] = caption
                    hazard_captions.append(caption)

                    # Draw bounding box and caption
                    cv2.rectangle(frame_image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
                    display_text = f"ID: {track_id} | {caption}"
                    cv2.putText(frame_image, display_text, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX,
                                0.5, (255, 0, 0), 2)
        else:
            # print("track_id4:", track_id)
            hazard_tracks.append(track_id)
            caption = captioned_tracks[track_id]
            hazard_captions.append(caption)

        # Draw bounding box and caption
        cv2.rectangle(frame_image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
        display_text = f"ID: {track_id} | {caption}"
        cv2.putText(frame_image, display_text, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX,
                    0.5, (255, 0, 0), 2)

    return hazard_tracks, hazard_captions


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "CPU")

# File paths
results_file_path = "results_md_blip_final_v6.csv"
results_file_path_out = "results_md_blip_final_v6.csv"
video_root = './COOOL_Benchmark/processed_videos/'
output_dir = "./COOOL_Benchmark/processed_videos_midas/"
os.makedirs(output_dir, exist_ok=True)

ann_type = 'challenge_object'
ci = Interrogator(Config(clip_model_name="ViT-B-32/openai"))

video_num = 0

# Load existing CSV or create a new DataFrame
if os.path.exists(results_file_path):
    results_df = pd.read_csv(results_file_path)
else:
    columns = ["ID", "Driver_State_Changed"] + [f"Hazard_Track_{i}" for i in range(23)] + [f"Hazard_Name_{i}" for i in range(23)]
    results_df = pd.DataFrame(columns=columns)

# Iterate through videos
for video in sorted(list(annotations.keys())):  # Iterate through available videos
    print("video:", video)
    video_num += 1
    captioned_tracks = {}
    forbidden_word_count = {}
    if video_num > 0:
        if video in result:
            road_mask_result = result[video]  # int
        else:
            road_mask_result = {}            
        road_mask_str = [str(val) for val in road_mask_result] #str

        
        id_caption_v3 = f"hazard_results/hazard_results_{video}_v3.pkl"
        if os.path.exists(id_caption_v3):
            with open(id_caption_v3, "rb") as f:
                caption_midas_ids = pickle.load(f)  #str
        else:
            caption_midas_ids = {}

        
        midas = f"unique_ids/unique_ids_{video}.pkl"
        if os.path.exists(midas):
            with open(midas, "rb") as f:
                midas_ids = pickle.load(f)  #str
        else:
            midas_ids = {}

        mutual_values = [key for key in road_mask_str if key in midas_ids]
        # print("mutual_values:", mutual_values)

        captions = {}
        for value in mutual_values:
            if value in caption_midas_ids:
                captions[value] = caption_midas_ids[value]['longest_caption']
        # print("len(captions):", len(captions))

        video_stream = cv2.VideoCapture(os.path.join(video_root, video+'.mp4'))
        fps = int(video_stream.get(cv2.CAP_PROP_FPS))
        width = int(video_stream.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(video_stream.get(cv2.CAP_PROP_FRAME_HEIGHT))
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for output video
        output_video_path = os.path.join(output_dir, f"{video}_midas_hazard_v7_road.mp4")
        out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))

        processor_hazard = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
        model_hazard = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16).to(device)

        frame = 0
        captioned_tracks = {}
        hazard_results = {}
        
        while video_stream.isOpened():
        
            ret, frame_image = video_stream.read()
            if ret == False: #False means end of video or error
                assert frame == len(annotations[video].keys()) #End of the video must be final frame
                break
                
            if frame == 0:
                frame += 1
                continue
          
            hazard_tracks = []
            hazard_captions = []
            
            video_frame_id = f"{video}_{frame}"
            # print(video_frame_id)
            driver_state_flag = results_df.loc[results_df['ID'] == video_frame_id, 'Driver_State_Changed'].values[0]  # Replace this with your actual logic
            row_data = {"ID": video_frame_id, "Driver_State_Changed": driver_state_flag}
            for i in range(23):
                row_data[f"Hazard_Track_{i}"] = ""
                row_data[f"Hazard_Name_{i}"] = ""
            
            for i in range(len(annotations[video][frame][ann_type])):
                x1, y1, x2, y2 = annotations[video][frame][ann_type][i]['bbox']
                track_id = annotations[video][frame][ann_type][i]['track_id']

                ######### Case 1) caption from midas when there are mutual objects from road mask and MiDas
                if 0 < len(captions) <= 6:
                   if track_id in captions:
                       # print("track_id:", track_id)
                       hazard_tracks.append(track_id)
                       hazard_captions.append(captions[track_id])
                       cv2.rectangle(frame_image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
                       display_text = f"ID: {track_id} | {captions[track_id]}"                            
                       cv2.putText(frame_image, display_text, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX,
                                        0.5, (255, 0, 0), 2)
                       
                ######### Case 6) when more than 10 hazards and midas cannot find them --> objects from road mask and createing caption for them
                if len(road_mask_str) > 12 and len(captions) < 7:
                    combined_ids_road = list(set(road_mask_str) | set(midas_ids.keys()))
                    print("method 6")
                    if track_id in forbidden_word_count:
                        if forbidden_word_count[track_id] >= 10:
                            continue
                    if track_id in combined_ids_road:   
                        if caption_midas_ids:
                          caption_create = False
                          if caption_midas_ids and caption_create == False:
                              caption_create = True
                              first_key = list(caption_midas_ids.keys())[0]
                              caption_0 = caption_midas_ids[first_key]['longest_caption']
                          hazard_tracks.append(track_id)
                          hazard_captions.append(caption_0)
                        else:                           
                          hazard_tracks, hazard_captions = process_track_ids(
                                                            caption_midas_ids, 
                                                            forbidden_word_count, 
                                                            hazard_results, 
                                                            captioned_tracks, 
                                                            frame_image, 
                                                            ci, 
                                                            x1, y1, x2, y2
                                                        )
                              
                                           
                ######## Case 5) vidoes with more than 10 hazard ---> combinaiton of all objects from MiDas and road masking
                if len(captions) >= 7:   # 15; Goose consider both road and midas caption
                  print("method 5")
                  combined_ids = list(set(road_mask_str) | set(caption_midas_ids.keys()))
                  caption_create = False
                  if caption_midas_ids and caption_create == False:
                      caption_create = True
                      first_key = list(caption_midas_ids.keys())[0]
                      caption_0 = caption_midas_ids[first_key]['longest_caption']
                  hazard_tracks.append(track_id)
                  hazard_captions.append(caption_0)
                # Caption video without caption
                # Try to caption with blip agian
                if len(captions) == 0:
                    #Case 3) when no objects from road mask but there is some form MiDas
                    if len(caption_midas_ids) > 0:  #Case 5) when no road but caption midas: caption was produces in previous step
                        if track_id in caption_midas_ids:
                           # print("track_id:", track_id)
                           hazard_tracks.append(track_id)
                           caption_to_display = caption_midas_ids[track_id]['longest_caption']
                           hazard_captions.append(caption_to_display)

                           cv2.rectangle(frame_image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
                           display_text = f"ID: {track_id} | {caption_to_display}"                            
                           cv2.putText(frame_image, display_text, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX,
                                        0.5, (255, 0, 0), 2)
 
                    if len(caption_midas_ids) == 0:
                        #Case 6) if object fram road mask and MiDas is there but no caption was produces in last steps for none of them
                        if mutual_values:
                            if track_id in mutual_values:  #4 both road and midas same but no caption midas
                              # print("mutual_values:", mutual_values)
                              if track_id in forbidden_word_count:
                                  if forbidden_word_count[track_id] >= 10:
                                    continue
                              object_cor = int(x1), int(y1), int(x2), int(y2)                           
                              is_hazard, caption = detect_hazard(object_cor, frame_image)
                              # print("track_id:", track_id)
            
                              ###### Case 2) If blip produces any caption
                              if len(caption) > 1:
                                # print("caption:", caption)
                                if track_id not in hazard_results:
                                    hazard_results[track_id] = []
                                hazard_results[track_id].append(caption)
                                hazard_tracks.append(track_id)
                                hazard_captions.append(caption)
    
                                cv2.rectangle(frame_image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
                                display_text = f"ID: {track_id} | {caption}"                            
                                cv2.putText(frame_image, display_text, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX,
                                            0.5, (255, 0, 0), 2)
        
                              ###########################################################################################################
                              #### If blip cannot produce any caption: we get help from VIT-B-L again with bbox :(
                              if len(caption) == 0:
                                  hazard_tracks.append(track_id)
                                  if track_id not in captioned_tracks:
                                    chip = frame_image[int(y1):int(y2), int(x1):int(x2)]
                                    hazard_chip = cv2.cvtColor(chip, cv2.COLOR_BGR2RGB)
                                    hazard_chip = Image.fromarray(hazard_chip)
                                    hazard_chip = hazard_chip.resize((512, 512))
                                    # Generate caption
                                    caption = ci.interrogate(hazard_chip)
                        
                                    caption1 = caption.replace(","," ")
                                    caption = " ".join(caption1.split()[:10])
                                    # print("caption;", caption)
                                    
                                    captioned_tracks[track_id] = caption
                                  else:
                                    caption = captioned_tracks[track_id]
                                  hazard_captions.append(caption)  
    
                                  cv2.rectangle(frame_image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
                                  display_text = f"ID: {track_id} | {caption}"                            
                                  cv2.putText(frame_image, display_text, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX,
                                                0.5, (255, 0, 0), 2)
                        #2
                        else:
                          if track_id in midas_ids:
                              # print("midas_ids:", midas_ids)
                              if track_id in forbidden_word_count:
                                  if forbidden_word_count[track_id] >= 10:
                                    continue
                              #6
                              object_cor = int(x1), int(y1), int(x2), int(y2)                           
                              is_hazard, caption = detect_hazard(object_cor, frame_image)
                              # print("track_id:", track_id)
            
                              ###### 2 If blip produces any caption: YAY!
                              if len(caption) > 1:
                                # print("caption:", caption)
                                if track_id not in hazard_results:
                                    hazard_results[track_id] = []
                                hazard_results[track_id].append(caption)
                                hazard_tracks.append(track_id)
                                hazard_captions.append(caption)
    
                                cv2.rectangle(frame_image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
                                display_text = f"ID: {track_id} | {caption}"                            
                                cv2.putText(frame_image, display_text, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX,
                                            0.5, (255, 0, 0), 2)
        
                              ###########################################################################################################
                              #### If blip cannot produce any caption: we get help from VIT-B-L again with bbox :(
                              if len(caption) == 0:
                                  hazard_tracks.append(track_id)
                                  if track_id not in captioned_tracks:
                                    chip = frame_image[int(y1):int(y2), int(x1):int(x2)]
                                    hazard_chip = cv2.cvtColor(chip, cv2.COLOR_BGR2RGB)
                                    hazard_chip = Image.fromarray(hazard_chip)
                                    hazard_chip = hazard_chip.resize((512, 512))
                                    # Generate caption
                                    caption = ci.interrogate(hazard_chip)
                        
                                    caption1 = caption.replace(","," ")
                                    caption = " ".join(caption1.split()[:10])
                                    # print("caption;", caption)
                                    
                                    captioned_tracks[track_id] = caption
                                  else:
                                    caption = captioned_tracks[track_id]
                                  hazard_captions.append(caption)  
    
                                  cv2.rectangle(frame_image, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 2)
                                  display_text = f"ID: {track_id} | {caption}"                            
                                  cv2.putText(frame_image, display_text, (int(x1), int(y1) - 10), cv2.FONT_HERSHEY_SIMPLEX,
                                                0.5, (255, 0, 0), 2)
                    
                    
            ####write csv file
            for i in range(min(len(hazard_tracks), 23)):
                row_data[f"Hazard_Track_{i}"] = hazard_tracks[i]
                # print("row_data[f'Hazard_Track_{i}']", row_data[f"Hazard_Track_{i}"])
                row_data[f"Hazard_Name_{i}"] = hazard_captions[i]
                # print(" row_data[f'Hazard_Name_{i}']",  row_data[f"Hazard_Name_{i}"])

             # Update DataFrame
            if video_frame_id in results_df['ID'].values:
                for key, value in row_data.items():
                    results_df.loc[results_df['ID'] == video_frame_id, key] = value
            else:
                results_df = pd.concat([results_df, pd.DataFrame([row_data])], ignore_index=True)
                      

            frame += 1
            out.write(frame_image)
        # Save updated DataFrame to CSV
        results_df.to_csv(results_file_path_out, index=False)
        print("Results file updated successfully!")
        video_stream.release()
        out.release()


# Save updated DataFrame to CSV
results_df.to_csv(results_file_path_out, index=False)
print("Results file updated successfully!")