In [1]:
import re
from collections import defaultdict
import torch
import numpy as np
import argparse
import pickle
import cv2
from PIL import Image
import pandas as pd
import os
from collections import Counter
from clip_interrogator import Config, Interrogator
import torch.nn.functional as F
from transformers import AutoProcessor, AutoModelForImageTextToText
from transformers import AutoProcessor, Blip2ForConditionalGeneration

In [2]:
def detect_hazard(object_cor, frame):
    is_it_hazard = False
    caption = ""
    x1, y1, x2, y2 = object_cor
    if x1 < 0:
       x1 = 0
    if x2 < 0:
       x2 = 0
    if y1 < 0:
       y1 = 0
    if y2 < 0:
       y2 = 0
    frame_height, frame_width, _ = frame.shape

    # Check if the coordinates are within bounds
    if (y1 - 20 >= 0 and y2 + 20 <= frame_height and 
        x1 - 20 >= 0 and x2 + 20 <= frame_width):
        cropped_object = frame[y1-20 :y2 + 20, x1 - 20:x2 + 20]
    else:
        cropped_object = frame[y1:y2, x1:x2]

    prompt0 = "Question: Is this an animal or a car or a human or a flying-object in the air or a floating-object on the road or an alien? Answer:"
    cropped_image = Image.fromarray(cv2.cvtColor(cropped_object, cv2.COLOR_BGR2RGB))  # Convert to PIL Image
    cropped_image = cropped_image.resize((512,512))
    inputs = processor_hazard(cropped_image, text=prompt0, return_tensors="pt").to(device, torch.float16)
    generated_ids = model_hazard.generate(**inputs, max_new_tokens=10)
    # print(f"Generated IDs: {generated_ids}")
    generated_text_general = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    # print("generated_text_general:", generated_text_general)
    generated_text_general = generated_text_general.split()[-1]
    # print("Last word of generated_text:", generated_text_general)

    contains_car = "car" in generated_text_general.lower()
    contains_human = any(word in generated_text_general.lower() for word in ["human", "person", "man", "woman", "men", "women", "kid"])
    contains_animal = any(word in generated_text_general.lower() for word in ["animal", "dog", "cat", "snake", "bird", "Kangaroo", "moose", "deer", "rabbit", "lizard", "cow", "horse", "goose", "duck", "mouse"])
    # contains_flyingobject = "flying-object" in generated_text_general.lower()
    contains_flyingobject = "air" in generated_text_general.lower()
    contains_object = any(word in generated_text_general.lower() for word in ["road", "alien"])
    
    if contains_car:
        prompt1 = "Question: Is this car in the opposing lane or a preceding vehicle or in the wrong way? Answer:"
        # print("prompt1:", prompt1)
        inputs = processor_hazard(cropped_image, text=prompt1, return_tensors="pt").to(device, torch.float16)
        generated_ids = model_hazard.generate(**inputs, max_new_tokens=100)
        generated_text = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        # print("generated_text_car:", generated_text)
        contains_lane = any(word in generated_text.lower() for word in ["wrong", "opposing"])
        if contains_lane:
            is_it_hazard = False

    if contains_human:
        # Specific prompt to describe appearance
        prompt_appearance = " This person is wearing a"
        inputs = processor_hazard(cropped_image, text=prompt_appearance, return_tensors="pt").to(device, torch.float16)
        generated_ids = model_hazard.generate(**inputs, max_new_tokens=20)  # Limit the response to approximately 10 words
        appearance_caption = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        
        prompt1 = "Question: Is this person crossing the street? Answer:"
        # print("prompt1:", prompt1)
        inputs = processor_hazard(cropped_image, text=prompt1, return_tensors="pt").to(device, torch.float16)
        generated_ids = model_hazard.generate(**inputs, max_new_tokens=100)
        generated_text = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        # print("generated_text_human:", generated_text)
        contains_lane = any(word in generated_text.lower() for word in ["yes"])
        
        if contains_lane:
            caption = str(generated_text_general) + " The person is going to cross the road " + appearance_caption
            is_it_hazard = True 
            
    if contains_animal:
        # Specific prompt to describe appearance
        prompt_color = f" The color of the {generated_text_general} "
        inputs = processor_hazard(cropped_image, text=prompt_color, return_tensors="pt").to(device, torch.float16)
        generated_ids = model_hazard.generate(**inputs, max_new_tokens=20)  # Limit the response to approximately 10 words
        appearance_color = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

        prompt_appearance = f" The characteristic of the {generated_text_general} "
        inputs = processor_hazard(cropped_image, text=prompt_appearance, return_tensors="pt").to(device, torch.float16)
        generated_ids = model_hazard.generate(**inputs, max_new_tokens=20)  # Limit the response to approximately 10 words
        appearance_caption = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        
        prompt1 = "Question: Is this animal crossing the street? Answer:"
        # print("prompt1:", prompt1)
        inputs = processor_hazard(cropped_image, text=prompt1, return_tensors="pt").to(device, torch.float16)
        generated_ids = model_hazard.generate(**inputs, max_new_tokens=100)
        generated_text = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        # print("generated_text_animal:", generated_text)
        contains_lane = any(word in generated_text.lower() for word in ["yes"])
        
        if contains_lane:
            caption = "It is a "+ str(generated_text_general) + f". The {generated_text_general} is going to cross the road {appearance_color}. {appearance_caption}."
            is_it_hazard = True  
            
    if contains_flyingobject:
        # Specific prompt to describe appearance
        prompt_color = f" The color of the {generated_text_general} "
        inputs = processor_hazard(cropped_image, text=prompt_color, return_tensors="pt").to(device, torch.float16)
        generated_ids = model_hazard.generate(**inputs, max_new_tokens=20)  # Limit the response to approximately 10 words
        appearance_color = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

        prompt_appearance = f" The characteristic of the {generated_text_general} "
        inputs = processor_hazard(cropped_image, text=prompt_appearance, return_tensors="pt").to(device, torch.float16)
        generated_ids = model_hazard.generate(**inputs, max_new_tokens=20)  # Limit the response to approximately 10 words
        appearance_caption = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        
        prompt1 = "Question: Is this object thrown into the air? Answer:"
        # print("prompt1:", prompt1)
        inputs = processor_hazard(cropped_image, text=prompt1, return_tensors="pt").to(device, torch.float16)
        generated_ids = model_hazard.generate(**inputs, max_new_tokens=100)
        generated_text = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        # print("generated_text_flying:", generated_text)
        contains_lane = any(word in generated_text.lower() for word in ["yes"])
        if contains_lane:
            # caption = "It is a "+ str(generated_text_general) + f". The {generated_text_general} is thrown to air {appearance_color}. {appearance_caption}."
            caption = "It is a flying-object" + f". The flying-object is thrown to air {appearance_color}. {appearance_caption}."
            is_it_hazard = True

    if contains_object:
        # Specific prompt to describe appearance
        prompt_color = f" The color of the {generated_text_general} "
        inputs = processor_hazard(cropped_image, text=prompt_color, return_tensors="pt").to(device, torch.float16)
        generated_ids = model_hazard.generate(**inputs, max_new_tokens=20)  # Limit the response to approximately 10 words
        appearance_color = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

        prompt_appearance = f" The characteristic of the {generated_text_general} "
        inputs = processor_hazard(cropped_image, text=prompt_appearance, return_tensors="pt").to(device, torch.float16)
        generated_ids = model_hazard.generate(**inputs, max_new_tokens=20)  # Limit the response to approximately 10 words
        appearance_caption = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        
        prompt1 = "Question: Is this object on the road? Answer:"
        # print("prompt1:", prompt1)
        inputs = processor_hazard(cropped_image, text=prompt1, return_tensors="pt").to(device, torch.float16)
        generated_ids = model_hazard.generate(**inputs, max_new_tokens=100)
        generated_text = processor_hazard.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        # print("generated_text_object:", generated_text)
        contains_lane = any(word in generated_text.lower() for word in ["yes"])
        if contains_lane:
            caption = "It is an object on the "+ str(generated_text_general) + f". The object is on the road {appearance_color}. {appearance_caption}."
            is_it_hazard = True
        
    return is_it_hazard, caption

In [54]:
def create_caption(captions_list):
    
    # Step 1: Check for "is going to" in the captions
    going_to_captions = [item['caption'] for item in captions_list if "is going to" in item['caption']]
    if going_to_captions:
        # print("Found 'is going to' in captions:", going_to_captions)  # Debugging print statement
        # Step 2: Find the most repetitive first 4 words if there are multiple "is going to" captions
        first_four_words = [caption.split()[:4] for caption in going_to_captions]
        most_common = Counter([" ".join(words) for words in first_four_words]).most_common(1)[0][0]
        for caption in going_to_captions:
            if " ".join(caption.split()[:4]) == most_common:
                return caption

    # Step 3: Check for phrases like "man walking", "woman walking", "people walking"
    walking_priority = ["man walking", "woman walking", "people walking"]
    for priority in walking_priority:
        for item in captions_list:
            if priority in item['caption']:
                return item['caption']

    # Step 4: Check for other priority words
    other_priority_words = ["piece of paper", "pieces of papers", "pieces of paper", "box", "foggy city", "bicycle", "ceiling fan", "skateboard"]
    for priority_word in other_priority_words:
        for item in captions_list:
            if priority_word in item['caption']:
                return item['caption']

    # Step 5: Check for the word "object"
    for item in captions_list:
        if "object" in item['caption']:
            return item['caption']

    # Step 6: Check for "small bird" repeated more than 6 times
    small_bird_captions = [item['caption'] for item in captions_list if "small bird" in item['caption']]
    if len(small_bird_captions) > 6:
        return small_bird_captions[0]

    # Step 7: Random caption as fallback
    return random.choice(captions_list)['caption']


In [3]:
annotation_file = open("./annotations_public.pkl", 'rb')
annotations = pickle.load(annotation_file)#Load annotations
annotation_file.close()

In [4]:
videos_dr = [10, 15, 23, 31, 43, 53, 57, 62, 66, 74, 75, 78, 79, 83, 85, 90, 95, 104, 116, 139, 141, 143, 153, 155, 161, 166, 175,
            178, 181, 191]
wrong_cap_video = [4, 5, 9, 10, 11, 12, 14, 21, 22, 23, 25, 26, 31, 34, 36, 43, 46, 54, 61, 62, 66, 67, 71, 73, 74, 76, 79, 81, 82, 
            83, 87, 89, 93, 94, 98, 99, 100, 101, 102, 104, 105, 106, 108, 109, 111, 112, 116, 121, 122, 123, 126, 127, 128, 129, 
            130, 132, 134, 135, 136, 137, 138, 139, 141, 143, 144, 145, 146, 147, 150, 153, 160, 161, 162, 164, 168, 169, 172, 176,
            177, 178, 181, 184, 186, 189, 191, 194, 200]
more_obj = [15, 25, 26, 33, 35, 36, 59, 69, 70, 137, ]

In [None]:
input_file_mine = "results_md_blip_final_v6_no_nan_dec18.csv"  
df_mine = pd.read_csv(input_file_mine)
hazard_dict_mine = {}

for index, row in df_mine.iterrows():
    video_id = row["ID"]  # Replace with your column name for video ID
    # frame_id = row["frame"]  # Replace with your column name for frame ID
    match = re.search(r'video_(\d+)_\d+', video_id)
    video_number = int(match.group(1))

    hazard_tracks = []
    for i in range(23):  # Assuming Hazard_Track_0 to Hazard_Track_22
        hazard_name_col = f'Hazard_Name_{i}'
        hazard_track_col = f'Hazard_Track_{i}'
        if row[hazard_track_col] == ' ':
            continue
        hazard_tracks.append(row[hazard_track_col])

    # Add to the dictionary using (video_id, frame_id) as the key
    hazard_dict_mine[video_id] = hazard_tracks

In [6]:
input_file_dr = "/home/booster/Downloads/submission_results_Post.csv"  
df_dr = pd.read_csv(input_file_dr)
hazard_dict_dr = {}

for index, row in df_dr.iterrows():
    video_id = row["ID"]  # Replace with your column name for video ID
    # frame_id = row["frame"]  # Replace with your column name for frame ID
    match = re.search(r'video_(\d+)_\d+', video_id)
    video_number = int(match.group(1))

    hazard_tracks = []
    for i in range(22):  # Assuming Hazard_Track_0 to Hazard_Track_22
        hazard_name_col = f'Hazard_Name_{i}'
        hazard_track_col = f'Hazard_Track_{i}'
        if row[hazard_track_col] == ' ':
            continue
        hazard_tracks.append(row[hazard_track_col])

    # Add to the dictionary using (video_id, frame_id) as the key
    hazard_dict_dr[video_id] = hazard_tracks

In [11]:
for videos in sorted(list(annotations.keys())):
    video_id = int(videos.split('_')[-1])
    # print("video_id:", video_id)
    if video_id in videos_dr:
        print(video_id)
        for keys in hazard_dict_dr:
            keys_split = int(keys.split('_')[1])
            # print("keys_split:", keys)
            if int(keys_split) in videos_dr:
                # print("keys2:", keys_split)
                if keys in hazard_dict_mine:
                    # print("keys3:",keys)
                    hazard = []
                    for value in hazard_dict_dr[keys]:
                        if value == -1:
                            continue
                        else:
                            hazard.append(str(float(value)))
                    hazard_dict_mine[keys] = hazard      

10
15
23
31
43
53
57
62
66
74
75
76
78
83
85
90
95
104
116
139
141
143
153
155
161
166
175
178
181
191


In [None]:
video_root = './COOOL_Benchmark/processed_videos/'
output_dir = "./COOOL_Benchmark/processed_videos_midas/"

results_file_path = "results_md_blip_final_v6_.csv"
results_file_path_out = "results_md_blip_final_v6_new.csv"

directory_path = './captions_output/'

# Load existing CSV or create a new DataFrame
if os.path.exists(results_file_path):
    results_df = pd.read_csv(results_file_path)
else:
    columns = ["ID", "Driver_State_Changed"] + [f"Hazard_Track_{i}" for i in range(23)] + [f"Hazard_Name_{i}" for i in range(23)]
    results_df = pd.DataFrame(columns=columns)


video_num = 0
for video in sorted(list(annotations.keys())):
    print(video)
    video_num += 1
    if video_num > 5:
        video_stream = cv2.VideoCapture(os.path.join(video_root, video+'.mp4'))
        fps = int(video_stream.get(cv2.CAP_PROP_FPS))
        width = int(video_stream.get(cv2.CAP_PROP_FRAME_WIDTH))
        height = int(video_stream.get(cv2.CAP_PROP_FRAME_HEIGHT))
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Codec for output video
        output_video_path = os.path.join(output_dir, f"{video}_midas_hazard_v10_road.mp4")
        out = cv2.VideoWriter(output_video_path, fourcc, fps, (width, height))
        frame = 0
        caption_obj = {}
        while video_stream.isOpened():
            ret, frame_image = video_stream.read()
            if ret == False: #False means end of video or error
                assert frame == len(annotations[video].keys()) #End of the video must be final frame
                break
                    
            if frame == 0:
                frame += 1
                continue
    
            hazard_tracks = []
            hazard_captions = []
            video_frame_id = f"{video}_{frame}"
            # print("video_frame_id:", video_frame_id)

            if video_frame_id in results_df['ID'].values:  # Check if the ID exists in the DataFrame
                row_data = results_df.loc[results_df['ID'] == video_frame_id].iloc[0].to_dict()
                # print(f"Data for {video_frame_id}: {row_data}")
            if video_num in wrong_cap_video:
               if len(hazard_dict_mine[video_frame_id]) > 0:
                    for track_id in hazard_dict_mine[video_frame_id]:
                        hazard_tracks.append(track_id)
                        if track_id not in caption_obj:
                            print("hazard_dict_mine[video_frame_id]:", hazard_dict_mine[video_frame_id])
                            track_id_pkl = int(float(track_id))
                            if track_id_pkl not in caption_obj:
                                prefix = f"cap_{video}_{track_id_pkl}"
                                for file_name in os.listdir(directory_path):
                                    # Check if the file starts with the desired prefix and ends with '.pkl'
                                    if file_name.startswith(prefix) and file_name.endswith('.pkl'):
                                        file_path = os.path.join(directory_path, file_name)  # Full path to the file
                                        try:
                                            # Open and read the .pkl file
                                            with open(file_path, 'rb') as file:
                                                data = pickle.load(file)  # Load the content of the .pkl file
                                                selected_caption = create_caption(data)
                                                # hazard_captions.append(selected_caption)
                                                caption_obj[track_id] = selected_caption
                                                print("caption_obj:", caption_obj)
                                                # print(f"Contents of {file_name}:")
                                        except Exception as e:
                                            print(f"Error reading {file_name}: {e}")
                        else: 
                            caption = caption_obj[track_id]
                        hazard_captions.append(selected_caption)
                            
            for i in range(min(len(hazard_tracks), 23)):
                row_data[f"Hazard_Track_{i}"] = hazard_tracks[i]
                row_data[f"Hazard_Name_{i}"] = hazard_captions[i]
    
            if video_frame_id in results_df['ID'].values:
                    for key, value in row_data.items():
                        results_df.loc[results_df['ID'] == video_frame_id, key] = value
            else:
                results_df = pd.concat([results_df, pd.DataFrame([row_data])], ignore_index=True)
            
                

            frame += 1
        results_df.to_csv(results_file_path_out, index=False)
        video_stream.release()
        out.release()

In [None]:
#Example
import os
import pickle

# Specify the directory containing .pkl files
directory_path = './captions_output/'

# Iterate over all files in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith('.pkl'):  # Check if the file has a .pkl extension
        file_path = os.path.join(directory_path, file_name)  # Full path to the file
        try:
            with open(file_path, 'rb') as file:  # Open the file in binary read mode
                data = pickle.load(file)  # Load the content of the .pkl file
                
                print(f"Contents of {file_name} (line by line):")
                if isinstance(data, dict):
                    for key, value in data.items():
                        print(f"{key}: {value}")
                elif isinstance(data, list):
                    for item in data:
                        print(item)
                else:
                    print(data)  # Print directly if the content is not iterable
                print("\n")  # Add spacing between files
        except Exception as e:
            print(f"Error reading {file_name}: {e}")


"""
Contents of cap_video_0140_4_129.pkl (line by line):
{'frame': 119, 'caption': 'a close up of a truck with a large piece'}
{'frame': 120, 'caption': 'a close up of a truck with a box on'}
{'frame': 121, 'caption': 'a close up of a truck with a box on'}
{'frame': 122, 'caption': 'a close up of a truck with a green box'}
{'frame': 123, 'caption': 'arafed truck on the road with a green box on'}
{'frame': 124, 'caption': 'arafed vehicle on the road with a green square on'}
{'frame': 125, 'caption': 'a picture taken from a vehicle of a car driving'}
{'frame': 126, 'caption': 'a close up of a car driving down a highway'}
{'frame': 127, 'caption': 'a close up of a car driving down a highway'}
{'frame': 128, 'caption': 'araf truck driving down a highway with a green screen'}
{'frame': 129, 'caption': 'there is a man standing in a bathroom with a'}
{'frame': 130, 'caption': 'a close up of a car driving down a highway'}
{'frame': 131, 'caption': 'arafed view of a car driving down a highway with'}
{'frame': 132, 'caption': 'a close up of a car driving down a highway'}
{'frame': 133, 'caption': 'a close up of a car driving down a road'}
{'frame': 134, 'caption': 'a close up of a car with a broken hood'}
{'frame': 135, 'caption': 'a close up of a car driving down a highway'}
{'frame': 136, 'caption': 'a picture taken from a car shows a truck that'}
{'frame': 137, 'caption': 'a close up of a car driving down a highway'}
{'frame': 138, 'caption': 'a close up of a truck driving down a highway'}
"""