In [None]:
import pickle
import csv
import cv2
import numpy as np
from PIL import Image
from clip_interrogator import Config, Interrogator

def process_single_video(annotation_file, video_path, output_file):
    # Load the annotations from the pickle file
    with open(annotation_file, 'rb') as f:
        annotations = pickle.load(f)
    
    # Open the output CSV file
    with open(output_file, 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        
        # Write the header
        csv_writer.writerow([
            "ID", "Driver_State_Changed", 
            "Hazard_Track_0", "Hazard_Name_0", 
            "Hazard_Track_22", "Hazard_Name_22"
        ])
        
        # Setup captioning model
        ci = Interrogator(Config(clip_model_name="ViT-L-14/openai"))
        
        # Use the first video (video_0001.mp4) for processing
        video_id = 'video_0001'
        video_stream = cv2.VideoCapture(video_path)
        assert video_stream.isOpened(), f"Failed to open video stream for {video_id}"
        
        frame = 0
        previous_centroids = []
        captioned_tracks = {}
        driver_state_flag = False
        
        while video_stream.isOpened():
            ret, frame_image = video_stream.read()
            if not ret:
                break
            
            # Convert to grayscale for optical flow (if using optical flow)
            gray_frame = cv2.cvtColor(frame_image, cv2.COLOR_BGR2GRAY)
            
            # Process annotations for this frame
            challenge_objects = annotations[video_id].get(frame, {}).get('challenge_object', [])
            
            # Initialize hazard track and name for the frame
            hazard_track_0, hazard_name_0 = "-1", " "
            hazard_track_22, hazard_name_22 = "-1", " "
            
            if challenge_objects:
                # Optical flow or Kalman filter tracking here for faster tracking of objects
                
                # Example: Optical flow tracking (if implemented)
                # Process or track using the optical flow/Kalman filter method above
                
                # Driver state change detection (simplified version)
                if previous_centroids:
                    # Ensure centroids are calculated only when there are challenge objects
                    centroids = np.array([obj['bbox'][0] + obj['bbox'][2] / 2 for obj in challenge_objects])
                    
                    # Track using optical flow (or Kalman filter) here for faster tracking
                    # dists = np.linalg.norm(previous_centroids - centroids, axis=1)
                    # Median distance calculation
                
                # Hazard description (simplified version)
                if hazard_track_0 != "-1" and hazard_track_0 not in captioned_tracks:
                    hazard_chip = frame_image[int(challenge_objects[0]['bbox'][1]):int(challenge_objects[0]['bbox'][3]), 
                                              int(challenge_objects[0]['bbox'][0]):int(challenge_objects[0]['bbox'][2])]
                    hazard_chip = cv2.cvtColor(hazard_chip, cv2.COLOR_BGR2RGB)
                    hazard_caption = ci.interrogate(Image.fromarray(hazard_chip))
                    hazard_caption = hazard_caption.replace(",", " ")
                    captioned_tracks[hazard_track_0] = hazard_caption
            
            # Write the row to the CSV file
            csv_writer.writerow([
                f"{video_id}_{frame}", driver_state_flag, 
                hazard_track_0, hazard_name_0, 
                hazard_track_22, hazard_name_22
            ])
            
            frame += 1
        
        video_stream.release()
    
    print(f"Submission file for video {video_id} has been created at: {output_file}")


In [23]:
import os
import cv2
import openai
from ultralytics import YOLO
import pickle
import csv

openai.api_key = "....."  

# Paths
VIDEO_ROOT = '/kaggle/input/coool-benchmark/COOOL Benchmark'
ANNOTATION_PATH = '/kaggle/input/annotations-public-pkl/annotations_public.pkl'
OUTPUT_CSV = '/kaggle/working/submission.csv'

# Load annotations
print("Loading annotations...")
try:
    with open(ANNOTATION_PATH, 'rb') as f:
        annotations = pickle.load(f)
    print("Annotations loaded successfully.")
except Exception as e:
    print(f"Error loading annotations: {e}")
    exit()

# Load YOLO model
print("Loading YOLO model...")
try:
    model = YOLO("/kaggle/input/yolov8x/keras/default/1/yolov8x.pt")  # Adjust the path to your YOLOv8 model
    print("YOLO model loaded successfully.")
except Exception as e:
    print(f"Error loading YOLO model: {e}")
    exit()

# Prepare CSV Header
header = ["ID", "Driver_State_Changed", "Hazard_Track_0", "Hazard_Name_0"]

# Function to generate a scenario explanation using GPT
def generate_gpt_explanation(detections):
    """
    Use GPT to generate a detailed scenario explanation based on detected objects.
    """
    if not detections:
        return "No significant hazards detected."
    
    object_list = [f"a {obj['name']}" for obj in detections]
    prompt = (
        f"The following objects were detected in a traffic video frame: {', '.join(object_list)}. "
        "Describe the scene in detail and indicate any potential hazards or unusual activities. "
        "Add descriptions about lighting, weather, and the condition of the image (e.g., blurry, zoomed)."
    )
    
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4",  # Use GPT-4 for advanced responses
            messages=[
                {"role": "system", "content": "You are an assistant that describes scenes in traffic videos."},
                {"role": "user", "content": prompt},
            ],
            max_tokens=300,  # Adjust token limit as needed
            temperature=0.7,
        )
        explanation = response['choices'][0]['message']['content'].strip()
        return explanation
    except Exception as e:
        print(f"Error generating explanation: {e}")
        return "Error generating explanation."

# Main processing loop
with open(OUTPUT_CSV, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)  # Write the header only once

    print("Processing videos...\n")

    for video_id in sorted(annotations.keys()):
        video_path = os.path.join(VIDEO_ROOT, f"{video_id}.mp4")
        
        if not os.path.exists(video_path):
            print(f"Skipping missing video: {video_path}")
            continue

        video_stream = cv2.VideoCapture(video_path)
        if not video_stream.isOpened():
            print(f"Failed to open video: {video_id}")
            continue

        print(f"Processing video: {video_id}")
        frame_count = 0
        previous_detections = []  # To track state changes

        while video_stream.isOpened():
            ret, frame_image = video_stream.read()
            if not ret:
                break

            # YOLO Detection
            results = model(frame_image)
            detected_objects = []

            # Extract object classes
            if results and results[0].boxes is not None:
                for det in results[0].boxes:
                    class_id = int(det.cls)
                    class_name = model.names[class_id]
                    detected_objects.append({"name": class_name})

            # Use GPT to generate explanation
            hazard_description = generate_gpt_explanation(detected_objects)
            driver_state_changed = "True" if detected_objects != previous_detections else "False"
            previous_detections = detected_objects

            # Construct row with hazards
            if detected_objects:
                row = [
                    f"{video_id}_{frame_count}",
                    driver_state_changed,
                    "1",  # Hazard detected
                    hazard_description
                ]
            else:
                row = [f"{video_id}_{frame_count}", driver_state_changed, "", "No significant hazards detected."]

            # Append to CSV file immediately after processing each frame
            writer.writerow(row)
            print(f"Frame {frame_count}: {row}")  # Print for verification

            frame_count += 1

        video_stream.release()
        print(f"Completed video: {video_id}\n")

print("Processing complete.")



Loading annotations...
Annotations loaded successfully.
Loading YOLO model...
YOLO model loaded successfully.
Processing videos...

Processing video: video_0001

0: 384x640 1 person, 6 cars, 1 traffic light, 274.4ms
Speed: 2.2ms preprocess, 274.4ms inference, 1.1ms postprocess per image at shape (1, 3, 384, 640)
Error generating explanation: 

You tried to access openai.ChatCompletion, but this is no longer supported in openai>=1.0.0 - see the README at https://github.com/openai/openai-python for the API.

You can run `openai migrate` to automatically upgrade your codebase to use the 1.0.0 interface. 

Alternatively, you can pin your installation to the old version, e.g. `pip install openai==0.28`

A detailed migration guide is available here: https://github.com/openai/openai-python/discussions/742

Frame 0: ['video_0001_0', 'True', '1', 'Error generating explanation.']

0: 384x640 1 person, 6 cars, 1 traffic light, 443.2ms
Speed: 2.1ms preprocess, 443.2ms inference, 1.1ms postprocess 

KeyboardInterrupt: 

In [20]:
!pip install clip_interrogator


Collecting clip_interrogator
  Downloading clip_interrogator-0.6.0-py3-none-any.whl (787 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m787.8/787.8 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting open-clip-torch
  Downloading open_clip_torch-2.29.0-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m25.7 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy, open-clip-torch, clip_interrogator
Successfully installed clip_interrogator-0.6.0 ftfy-6.3.1 open-clip-torch-2.29.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To 

In [22]:
import os
import cv2
import pickle
import csv
from clip_interrogator import Config, Interrogator
from PIL import Image
import numpy as np
from ultralytics import YOLO

# Initialize the clip_interrogator with default configuration
config = Config()
interrogator = Interrogator(config)

# Paths
VIDEO_ROOT = '/kaggle/input/coool-benchmark/COOOL Benchmark'
ANNOTATION_PATH = '/kaggle/input/annotations-public-pkl/annotations_public.pkl'
OUTPUT_CSV = '/kaggle/working/submission.csv'

# Load annotations
print("Loading annotations...")
try:
    with open(ANNOTATION_PATH, 'rb') as f:
        annotations = pickle.load(f)
    print("Annotations loaded successfully.")
except Exception as e:
    print(f"Error loading annotations: {e}")
    exit()

# Load YOLO model
print("Loading YOLO model...")
try:
    model = YOLO("/kaggle/input/yolov8x/keras/default/1/yolov8x.pt")  # Adjust the path to your YOLOv8 model
    print("YOLO model loaded successfully.")
except Exception as e:
    print(f"Error loading YOLO model: {e}")
    exit()

# Prepare CSV Header
header = ["ID", "Driver_State_Changed", "Hazard_Track_0", "Hazard_Name_0"]

# Function to generate a scenario explanation using CLIP Interrogator
def generate_clip_description(frame_image):
    """
    Use CLIP Interrogator to generate a description for the given frame image.
    Converts NumPy array (BGR) to PIL Image (RGB) before processing.
    """
    try:
        # Convert BGR (OpenCV) to RGB (PIL)
        frame_rgb = cv2.cvtColor(frame_image, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(frame_rgb)

        # Generate description for the frame image
        description = interrogator.interrogate(pil_image)
        return description
    except Exception as e:
        print(f"Error generating description with CLIP Interrogator: {e}")
        return "Error generating description."

# Main processing loop
with open(OUTPUT_CSV, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)  # Write the header only once

    print("Processing videos...\n")

    for video_id in sorted(annotations.keys()):
        video_path = os.path.join(VIDEO_ROOT, f"{video_id}.mp4")
        
        if not os.path.exists(video_path):
            print(f"Skipping missing video: {video_path}")
            continue

        video_stream = cv2.VideoCapture(video_path)
        if not video_stream.isOpened():
            print(f"Failed to open video: {video_id}")
            continue

        print(f"Processing video: {video_id}")
        frame_count = 0
        previous_detections = []  # To track state changes

        while video_stream.isOpened():
            ret, frame_image = video_stream.read()
            if not ret:
                break

            # YOLO Detection
            results = model(frame_image)
            detected_objects = []

            # Extract object classes
            if results and results[0].boxes is not None:
                for det in results[0].boxes:
                    class_id = int(det.cls)
                    class_name = model.names[class_id]
                    detected_objects.append({"name": class_name})

            # Generate description using CLIP Interrogator
            hazard_description = generate_clip_description(frame_image)
            driver_state_changed = "True" if detected_objects != previous_detections else "False"
            previous_detections = detected_objects

            # Construct row with hazards
            if detected_objects:
                row = [
                    f"{video_id}_{frame_count}",
                    driver_state_changed,
                    "1",  # Hazard detected
                    hazard_description
                ]
            else:
                row = [f"{video_id}_{frame_count}", driver_state_changed, "", "No significant hazards detected."]

            # Append to CSV file immediately after processing each frame
            writer.writerow(row)
            print(f"Frame {frame_count}: {row}")  # Print for verification

            frame_count += 1

        video_stream.release()
        print(f"Completed video: {video_id}\n")

print("Processing complete.")


Loading caption model blip-large...
Loading CLIP model ViT-L-14/openai...
Loaded CLIP model and data in 6.01 seconds.
Loading annotations...
Annotations loaded successfully.
Loading YOLO model...
YOLO model loaded successfully.
Processing videos...

Processing video: video_0001

0: 384x640 1 person, 6 cars, 1 traffic light, 305.6ms
Speed: 2.1ms preprocess, 305.6ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)


  with torch.no_grad(), torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():
100%|██████████| 55/55 [00:00<00:00, 150.44it/s]
  with torch.no_grad(), torch.cuda.amp.autocast():
  with torch.no_grad(), torch.cuda.amp.autocast():
Flavor chain:  12%|█▎        | 4/32 [04:57<34:39, 74.28s/it]


KeyboardInterrupt: 

In [3]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.51-py3-none-any.whl (901 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m901.3/901.3 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting ultralytics-thop>=2.0.0
  Downloading ultralytics_thop-2.0.13-py3-none-any.whl (26 kB)
Collecting py-cpuinfo
  Downloading py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)
Installing collected packages: py-cpuinfo, ultralytics-thop, ultralytics
Successfully installed py-cpuinfo-9.0.0 ultralytics-8.3.51 ultralytics-thop-2.0.13
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
import os
import cv2
import pickle
import csv
from ultralytics import YOLO

# Paths
VIDEO_ROOT = '/kaggle/input/coool-benchmark/COOOL Benchmark'
ANNOTATION_PATH = '/kaggle/input/annotations-public-pkl/annotations_public.pkl'
OUTPUT_CSV = '/kaggle/working/submission.csv'

# Load annotations
print("Loading annotations...")
try:
    with open(ANNOTATION_PATH, 'rb') as f:
        annotations = pickle.load(f)
    print("Annotations loaded successfully.")
except Exception as e:
    print(f"Error loading annotations: {e}")
    exit()

# Load YOLO model
print("Loading YOLO model...")
try:
    model = YOLO("/kaggle/input/yolov8x/keras/default/1/yolov8x.pt")  # Adjust the path to your YOLOv8 model
    print("YOLO model loaded successfully.")
except Exception as e:
    print(f"Error loading YOLO model: {e}")
    exit()

# Prepare CSV Header
header = ["ID", "Driver_State_Changed", "Hazard_Track_0", "Hazard_Name_0"]

# Function to generate a simple description based on object counts
def generate_simple_description(detected_objects):
    """
    Generate a simple description based on the count of detected objects (persons, cars, traffic lights).
    """
    person_count = sum(1 for obj in detected_objects if obj['name'] == 'person')
    car_count = sum(1 for obj in detected_objects if obj['name'] == 'car')
    traffic_light_count = sum(1 for obj in detected_objects if obj['name'] == 'traffic light')

    # Create a description string based on object counts
    description = f"{person_count} person, {car_count} cars, {traffic_light_count} traffic light"
    return description

# Main processing loop
with open(OUTPUT_CSV, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)  # Write the header only once

    print("Processing videos...\n")

    for video_id in sorted(annotations.keys()):
        video_path = os.path.join(VIDEO_ROOT, f"{video_id}.mp4")
        
        if not os.path.exists(video_path):
            print(f"Skipping missing video: {video_path}")
            continue

        video_stream = cv2.VideoCapture(video_path)
        if not video_stream.isOpened():
            print(f"Failed to open video: {video_id}")
            continue

        print(f"Processing video: {video_id}")
        frame_count = 0
        previous_detections = []  # To track state changes

        while video_stream.isOpened():
            ret, frame_image = video_stream.read()
            if not ret:
                break

            # YOLO Detection
            results = model(frame_image)
            detected_objects = []

            # Extract object classes
            if results and results[0].boxes is not None:
                for det in results[0].boxes:
                    class_id = int(det.cls)
                    class_name = model.names[class_id]
                    detected_objects.append({"name": class_name})

            # Generate description based on detected objects
            hazard_description = generate_simple_description(detected_objects)
            driver_state_changed = "True" if detected_objects != previous_detections else "False"
            previous_detections = detected_objects

            # Construct row with hazards
            if detected_objects:
                row = [
                    f"{video_id}_{frame_count}",
                    driver_state_changed,
                    "1",  # Hazard detected
                    hazard_description
                ]
            else:
                row = [
                    f"{video_id}_{frame_count}",
                    "False",  # No change in state
                    "0",  # No hazard detected
                    "0"  # No description
                ]

            # Append to CSV file immediately after processing each frame
            writer.writerow(row)
            print(f"Frame {frame_count}: {row}")  # Print for verification

            frame_count += 1

        video_stream.release()
        print(f"Completed video: {video_id}\n")

print("Processing complete.")


Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
Loading annotations...
Annotations loaded successfully.
Loading YOLO model...
YOLO model loaded successfully.
Processing videos...

Processing video: video_0001

0: 384x640 1 person, 6 cars, 1 traffic light, 322.5ms
Speed: 2.8ms preprocess, 322.5ms inference, 1.6ms postprocess per image at shape (1, 3, 384, 640)
Frame 0: ['video_0001_0', 'True', '1', '1 person, 6 cars, 1 traffic light']

0: 384x640 1 person, 6 cars, 1 traffic light, 288.7ms
Speed: 1.7ms preprocess, 288.7ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)
Frame 1: ['video_0001_1', 'False', '1', '1 person, 6 cars, 1 traffic light']

0: 384x640 2 persons, 5 cars, 1 traffic light, 320.1ms
Speed: 1.9ms 

In [6]:
!pip install clip_interrogator

Collecting clip_interrogator
  Downloading clip_interrogator-0.6.0-py3-none-any.whl (787 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m787.8/787.8 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting open-clip-torch
  Downloading open_clip_torch-2.29.0-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: ftfy, open-clip-torch, clip_interrogator
Successfully installed clip_interrogator-0.6.0 ftfy-6.3.1 open-clip-torch-2.29.0
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To 

In [None]:
import torch
import numpy as np
import pickle
import cv2
import os
from sklearn.linear_model import LinearRegression
from PIL import Image
from clip_interrogator import Config, Interrogator
from concurrent.futures import ThreadPoolExecutor

# Directly setting paths in the script (you can skip the argparse part)
VIDEO_ROOT = '/kaggle/input/coool-benchmark/COOOL Benchmark'
ANNOTATION_PATH = '/kaggle/input/annotations-public-pkl/annotations_public.pkl'

# Assertions for input files
assert os.path.exists(ANNOTATION_PATH), "Annotations file not found."
annotation_file = open(ANNOTATION_PATH, 'rb')
annotations = pickle.load(annotation_file)  # Load annotations
annotation_file.close()

# Initialize output CSV file
results_file = open("results.csv", 'w')
results_file.write("ID,Driver_State_Changed")
for i in range(23):
    results_file.write(f",Hazard_Track_{i},Hazard_Name_{i}")
results_file.write("\n")

# Setup captioning model
ci = Interrogator(Config(clip_model_name="ViT-L-14/openai"))

# Create function to handle per-frame processing
def process_frame(video, frame, video_stream, previous_centroids, median_dists, captioned_tracks, driver_state_flag, results_file):
    ret, frame_image = video_stream.read()
    if ret == False:  # End of video or error
        return False, previous_centroids, driver_state_flag  # Return False to signal end of video

    # Gather BBoxes from annotations
    bboxes, centroids, chips, track_ids = [], [], [], []
    for ann_type in ['challenge_object']:
        for i in range(len(annotations[video][frame][ann_type])):
            x1, y1, x2, y2 = annotations[video][frame][ann_type][i]['bbox']
            track_ids.append(annotations[video][frame][ann_type][i]['track_id'])
            bboxes.append([x1, y1, x2, y2])
            centroids.append([x1 + (abs(x2 - x1) / 2), y1 + (abs(y2 - y1) / 2)])
            chips.append(frame_image[int(y1):int(y2), int(x1):int(x2)])
    bboxes = np.array(bboxes)
    centroids = np.array(centroids)

    if len(bboxes) == 0 or len(previous_centroids) == 0:
        return True, centroids, driver_state_flag  # Skip this frame

    ### Driver state change detection
    dists = [np.min(np.linalg.norm(previous_centroids - centroid, axis=1)) for centroid in centroids]
    median_dist = np.median(dists)
    median_dists.append(median_dist)

    if len(median_dists) > 1:
        x = np.array(range(len(median_dists))).reshape(-1, 1)
        y = np.array(median_dists)
        speed_model = LinearRegression().fit(x, y)
        if speed_model.coef_[0] < 0:  # Slowing down detected
            driver_state_flag = True

    ### Hazard detection
    image_center = [frame_image.shape[1] / 2, frame_image.shape[0] / 2]
    potential_hazard_dists = np.linalg.norm(centroids - image_center, axis=1)
    probable_hazard = np.argmin(potential_hazard_dists)
    hazard_track = track_ids[probable_hazard]

    ### Hazard description
    if hazard_track not in captioned_tracks:
        hazard_chip = cv2.cvtColor(chips[probable_hazard], cv2.COLOR_BGR2RGB)
        hazard_chip = Image.fromarray(hazard_chip)
        hazard_caption = ci.interrogate(hazard_chip)
        hazard_caption = hazard_caption.replace(",", " ")  # Remove commas
        captioned_tracks[hazard_track] = hazard_caption
    else:
        hazard_caption = captioned_tracks[hazard_track]  # Use cached caption

    # Write result to CSV
    results_file.write(f"{video}_{frame},{driver_state_flag},{hazard_track},{hazard_caption}" + "".join([", , " for _ in range(22)]) + '\n')

    return True, centroids, driver_state_flag

# Create a ThreadPoolExecutor for concurrent processing
with ThreadPoolExecutor(max_workers=4) as executor:
    for video in sorted(list(annotations.keys())):
        video_path = os.path.join(VIDEO_ROOT, video + ".mp4")
        
        # Skip if the video file is not found
        if not os.path.exists(video_path):
            print(f"Skipping video {video}.mp4, file not found.")
            continue  # Skip to the next video if the current one doesn't exist
        
        video_stream = cv2.VideoCapture(video_path)
        assert video_stream.isOpened()

        frame = 0
        previous_centroids = []
        median_dists = []
        captioned_tracks = {}
        driver_state_flag = False

        # Process frames in parallel
        while video_stream.isOpened():
            print(f'{video}_{frame}')
            continue_processing, previous_centroids, driver_state_flag = executor.submit(
                process_frame, video, frame, video_stream, previous_centroids, median_dists, captioned_tracks, driver_state_flag, results_file).result()

            if not continue_processing:
                break

            frame += 1

results_file.close()


Loading caption model blip-large...
Loading CLIP model ViT-L-14/openai...


ViT-L-14_openai_artists.safetensors: 100%|██████████| 16.2M/16.2M [00:00<00:00, 65.7MB/s]
ViT-L-14_openai_flavors.safetensors: 100%|██████████| 155M/155M [00:00<00:00, 207MB/s] 
ViT-L-14_openai_mediums.safetensors: 100%|██████████| 146k/146k [00:00<00:00, 16.5MB/s]
ViT-L-14_openai_movements.safetensors: 100%|██████████| 307k/307k [00:00<00:00, 24.7MB/s]
ViT-L-14_openai_trendings.safetensors: 100%|██████████| 111k/111k [00:00<00:00, 18.6MB/s]
ViT-L-14_openai_negative.safetensors: 100%|██████████| 63.2k/63.2k [00:00<00:00, 14.6MB/s]


Loaded CLIP model and data in 16.58 seconds.
video_0001_0
video_0001_1


  with torch.no_grad(), torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():
100%|██████████| 55/55 [00:00<00:00, 163.24it/s]
  with torch.no_grad(), torch.cuda.amp.autocast():
  with torch.no_grad(), torch.cuda.amp.autocast():
Flavor chain:  25%|██▌       | 8/32 [08:55<26:45, 66.92s/it]
100%|██████████| 55/55 [00:00<00:00, 161.41it/s]
100%|██████████| 6/6 [00:00<00:00, 144.25it/s]
100%|██████████| 50/50 [00:00<00:00, 183.78it/s]
  with torch.no_grad(), torch.cuda.amp.autocast():


video_0001_2
video_0001_3
video_0001_4
video_0001_5
video_0001_6
video_0001_7
video_0001_8
video_0001_9
video_0001_10
video_0001_11
video_0001_12
video_0001_13
video_0001_14
video_0001_15
video_0001_16
video_0001_17
video_0001_18
video_0001_19


  with torch.no_grad(), torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():
100%|██████████| 55/55 [00:00<00:00, 182.48it/s]
  with torch.no_grad(), torch.cuda.amp.autocast():
  with torch.no_grad(), torch.cuda.amp.autocast():
Flavor chain:  44%|████▍     | 14/32 [15:00<19:17, 64.30s/it]
100%|██████████| 55/55 [00:00<00:00, 184.18it/s]
100%|██████████| 6/6 [00:00<00:00, 139.13it/s]
100%|██████████| 50/50 [00:00<00:00, 186.69it/s]
  with torch.no_grad(), torch.cuda.amp.autocast():


video_0001_20
video_0001_21
video_0001_22
video_0001_23
video_0001_24
video_0001_25
video_0001_26
video_0001_27
video_0001_28
video_0001_29
video_0001_30
video_0001_31
video_0001_32
video_0001_33
video_0001_34
video_0001_35
video_0001_36
video_0001_37
video_0001_38
video_0001_39
video_0001_40
video_0001_41
video_0001_42


  with torch.no_grad(), torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():
100%|██████████| 55/55 [00:00<00:00, 179.66it/s]
  with torch.no_grad(), torch.cuda.amp.autocast():
  with torch.no_grad(), torch.cuda.amp.autocast():
Flavor chain:  41%|████      | 13/32 [14:02<20:31, 64.83s/it]
100%|██████████| 55/55 [00:00<00:00, 178.52it/s]
100%|██████████| 6/6 [00:00<00:00, 130.91it/s]
100%|██████████| 50/50 [00:00<00:00, 184.57it/s]
  with torch.no_grad(), torch.cuda.amp.autocast():


video_0001_43
video_0001_44
video_0001_45
video_0001_46
video_0001_47
video_0001_48
video_0001_49
video_0001_50
video_0001_51
video_0001_52
video_0001_53
video_0001_54
video_0001_55
video_0001_56
video_0001_57
video_0001_58
video_0001_59
video_0001_60
video_0001_61
video_0001_62
video_0001_63
video_0001_64
video_0001_65
video_0001_66
video_0001_67
video_0001_68
video_0001_69
video_0001_70
video_0001_71
video_0001_72
video_0001_73
video_0001_74
video_0001_75
video_0001_76
video_0001_77
video_0001_78
video_0001_79
video_0001_80
video_0001_81
video_0001_82
video_0001_83
video_0001_84
video_0001_85
video_0001_86
video_0001_87
video_0001_88
video_0001_89
video_0001_90


  with torch.no_grad(), torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():
100%|██████████| 55/55 [00:00<00:00, 168.75it/s]
  with torch.no_grad(), torch.cuda.amp.autocast():
  with torch.no_grad(), torch.cuda.amp.autocast():
Flavor chain:  31%|███▏      | 10/32 [10:59<24:11, 65.97s/it]
100%|██████████| 55/55 [00:00<00:00, 170.18it/s]
100%|██████████| 6/6 [00:00<00:00, 134.36it/s]
100%|██████████| 50/50 [00:00<00:00, 182.76it/s]
  with torch.no_grad(), torch.cuda.amp.autocast():


video_0001_91
video_0001_92
video_0001_93
video_0001_94
video_0001_95
video_0001_96
video_0001_97
video_0001_98
video_0001_99
video_0001_100
video_0001_101
video_0001_102
video_0001_103
video_0001_104
video_0001_105
video_0001_106
video_0001_107
video_0001_108
video_0001_109
video_0001_110
video_0001_111
video_0001_112
video_0001_113
video_0001_114
video_0001_115
video_0001_116
video_0001_117
video_0001_118
video_0001_119
video_0001_120
video_0001_121
video_0001_122
video_0001_123
video_0001_124
video_0001_125
video_0001_126
video_0001_127
video_0001_128
video_0001_129
video_0001_130
video_0001_131
video_0001_132
video_0001_133
video_0001_134
video_0001_135
video_0001_136


  with torch.no_grad(), torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():
100%|██████████| 55/55 [00:00<00:00, 172.68it/s]
  with torch.no_grad(), torch.cuda.amp.autocast():
  with torch.no_grad(), torch.cuda.amp.autocast():
Flavor chain:  44%|████▍     | 14/32 [15:06<19:25, 64.76s/it]
100%|██████████| 55/55 [00:00<00:00, 167.51it/s]
100%|██████████| 6/6 [00:00<00:00, 129.02it/s]
100%|██████████| 50/50 [00:00<00:00, 157.24it/s]
  with torch.no_grad(), torch.cuda.amp.autocast():


video_0001_137
video_0001_138
video_0001_139
video_0001_140
video_0001_141
video_0001_142
video_0001_143
video_0001_144
video_0001_145
video_0001_146
video_0001_147
video_0001_148
video_0001_149
video_0001_150
video_0001_151
video_0001_152
video_0001_153
video_0001_154
video_0001_155
video_0001_156
video_0001_157
video_0001_158
video_0001_159
video_0001_160
video_0001_161
video_0001_162
video_0001_163
video_0001_164
video_0001_165
video_0001_166
video_0001_167
video_0001_168
video_0001_169
video_0001_170
video_0001_171
video_0001_172
video_0001_173
video_0001_174
video_0001_175
video_0001_176
video_0001_177
video_0001_178
video_0001_179
video_0001_180
video_0001_181
video_0001_182
video_0001_183
video_0001_184
video_0001_185
video_0001_186
video_0001_187
video_0001_188
video_0001_189
video_0001_190
video_0001_191
video_0001_192
video_0001_193
video_0001_194
video_0001_195
video_0001_196
video_0001_197
video_0001_198
video_0001_199
video_0001_200
video_0001_201
video_0001_202
video_0001

  with torch.no_grad(), torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():
100%|██████████| 55/55 [00:00<00:00, 166.46it/s]
  with torch.no_grad(), torch.cuda.amp.autocast():
  with torch.no_grad(), torch.cuda.amp.autocast():
Flavor chain:  28%|██▊       | 9/32 [10:12<26:04, 68.01s/it]
100%|██████████| 55/55 [00:00<00:00, 165.71it/s]
100%|██████████| 6/6 [00:00<00:00, 131.79it/s]
100%|██████████| 50/50 [00:00<00:00, 169.38it/s]
  with torch.no_grad(), torch.cuda.amp.autocast():


video_0001_274
video_0001_275
video_0001_276
video_0001_277
video_0001_278
video_0001_279
video_0001_280
video_0001_281
video_0001_282
video_0001_283
video_0001_284
video_0001_285
video_0001_286
video_0001_287
video_0001_288
video_0001_289
video_0001_290
video_0001_291
video_0001_292
video_0001_293
video_0001_294
video_0001_295
video_0001_296
video_0001_297
video_0001_298
video_0001_299
video_0001_300
video_0001_301
video_0001_302
video_0001_303
video_0001_304
video_0001_305
video_0001_306
video_0001_307
video_0001_308
video_0001_309
video_0001_310
video_0001_311
video_0001_312
video_0001_313
video_0001_314
video_0001_315
video_0001_316
video_0001_317
video_0001_318
video_0001_319
video_0001_320
video_0001_321
video_0001_322
video_0001_323
video_0001_324
video_0001_325
video_0001_326
video_0001_327
video_0001_328
video_0001_329
video_0001_330
video_0001_331
video_0001_332
video_0001_333
video_0001_334
video_0001_335
video_0001_336
video_0001_337
video_0001_338
video_0001_339
video_0001

  with torch.no_grad(), torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():
100%|██████████| 55/55 [00:00<00:00, 161.89it/s]
  with torch.no_grad(), torch.cuda.amp.autocast():
  with torch.no_grad(), torch.cuda.amp.autocast():
Flavor chain:  31%|███▏      | 10/32 [11:07<24:27, 66.71s/it]
100%|██████████| 55/55 [00:00<00:00, 164.66it/s]
100%|██████████| 6/6 [00:00<00:00, 139.25it/s]
100%|██████████| 50/50 [00:00<00:00, 178.70it/s]
  with torch.no_grad(), torch.cuda.amp.autocast():


video_0003_2
video_0003_3
video_0003_4
video_0003_5
video_0003_6
video_0003_7
video_0003_8
video_0003_9
video_0003_10
video_0003_11
video_0003_12
video_0003_13
video_0003_14
video_0003_15
video_0003_16
video_0003_17
video_0003_18
video_0003_19
video_0003_20
video_0003_21
video_0003_22
video_0003_23
video_0003_24
video_0003_25
video_0003_26
video_0003_27
video_0003_28
video_0003_29
video_0003_30
video_0003_31
video_0003_32
video_0003_33
video_0003_34
video_0003_35
video_0003_36
video_0003_37
video_0003_38
video_0003_39
video_0003_40
video_0003_41
video_0003_42
video_0003_43
video_0003_44
video_0003_45
video_0003_46
video_0003_47
video_0003_48
video_0003_49
video_0003_50
video_0003_51
video_0003_52
video_0003_53
video_0003_54
video_0003_55
video_0003_56
video_0003_57
video_0003_58
video_0003_59
video_0003_60
video_0003_61
video_0003_62
video_0003_63
video_0003_64
video_0003_65
video_0003_66
video_0003_67
video_0003_68


  with torch.no_grad(), torch.cuda.amp.autocast():
  with torch.cuda.amp.autocast():
100%|██████████| 55/55 [00:00<00:00, 177.73it/s]
  with torch.no_grad(), torch.cuda.amp.autocast():
  with torch.no_grad(), torch.cuda.amp.autocast():
Flavor chain:   6%|▋         | 2/32 [02:01<30:27, 60.91s/it]

In [2]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.51-py3-none-any.whl (901 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m901.3/901.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting ultralytics-thop>=2.0.0
  Downloading ultralytics_thop-2.0.13-py3-none-any.whl (26 kB)
Collecting py-cpuinfo
  Downloading py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)
Installing collected packages: py-cpuinfo, ultralytics-thop, ultralytics
Successfully installed py-cpuinfo-9.0.0 ultralytics-8.3.51 ultralytics-thop-2.0.13
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
import os
import cv2
import numpy as np
import torch
import pickle
from torch import nn
from ultralytics import YOLO  # For object detection
from transformers import BlipProcessor, BlipForConditionalGeneration
from concurrent.futures import ThreadPoolExecutor
from PIL import Image

# Paths
VIDEO_ROOT = '/kaggle/input/coool-benchmark/COOOL Benchmark'
ANNOTATION_PATH = '/kaggle/input/annotations-public-pkl/annotations_public.pkl'

# Assertions
assert os.path.exists(ANNOTATION_PATH), "Annotations file not found."
with open(ANNOTATION_PATH, 'rb') as f:
    annotations = pickle.load(f)  # Load annotations

# Initialize Output File
results_file = open("results.csv", 'w')
results_file.write("ID,Driver_State_Changed")
for i in range(23):
    results_file.write(f",Hazard_Track_{i},Hazard_Name_{i}")
results_file.write("\n")

# Load Models
detector = YOLO("/kaggle/input/yolov8x/keras/default/1/yolov8x.pt")  # Pre-trained YOLO model
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base").to("cpu")  # Use CPU

# Utility function for generating captions
def generate_caption(image_chip):
    """
    Generate a caption for the given image chip using BLIP model.
    """
    try:
        if image_chip is None or image_chip.size == 0:
            return "Invalid Image"
        
        pil_image = Image.fromarray(cv2.cvtColor(image_chip, cv2.COLOR_BGR2RGB))
        inputs = processor(pil_image, return_tensors="pt").to("cpu")
        with torch.no_grad():
            outputs = blip_model.generate(**inputs)
        return processor.decode(outputs[0], skip_special_tokens=True)
    except Exception as e:
        print(f"Error generating caption: {e}")
        return "Error generating caption"

# Main video processing function
def process_video(video, annotations, results_file):
    video_path = os.path.join(VIDEO_ROOT, video + ".mp4")
    if not os.path.exists(video_path):
        print(f"Skipping video {video}.mp4, file not found.")
        return

    print(f"Processing video: {video}")
    video_stream = cv2.VideoCapture(video_path)
    assert video_stream.isOpened(), f"Failed to open video {video_path}"

    frame_id = 0
    previous_centroids = []
    median_dists = []
    captioned_tracks = {}
    driver_state_flag = False

    while True:
        ret, frame = video_stream.read()
        if not ret:
            break

        # Object Detection
        results = detector.predict(frame, device="cpu")  # Use CPU for YOLO
        bboxes = results[0].boxes.xyxy.cpu().numpy() if len(results) > 0 else []
        labels = results[0].boxes.cls.cpu().numpy() if len(results) > 0 else []  # Class labels
        centroids = []
        chips = []
        track_ids = []

        for idx, bbox in enumerate(bboxes):
            x1, y1, x2, y2 = bbox.astype(int)
            centroids.append([(x1 + x2) / 2, (y1 + y2) / 2])
            chips.append(frame[y1:y2, x1:x2])
            track_ids.append(len(track_ids))  # Placeholder for track IDs
            print(f"Frame {frame_id}: Detected object at {x1},{y1},{x2},{y2} with label {labels[idx]}")

        centroids = np.array(centroids)

        # Driver State Detection (LSTM)
        if len(previous_centroids) > 0:
            dists = [
                np.min(np.linalg.norm(previous_centroids - centroid, axis=1))
                for centroid in centroids
            ]
            median_dist = np.median(dists) if dists else 0
            median_dists.append(median_dist)

            if len(median_dists) >= 5:  # Use last 5 distances for LSTM input
                input_sequence = torch.tensor(median_dists[-5:], dtype=torch.float32).unsqueeze(0).unsqueeze(2).to("cpu")  # Use CPU
                with torch.no_grad():
                    slowing_prob = torch.sigmoid(lstm_model(input_sequence)).item()
                if slowing_prob > 0.5:  # Threshold for state change
                    driver_state_flag = True

        # Hazard Detection and Description
        if len(centroids) > 0:
            image_center = [frame.shape[1] / 2, frame.shape[0] / 2]
            distances_to_center = np.linalg.norm(centroids - image_center, axis=1)
            probable_hazard_idx = np.argmin(distances_to_center)

            hazard_chip = chips[probable_hazard_idx]
            hazard_caption = (
                captioned_tracks.get(track_ids[probable_hazard_idx])
                or generate_caption(hazard_chip)
            )
            captioned_tracks[track_ids[probable_hazard_idx]] = hazard_caption

            print(f"Frame {frame_id}: Hazard identified as '{hazard_caption}'")

        # Write Results to File
        results_file.write(
            f"{video}_{frame_id},{driver_state_flag},{track_ids[probable_hazard_idx] if len(track_ids) > 0 else 'None'},{hazard_caption if len(centroids) > 0 else 'None'}" +
            "".join([", , " for _ in range(22)]) +
            "\n"
        )

        # Update Previous Centroids
        previous_centroids = centroids
        frame_id += 1

    video_stream.release()
    print(f"Finished processing video: {video}")

# Parallel Video Processing
with ThreadPoolExecutor(max_workers=4) as executor:
    futures = []
    for video in sorted(annotations.keys()):
        futures.append(executor.submit(process_video, video, annotations, results_file))

    for future in futures:
        future.result()  # Wait for all tasks to complete

results_file.close()

Skipping video video_0004.mp4, file not found.Skipping video video_0002.mp4, file not found.
Skipping video video_0005.mp4, file not found.
Processing video: video_0001

Skipping video video_0007.mp4, file not found.
Skipping video video_0008.mp4, file not found.
Processing video: video_0003
Processing video: video_0009
Processing video: video_0006

Ultralytics 8.3.51 🚀 Python-3.10.15 torch-2.4.0+cu121 CPU (Intel Xeon 2.00GHz)

Ultralytics 8.3.51 🚀 Python-3.10.15 torch-2.4.0+cu121 CPU (Intel Xeon 2.00GHz)

Ultralytics 8.3.51 🚀 Python-3.10.15 torch-2.4.0+cu121 CPU (Intel Xeon 2.00GHz)

0: 384x640 1 person, 6 cars, 1 traffic light, 331.9ms
Speed: 2.2ms preprocess, 331.9ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)
Frame 0: Detected object at 1571,456,1919,620 with label 2.0
Frame 0: Detected object at 1792,399,1858,669 with label 0.0
Frame 0: Detected object at 983,432,1036,475 with label 2.0
Frame 0: Detected object at 923,436,955,459 with label 2.0
Frame 0: Detect



YOLOv8x summary (fused): 268 layers, 68,200,608 parameters, 0 gradients, 257.8 GFLOPs
YOLOv8x summary (fused): 268 layers, 68,200,608 parameters, 0 gradients, 257.8 GFLOPs
YOLOv8x summary (fused): 268 layers, 68,200,608 parameters, 0 gradients, 257.8 GFLOPs
0: 384x640 1 car, 405.1ms
Speed: 3.3ms preprocess, 405.1ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)
Frame 0: Detected object at 4,834,1917,1059 with label 2.0
0: 384x640 1 car, 375.6ms
Speed: 3.7ms preprocess, 375.6ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)
Frame 0: Detected object at 1246,537,1306,588 with label 2.0
Frame 0: Hazard identified as 'a blur of a person in the middle of a room'

0: 384x640 (no detections), 344.4ms
Speed: 8.6ms preprocess, 344.4ms inference, 1.4ms postprocess per image at shape (1, 3, 384, 640)

Frame 0: Hazard identified as 'a car is parked in a garage'

0: 384x640 1 person, 6 cars, 1 traffic light, 330.3ms
Speed: 2.9ms preprocess, 330.3ms inference, 1.5m