In [1]:
import os
import json
import numpy as np
import tensorflow as tf
import cv2
from concurrent.futures import ThreadPoolExecutor
from keras import layers

class ULSAMLayer(layers.Layer):
    def __init__(self, groups=8, **kwargs):
        super(ULSAMLayer, self).__init__(**kwargs)
        self.groups = groups
        self.depthwise_conv = layers.DepthwiseConv2D(kernel_size=1, strides=1, padding='same')
        self.max_pool = layers.MaxPooling2D(pool_size=3, strides=1, padding='same')
        self.conv = layers.Conv2D(filters=80, kernel_size=1, strides=1, padding='same')

    def call(self, input_tensor):
        channels = input_tensor.shape[-1]
        group_size = channels // self.groups
        splits = tf.split(input_tensor, num_or_size_splits=self.groups, axis=-1)
        output_splits = []
        for split in splits:
            processed_split = self.process_split(split, group_size)
            output_splits.append(processed_split)
        return tf.concat(output_splits, axis=-1)

    def process_split(self, split, group_size):
        x = self.depthwise_conv(split)
        x = self.max_pool(x)
        x = self.conv(x)
        return x

def get_bounding_box(cam, img_size=(224, 224), threshold=0.5):
    cam_resized = cv2.resize(cam, img_size)
    binary = (cam_resized > threshold).astype(np.uint8)
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    if not contours:
        return 0.0, 0.0, 0.0, 0.0
    
    largest_contour = max(contours, key=cv2.contourArea)
    x, y, w, h = cv2.boundingRect(largest_contour)
    
    x_norm = x / img_size[0]
    y_norm = y / img_size[1]
    w_norm = w / img_size[0]
    h_norm = h / img_size[1]
    
    return (x_norm, y_norm, w_norm, h_norm)

def generate_cam(img, class_idx, cam_model):
    if not isinstance(img, tf.Tensor):
        img = tf.convert_to_tensor(img, dtype=tf.float32)
    
    with tf.GradientTape() as tape:
        tape.watch(img)
        conv_output, predictions = cam_model(img)
        loss = predictions[0, class_idx]
    
    grads = tape.gradient(loss, conv_output)
    pooled_grads = tf.reduce_mean(grads, axis=(0, 1, 2))
    cam = tf.reduce_sum(tf.multiply(pooled_grads, conv_output), axis=-1)
    cam = tf.maximum(cam, 0)
    
    if tf.reduce_max(cam) > 0:
        cam = cam / tf.reduce_max(cam)
    cam = tf.squeeze(cam)
    
    return cam.numpy()

def parse_image(img_path):
    print(f"Parsing image: {os.path.basename(img_path)}")
    img = tf.io.read_file(img_path)
    img = tf.image.decode_png(img, channels=3)
    img = tf.image.resize(img, (224, 224))
    img = tf.cast(img, tf.float32) / 255.0
    return np.expand_dims(img, axis=0)

def read_mapping(file_path):
    print(f"Reading mapping file from: {file_path}")
    mapping = {}
    with open(file_path, 'r') as f:
        for line in f:
            if line.strip() and not line.startswith("#"):
                try:
                    values = line.split(",")
                    triplet = int(values[0].strip())
                    instrument = int(values[1].strip())
                    target = int(values[2].strip())
                    verb = int(values[3].strip())
                    mapping[triplet] = (instrument, target, verb)
                except ValueError:
                    continue
    print(f"Loaded {len(mapping)} triplet mappings")
    return mapping

def process_frame(args):
    """Process a single frame"""
    image_path, model, cam_model, triplet_mapping = args
    frame_id = str(int(os.path.basename(image_path).split(".")[0]))
    print(f"Processing frame {frame_id}")
    
    try:
        image = parse_image(image_path)
        
        print(f"Running model inference for frame {frame_id}")
        model_output = model(tf.convert_to_tensor(image))
        
        recognition = model_output[1][0].numpy().tolist()
        confidence_indices = [(idx, conf.numpy()) for idx, conf in enumerate(model_output[1][0]) if conf > 0.5]
        print(f"Found {len(confidence_indices)} confident predictions in frame {frame_id}")
        
        list_triplet = []
        for idx, confidence in confidence_indices:
            print(f"Processing triplet {idx} with confidence {confidence:.3f} in frame {frame_id}")
            triplet_id = triplet_mapping[idx]
            instrument_idx = triplet_id[0]
            cam = generate_cam(image, instrument_idx, cam_model)
            bbox_x, bbox_y, bbox_w, bbox_h = get_bounding_box(cam)
            tool_prob = float(model_output[0][0][instrument_idx].numpy())
            
            list_triplet.append({
                "triplet": idx,
                "instrument": [instrument_idx, tool_prob, bbox_x, bbox_y, bbox_w, bbox_h]
            })
        
        result = {
            "detection": list_triplet,
            "recognition": recognition
        }
        
        print(f"Completed processing frame {frame_id}")
        return frame_id, result
        
    except Exception as e:
        print(f"Error processing frame {frame_id}: {str(e)}")
        return frame_id, None

def process_video(video_folder, base_path, model, cam_model, triplet_mapping, max_workers=None):
    """Process all frames of a video in parallel"""
    print(f"\nStarting processing of video: {video_folder}")
    video_path = os.path.join(base_path, video_folder)
    video_dict = {}
    
    # Get all image paths
    image_paths = [os.path.join(video_path, f) for f in sorted(os.listdir(video_path))]
    print(f"Found {len(image_paths)} frames in {video_folder}")
    
    # Create arguments for parallel processing
    process_args = [(path, model, cam_model, triplet_mapping) for path in image_paths]
    
    # Process all frames in parallel
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        results = executor.map(process_frame, process_args)
        
        for frame_id, result in results:
            if result is not None:
                video_dict[frame_id] = result
    
    print(f"Completed processing video: {video_folder}")
    return video_dict

def main():
    print("Starting main execution")
    
    base_path = r"C:\Users\satya\OneDrive\Desktop\projects\BH\test\videos"
    desired_order = ["VID92", "VID96", "VID103", "VID110", "VID111"]
    file_path = r"C:\Users\satya\OneDrive\Desktop\projects\BH\dict\maps.txt"
    
    print("Loading model...")
    model = tf.keras.models.load_model(
        r"C:\Users\satya\OneDrive\Desktop\projects\BH\model_final.h5",
        custom_objects={'ULSAMLayer': ULSAMLayer}
    )
    print("Model loaded successfully")
    
    print("Creating CAM model...")
    cam_model = tf.keras.Model(
        inputs=model.input,
        outputs=[model.get_layer('re_lu_34').output, model.get_layer('instrument_output').output]
    )
    print("CAM model created")
    
    triplet_mapping = read_mapping(file_path)
    
    print(f"\nStarting video processing for {len(desired_order)} videos")
    overall_dict = {}
    
    # Process videos in parallel
    with ThreadPoolExecutor(max_workers=len(desired_order)) as executor:
        print("Initialized thread pool executor")
        future_to_video = {
            executor.submit(
                process_video, 
                video_folder, 
                base_path, 
                model, 
                cam_model, 
                triplet_mapping,
                os.cpu_count()  # Use all available CPU cores for frame processing
            ): video_folder 
            for video_folder in desired_order
        }
        
        for future in future_to_video:
            video_folder = future_to_video[future]
            try:
                print(f"\nWaiting for results from video {video_folder}")
                video_dict = future.result()
                overall_dict[video_folder] = video_dict
                print(f"Completed processing video {video_folder}")
            except Exception as e:
                print(f"Error processing video {video_folder}: {str(e)}")
    
    print("\nSaving results to video_predictions.json")
    with open("video_prediction.json", "w") as f:
        json.dump(overall_dict, f, indent=2)
        print("Processing completed successfully")
    return overall_dict

overall_dict=main()

Starting main execution
Loading model...





Model loaded successfully
Creating CAM model...
CAM model created
Reading mapping file from: C:\Users\satya\OneDrive\Desktop\projects\BH\dict\maps.txt
Loaded 100 triplet mappings

Starting video processing for 5 videos
Initialized thread pool executor

Starting processing of video: VID92

Starting processing of video: VID96

Starting processing of video: VID103

Starting processing of video: VID110

Starting processing of video: VID111

Waiting for results from video VID92
Found 1707 frames in VID96
Found 2124 frames in VID92
Processing frame 0
Parsing image: 000000.png
Processing frame 0
Parsing image: 000000.png
Processing frame 1
Parsing image: 000001.png
Processing frame 1
Parsing image: 000001.png
Processing frame 2
Parsing image: 000002.png
Processing frame 2
Parsing image: 000002.png
Found 2146 frames in VID111
Found 2220 frames in VID103
Processing frame 3
Parsing image: 000003.png
Found 2177 frames in VID110
Processing frame 4
Parsing image: 000004.png
Processing frame 5
Parsi