In [None]:
import os
import cv2
import json
import numpy as np
from tqdm import tqdm
from sklearn.cluster import KMeans
from skimage.color import rgb2lab, deltaE_cie76

color_palette = {
    'black': [0, 0, 0],
    'blue': [0, 0, 255],
    'brown': [150, 75, 0],
    'green': [0, 128, 0],
    'grey': [128, 128, 128],
    'orange': [255, 165, 0],
    'pink': [255, 192, 203],
    'purple': [128, 0, 128],
    'red': [255, 0, 0],
    'white': [255, 255, 255],
    'yellow': [255, 255, 0]
}

palette_lab = {name: rgb2lab(np.uint8([[rgb]])) for name, rgb in color_palette.items()}

def get_closest_color_name(lab_color):
    min_distance = float('inf')
    closest_color = None
    for color_name, color_lab in palette_lab.items():
        distance = deltaE_cie76(lab_color, color_lab)
        if distance < min_distance:
            min_distance = distance
            closest_color = color_name
    return closest_color

def extract_dominant_colors_and_bboxes(image, num_clusters=5):
    h, w, _ = image.shape
    reshaped_image = image.reshape((-1, 3))
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(reshaped_image)
    labels = kmeans.labels_
    dominant_colors = kmeans.cluster_centers_.astype(int)
    dominant_colors_lab = rgb2lab(np.uint8([[dominant_colors]]).reshape(-1, 1, 3))
    color_names = [get_closest_color_name(color) for color in dominant_colors_lab]
    bounding_boxes = []

    for cluster_idx in range(num_clusters):
        cluster_pixels = np.where(labels == cluster_idx)[0]
        pixel_y = cluster_pixels // w
        pixel_x = cluster_pixels % w

        xmin, xmax = pixel_x.min(), pixel_x.max()
        ymin, ymax = pixel_y.min(), pixel_y.max()

        bounding_boxes.append({
            "bounding_box": {
                "xmin": int(xmin),
                "ymin": int(ymin),
                "width": int(xmax - xmin),
                "height": int(ymax - ymin)
            },
            "color_name": color_names[cluster_idx]
        })

    return bounding_boxes

def save_dominant_colors_to_json(image, output_path, num_clusters=5):
    bounding_boxes = extract_dominant_colors_and_bboxes(image, num_clusters=num_clusters)
    json_data = {
        "dominant_colors_bounding_boxes": bounding_boxes
    }
    with open(output_path, 'w') as f:
        json.dump(json_data, f, indent=4)

def process_images_in_directory(input_base_dir, output_base_dir, num_clusters=5):
    os.makedirs(output_base_dir, exist_ok=True)

    for video_folder in os.listdir(input_base_dir):
        video_folder_path = os.path.join(input_base_dir, video_folder)
        
        if os.path.isdir(video_folder_path) and '_reduced' not in video_folder:
            for keyframe_subfolder in os.listdir(video_folder_path):
                keyframe_subfolder_path = os.path.join(video_folder_path, keyframe_subfolder)
                
                if os.path.isdir(keyframe_subfolder_path):
                    for image_filename in tqdm(os.listdir(keyframe_subfolder_path)):
                        if image_filename.endswith(('.jpg', '.png', '.jpeg')):  
                            image_path = os.path.join(keyframe_subfolder_path, image_filename)
                            image = cv2.imread(image_path)
                            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                            output_json_path = os.path.join(output_base_dir, video_folder, keyframe_subfolder)
                            os.makedirs(output_json_path, exist_ok=True)
                            output_json_path = os.path.join(output_json_path, os.path.splitext(image_filename)[0] + '.json')
                            save_dominant_colors_to_json(image, output_json_path, num_clusters=num_clusters)
                            print(f"Processed {image_filename}, saved output to {output_json_path}")

input_directory = '../keyframe_information/keyframe'
output_directory = '../keyframe_information/color_metadata'

process_images_in_directory(input_directory, output_directory, num_clusters=5)