# Imports

In [2]:
import os
import requests
import functools
import pathlib
import shutil
from tqdm import tqdm
import threading
from multiprocessing import Pool
import concurrent
from tqdm import tqdm
import pickle
import numpy as np
import cv2
import os

# Settings base variables and paths

In [3]:
# Set the base folder path for the project
output_path = "../output"
images_path = os.path.join(output_path, "images")
metadata_path = os.path.join(output_path, "metadata")
config_path = os.path.join(output_path, "config")

list_of_paths = [output_path, images_path, metadata_path, config_path]

# Set the base URL for the dataset
metadata_extension = "json"

yolo_cfg = "https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3.cfg"
yolo_weights = "https://pjreddie.com/media/files/yolov3.weights"
yolo_classes = "https://raw.githubusercontent.com/pjreddie/darknet/master/data/coco.names"

files_to_download = [
    (yolo_cfg, os.path.join(config_path, "yolov3.cfg")),
    (yolo_weights, os.path.join(config_path, "yolov3.weights")),
    (yolo_classes, os.path.join(config_path, "coco.names")),
]

# Create folder structure
The code creates the folder structure for the project. The folder structure is as follows:
- output
    - images
    - metadata
    - config

This method creates a folder with the given path if it doesn't already exist, It also outputs a message to inform the user if the folder was created or if it already exists.
This is useful for organizing and managing files in a project. By creating a folder to store data and resources, it keeps the working directory tidy and makes it easier to locate files. Additionally, by checking if the folder exists before creating it, it prevents the program from overwriting existing data or throwing an error.

In [4]:
def create_folder(path):
    """
    This function creates a folder at the specified path.
    If the folder already exists, it will print a message saying so.
    If there is an error creating the folder, it will print the error message.

    Parameters:
        :param path (str): The path of the folder to be created.

    Returns:
    None
    """
    try:
        # Use os.mkdir to create the folder at the specified path
        os.mkdir(path)
        print(f"Folder {path} created")
    except FileExistsError:
        # If the folder already exists, print a message saying so
        print(f"Folder {path} already exists")
    except Exception as e:
        # If there is an error creating the folder, print the error message
        print(f"Error creating folder {path}: {e}")

# Create the folder structure
This method initializes a list of folders by calling the create_folder method for each folder in the list.
The purpose of this method is to make sure that all necessary folders exist before the program continues its execution.
If a folder does not exist, the create_folder method will create it. If a folder already exists, the method will simply print a message indicating that the folder already exists. In case of any other error, the method will print the error message.

In [5]:
def init_folder(folder_names: list):
    for folder_name in folder_names:
        create_folder(folder_name)

In [6]:
init_folder(list_of_paths)

Folder ../output already exists
Folder ../output/images already exists
Folder ../output/metadata already exists
Folder ../output/config already exists


# Define methods for downloading the necessary files for yolo
The following code block is a method to download a file from a given URL and save it to a specified filename.
The method starts by creating a session (s = requests.Session()) and then mounting it to the URL (s.mount(url, requests.adapters.HTTPAdapter(max_retries=3))). This sets the maximum number of retries to 3 if the connection to the URL fails.
Then, the method makes a GET request to the URL (r = s.get(url, stream=True, allow_redirects=True)) and checks if it returns a successful response (r.raise_for_status()). If there was an HTTP error during the request, the error message is printed (print(f"HTTP error occurred while downloading dataset: {e}")).
The method also checks the file size specified in the response headers and assigns it to the variable file_size (file_size = int(r.headers.get(‘Content-Length’, 0))). If the file size is 0, a default value of “(Unknown total file size)” is assigned to the variable desc; otherwise, the variable desc is left empty.
Next, the method resolves the file path and creates a directory if it doesn’t already exist (path.parent.mkdir(parents=True, exist_ok=True)). The method then creates a tqdm progress bar to show the download progress (with tqdm.tqdm(total=file_size, unit=‘B’, unit_scale=True, desc=desc) as pbar:).
Finally, the method writes the contents of the file to disk in chunks (for chunk in r.iter_content(chunk_size=1024):), updating the progress bar for each chunk that is written to disk (pbar.update(len(chunk))). If an error occurred during the download, a message with the error is printed (print(f"Error occurred while downloading dataset: {e}")). The file path is returned when the method is finished.

In [7]:
def download(url, filename):
    """
    This download a file from a given URL and save it to a specified filename.

    Parameters:
        :param url (str): The URL of the file to be downloaded.
        :param filename (str): The filename to save the file as.

    Returns:
    path (str): The path of the downloaded file.
    """
    try:
        # Create a session object to persist the state of connection
        s = requests.Session()
        # Retry connecting to the URL up to 3 times
        s.mount(url, requests.adapters.HTTPAdapter(max_retries=3))
        # Send a GET request to the URL to start the download
        r = s.get(url, stream=True, allow_redirects=True)
        # Raise an error if the response is not 200 OK
        r.raise_for_status()
        # Get the file size from the Content-Length header, default to 0 if not present
        file_size = int(r.headers.get('Content-Length', 0))
        # Get the absolute path to the target file
        path = pathlib.Path(filename).expanduser().resolve()
        # Create parent directories if they don't exist
        path.parent.mkdir(parents=True, exist_ok=True)
        # Set the description to display while downloading, "(Unknown total file size)" if file size is 0
        desc = "(Unknown total file size)" if file_size == 0 else ""
        # Enable decoding the response content
        r.raw.read = functools.partial(r.raw.read, decode_content=True)
        # Use tqdm to display the download progress
        with tqdm(total=file_size, unit='B', unit_scale=True, desc=desc) as pbar:
            # Open the target file in binary write mode
            with path.open("wb") as f:
                # Write each chunk of data from the response to the file
                for chunk in r.iter_content(chunk_size=1024):
                    f.write(chunk)
                    pbar.update(len(chunk))
        # Return the path to the downloaded file
        return path
    # Handle HTTP error if the response is not 200 OK
    except requests.exceptions.HTTPError as e:
        print(f"HTTP error occurred while downloading dataset: {e}")
    # Handle any other exceptions that might occur while downloading the file
    except Exception as e:
        print(f"Error occurred while downloading dataset: {e}")

In [8]:
# Download the yolov3 files

In [9]:
for url, filename in files_to_download:
    # check if the file already exists
    if not pathlib.Path(filename).exists():
        # download the file
        download(url, filename)

print("All files are downloaded")

All files are downloaded


# Define methods to get all the image paths
The get_all_images method is used to retrieve all images present in the specified image path. It uses the os.walk function to traverse through all subdirectories within the image path and collects the file names that end with either '.png' or '.jpg' extensions. The full path of each image is then generated by joining the root directory and the file name. The method returns a list of all images' full paths. In case of any error, an error message is printed and an empty list is returned.

In [10]:
def get_all_images(path):
    """Get all images from the given path.

    Args:
    param: image_path (str): path to the directory containing the images.

    Returns:
    - list: a list of full path to all the images with png or jpg extensions.
    - empty list: an empty list if an error occurred while fetching images.
    """
    try:
        # use os.walk to traverse all the subdirectories and get all images
        return [os.path.join(root, name)
                for root, dirs, files in os.walk(path)
                for name in files
                if name.endswith((".png", ".jpg"))]
    except Exception as e:
        # return an empty list and log the error message if an error occurred
        print(f"An error occurred while fetching images: {e}")
        return []

In [11]:
import json


def get_labels(label_path):
    # check if the file exists
    if not pathlib.Path(label_path).exists():
        print(f"Label file {label_path} does not exist")
        return []
    return open(label_path).read().strip().split("\n")

def get_colors(labels):
    return np.random.randint(0, 255, size=(len(labels), 3), dtype="uint8")

def detect(
        path_name,
        output_folder = "../output/images_labelized/",
        label_path = "../output/config/coco.names",
        weights_path = "../output/config/yolov3.weights",
        config_path = "../output/config/yolov3.cfg",
        CONFIDENCE = 0.5,
        SCORE_THRESHOLD = 0.5,
        IOU_THRESHOLD = 0.5
):
    labels = get_labels(label_path)
    colors = get_colors(labels)

    # load the COCO class labels our YOLO model was trained on
    net = cv2.dnn.readNetFromDarknet(config_path, weights_path)

    image = cv2.imread(path_name)
    file_name = os.path.basename(path_name)
    filename, ext = file_name.split(".")

    h, w = image.shape[:2]
    # create 4D blob
    blob = cv2.dnn.blobFromImage(image, 1/255.0, (416, 416), swapRB=True, crop=False)

    # sets the blob as the input of the network
    net.setInput(blob)
    # get all the layer names
    ln = net.getLayerNames()
    try:
        ln = [ln[i[0] - 1] for i in net.getUnconnectedOutLayers()]
    except IndexError:
        # in case getUnconnectedOutLayers() returns 1D array when CUDA isn't available
        ln = [ln[i - 1] for i in net.getUnconnectedOutLayers()]
    # feed forward (inference) and get the network output
    # measure how much it took in seconds
    layer_outputs = net.forward(ln)

    font_scale = 1
    thickness = 1
    boxes, confidences, class_ids = [], [], []
    text_labels = []
    # loop over each of the layer outputs
    for output in layer_outputs:
        # loop over each of the object detections
        for detection in output:
            # extract the class id (label) and confidence (as a probability) of
            # the current object detection
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            # discard out weak predictions by ensuring the detected
            # probability is greater than the minimum probability
            if confidence > CONFIDENCE:
                # scale the bounding box coordinates back relative to the
                # size of the image, keeping in mind that YOLO actually
                # returns the center (x, y)-coordinates of the bounding
                # box followed by the boxes' width and height
                box = detection[:4] * np.array([w, h, w, h])
                (centerX, centerY, width, height) = box.astype("int")
                # use the center (x, y)-coordinates to derive the top and
                # and left corner of the bounding box
                x = int(centerX - (width / 2))
                y = int(centerY - (height / 2))
                # update our list of bounding box coordinates, confidences,
                # and class IDs
                boxes.append([x, y, int(width), int(height)])
                confidences.append(float(confidence))
                class_ids.append(class_id)

        # loop over the indexes we are keeping
        for i in range(len(boxes)):
            text_labels.append(labels[class_ids[i]])
            # extract the bounding box coordinates
            x, y = boxes[i][0], boxes[i][1]
            w, h = boxes[i][2], boxes[i][3]
            # draw a bounding box rectangle and label on the image
            color = [int(c) for c in colors[class_ids[i]]]
            cv2.rectangle(image, (x, y), (x + w, y + h), color=color, thickness=thickness)
            text = f"{labels[class_ids[i]]}: {confidences[i]:.2f}"
            # calculate text width & height to draw the transparent boxes as background of the text
            (text_width, text_height) = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, fontScale=font_scale, thickness=thickness)[0]
            text_offset_x = x
            text_offset_y = y - 5
            box_coords = ((text_offset_x, text_offset_y), (text_offset_x + text_width + 2, text_offset_y - text_height))
            overlay = image.copy()
            cv2.rectangle(overlay, box_coords[0], box_coords[1], color=color, thickness=cv2.FILLED)
            # add opacity (transparency to the box)
            image = cv2.addWeighted(overlay, 0.6, image, 0.4, 0)
            # now put the text (label: confidence %)
            cv2.putText(image, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX,
                        fontScale=font_scale, color=(0, 0, 0), thickness=thickness)

    cv2.imwrite(os.path.join(output_folder, filename + "." + ext), image)

    return text_labels



def update_tags(images):
    # Run the YOLOv3 algorithm on each image
    # display progress bar in the first thread only
    for image in tqdm(images, desc="Updating tags"):
        # read pickle file from ../output/metadata/file_name.pkl
        file_name = os.path.basename(image)
        file_name, ext = file_name.split(".")
        print(file_name, ext)
        try:
            with open(os.path.join("../output/metadata", file_name + "." + metadata_extension), "rb") as f:
                if metadata_extension == "json":
                    metadata = json.load(f)
                else:
                    metadata = pickle.load(f)

            if "tags" in metadata:
                continue

            labels = detect(image)

            # Remove duplicates from labels
            labels = list(set(labels))
            # add labels to metadata
            metadata["tags"] = labels
            # save metadata to pickle file
            with open(os.path.join("../output/metadata", file_name + "." + metadata_extension), "wb") as f:
                if metadata_extension == "json":
                    json.dump(metadata, f)
                else:
                    pickle.dump(metadata, f)
        except FileNotFoundError:
            print("File not found: ", file_name)
            continue
        except Exception as e:
            print("Error: ", e)
            continue


# Get the list of images
images = os.listdir("../output/images")
images = [os.path.join("../output/images", image) for image in images]

update_tags(images)

Updating tags: 100%|██████████| 100/100 [00:00<00:00, 10378.09it/s]

000000563629 jpg
000000002746 jpg
000000109161 jpg
000000300857 jpg
000000052171 jpg
000000023788 jpg
000000179486 jpg
000000483911 jpg
000000469310 jpg
000000368241 jpg
000000348191 jpg
000000479665 jpg
000000350906 jpg
000000574433 jpg
000000210488 jpg
000000481860 jpg
000000230758 jpg
000000083258 jpg
000000271062 jpg
000000454091 jpg
000000520062 jpg
000000462523 jpg
000000078954 jpg
000000472730 jpg
000000576224 jpg
000000328895 jpg
000000146138 jpg
000000319490 jpg
000000512946 jpg
000000161587 jpg
000000097205 jpg
000000439269 jpg
000000346194 jpg
000000248813 jpg
000000374898 jpg
000000501084 jpg
000000257933 jpg
000000365995 jpg
000000051336 jpg
000000561770 jpg
000000353599 jpg
000000517180 jpg
000000142438 jpg
000000164399 jpg
000000276043 jpg
000000071958 jpg
000000380919 jpg
000000366522 jpg
000000023978 jpg
000000098650 jpg
000000089985 jpg
000000532328 jpg
000000053247 jpg
000000348807 jpg
000000108257 jpg
000000128187 jpg
000000321669 jpg
000000272231 jpg
000000087764 j




### Now, find dominant colors in the images

In [12]:
import cv2
import numpy as np
from sklearn.cluster import KMeans
from collections import Counter
from matplotlib import pyplot as plt
from tqdm import tqdm


# Convert RGB to HEX (RGB is an array of 3 values)
def rgb_to_hex(rgb):
    return '#%02x%02x%02x' % (int(rgb[0]), int(rgb[1]), int(rgb[2]))


# Find 4 dominant colors in the image
def find_dominant_colors(image, k=4, image_processing_size=None):
    # Need to return a list of 4 colors
    assert k <= 4, "k needs to be less than or equal to 4"
    # Resize image if new dims provided, just to speed up processing
    if image_processing_size is not None:
        image = cv2.resize(image, image_processing_size, interpolation=cv2.INTER_AREA)
    # Convert to RGB from BGR
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    # Reshape the image to be a list of pixels
    image = image.reshape((image.shape[0] * image.shape[1], 3))
    # Cluster and assign labels to the pixels
    clt = KMeans(n_clusters=k, n_init=10)
    labels = clt.fit_predict(image)
    # Count labels to find most popular
    label_counts = Counter(labels)
    # Subset out most popular centroid
    dominant_colors = [clt.cluster_centers_[i] for i in label_counts.keys()]
    # For each color, convert to hex and append to list
    dominant_colors = [rgb_to_hex(color) for color in dominant_colors]

    # find percent of each color in the image
    percent = [int((label_counts[i] / len(labels)) * 100) for i in label_counts.keys()]

    # return the 4 dominant colors and their percent
    dominant_colors = list(zip(dominant_colors, percent))
    return dominant_colors


# For each image in the folder "../images_labelized"
for image in tqdm(images, desc="Finding dominant colors"):
    img = cv2.imread("../output/images/" + image)
    dominant_color = find_dominant_colors(img)
    # save the image in the folder "../images_labelized"
    print(dominant_color)


Finding dominant colors:   0%|          | 0/100 [00:00<?, ?it/s][ WARN:0@25.931] global loadsave.cpp:244 findDecoder imread_('../output/images/../output/images/000000563629.jpg'): can't open/read file: check file path/integrity
Finding dominant colors:   0%|          | 0/100 [00:00<?, ?it/s]


error: OpenCV(4.7.0) /Users/opencv-cn/GHA-OCV-3/_work/opencv-python/opencv-python/opencv/modules/imgproc/src/color.cpp:182: error: (-215:Assertion failed) !_src.empty() in function 'cvtColor'


In [None]:
# Create histogram of the dominant colors
import matplotlib.pyplot as plt
import numpy as np

# For each image in the folder "../images_labelized" get the line that contains the dominant color
for image in tqdm(images, desc="Creating color pie charts"):
    with open("../output/labels/" + image.split(".")[0] + ".txt", "r") as f:
        for line in f:
            if "dominant_color" in line:
                # Get the dominant color
                dominant_color = line.split(":")[1]
                dominant_color = eval(dominant_color)
                # Create the pie chart
                labels = [i[0] for i in dominant_color]
                sizes = [i[1] for i in dominant_color]
                fig1, ax1 = plt.subplots()
                # change the colors of the pie chart
                colors = [i[0] for i in dominant_color]
                ax1.pie(sizes, colors=colors, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90)
                ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
                # Save the pie chart in the folder "../histogram"
                plt.savefig("../output/histogram/" + image.split(".")[0] + ".png")
                plt.close()


# Group the images by multiple data
# there is multiple ways to group the images (in the folder labels, we can find the label, the dominant color), in metadata folder, we have the metadata of each image

In [None]:
import pandas as pd
import pickle

# Create a dataframe with the metadata of each image, one pikle file per image
df = pd.DataFrame(columns=["image", "label", "dominant_color", "metadata"])

# For each image in the folder "../images_labelized" get the line that contains the dominant color
for image in tqdm(images, desc="Creating dataframe"):
    # Example of lines :
    # label:person,posX:113,posY:120,width:134,height:360,confidence:0.9987273216247559%
    # label:horse,posX:325,posY:187,width:282,height:270,confidence:0.9985314607620239%
    # label:handbag,posX:143,posY:385,width:82,height:91,confidence:0.5130287408828735%
    # dominant_color:[('#3e4542', 37), ('#6d6f6d', 22), ('#a6a4a7', 9), ('#191b1d', 30)]

    # Get the label
    with open("../output/labels/" + image.split(".")[0] + ".txt", "r") as f:
        # it's possible to have more than one label per image
        label = ""
        dominant_color = ""

        for line in f:
            if "label" in line:
                raw_label = line.split(":")[1]
                # cut as the first , is the end of the label
                label += raw_label.split(",")[0]
                label += ","
            if "dominant_color" in line:
                dominant_color = line.split(":")[1]
                dominant_color = eval(dominant_color)
                dominant_color = [i[0] for i in dominant_color]
                dominant_color = ",".join(dominant_color)
        # Get the metadata
        with open("../output/metadata/" + image.split(".")[0] + ".pickle", "rb") as f:
            metadata = pickle.load(f)
            # Convert metadata to string : key:value,key:value
            metadata = ",".join([f"{key}:{value}" for key, value in metadata.items()])

        # Add the image to the dataframe by using panda.concat
        df = pd.concat([df, pd.DataFrame([[image, label, dominant_color, metadata]],
                                         columns=["image", "label", "dominant_color", "metadata"])])

# Save the dataframe
df.to_csv("../output/dataframe.csv", index=False)


In [None]:
# Group the images by label
import os

# Create a folder for each label
for label in df["label"].unique():
    if not os.path.exists("../output/grouped_by_label/" + label):
        os.makedirs("../output/grouped_by_label/" + label)

# For each image in the folder "../images_labelized" get the line that contains the dominant color
for image in tqdm(images, desc="Grouping images by label"):
    # Get the label
    with open("../output/labels/" + image.split(".")[0] + ".txt", "r") as f:
        for line in f:
            if "label" in line:
                label = line.split(":")[1]
                label = label.split(",")[0]
    # Copy the image in the folder "../grouped_by_label"
    os.system(f"cp ../output/images/{image} ../output/grouped_by_label/{label}/{image}")

In [None]:
# Group image by dominant color, take the color one by one and create a folder for each color
for colors in df["dominant_color"].unique():
    # split colors by , to get individual colors
    colors = colors.split(",")
    for color in colors:
        if not os.path.exists("../output/grouped_by_dominant_color/" + color):
            os.makedirs("../output/grouped_by_dominant_color/" + color)

# For each image in the folder "../images_labelized" get the line that contains the dominant color
for image in tqdm(images, desc="Grouping images by dominant color"):
    # Get the dominant color
    with open("../output/labels/" + image.split(".")[0] + ".txt", "r") as f:
        for line in f:
            if "dominant_color" in line:
                dominant_color = line.split(":")[1]
                dominant_color = eval(dominant_color)
                dominant_color = [i[0] for i in dominant_color]
                dominant_color = ",".join(dominant_color)
                # split colors by , to get individual colors
                dominant_color = dominant_color.split(",")
                for color in dominant_color:
                    # Copy the image in the folder "../grouped_by_dominant_color"
                    os.system(f"cp ../output/images/{image} ../output/grouped_by_dominant_color/{color}/{image}")

In [None]:
# Get all metadata and group by similar metadata

In [None]:
for metadata in df["metadata"].unique():
    # Metadata are in string formated as key:value,key:value
    # Split the metadata by , to get individual metadata
    metadata = metadata.split(",")
    for meta in metadata:
        try:
            # Split the metadata by : to get the key and the value
            key, value = meta.split(":")
            if not os.path.exists(f"../output/grouped_by_metadata/{key}/{value}"):
                os.makedirs(f"../output/grouped_by_metadata/{key}/{value}")
        except:
            continue

# For each image in the folder "../images_labelized" get the line that contains the dominant color
for image in tqdm(images, desc="Grouping images by metadata"):
    # Get the metadata
    with open("../output/metadata/" + image.split(".")[0] + ".pickle", "rb") as f:
        metadata = pickle.load(f)
        # Convert metadata to string : key:value,key:value
        metadata = ",".join([f"{key}:{value}" for key, value in metadata.items()])
        # Split the metadata by , to get individual metadata
        metadata = metadata.split(",")
        for meta in metadata:
            try:
                # Split the metadata by : to get the key and the value
                key, value = meta.split(":")
            except:
                continue
            # Copy the image in the folder "../grouped_by_metadata"
            os.system(f"cp ../output/images/{image} ../output/grouped_by_metadata/{key}/{value}/{image}")