#### Auto labelization with yolo

In [1]:
# Load the images from the folder "../images"
import os

# Load the images from the folder "../images"
images = []
for filename in os.listdir("../output/images"):
    images.append(filename)


# Download and place it into the folder "../config"
# https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3.cfg
# https://pjreddie.com/media/files/yolov3.weights
# https://raw.githubusercontent.com/pjreddie/darknet/master/data/coco.names

def download(url, filename):
    import functools
    import pathlib
    import shutil
    import requests
    from tqdm.auto import tqdm

    r = requests.get(url, stream=True, allow_redirects=True)
    if r.status_code != 200:
        r.raise_for_status()  # Will only raise for 4xx codes, so...
        raise RuntimeError(f"Request to {url} returned status code {r.status_code}")
    file_size = int(r.headers.get('Content-Length', 0))

    path = pathlib.Path(filename).expanduser().resolve()
    path.parent.mkdir(parents=True, exist_ok=True)

    desc = "(Unknown total file size)" if file_size == 0 else ""
    r.raw.read = functools.partial(r.raw.read, decode_content=True)  # Decompress if needed
    with tqdm.wrapattr(r.raw, "read", total=file_size, desc=desc) as r_raw:
        with path.open("wb") as f:
            shutil.copyfileobj(r_raw, f)

    return path


# Check if exist the folder "../images_labelized"
import os

if not os.path.exists("../output/images_labelized"):
    os.makedirs("../output/images_labelized")

# Check if the folder "../labels" exist
if not os.path.exists("../output/labels"):
    os.makedirs("../output/labels")

# check if exist the folder "../config"
if not os.path.exists("../output/config"):
    os.makedirs("../output/config")

print("All folders are created")

# Check if the file yolov3.cfg exist
if not os.path.isfile("../output/config/yolov3.cfg"):
    download("https://raw.githubusercontent.com/pjreddie/darknet/master/cfg/yolov3.cfg", "../output/config/yolov3.cfg")

# Check if the file yolov3.weights exist
if not os.path.isfile("../output/config/yolov3.weights"):
    download("https://pjreddie.com/media/files/yolov3.weights", "../output/config/yolov3.weights")

# Check if the file coco.names exist
if not os.path.isfile("../output/config/coco.names"):
    download("https://raw.githubusercontent.com/pjreddie/darknet/master/data/coco.names", "../output/config/coco.names")

print("All files are downloaded")

All folders are created
All files are downloaded


In [2]:
import cv2
import numpy as np
from tqdm import tqdm

# Load Yolo
net = cv2.dnn.readNet("../output/config/yolov3.weights", "../output/config/yolov3.cfg")
classes = []
with open("../output/config/coco.names", "r") as f:
    classes = [line.strip() for line in f.readlines()]
layer_names = net.getLayerNames()
output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers()]
colors = np.random.uniform(0, 255, size=(len(classes), 3))

# Loading image
img = cv2.imread("../output/images/" + images[0])


def labelize(img, img_name):
    #img = cv2.resize(img, None, fx=0.4, fy=0.4)
    height, width, channels = img.shape

    # Detecting objects
    blob = cv2.dnn.blobFromImage(img, 0.00392, (416, 416), (0, 0, 0), True, crop=False)

    net.setInput(blob)
    outs = net.forward(output_layers)

    # Showing informations on the screen
    class_ids = []
    confidences = []
    boxes = []
    for out in outs:
        for detection in out:
            scores = detection[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if confidence > 0.5:
                # Object detected
                center_x = int(detection[0] * width)
                center_y = int(detection[1] * height)
                w = int(detection[2] * width)
                h = int(detection[3] * height)
                # Rectangle coordinates
                x = int(center_x - w / 2)
                y = int(center_y - h / 2)
                boxes.append([x, y, w, h])
                confidences.append(float(confidence))
                class_ids.append(class_id)

    indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)

    font = cv2.FONT_HERSHEY_PLAIN
    for i in range(len(boxes)):
        if i in indexes:
            x, y, w, h = boxes[i]
            label = str(classes[class_ids[i]])
            color = colors[i]
            cv2.rectangle(img, (x, y), (x + w, y + h), color, 2)
            cv2.putText(img, label, (x, y + 30), font, 2, color, 1)
            # Save the label in the folder "../labels"
            with open("../output/labels/" + img_name.split(".")[0] + ".txt", "a") as f:
                f.write(f"label:{label},posX:{x},posY:{y},width:{w},height:{h},confidence:{confidences[i]}%\n")
    return img


# For each image in the folder "../images"
for image in tqdm(images, desc="Labelizing images"):
    img = cv2.imread("../output/images/" + image)
    img = labelize(img, image)
    # save the image in the folder "../images_labelized"
    cv2.imwrite("../output/images_labelized/" + image, img)


Labelizing images: 100%|██████████| 40671/40671 [1:58:13<00:00,  5.73it/s]     


### Now, find dominant colors in the images

In [None]:
import cv2


# Convert RGB to HEX (RGB is an array of 3 values)
def rgb_to_hex(rgb):
    return '#%02x%02x%02x' % (int(rgb[0]), int(rgb[1]), int(rgb[2]))


# Find 4 dominant colors in the image
def find_dominant_colors(image, k=4, image_processing_size=None):
    # Need to return a list of 4 colors
    assert k <= 4, "k needs to be less than or equal to 4"
    # Resize image if new dims provided, just to speed up processing
    if image_processing_size is not None:
        image = cv2.resize(image, image_processing_size, interpolation=cv2.INTER_AREA)
    # Convert to RGB from BGR
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    # Reshape the image to be a list of pixels
    image = image.reshape((image.shape[0] * image.shape[1], 3))
    # Cluster and assign labels to the pixels
    clt = KMeans(n_clusters=k, n_init=10)
    labels = clt.fit_predict(image)
    # Count labels to find most popular
    label_counts = Counter(labels)
    # Subset out most popular centroid
    dominant_colors = [clt.cluster_centers_[i] for i in label_counts.keys()]
    # For each color, convert to hex and append to list
    dominant_colors = [rgb_to_hex(color) for color in dominant_colors]

    # find percent of each color in the image
    percent = [int((label_counts[i] / len(labels)) * 100) for i in label_counts.keys()]

    # return the 4 dominant colors and their percent
    dominant_colors = list(zip(dominant_colors, percent))
    return dominant_colors


# For each image in the folder "../images_labelized"
for image in tqdm(images, desc="Finding dominant colors"):
    img = cv2.imread("../output/images/" + image)
    dominant_color = find_dominant_colors(img)
    # save the image in the folder "../images_labelized"
    with open("../output/labels/" + image.split(".")[0] + ".txt", "a") as f:
        f.write(f"dominant_color:{dominant_color}")


In [None]:
# Create histogram of the dominant colors
import matplotlib.pyplot as plt
import numpy as np

# For each image in the folder "../images_labelized" get the line that contains the dominant color
for image in tqdm(images, desc="Creating color pie charts"):
    with open("../output/labels/" + image.split(".")[0] + ".txt", "r") as f:
        for line in f:
            if "dominant_color" in line:
                # Get the dominant color
                dominant_color = line.split(":")[1]
                dominant_color = eval(dominant_color)
                # Create the pie chart
                labels = [i[0] for i in dominant_color]
                sizes = [i[1] for i in dominant_color]
                fig1, ax1 = plt.subplots()
                # change the colors of the pie chart
                colors = [i[0] for i in dominant_color]
                ax1.pie(sizes, colors=colors, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90)
                ax1.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
                # Save the pie chart in the folder "../histogram"
                plt.savefig("../output/histogram/" + image.split(".")[0] + ".png")
                plt.close()


# Group the images by multiple data
# there is multiple ways to group the images (in the folder labels, we can find the label, the dominant color), in metadata folder, we have the metadata of each image

In [None]:
import pandas as pd
import pickle

# Create a dataframe with the metadata of each image, one pikle file per image
df = pd.DataFrame(columns=["image", "label", "dominant_color", "metadata"])

# For each image in the folder "../images_labelized" get the line that contains the dominant color
for image in tqdm(images, desc="Creating dataframe"):
    # Example of lines :
    # label:person,posX:113,posY:120,width:134,height:360,confidence:0.9987273216247559%
    # label:horse,posX:325,posY:187,width:282,height:270,confidence:0.9985314607620239%
    # label:handbag,posX:143,posY:385,width:82,height:91,confidence:0.5130287408828735%
    # dominant_color:[('#3e4542', 37), ('#6d6f6d', 22), ('#a6a4a7', 9), ('#191b1d', 30)]

    # Get the label
    with open("../output/labels/" + image.split(".")[0] + ".txt", "r") as f:
        # it's possible to have more than one label per image
        label = ""
        dominant_color = ""

        for line in f:
            if "label" in line:
                raw_label = line.split(":")[1]
                # cut as the first , is the end of the label
                label += raw_label.split(",")[0]
                label += ","
            if "dominant_color" in line:
                dominant_color = line.split(":")[1]
                dominant_color = eval(dominant_color)
                dominant_color = [i[0] for i in dominant_color]
                dominant_color = ",".join(dominant_color)
        # Get the metadata
        with open("../output/metadata/" + image.split(".")[0] + ".pickle", "rb") as f:
            metadata = pickle.load(f)
            # Convert metadata to string : key:value,key:value
            metadata = ",".join([f"{key}:{value}" for key, value in metadata.items()])

        # Add the image to the dataframe by using panda.concat
        df = pd.concat([df, pd.DataFrame([[image, label, dominant_color, metadata]],
                                         columns=["image", "label", "dominant_color", "metadata"])])

# Save the dataframe
df.to_csv("../output/dataframe.csv", index=False)


In [None]:
# Group the images by label
import os

# Create a folder for each label
for label in df["label"].unique():
    if not os.path.exists("../output/grouped_by_label/" + label):
        os.makedirs("../output/grouped_by_label/" + label)

# For each image in the folder "../images_labelized" get the line that contains the dominant color
for image in tqdm(images, desc="Grouping images by label"):
    # Get the label
    with open("../output/labels/" + image.split(".")[0] + ".txt", "r") as f:
        for line in f:
            if "label" in line:
                label = line.split(":")[1]
                label = label.split(",")[0]
    # Copy the image in the folder "../grouped_by_label"
    os.system(f"cp ../output/images/{image} ../output/grouped_by_label/{label}/{image}")

In [None]:
# Group image by dominant color, take the color one by one and create a folder for each color
for colors in df["dominant_color"].unique():
    # split colors by , to get individual colors
    colors = colors.split(",")
    for color in colors:
        if not os.path.exists("../output/grouped_by_dominant_color/" + color):
            os.makedirs("../output/grouped_by_dominant_color/" + color)

# For each image in the folder "../images_labelized" get the line that contains the dominant color
for image in tqdm(images, desc="Grouping images by dominant color"):
    # Get the dominant color
    with open("../output/labels/" + image.split(".")[0] + ".txt", "r") as f:
        for line in f:
            if "dominant_color" in line:
                dominant_color = line.split(":")[1]
                dominant_color = eval(dominant_color)
                dominant_color = [i[0] for i in dominant_color]
                dominant_color = ",".join(dominant_color)
                # split colors by , to get individual colors
                dominant_color = dominant_color.split(",")
                for color in dominant_color:
                    # Copy the image in the folder "../grouped_by_dominant_color"
                    os.system(f"cp ../output/images/{image} ../output/grouped_by_dominant_color/{color}/{image}")

In [None]:
# Get all metadata and group by similar metadata

In [None]:
for metadata in df["metadata"].unique():
    # Metadata are in string formated as key:value,key:value
    # Split the metadata by , to get individual metadata
    metadata = metadata.split(",")
    for meta in metadata:
        try:
            # Split the metadata by : to get the key and the value
            key, value = meta.split(":")
            if not os.path.exists(f"../output/grouped_by_metadata/{key}/{value}"):
                os.makedirs(f"../output/grouped_by_metadata/{key}/{value}")
        except:
            continue

# For each image in the folder "../images_labelized" get the line that contains the dominant color
for image in tqdm(images, desc="Grouping images by metadata"):
    # Get the metadata
    with open("../output/metadata/" + image.split(".")[0] + ".pickle", "rb") as f:
        metadata = pickle.load(f)
        # Convert metadata to string : key:value,key:value
        metadata = ",".join([f"{key}:{value}" for key, value in metadata.items()])
        # Split the metadata by , to get individual metadata
        metadata = metadata.split(",")
        for meta in metadata:
            try:
                # Split the metadata by : to get the key and the value
                key, value = meta.split(":")
            except:
                continue
            # Copy the image in the folder "../grouped_by_metadata"
            os.system(f"cp ../output/images/{image} ../output/grouped_by_metadata/{key}/{value}/{image}")