In [2]:
import numpy as np
import cv2
import tqdm
import os
import sys

# color of different clusters
GBR = [[0, 0, 255],
       [0, 128, 255],
       [255, 0, 0],
       [128, 0, 128],
       [255, 0, 255]]

# path configuration
project_root = os.path.abspath('.')
output_path = os.path.join(project_root)
input_path = os.path.join(project_root)
if not os.path.exists(output_path):
    os.makedirs(output_path)

In [52]:
import time
def kmeans(data: np.ndarray, n_cl: int):
    """
        K-means

    :param data:    original data
    :param n_cl:    number of classes
    :param seeds:   seeds
    :return:        new labels and new seeds
    """
    n_samples, channel = data.shape
    # print("???")
    # TODO: firstly you should init centroids by a certain strategy
    indexes = np.random.choice(len(data), n_cl, replace=False)
    centers = []
    for i in range(len(indexes)):
        centers.append(data[indexes[i]])
    centers = np.asarray(centers)
    # print(data)
    
    old_labels = np.zeros((n_samples,))
    while True:
        # TODO: calc distance between samples and centroids
        # t1 = time.time()
        new_labels = np.zeros((n_samples,))
        min_data_dist = None
        # print("--------------------centers---------------------")
        # print(centers)
        for id, center in enumerate(centers):
            data_dist = np.square(data - center).sum(axis = 1)
            # print("--------------------center", id, "---------------------")
            # print(data_dist)
            if(min_data_dist is None):
                min_data_dist = data_dist
            else:
                new_labels[np.where(min_data_dist > data_dist)] = id
            # print("--------------------labels", id, "---------------------")
            # print(new_labels)
        #t2 = time.time()

        for center_id in range(n_cl):
            centers[center_id] = np.mean(data[np.where(new_labels == center_id)], axis=0)
        #t3 = time.time()
        # print("----------------------")
        # print(centers, t2 - t1, t3 - t2)
        # exit(0)
        if np.all(new_labels == old_labels):  
            break
        old_labels = new_labels

    return old_labels

In [54]:
def detect(video, n_cl=2):
    # load video, get number of frames and get shape of frame
    cap = cv2.VideoCapture(video)
    fps = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    size = (int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)),
            int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)))

    # instantiate a video writer
    video_writer = cv2.VideoWriter(os.path.join(output_path, "result_with_%dclz.mp4" % n_cl),
                                   cv2.VideoWriter_fourcc(*'mp4v'),
                                   (fps / 10),
                                   size,
                                   isColor=True)

    # initialize frame and seeds
    ret, frame = cap.read()
 

    print("Begin clustering with %d classes:" % n_cl)
    bar = tqdm.tqdm(total=fps)  # progress bar
    while ret:
        frame = np.float32(frame)
        h, w, c = frame.shape

        # k-means
        data = frame.reshape((h * w, c))
        labels = kmeans(data, n_cl=n_cl)
        # give different cluster different colors
        new_frame = np.zeros((h * w, c))
        # TODO: dye pixels with colors
        for id in range(h * w):
            # print(id, labels[id])
            new_frame[id] = GBR[int(labels[id])]
        new_frame = new_frame.reshape((h, w, c)).astype("uint8")
        video_writer.write(new_frame)

        ret, frame = cap.read()
        bar.update()
    
    # release resources
    video_writer.release()
    cap.release()
    cv2.destroyAllWindows()


video_sample = os.path.join(input_path, "road_video.MOV")
detect(video_sample, n_cl=3)

Begin clustering with 3 classes:












[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A

1. It easily scales to large datasets and is not so difficult to train even with large amounts of data (It guarentees convergence). It can adapt to new samples. Also, the implementation is easy.
2. We have to choose K manually, and it might not be so accurate. 
3. If the data are clustered with round boundaries and is clearly splitted apart, then K-means is a suitable method for fitting the data.