In [1]:
import numpy as np
import cv2
from io import StringIO
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import mylibrary as mylib
from mylibrary import euclidean_distance
import os

%matplotlib inline

In [2]:
class Center:
    """
    Purpose:
        Each Center object save the location of center and the records that belong to this center.
        pts save the id of each record that belongs to the Center object.
        The id of recrod is the index of record in given data matrix, start from 0.
    """
    def __init__(self, center, pts):
        self.center = center
        self.pts = pts
        
    def set_center(self, center):
        self.center = center
        
    def set_pts(self, pts):
        self.pts = pts
    
    def __eq__(self, other): 
        return np.all(self.center == other.center)
    
    def __repr__(self):
         return "cluster: " + str(self.center) 

In [3]:
def init_given(data,k,seed):
    """
    Purpose:
        Init the list of Center objects by the data given in task3 part A.
    """
    centers = np.array([[6.2,3.2],
              [6.6,3.7],
              [6.5,3.0]])
    centers = list(map(lambda x: Center(x,np.array([])), centers))
    centers = assign_to_center(data, centers,k)
    return centers

In [4]:
def init_centers_random(data, k, seed=20):
    """
    Purpose:
        randomly shuffle the data, and pick the top k recrods to updates the list of centers.
    Input:
        data: a two dimension matrix
        k: int, the number of centers
        seed: the seed of random number generator.
    Output:
        centers: a list of Center objects. 
    """
    centers = data[np.random.RandomState(seed=seed).permutation(data.shape[0])[0:k]]
    centers = list(map(lambda x: Center(x,np.array([])), centers))
    centers = assign_to_center(data, centers,k)
    return centers

In [5]:
def assign_to_center(data, centers, k):
    """
    Purpose:
        Assign records in data to centers using Euclidean distance. The record id is the index
        of record in data matrix.
    Input:
        data: a two dimension matrix of real number.
        centers: a list of Center objects.
        k: int, the number of centers. equal to the length of centers.
    Output:
        centers: a list of Center objects, the pts list has been updated.
    """
    dis_matrix = np.empty((0,data.shape[0]))
    for center in centers:
        dis_matrix = np.vstack((dis_matrix, np.sum(np.square(data - center.center), axis=1)))
    belongs = np.argmin(dis_matrix, axis=0)
    for i in range(k):
        centers[i].pts = np.where(belongs == i)[0]
    return centers

In [6]:
def update_centers(data, centers, k):
    """
    Purpose:
        Generate a new list of Center object given the information from the previous centers.
    Input:
        data: a two dimension matrix
        centers: a list of Center objects. previous.
        k: int, the number of centers.
    Output:
        the new list of Center objects.
    """
    not_updated = True
    new_centers = []
    for center in centers:
        new_centers.append(Center(np.mean(data[center.pts],axis=0), np.array([])))
    return assign_to_center(data,new_centers, k)   

In [7]:
def plot_k_mean(data, centers, itr, save_path="../task3_img/k_mean"):
    """
    Purpose:
        General pupose plot function to plot k_mean funcion.
    """
    try:
        os.makedirs(save_path)
    except FileExistsError:
        print("use existing folder:", save_path)
        
    label_set = set(np.arange(len(centers)))
    color_map = dict(zip(label_set, cm.rainbow(np.linspace(0, 1, len(label_set)))))
    for label in label_set:
        index = centers[label].pts
        plt.scatter(data[index][:,0], data[index][:,1], s=20, c=color_map[label],
                    alpha=0.3, label=label)
        plt.scatter(centers[label].center[0], centers[label].center[1], s=100, c=color_map[label],
                    alpha=1.0, marker='x')
    plt.title("iteration: "+ str(itr))
    plt.legend(loc='best')
    #plt.show()
    plt.savefig(save_path+"/iteration_" + str(itr) + ".png")
    plt.close()

In [8]:
def plot_k_mean_a(data, centers, itr, save_path="../task3_img/k_mean"):
    """
    Purpose:
        The plot funcion specific for the cv project2 task3 k-mean part.
    """
    try:
        os.makedirs(save_path)
    except FileExistsError:
        print("use existing folder:", save_path)
    
    label_set = set(np.arange(len(centers)))
    #color_map = dict(zip(label_set, cm.rainbow(np.linspace(0, 1, len(label_set)))))
    color_map = dict(zip(label_set, ['red','green','blue']))
    for label in label_set:
        index = centers[label].pts
        plt.scatter(data[index][:,0], data[index][:,1], s=50, c=color_map[label],
                    alpha=0.3, label=label,marker='^')
        plt.scatter(centers[label].center[0], centers[label].center[1], s=50, color=color_map[label],
                    alpha=1.0, marker='o')
    plt.title("iteration: "+ str(itr))
    plt.legend(loc='best')
    plt.savefig(save_path+"/iteration_" + str(itr) + ".png")
    plt.close()

In [19]:
def k_mean(data, k, init_fun, max_itr=50, seed=20, need_plot=False, plot_fun=None):
    """
    Purpose:
        Main funcion for k mean algorithm.
    Input:
        data: a two dimension matrix
        k: the number of centers
        init_fun: the funcion used to generate the intial list of Center objects.
        max_tr: the maximum number of iterations.
        need_plot: If set true, the k_mean funcion will save plot for each iteration.
        plot_fun: function, the plot function.
    Output:
        centers: a list of Center objects. The result for the final iteration.
    """
    itr = 0
    centers = init_fun(data,k,seed)
    while itr <= max_itr:
        if itr % 5 == 0:
            print("iteration :", itr)
        for center in centers:
            print(center)
            print(center.pts)
        if need_plot:
            plot_fun(data, centers, itr)
        
        new_centers = update_centers(data,centers,k)
        centers_1 = np.asarray(list(map(lambda x: x.center ,centers)))
        centers_2 = np.asarray(list(map(lambda x: x.center ,new_centers)))
        if np.all(centers_1 == centers_2):
            break
        centers = new_centers 
        itr += 1
    print("total iteration", itr)
    return centers

In [20]:
data_given = mylib.generate_data()

In [21]:
data_given = np.array([[5.9,3.2],
                 [4.6,2.9],
                 [6.2,2.8],
                 [4.7,3.2],
                 [5.5,4.2],
                 [5.0,3.0],
                 [4.9,3.1],
                 [6.7,3.1],
                 [5.1,3.8],
                 [6.0,3.0]])

In [22]:
data_a = data_given
seed = 20
k = 3
max_itr = 100
itr = 0

In [23]:
centers = k_mean(data_a,k,init_fun=init_given ,need_plot=True,plot_fun=plot_k_mean_a)

iteration : 0
cluster: [6.2 3.2]
[0 1 3 5 6 8 9]
cluster: [6.6 3.7]
[4]
cluster: [6.5 3. ]
[2 7]
use existing folder: ../task3_img/k_mean
cluster: [5.17142857 3.17142857]
[1 3 5 6]
cluster: [5.5 4.2]
[4 8]
cluster: [6.45 2.95]
[0 2 7 9]
use existing folder: ../task3_img/k_mean
cluster: [4.8  3.05]
[1 3 5 6]
cluster: [5.3 4. ]
[4 8]
cluster: [6.2   3.025]
[0 2 7 9]
use existing folder: ../task3_img/k_mean
total iteration 2


In [42]:
centers = k_mean(data_a,k=4,init_fun=init_centers_random ,need_plot=True,plot_fun=plot_k_mean,seed=110)

use existing folder: ../task3_img/k_mean
use existing folder: ../task3_img/k_mean
use existing folder: ../task3_img/k_mean
use existing folder: ../task3_img/k_mean
use existing folder: ../task3_img/k_mean
iteration : 5
total iteration 5


In [10]:
#data = mylib.generate_data()

### Color Quantization

In [27]:
def to_pixel_list(img):
    """
    Purpose:
        Transfer the two dimension image to a one dimension of pixels list.
    Input:
        img: a two dimension image. Both color and gray imgages are fine.
    Output:
        pixel_list: a list of pixel. value type float
    """
    pixel_list = np.empty((0,img[0].shape[1]))
    for row in img:
        pixel_list = np.append(pixel_list, row, axis=0)
    return pixel_list

In [28]:
def get_img(centers,shape):
    """
    Purpose:
        use the k_means results to generate images.
    Input:
        centers: a list of Center objects
        shape: the shape of the original image.
    Output: 
        the new image.
    """
    row_num, col_num, _ = shape
    new_img = np.empty(shape)
    for center in centers:
        pixel = center.center.astype(np.uint8)
        locs = list(map(lambda x: (int(x/col_num), int(x%col_num)) ,center.pts))
        for loc in locs:
            new_img[loc] = pixel
    return new_img.astype(np.uint8)

In [29]:
def quantized_img(img, k, init_fun = init_centers_random, max_itr=10000, seed=20):
    """
    Prupose:
        Generate k mean images
    Input:
        img: a two dimension matrix. color or gray are fine.
        k: the number of colors.
        init_fun: functon, the funcion to init the centers for k mean algorithm. default: init_centers_random
        max_itr: int, the maximum number of iterations
        seed: use in init_fun
    Output:
        the new k mean image.
    """
    pixel_list = to_pixel_list(img)
    centers = k_mean(pixel_list, k, init_fun, max_itr=100000, seed=seed)
    return get_img(centers,img.shape)

### Test

In [30]:
img = cv2.imread("../data/baboon.jpg")

In [31]:
ks = [3,5,10,20]
for k in ks:
    print("k =",k)
    new_img = quantized_img(img,k)
    cv2.imwrite("../task3_img/task3_baboon_"+str(k)+".jpg",new_img)
    print()

k = 3
iteration : 5
iteration : 10
iteration : 15
iteration : 20
iteration : 25
iteration : 30
total iteration 31

k = 5
iteration : 5
iteration : 10
iteration : 15
iteration : 20
iteration : 25
iteration : 30
iteration : 35
total iteration 35

k = 10
iteration : 5
iteration : 10
iteration : 15
iteration : 20
iteration : 25
iteration : 30
iteration : 35
iteration : 40
iteration : 45
iteration : 50
iteration : 55
iteration : 60
iteration : 65
iteration : 70
iteration : 75
iteration : 80
iteration : 85
iteration : 90
iteration : 95
iteration : 100
iteration : 105
iteration : 110
iteration : 115
iteration : 120
iteration : 125
iteration : 130
iteration : 135
iteration : 140
iteration : 145
total iteration 145

k = 20
iteration : 5
iteration : 10
iteration : 15
iteration : 20
iteration : 25
iteration : 30
iteration : 35
iteration : 40
iteration : 45
iteration : 50
iteration : 55
iteration : 60
iteration : 65
iteration : 70
iteration : 75
iteration : 80
iteration : 85
iteration : 90
iterati

## Depreciated

In [15]:
def get_img(centers,shape):
    row_num, col_num, _ = img.shape
    new_img = np.empty((img.shape))
    for center in centers:
        pixel = center.center.astype(np.uint8)
        locs = np.asarray(list(map(lambda x: [int(x/col_num), int(x%col_num)] ,center.pts)))
        new_img[locs.T] = pixel
    return new_img.astype(np.uint8)