In [35]:
import numpy as np
from io import StringIO
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import mylibrary as mylib
from mylibrary import euclidean_distance
import os

%matplotlib inline

In [36]:
class Center:
    """
    Purpose:
        Each Center object save the location of center and the records that belong to this center.
        pts save the id of each record that belongs to the Center object.
        The id of recrod is the index of record in given data matrix, start from 0.
    """
    def __init__(self, center, pts):
        self.center = center
        self.pts = pts
        
    def set_center(self, center):
        self.center = center
        
    def set_pts(self, pts):
        self.pts = pts
    
    def __eq__(self, other): 
        return np.all(self.center == other.center)
    
    def __repr__(self):
         return "cluster: " + str(self.center) 

In [37]:
def init_given(data,k,seed):
    """
    Purpose:
        Init the list of Center objects by the data given in task3 part A.
    """
    centers = np.array([[6.2,3.2],
              [6.6,3.7],
              [6.5,3.0]])
    centers = list(map(lambda x: Center(x,np.array([])), centers))
    centers = assign_to_center(data, centers,k)
    return centers

In [38]:
def init_centers_random(data, k, seed=20):
    """
    Purpose:
        randomly shuffle the data, and pick the top k recrods to updates the list of centers.
    Input:
        data: a two dimension matrix
        k: int, the number of centers
        seed: the seed of random number generator.
    Output:
        centers: a list of Center objects. 
    """
    centers = data[np.random.RandomState(seed=seed).permutation(data.shape[0])[0:k]]
    centers = list(map(lambda x: Center(x,np.array([])), centers))
    centers = assign_to_center(data, centers,k)
    return centers

In [39]:
def assign_to_center(data, centers, k):
    """
    Purpose:
        Assign records in data to centers using Euclidean distance. The record id is the index
        of record in data matrix.
    Input:
        data: a two dimension matrix of real number.
        centers: a list of Center objects.
        k: int, the number of centers. equal to the length of centers.
    Output:
        centers: a list of Center objects, the pts list has been updated.
    """
    dis_matrix = np.empty((0,data.shape[0]))
    for center in centers:
        dis_matrix = np.vstack((dis_matrix, np.sum(np.square(data - center.center), axis=1)))
    belongs = np.argmin(dis_matrix, axis=0)
    for i in range(k):
        centers[i].pts = np.where(belongs == i)[0]
    return centers

In [40]:
def update_centers(data, centers, k):
    """
    Purpose:
        Generate a new list of Center object given the information from the previous centers.
    Input:
        data: a two dimension matrix
        centers: a list of Center objects. previous.
        k: int, the number of centers.
    Output:
        the new list of Center objects.
    """
    not_updated = True
    new_centers = []
    for center in centers:
        new_centers.append(Center(np.mean(data[center.pts],axis=0), np.array([])))
    return assign_to_center(data,new_centers, k)   

In [41]:
def plot_k_mean(data, centers, name="", save_path="../img/"):
    """
    Purpose:
        General pupose plot function to plot k_mean funcion.
    """
    try:
        os.makedirs(save_path)
    except FileExistsError:
        print("use existing folder:", save_path)
        
    label_set = set(np.arange(len(centers)))
    color_map = dict(zip(label_set, cm.rainbow(np.linspace(0, 1, len(label_set)))))
    for label in label_set:
        index = centers[label].pts
        plt.scatter(data[index][:,0], data[index][:,1], s=20, c=color_map[label],
                    alpha=0.3, label=label)
        plt.scatter(centers[label].center[0], centers[label].center[1], s=100, c=color_map[label],
                    alpha=1.0, marker='x')
    name = str(name) + "k_mean"
    plt.title(name)
    plt.legend(loc='best')
    #plt.show()
    plt.savefig(save_path+ name + ".png")
    plt.close()

In [42]:
def plot_k_mean_a(data, centers, itr, save_path="../task3_img/k_mean"):
    """
    Purpose:
        The plot funcion specific for the cv project2 task3 k-mean part.
    """
    try:
        os.makedirs(save_path)
    except FileExistsError:
        print("use existing folder:", save_path)
    
    label_set = set(np.arange(len(centers)))
    #color_map = dict(zip(label_set, cm.rainbow(np.linspace(0, 1, len(label_set)))))
    color_map = dict(zip(label_set, ['red','green','blue']))
    for label in label_set:
        index = centers[label].pts
        plt.scatter(data[index][:,0], data[index][:,1], s=50, c=color_map[label],
                    alpha=0.3, label=label,marker='^')
        plt.scatter(centers[label].center[0], centers[label].center[1], s=50, color=color_map[label],
                    alpha=1.0, marker='o')
    plt.title("iteration: "+ str(itr))
    plt.legend(loc='best')
    plt.savefig(save_path+"/iteration_" + str(itr) + ".png")
    plt.close()

In [43]:
def k_mean(data, k, init_fun, max_itr=50, seed=20, need_plot=False, plot_fun=None):
    """
    Purpose:
        Main funcion for k mean algorithm.
    Input:
        data: a two dimension matrix
        k: the number of centers
        init_fun: the funcion used to generate the intial list of Center objects.
        max_tr: the maximum number of iterations.
        need_plot: If set true, the k_mean funcion will save plot for each iteration.
        plot_fun: function, the plot function.
    Output:
        centers: a list of Center objects. The result for the final iteration.
    """
    itr = 0
    centers = init_fun(data,k,seed)
    while itr <= max_itr:
        print("iteration :", itr)
        for center in centers:
            print(center)
        if need_plot:
            plot_fun(data, centers, itr)
        
        new_centers = update_centers(data,centers,k)
        centers_1 = np.asarray(list(map(lambda x: x.center ,centers)))
        centers_2 = np.asarray(list(map(lambda x: x.center ,new_centers)))
        if np.all(centers_1 == centers_2):
            break
        centers = new_centers 
        itr += 1
    print("total iteration", itr)
    return centers

## Test

In [34]:
data, labels = mylib.get_data("../data/cho.txt")

In [12]:
seed = 20
max_itr = 100
k = 5
centers = k_mean(data,k,init_fun=init_centers_random ,need_plot=False,plot_fun=None)

iteration : 0
cluster: [-0.42  -0.57   0.08  -0.44  -0.36  -0.18  -0.15   0.56   0.63   0.48
 -0.67  -0.45   0.096 -0.02  -0.05   0.6  ]
cluster: [ 0.3  -0.84  0.58 -0.14 -0.58 -0.26 -0.7  -0.95 -0.12  1.17  0.55  0.73
 -0.03 -0.14 -0.33  0.25]
cluster: [-0.15 -0.6  -0.37 -0.15  0.59  0.46  0.02  0.21  0.04 -0.4  -0.48 -0.06
  0.21  0.44  0.34  0.26]
cluster: [-0.65  -0.69  -0.68  -0.05   0.75   0.88   0.79   0.17  -0.44  -0.29
 -0.32   0.22   0.386  0.55   0.47  -0.15 ]
cluster: [-0.18  0.73  0.27  0.02 -0.15  0.59  0.75 -0.53 -0.09 -0.95 -0.82 -0.35
  0.56  0.52  0.38 -0.32]
iteration : 1
cluster: [-0.38511594 -0.37724638 -0.55536232 -0.65985507 -0.51666667 -0.31797101
  0.12666667  0.4626087   0.6242029   0.40202899  0.03811594 -0.20511594
 -0.12285507  0.11972464  0.27695652  0.45376812]
cluster: [-0.23495294 -0.01088235  0.73323529  0.32870588 -0.166      -0.26247059
 -0.54841176 -0.73071765 -0.44329412  0.78129412  0.64354118  0.30102353
 -0.01404118 -0.25864706 -0.441      -0.23

In [13]:
plot_k_mean(data, centers, name="cho", save_path="../img/")

use existing folder: ../img/


## For demo

In [None]:
0	5.0	3.0	1.6	0.2
1	6.3	2.8	5.1	1.5
2	5.0	2.0	3.5	1.0
3	4.7	3.2	1.6	0.2
4	6.1	3.0	4.6	1.4


In [20]:
def init_given(data,k,seed):
    """
    Purpose:
        Init the list of Center objects by the data given in task3 part A.
    """
    centers = np.array([[5.0,3.0,1.6,0.2],
         [6.3,2.8,5.1,1.5],
         [5.0,2.0,3.5,1.0],
         [4.7,3.2,1.6,0.2],
         [6.1,3.0,4.6,1.4]])
    centers = list(map(lambda x: Center(x,np.array([])), centers))
    centers = assign_to_center(data, centers,k)
    return centers

In [18]:
data, labels = mylib.get_data("new_dataset_1.txt")

In [23]:
centers = k_mean(data,k,init_fun=init_given ,need_plot=False,plot_fun=None)

iteration : 0
cluster: [5.  3.  1.6 0.2]
cluster: [6.3 2.8 5.1 1.5]
cluster: [5.  2.  3.5 1. ]
cluster: [4.7 3.2 1.6 0.2]
cluster: [6.1 3.  4.6 1.4]
iteration : 1
cluster: [5.15652174 3.34782609 1.46521739 0.23478261]
cluster: [6.65283019 2.9754717  5.5245283  1.97924528]
cluster: [5.28333333 2.425      3.7        1.16666667]
cluster: [4.87777778 3.47777778 1.46296296 0.25185185]
cluster: [6.00571429 2.86857143 4.38285714 1.39142857]
iteration : 2
cluster: [5.24583333 3.5125     1.47083333 0.24166667]
cluster: [6.80238095 3.05       5.66904762 2.0452381 ]
cluster: [5.35714286 2.44285714 3.71428571 1.16428571]
cluster: [4.78461538 3.33076923 1.45769231 0.24615385]
cluster: [6.03409091 2.83863636 4.55681818 1.48636364]
iteration : 3
cluster: [5.276 3.692 1.5   0.288]
cluster: [6.85       3.07368421 5.74210526 2.07105263]
cluster: [5.43529412 2.48235294 3.76470588 1.15882353]
cluster: [4.736 3.144 1.428 0.2  ]
cluster: [6.07777778 2.84888889 4.63111111 1.53777778]
iteration : 4
cluster: [

## After PCA

In [24]:
data, labels = mylib.get_data("../data/cho.txt")
seed = 20
max_itr = 100
k = 5

data_pca = mylib.pca(data)
centers = k_mean(data_pca,k,init_fun=init_centers_random ,need_plot=False,plot_fun=None)
plot_k_mean(data_pca, centers, name="cho", save_path="../img/")

iteration : 0
cluster: [-0.27565479  0.18065626]
cluster: [0.47496673 0.05843564]
cluster: [-0.44380992 -0.25134271]
cluster: [-1.08266087 -0.13488786]
cluster: [-0.22259243  0.68329737]
iteration : 1
cluster: [-0.277635    0.19360272]
cluster: [0.81011141 0.12581677]
cluster: [-0.38277885 -0.43236251]
cluster: [-1.44267641 -0.24272675]
cluster: [-0.11658362  0.69496911]
iteration : 2
cluster: [-0.258704    0.15764264]
cluster: [0.89589858 0.15720497]
cluster: [-0.33493676 -0.4542657 ]
cluster: [-1.52567939 -0.24407146]
cluster: [-1.56396355e-04  6.67018721e-01]
iteration : 3
cluster: [-0.26278107  0.14217749]
cluster: [0.95522417 0.16335718]
cluster: [-0.30657754 -0.46729685]
cluster: [-1.53357463 -0.23610084]
cluster: [0.16465883 0.56487611]
iteration : 4
cluster: [-0.31823849  0.13558215]
cluster: [1.03476613 0.16730377]
cluster: [-0.26334979 -0.47451314]
cluster: [-1.53357463 -0.23610084]
cluster: [0.28902543 0.42911332]
iteration : 5
cluster: [-0.42359563  0.19245478]
cluster: [1.