In [9]:
import numpy as np
from io import StringIO
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import mylibrary as mylib
from mylibrary import euclidean_distance
import os
import numpy as np
from io import StringIO
from sklearn import datasets as ds

%matplotlib inline

In [10]:
class Center:
    """
    Purpose:
        Each Center object save the location of center and the records that belong to this center.
        pts save the id of each record that belongs to the Center object.
        The id of recrod is the index of record in given data matrix, start from 0.
    """
    def __init__(self, center, pts):
        self.center = center
        self.pts = pts
        
    def set_center(self, center):
        self.center = center
        
    def set_pts(self, pts):
        self.pts = pts
    
    def __eq__(self, other): 
        return np.all(self.center == other.center)
    
    def __repr__(self):
         return "cluster: " + str(self.center) 

In [11]:
def assign_to_center(data, centers, k):
    """
    Purpose:
        Assign records in data to centers using Euclidean distance. The record id is the index
        of record in data matrix.
    Input:
        data: a two dimension matrix of real number.
        centers: a list of Center objects.
        k: int, the number of centers. equal to the length of centers.
    Output:
        centers: a list of Center objects, the pts list has been updated.
    """
    dis_matrix = np.empty((0,data.shape[0]))
    for center in centers:
        dis_matrix = np.vstack((dis_matrix, np.sum(np.square(data - center.center), axis=1)))
    belongs = np.argmin(dis_matrix, axis=0)
    for i in range(k):
        centers[i].pts = np.where(belongs == i)[0]
    return centers

In [13]:
def get_data(filename):
    with open(filename) as f:
        raw_data = np.genfromtxt(StringIO(f.read()), delimiter="\t")
    label = raw_data[:,0:2].astype(int)
    data = raw_data[:,2:]
    return data, label

In [14]:
def get_center_res(filename):
    with open(filename) as f:
        raw_data = np.genfromtxt(StringIO(f.read()), delimiter="\t")
    label = raw_data[:,0:1].astype(int)
    data = raw_data[:,1:]
    return data, label

## Test

In [66]:
def plot_k_mean(data, centers, name="", save_path="img/"):
    """
    Purpose:
        General pupose plot function to plot k_mean funcion.
    """
    try:
        os.makedirs(save_path)
    except FileExistsError:
        print("use existing folder:", save_path)
        
    label_set = set(np.arange(len(centers)))
    color_map = dict(zip(label_set, cm.rainbow(np.linspace(0, 1, len(label_set)))))
    for label in label_set:
        index = centers[label].pts
        plt.scatter(data[index][:,0], data[index][:,1], s=20, c=color_map[label],
                    alpha=0.3, label=label)
    name = str(name) + "_k_mean"
    plt.title(name)
    plt.legend(loc='best')
    #plt.show()
    plt.savefig(save_path+ name + ".png")
    plt.close()

In [67]:
data, labels = get_data("cho.txt")
res, _ = get_center_res("cho_centers_res.txt")
centers = list(map(lambda x: Center(x,np.array([])), res))
centers = assign_to_center(data, centers, len(centers))
data_pca = mylib.pca(data)
plot_k_mean(data_pca, centers, name="cho")

use existing folder: img/


In [68]:
data, labels = get_data("iyer.txt")
res, _ = get_center_res("iyer_centers_res.txt")
centers = list(map(lambda x: Center(x,np.array([])), res))
centers = assign_to_center(data, centers, len(centers))
data_pca = mylib.pca(data)
plot_k_mean(data_pca, centers, name="iyer")

use existing folder: img/


In [61]:
centers[0].center

array([ 0.07463793, -0.32827586, -0.67155172, -0.65465517, -0.59586207,
       -0.42431034, -0.50931034, -0.41465517,  0.15413793,  0.88724138,
        0.57758621,  0.30284483,  0.06941379,  0.08303448,  0.1205    ,
        0.4637931 ])

In [39]:
res, _ = get_center_res("cho_centers_res.txt")

In [15]:
data, labels = get_data("cho_pca.txt")
res, _ = get_center_res("cho_pca_centers_res.txt")
centers = list(map(lambda x: Center(x,np.array([])), res))
centers = assign_to_center(data, centers,5)
plot_k_mean(data, centers, name="cho_pca")

use existing folder: img/


In [16]:
data, labels = get_data("iyer_pca.txt")
res, _ = get_center_res("iyer_pca_centers_res.txt")
centers = list(map(lambda x: Center(x,np.array([])), res))
centers = assign_to_center(data, centers,5)
plot_k_mean(data, centers, name="iyer_pca")

use existing folder: img/
