In [12]:
import csv
import random
import numpy as np
import matplotlib.pyplot as plt

from numpy.linalg import eig, norm
from scipy.linalg import inv

%matplotlib notebook

In [4]:
def ellipse(sigma, mean, scale=1):
    
    d, v = eig(inv(sigma))
    mat = v @ inv(np.sqrt(np.diag(d)))

    N = 200
    t = np.arange(0, N) * (2*np.pi) / N

    Y1 = scale * np.cos(t)
    Y2 = scale * np.sin(t)
   
    Y = np.array([Y1, Y2])

    X = mat.dot(Y)

    X1 = X[0]
    X2 = X[1]

    # move ellipse to mean_k
    X1 = X1 + mean[0]
    X2 = X2 + mean[1]
    
    return X1, X2

In [3]:
def plot(samples, parameters):

    sam_a = samples["a"]
    sam_o = samples["o"]
    sam_u = samples["u"]

    data = (sam_a.T, sam_o.T, sam_u.T)
    colors = ("red", "green", "blue")
    groups = ("a", "o", "u")
    scales = (2, 2, 2)

    # Create plot
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)

    for data, color, group in zip(data, colors, groups):
        x, y = data
        ax.scatter(x, y, alpha=0.8, c=color, edgecolors='none', s=30, label=group)

    for color, group, scale in zip(colors, groups, scales):
        X1, X2 = ellipse(parameters["sigma"][group], parameters["mean"][group], scale=scale)
        ax.plot(X1, X2, c=color)
    
    plt.title('A,O,U scatter')
    plt.legend(loc=2)
    plt.show()

In [9]:
def get_means(train_a, train_o, train_u):

    points_a = [train_a[random.randrange(0, len(train_a))] for i in range(5)]
    points_o = [train_o[random.randrange(0, len(train_o))] for i in range(5)]
    points_u = [train_u[random.randrange(0, len(train_u))] for i in range(5)]
    
    return (np.mean(points_a), np.mean(points_o), np.mean(points_u))

In [None]:
def k_means(samples, means):
    
    classes = np.zeros(len(samples))
    
    dist = np.zeros(2)
    
    # dist[0] = 1E6
    # while dist(n) - dist(n-1) < 0.01
    
    n_means = np.zeros(len(means))
    q_means = np.zeros(len(means))
    
    for i in range(len(samples)):
        norms = [(norm(samples[i]-means[k]), k) for k in range(3)]
        n, k = min(norms, key=lambda x: x[0])
        classes[i] = k
        
    for i in range(samples):
        n_means[classes[i]] += samples[i]
        q_means[classes[i]] += 1
        
    means = np.array(map(lambda x,y: x / y, zip(n_means, q_means)))
    
    # calculate distorsion
    
    return means, classes

In [13]:
def main():

    # parse data
    with open('a.txt') as f:
        lines_a = list(csv.reader(f, delimiter='\t'))

    with open('o.txt') as f:
        lines_o = list(csv.reader(f, delimiter='\t'))

    with open('u.txt') as f:
        lines_u = list(csv.reader(f, delimiter='\t'))

    lines_a = list(map(lambda x: [int(x[0]), int(x[1])],lines_a))
    lines_o = list(map(lambda x: [int(x[0]), int(x[1])],lines_o))
    lines_u = list(map(lambda x: [int(x[0]), int(x[1])],lines_u))

    # separate 'train' and 'test' datasets
    train_a = np.array(lines_a[:34])
    test_a = np.array(lines_a[35:])
    
    train_o = np.array(lines_o[:34])
    test_o = np.array(lines_o[35:])
    
    train_u = np.array(lines_u[:34])
    test_u = np.array(lines_u[35:])
    
    # plot
    samples = np.array(list(train_a) + list(train_o) + list(train_u))
    
    # obtain first means
    # by calc the mean of 5 random
    # points in samples
    means = get_means(train_a, train_o, train_u)
    
    means, classes = k_means(samples, means)
    
    print(means)
    print(classes)