In [1]:
import csv
import random
import numpy as np
import matplotlib.pyplot as plt

from numpy.linalg import eig, norm, det
from scipy.linalg import inv
from scipy.stats import multivariate_normal

%matplotlib notebook

In [3]:
def ellipse(sigma, mean, scale=1):
    
    d, v = eig(inv(sigma))
    mat = v @ inv(np.sqrt(np.diag(d)))

    N = 200
    t = np.arange(0, N) * (2*np.pi) / N

    Y1 = scale * np.cos(t)
    Y2 = scale * np.sin(t)
   
    Y = np.array([Y1, Y2])

    X = mat.dot(Y)

    X1 = X[0]
    X2 = X[1]

    # move ellipse to mean_k
    X1 = X1 + mean[0]
    X2 = X2 + mean[1]
    
    return X1, X2

In [4]:
def plot(samples, parameters):

    sam_1 = samples["1"]
    sam_2 = samples["2"]
    sam_3 = samples["3"]

    data = (sam_1.T, sam_2.T, sam_3.T)
    colors = ("red", "green", "blue")
    groups = ("1", "2", "3")
    scales = (2, 2, 2)

    # Create plot
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)

    for data, color, group in zip(data, colors, groups):
        x, y = data
        ax.scatter(x, y, alpha=0.8, c=color, edgecolors='none', s=30, label=group)

    for color, group, scale in zip(colors, groups, scales):
        X1, X2 = ellipse(parameters["sigma"][group], parameters["mean"][group], scale=scale)
        ax.plot(X1, X2, c=color)
    
    plt.title('EM - 1,2,3')
    plt.legend(loc=2)
    plt.show()

In [5]:
def get_random_means(train):

    train_a = train["1"]
    train_o = train["2"]
    train_u = train["3"]
    
    points_a = [train_a[random.randrange(0, len(train_a))] for i in range(5)]
    points_o = [train_o[random.randrange(0, len(train_o))] for i in range(5)]
    points_u = [train_u[random.randrange(0, len(train_u))] for i in range(5)]

    data = {
        "1": points_a,
        "2": points_o,
        "3": points_u
    }
    
    means = [
        np.mean(points_a, axis=0),
        np.mean(points_o, axis=0),
        np.mean(points_u, axis=0)
    ]
    
    return means, data

In [6]:
def get_sigma(data):

    d1 = data["1"]
    d2 = data["2"]
    d3 = data["3"]
    
    return (1/3) * (np.cov(d1, rowvar=False) + np.cov(d2, rowvar=False) + np.cov(d2, rowvar=False))

In [7]:
def parse():

    # parse data
    with open('a.txt') as f:
        lines_a = list(csv.reader(f, delimiter='\t'))

    with open('o.txt') as f:
        lines_o = list(csv.reader(f, delimiter='\t'))

    with open('u.txt') as f:
        lines_u = list(csv.reader(f, delimiter='\t'))

    lines_a = list(map(lambda x: [int(x[0]), int(x[1])],lines_a))
    lines_o = list(map(lambda x: [int(x[0]), int(x[1])],lines_o))
    lines_u = list(map(lambda x: [int(x[0]), int(x[1])],lines_u))

    # separate 'train' and 'test' datasets
    train_a = np.array(lines_a[:35])
    test_a = np.array(lines_a[35:])
    
    train_o = np.array(lines_o[:35])
    test_o = np.array(lines_o[35:])
    
    train_u = np.array(lines_u[:35])
    test_u = np.array(lines_u[35:])
    
    samples = np.array(list(train_a) + list(train_o) + list(train_u))
    
    data = {
        "train": {
            "1": train_a,
            "2": train_o,
            "3": train_u
        },
        "test": {
            "1": test_a,
            "2": test_o,
            "3": test_u
        }
    }
    
    return samples, data

In [8]:
samples, datasets = parse()

## EM

In [14]:
def cacl_L(samples, means, sigmas, pi):
    x = samples
    return sum([log(sum([multivariate_normal.pdf(x[n], mean=means[k], cov=sigmas[k])*pi[k] for k in range(3)])) for n in range(len(samples))])

In [22]:
def calc_means(samples, gammas, den):
    #TODO
    return 0

In [23]:
def calc_sigmas(samples, means, gammas, den):
    #TODO
    return 0

In [24]:
def calc_pi(den, N):
    #TODO
    return 0

In [25]:
def calc_denom(samples, gammas):
    #TODO
    return 0

In [9]:
def gamma(sample, means, sigmas, pi):
    
    nums = np.array([multivariate_normal.pdf(sample, mean=means[k], cov=sigmas[k])*pi[k] for k in range(3)])

    den = nums.sum()
    
    return nums/den

In [21]:
def em(samples, means, sigmas, pi):

    L[0] = 0.0
    L[1] = calc_L(samples, means, sigmas, pi)
    deltaL = L[1] - L[0]
    
    while abs(deltaL) > 0.01:

        ############
        # 'E' step #
        ############
        gammas = np.array([gamma(samples[i], means, sigmas, pi) for i in range(len(samples))])

        ############
        # 'M' step #
        ############
        
        # calc denominator
        den = calc_denom(samples, gammas)
        
        # calc new means
        means = calc_means(samples, gammas, den)

        # calc new sigmas
        sigmas = calc_sigmas(samples, means, gammas, den)

        # calc new pi
        pi = calc_pi(den, len(samples))

        L[0] = L[1]
        L[1] = calc_L(samples, means, sigmas, pi)
        deltaL = L[1] - L[0]

    return means, sigmas, gammas

## EM classification

In [26]:
# obtain first means
# by calc the mean of 5 random
# points in samples
means, data = get_random_means(datasets["train"])

# obtain first sigmas
# by calc the average of the sigma
# of each class
sigma = get_sigma(data)

sigmas = np.array([sigma, sigma, sigma])
pi = np.array([(1/3) for i in range(3)])

means, sigmas, gammas = em(samples, means, sigmas, pi)

print(len(means))
print(len(sigmas))
print(len(gammas))