### Onur Can
### Clustering - Expectation Maximization (EM)
### 27.12.2021
#### 1. Data Import & Initial Plot --------------------- 2. Parameters & Initilization
#### 3. EM Algorithm Steps --------------------------- 4. Visualization
#### 5. Iterations ------------------------------------------ 6. Final Parameters & Plot

In [1]:
# Onur Can 
# Project is done for Prof. Mehmet Gönen's DASC 521: Introduction to Machine Learning @ Koç University MSc Data Science Program
# Thanks Prof Mehmet for the dataset generation and instructions
# importing necessary libraries
import numpy as np                # for array/matrix operations
import matplotlib.pyplot as plt   # for plotting
import scipy.spatial as spa       # for distance calculations
import scipy.stats as stats       # for contour multivariate normal pdf

### 1. Data Import

In [2]:
# Reading the X data points
X = np.genfromtxt("../input/EM-Cluster/EM_Clustering_data_set.csv", delimiter = ",")
print("Data matrix's shape is",X.shape," \nWhere there are -",X.shape[0],"- observations with -", X.shape[1],"- features \n")

#This will be used in ONLY CONTOUR drawing just initializing here.
original_means = np.array([[-2.5, -2.5], [
                            +2.5, +2.5], [
                            +2.5, -2.5], [
                             0.0,  0.0], [
                            -2.5, +2.5]  ])
original_cov = np.array([[  [+0.8, -0.6],[-0.6, +0.8]], [
                            [+0.8, -0.6],[-0.6, +0.8]], [
                            [+0.8, +0.6],[+0.6, +0.8]], [
                            [+1.6, -0.0],[+0.0, +1.6]], [
                            [+0.8, +0.6],[+0.6, +0.8]]  ])
print("---Original Bivariate Gaussian Means---")
print(original_means, original_means.shape)
print("---Original Bivariate Gaussian Covariances---")
print(original_cov,original_cov.shape)

### 1. Initial Data Plot

In [3]:
# Plotting of the initial Data for observation purposes as in the HW file.
plt.figure(figsize = (8,8))
plt.plot(X[:,0], X[:,1], ".", markersize = 10, color = "black")
plt.xlabel("x1")
plt.ylabel("x2")
plt.title("Initial Data Visualization", fontsize = 16)
plt.show()

### 2. Parameters & Initilization

In [4]:
# N Sample size N
N = X.shape[0]
# D number of features
D = X.shape[1]
# K Pre-determined Cluster count given in the HW file
K = 5 # since we are given 5 cluster means
# Initialization of centroids, members ( via closest distance), covs, priors
initial_centroids = np.genfromtxt("../input/EM-Cluster/EM_Clustering_initial_centroids.csv", delimiter = ",")
initial_members = np.argmin(spa.distance_matrix(initial_centroids, X), axis = 0)
initial_cov = np.array([(((X[initial_members == (c)]- initial_centroids[c]).T) @ (
    X[initial_members == (c)]- initial_centroids[c]))/len(X[initial_members == (c)]) for c in range(K)])
initial_priors = [np.mean(initial_members == (c)) for c in range(K)]

print("---Initial Centroids--- \n",initial_centroids, initial_centroids.shape)
print("---Initial Covariances--- \n",initial_cov, initial_cov.shape)
print("---Initial Priors--- \n",initial_priors)

### 3. EM Algorithm Steps

In [5]:
# Update parameters function calculates new parameters with gith given new memberships
def update_parameters(memberships, X):
    #initialization of parameters
    centroids = np.empty((K,D))
    cov = np.empty((K,D,D))
    priors = np.empty(K)
    if memberships is None:
        # initialize centroids covs priors if memberships are not assigned
        centroids = initial_centroids
        cov = initial_cov
        priors = initial_priors       
    else:
        for c in range(K):
            centroids[c] = np.sum(np.multiply(X,memberships[:,c][:,None]),axis = 0) / np.sum(memberships[:,c])
            cov[c] = (((X - centroids[c]).T) @ np.multiply((
                X - centroids[c]),memberships[:,c][:,None])) / np.sum(memberships[:,c])
            priors[c] = np.sum(memberships[:,c]) / N    
    return(centroids, cov, priors)
# Update membership function calculates new membership probabilities(soft assg) with gith given parameters
def update_memberships(centroids, cov, priors, X):
    # calculation does soft assignment via using multivariate gaussian pdf
    # instead of hard assignment, this time we are assigning probabilities for each data to cluster
    posterior = np.empty((N,0))
    for c in range(K):
        posteriors_by_K = np.vstack([(np.linalg.det(cov[c])**-0.5)*
                            np.exp(- 0.5 * (X[j] - centroids[c]).T @ np.linalg.inv(cov[c]) @ (X[j] - centroids[c]))
                                   *(priors[c]) for j in range(N)])     
        posterior = np.hstack((posterior,posteriors_by_K))    
    soft_assingment = posterior/np.sum(posterior,axis =1)[:,None]  
    memberships = soft_assingment   
    return(memberships)

### 4. Visualization

In [6]:
# This section is based on K-means Lab and added Contour for final plotting
def plot_current_state(centroids, memberships, X, cov, draw_Contour):   
    x1 = np.linspace(-5,+5,201)
    x2 = np.linspace(-5,+5,201)
    x1_grid, x2_grid = np.meshgrid(x1, x2)
    X_test = np.transpose(np.vstack((x1_grid.flatten(), x2_grid.flatten())))
    cluster_colors = np.array(["#1f78b4", "#33a02c", "#e31a1c", "#ff7f00", "#6a3d9a", "#b15928",
                               "#a6cee3", "#b2df8a", "#fb9a99", "#fdbf6f", "#cab2d6", "#ffff99"])
    if memberships is None:
        plt.plot(X[:,0], X[:,1], ".", markersize = 10, color = "black")
    else:
        for c in range(K):
            # if statements is for plotting intermediary steps without plotting
            if draw_Contour == True:
                # calculate multivariate gaussian pdf for z coordinate of contour
                z_data_interval = stats.multivariate_normal.pdf(X_test, centroids[c], cov[c])
                # here we have used homework parameters to draw their contour lines
                z_data_interval_original = stats.multivariate_normal.pdf(X_test, original_means[c], original_cov[c])            
                plt.contour(x1_grid, x2_grid, np.reshape(
                    z_data_interval_original,x1_grid.shape), levels = [0.05], colors = "k", linestyles = "dashed")
                plt.contour(x1_grid, x2_grid, np.reshape(
                    z_data_interval,x1_grid.shape), levels = [0.05], colors = cluster_colors[c], linestyles = "solid")
            plt.plot(X[memberships == c, 0], X[memberships == c, 1], ".", markersize = 10,
                     color = cluster_colors[c])
    # plotting of the points with their assingments        
    for c in range(K):
        plt.plot(centroids[c, 0], centroids[c, 1], "s", markersize = 12, 
                 markerfacecolor = cluster_colors[c], markeredgecolor = "black")
    plt.xlabel("x1")
    plt.ylabel("x2")

### 5. Iterations

In [7]:
# Iterative steps to run EM algorithm E step and M step
# Initialization of parameters
iteration = 1
memberships = None
centroids = np.empty((5,2))
cov = np.empty((5,2,2))
priors = np.empty(5)
iteration_limit = 101  #for N iterations enter (N + 1) it breaks at (N + 1) w/o calculation
idv_ploting = False # Change this to True for individual analysis/plots

while True:
    # Break Condition
    if iteration == iteration_limit:
        iteration -= 1 # fixing the output of iteration see iteration_limit explanation
        break
    
    # E step and M step
    centroids, cov, priors = update_parameters(memberships, X)    
    memberships = update_memberships(centroids, cov, priors, X)
    
    # This is added to show intermediate steps while iterations, default = hidden
    if idv_ploting == True:
        print("Iteration#{}:".format(iteration))
        print("{} \n {} \n {}".format(centroids,cov,priors))
        plt.figure(figsize = (8, 8)) 
        plot_current_state(centroids, np.argmax(memberships,axis=1), X, cov, draw_Contour = False)
        plt.show()
    # Update Iterations
    iteration += 1
    

### 6. Final Parameters & Plot

In [8]:
# Stopping Iteration Info and Plot
print("Iteration#{}:".format(iteration))
print("---Centroids--- \n",centroids)
print("---Covariances--- \n",cov)
print("---Priors--- \n",priors)

# Desired output plot in the HW file with respective assingments and contours of original/100th iteration.
plt.figure(figsize = (8, 8)) 
plot_current_state(centroids, np.argmax(memberships,axis=1), X, cov, draw_Contour = True)
plt.title("Distribution of Clusters in {}th Iteration".format(iteration), fontsize = 16)
plt.show()
 