In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from scipy.special import logsumexp
import random
from scipy.stats import multivariate_normal

In [2]:
np.random.seed(42)

In [3]:
train = np.array(pd.read_csv('./codon_usage.csv'))
# shuffle the dataset
train = np.delete(train, (486,5063), axis=0)
y = np.copy(train[:,0])
X = np.delete(train,(0,1,2,3,4), axis=1)
X = X.astype(np.float32)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
def one_hot_encoding(data):
    num_count = set()
    for i in data:
        num_count.add(i)
    num_count = list(num_count)
    data = np.copy(data)
    zeros = np.zeros((data.shape[0],len(num_count)))
    for i in range(data.shape[0]):
        zeros[i,num_count.index(data[i])] = 1
    return zeros
y = one_hot_encoding(y)

In [5]:
def assign_closest_points(centers, data_point):
    dis,idx = np.inf,0
    for i in range(len(centers)):
        distance = np.linalg.norm(data_point-centers[i])
        if distance<dis:
            dis = distance
            idx = i
    return (data_point,idx)
def cov(points):
    sum = np.copy(points[0])
    for i in range(1,len(points)):
        sum += points[i]
    mean = sum/len(points)
    points = points - mean
    sigma = (points[0].reshape(-1,1))@(points[0].reshape(-1,1).T)
    for i in range(1,len(points)):
        sigma += (points[i].reshape(-1,1))@(points[i].reshape(-1,1).T)
    return sigma/len(points),mean

In [6]:
mus = []
sigmas_list = []
labels = []
for iter in range(1,11):
    eps = 1e-3
    k = len(y[0])
    k_centers_index = np.random.randint(low = 0, high = X.shape[0], size = k)
    k_centers_points = np.copy(X[k_centers_index,:])
    points_idx = [assign_closest_points(k_centers_points,X[i]) for i in range(X.shape[0])]
    assigned_points = dict()
    for point,i in points_idx:
        if i in assigned_points.keys():
            assigned_points[i].append(point)
        else:
            assigned_points[i] = [point]
    sigma_vals = []
    for i in range(k):
        sigmas, mean = cov(assigned_points[i])
        k_centers_points[i] = np.copy(mean)
        sigma_vals.append(sigmas)
    prob = [len(assigned_points[i])/X.shape[0] for i in range(k)]
    mu = np.copy(k_centers_points)
    w = np.zeros((k, X.shape[0]))
    t = 0
    while np.sum([np.linalg.norm(mu[i]-k_centers_points[i]) for i in range(mu.shape[0])]) > eps or t == 0:
        k_centers_points = np.copy(mu)
        log_w = np.zeros(w.shape)
        for i in range(k):
            f_val = multivariate_normal.logpdf(X, mean = mu[i], cov = sigma_vals[i]+(1e-5)*np.eye(64), allow_singular=True)
            log_w[i,:] = np.log(2*eps)+f_val + np.log(prob[i])
        for j in range(X.shape[0]):
            w[:,j] = np.exp(log_w[:,j] - logsumexp(log_w[:,j]))
        for i in range(k):
            mu_numerator,denominator,sigma_numerator,P_numerator = 0,0,0,0
            for j in range(X.shape[0]):
                mu_numerator += w[i][j]*X[j,:] 
                denominator += w[i][j]
            mu[i] = mu_numerator/denominator
            centered = X - mu[i]
            for j in range(X.shape[0]):
                sigma_numerator += w[i][j]*(centered[j].reshape(-1,1)@centered[j].reshape(-1,1).T)
            sigma_vals[i] = np.copy(sigma_numerator/denominator)
            prob[i] = np.copy(denominator/X.shape[0])
        t = t+1
    label = np.argmax(y,axis = 1)
    pred  = np.argmax(w,axis = 0)
    labels.append(np.copy(label))
    mus.append(np.copy(mu))
    sigmas_list.append(np.copy(sigma_vals))
    contigency_table = np.zeros((k,k),dtype = np.integer)
    for i in range(len(pred)):
        contigency_table[label[i]][pred[i]] += 1
    col_sum = np.sum(contigency_table,axis = 0)
    row_sum = np.sum(contigency_table,axis = 1)
    H_T = -np.sum(col_sum/np.sum(col_sum)*np.log(col_sum/np.sum(col_sum)))
    H_C = -np.sum(row_sum/np.sum(row_sum)*np.log(row_sum/np.sum(row_sum)))
    I_CT = 0
    for i in range(k):
        for j in range(k):
            if (contigency_table[i][j] != 0):
                I_CT += (contigency_table[i][j]/np.sum(row_sum))*np.log((contigency_table[i][j]*np.sum(row_sum))/((row_sum[i])*(col_sum[j])))
    NMI = I_CT/(np.sqrt(H_C*H_T))
    print(f'finished with {t} iterations! NMI is {NMI}')




finished with 36 iterations! NMI is 0.35116187914114155
finished with 30 iterations! NMI is 0.3584236115108929
finished with 33 iterations! NMI is 0.43568224720570703
finished with 26 iterations! NMI is 0.38483766583580353
finished with 46 iterations! NMI is 0.3500553699572617
finished with 27 iterations! NMI is 0.4312192999626904
finished with 26 iterations! NMI is 0.403949733455893
finished with 30 iterations! NMI is 0.3657396920854319
finished with 30 iterations! NMI is 0.4116406880135222
finished with 41 iterations! NMI is 0.358570050235324


The Best NMI is 0.44 for the 3rd initialization value

In [24]:
best_mean = mus[2]
best_cov = sigmas_list[2]
best_cluster_dist = labels[2]
for i in range(k):
    print(f'The best mean for cluster {i} is:')
    print(best_mean[i])

The best mean for cluster 0 is:
[0.02368443 0.02065003 0.01341875 0.019096   0.0139325  0.01180214
 0.00597069 0.02213109 0.02686099 0.02436426 0.00919575 0.0266513
 0.01989228 0.01681875 0.01248011 0.02122705 0.02258515 0.02467105
 0.02078806 0.01787422 0.01090978 0.00916427 0.00977882 0.01257251
 0.01351222 0.02244222 0.02525175 0.01457154 0.00936678 0.01040212
 0.01188589 0.01004339 0.00816326 0.00888809 0.01332452 0.01310503
 0.01984521 0.01398706 0.01256277 0.01835881 0.01848054 0.01738437
 0.01837332 0.02223524 0.02526401 0.00620043 0.00777998 0.01051999
 0.01021071 0.03435555 0.02475781 0.01232014 0.01339227 0.0045479
 0.00526683 0.00631312 0.00427042 0.03041247 0.0234588  0.03617611
 0.02294369 0.00187262 0.00048844 0.00074027]
The best mean for cluster 1 is:
[0.02590151 0.02133166 0.01477784 0.01780349 0.02199082 0.01666818
 0.00894176 0.01128482 0.02749196 0.01678555 0.00925932 0.02187428
 0.02949071 0.01548961 0.01121105 0.01274967 0.03509164 0.01782137
 0.01607732 0.0072840

In [25]:
for i in range(k):
    print(f'The best covariance for cluster {i} is:')
    print(best_cov[i])

The best covariance for cluster 0 is:
[[ 1.06943349e-04 -5.32915647e-06  4.30145410e-05 ...  4.71258630e-07
   8.93837694e-07  1.19990068e-06]
 [-5.32915647e-06  1.41693527e-04 -5.21654765e-05 ... -6.76368529e-07
   1.18927153e-06 -5.12856161e-07]
 [ 4.30145410e-05 -5.21654765e-05  1.09537985e-04 ...  3.63676321e-07
   9.07564939e-08  9.40598511e-07]
 ...
 [ 4.71258630e-07 -6.76368529e-07  3.63676321e-07 ...  1.19190145e-06
  -4.65793519e-08 -1.07725441e-07]
 [ 8.93837694e-07  1.18927153e-06  9.07564939e-08 ... -4.65793519e-08
   3.68952215e-07 -4.56298181e-08]
 [ 1.19990068e-06 -5.12856161e-07  9.40598511e-07 ... -1.07725441e-07
  -4.56298181e-08  1.06634332e-06]]
The best covariance for cluster 1 is:
[[ 1.6396269e-04 -7.0384405e-05  5.3639746e-05 ... -2.8298236e-06
  -1.5219244e-06 -2.5582985e-06]
 [-7.0384405e-05  8.6239481e-05 -3.6793175e-05 ...  1.2957079e-06
   6.8616754e-07 -6.1490283e-07]
 [ 5.3639746e-05 -3.6793175e-05  5.8685520e-05 ...  3.9908267e-07
  -2.4216340e-07 -5.9299

In [26]:
print('The best cluster distribution is:')
for i in range(k):
    print(f'cluster {i}: {np.sum(best_cluster_dist == i)}')

The best cluster distribution is:
cluster 0: 2919
cluster 1: 1345
cluster 2: 215
cluster 3: 2831
cluster 4: 18
cluster 5: 2523
cluster 6: 220
cluster 7: 180
cluster 8: 126
cluster 9: 2077
cluster 10: 572
