In [151]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from scipy.special import logsumexp
import random
import math
from scipy.stats import multivariate_normal
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import MinMaxScaler

In [152]:
np.random.seed(42)

In [153]:
train = np.array(pd.read_csv('./codon_usage.csv'))
# shuffle the dataset
train = np.delete(train, (486,5063), axis=0)
y = np.copy(train[:,0])
X = np.delete(train,(0,1,2,3,4), axis=1)
X = X.astype(np.float32)

  exec(code_obj, self.user_global_ns, self.user_ns)


In [154]:
scaler = MinMaxScaler()
scaler.fit(X)
X = scaler.transform(X)

## Note: Only 20% of the data are used (stratified so it does not matter) as discussed in Thread #307

In [155]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
sss.get_n_splits(X, y)

5

In [156]:
def Gaussian_Kernel(x_t,x_i,h):
    return (1/((2*math.pi)**(64/2)))*np.exp(-np.sum((x_i-x_t)*(x_i-x_t),axis = 1, keepdims=True)/(2*h**2))

In [157]:
def findAttractor(x,D,h,eps):
    x_prev = np.copy(x)
    x_next = np.copy(x_prev)
    t = 0
    while t == 0 or np.linalg.norm(x_next-x_prev)>=eps:
        x_prev = np.copy(x_next)
        k_i = Gaussian_Kernel(x_next,D,h)
        numerator = np.sum((k_i*D),axis = 0)
        denominator= np.sum(k_i)
        x_next = numerator/denominator
        t+=1
    return x_next

In [158]:
def f_hat(x_star,n,h,d,D):
    partial = np.sum(Gaussian_Kernel(x_star,D,h))*1/(n*h**d)
    return partial

In [159]:
h = 0.2
eta = 5e16
eps = 0.001
alpha = list()
R = dict()
f_hats = list()
used_index = []
selected_index = next(sss.split(X,y))[1]
selected_train = (X[next(sss.split(X,y))[1]])
for i,x in enumerate(selected_train):
    x_star = findAttractor(x,selected_train,h,eps)
    density = f_hat(x_star,selected_train.shape[0],h,64,X)
    f_hats.append(density)
    if  density>= eta:
        alpha.append(x_star)
        R[len(alpha)-1] = i
        used_index.append(i)

In [272]:
eps = 4e-3
adj_table = [[] for i in range(len(alpha))]
for i in range(len(alpha)):
    for j in range(i+1,len(alpha)):
        if (np.linalg.norm(alpha[i]-alpha[j])) <= eps:
            adj_table[i].append(j)

In [161]:
def DFS(temp, v, visited,adj_table):
    visited[v] = True
    temp.append(v)
    for i in adj_table[v]:
        if visited[i] == False:
            temp = DFS(temp, i, visited,adj_table)
    return temp
 
def connectedComponents(alpha,adj_table):
    visited = [False]*len(alpha)
    cc = []
    for v in range(len(alpha)):
        if visited[v] == False:
            temp = []
            cc.append(DFS(temp, v, visited,adj_table))
    return cc

In [277]:
cc = connectedComponents(alpha,adj_table)

In [278]:
pred = [-1]*len(selected_index)
for i in range(len(cc)):
    for j in cc[i]:
        pred[R[j]] = i

In [274]:
def one_hot_encoding(data):
    num_count = set()
    for i in data:
        num_count.add(i)
    num_count = list(num_count)
    data = np.copy(data)
    zeros = np.zeros((data.shape[0],len(num_count)))
    for i in range(data.shape[0]):
        zeros[i,num_count.index(data[i])] = 1
    return zeros


In [279]:
y_temp = one_hot_encoding(y[next(sss.split(X,y))[1]])
label = np.argmax(y_temp,axis = 1)

In [280]:
label = label[used_index]
pred = np.array(pred)[used_index]

In [281]:
contigency_table = np.zeros((11,len(cc)),dtype = np.integer)
for i in range(len(pred)):
    contigency_table[label[i]][pred[i]] += 1
col_sum = np.sum(contigency_table,axis = 0)
row_sum = np.sum(contigency_table,axis = 1)
H_T = -np.sum(col_sum/np.sum(col_sum)*np.log(col_sum/np.sum(col_sum)))
H_C = -np.sum(row_sum/np.sum(row_sum)*np.log(row_sum/np.sum(row_sum)))
I_CT = 0
for i in range(11):
    for j in range(len(cc)):
        if (contigency_table[i][j] != 0):
            I_CT += (contigency_table[i][j]/np.sum(row_sum))*np.log((contigency_table[i][j]*np.sum(row_sum))/((row_sum[i])*(col_sum[j])))
NMI = I_CT/(np.sqrt(H_C*H_T))

  """Entry point for launching an IPython kernel.


In [282]:
print("Note: Only 20% of the data are used (stratified using scipy so it does not matter) as discussed in Thread #307 due to efficiency problem")
print(f"Class Size is: {len(cc)}")
print(f"The class distribution is: {[len(cc[i]) for i in range(len(cc))]}")
print(f"The NMI Score is: {NMI}")

Note: Only 20% of the data are used (stratified using scipy so it does not matter) as discussed in Thread #307 due to efficiency problem
Class Size is: 9
The class distribution is: [1910, 3, 39, 427, 15, 14, 74, 21, 3]
The NMI Score is: 0.3362311094353266
