# Import Packages

In [1]:
import sys

sys.path.append('../')

In [2]:
import numpy as np
from numpy.random import RandomState
from tqdm.notebook import tqdm

In [3]:
from src.evaluation_metrics import (
    max_intra_distance,
    min_inter_distance
)
from src.gaussian_mixture import GMMUtils
from src.merge_components import (
    EntMergeComponents,
    NEnt1MergeComponents,
    DEMPMergeComponents,
    DEMP2MergeComponents,
    MCMergeComponents,
    NMCMergeComponents
)

# Helper Functions

In [4]:
def create_artificial_prob(seed):
    """
    Create unmerged probabilities (prob_0) from the artificial dataset.

    Args:
        seed (int): random seed.
    Returns:
        Tuple[ndarray, ndarray]: X, prob_0.
    """
    random = RandomState(seed)
    K = 50
    D = 2
    rho = random.dirichlet(alpha=np.ones(K) * 1.0)
    means = random.randn(K, D) * 3
    covariances = [np.eye(D) for k in range(K)]
    gmm = GMMUtils(rho, means, covariances)
    X = gmm.sample(N=5000, random_state=seed)
    prob_0 = gmm.prob_latent(X)
    return X, prob_0

# Artificial Experiment

In [5]:
T = 100  # number of trial
L_list = [40, 30, 20, 10, 5]  # numbers of the components measured the distance

results_intra = np.zeros([6, len(L_list), T])
results_inter = np.zeros([6, len(L_list), T])

for t in tqdm(range(T)):
    X, prob_0 = create_artificial_prob(seed=t + 100)
    algorithm_list = [
        EntMergeComponents(),
        NEnt1MergeComponents(),
        DEMPMergeComponents(),
        DEMP2MergeComponents(),
        MCMergeComponents(),
        NMCMergeComponents()
    ]
    # fit
    for algorithm in algorithm_list:
        algorithm.fit(prob_0)
    # calculate distances
    for i in range(6):
        for j in range(len(L_list)):
            results_intra[i, j, t] = max_intra_distance(X, algorithm_list[i].prob_merged(prob_0, L_list[j]))
            results_inter[i, j, t] = min_inter_distance(X, algorithm_list[i].prob_merged(prob_0, L_list[j]))

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




# Create Ranking

In [6]:
def rank(a):
    u, cnt = np.unique(a, return_counts=True)
    values = np.cumsum(cnt) - 0.5 * (cnt - 1)
    indices = np.searchsorted(u, a)
    return values[indices]

In [7]:
ranks_intra = np.zeros([6, len(L_list), T])
for j in range(len(L_list)):
    for t in range(T):
        ranks_intra[:, j, t] = rank(results_intra[:, j, t])

print(np.mean(ranks_intra, axis=2))

[[6.    6.    6.    5.94  5.58 ]
 [3.67  4.145 3.92  4.15  4.18 ]
 [4.305 4.56  4.065 2.57  2.13 ]
 [2.4   2.525 2.45  2.125 1.905]
 [2.255 2.04  3.215 4.525 4.94 ]
 [2.37  1.73  1.35  1.69  2.265]]


In [8]:
ranks_inter = np.zeros([6, len(L_list), T])
for j in range(len(L_list)):
    for t in range(T):
        ranks_inter[:, j, t] = rank(- results_inter[:, j, t])

print(np.mean(ranks_inter, axis=2))

[[5.02  5.145 5.375 5.505 5.19 ]
 [4.285 4.23  4.255 4.56  3.99 ]
 [4.99  4.97  4.815 2.88  2.635]
 [3.555 3.55  3.17  2.855 2.485]
 [2.09  2.06  2.35  3.905 4.86 ]
 [1.06  1.045 1.035 1.295 1.84 ]]
