In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import norm
from scipy.stats import invgamma
import utils
import hashlib
from sklearn.mixture import GaussianMixture
from mixtureModels import DiagGaussianFS
from mixtureModels import DiagGaussian
from mixtureModels import DiagGaussianCategorical
from sklearn.metrics.cluster import adjusted_rand_score
import time
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import multiprocessing
from ucimlrepo import fetch_ucirepo 
import os

In [2]:
seed = 3489
# seed = np.rendom.randint(1, 2**16 - 1)
np.random.seed(seed)

## Cell motif human [Gaurik] data

In [4]:
data_dir = "dataGaurik"
filename = "cell_motif_r_n_human"

X = utils.extractData(f"{data_dir}/{filename}.csv", "data")
z_true = utils.extractData(f"{data_dir}/{filename}.labels", "labels")
K = len(set(z_true))

#### Run Models

In [None]:
start_time = time.process_time()
training_runs_EM = 2

EM_gmm = GaussianMixture(n_components=K, n_init=training_runs_EM, covariance_type="diag")
EM_gmm.fit(X)

print(f"BIC: {EM_gmm.bic(X)}\nK: {K}")
z_pred_EM = EM_gmm.predict(X)
utils.saveData(f"out{data_dir}/{filename}.pred.em.labels", z_pred_EM, "labels")
utils.saveData(f"out{data_dir}/{filename}.pred.em.time", time.process_time() - start_time, "single")


In [5]:
start_time = time.process_time()
est = DiagGaussian(alpha = 1.0, m_0_partial = 0, k_0 = 0.03, v_0_partial = 3, S_0_partial = 0.3, K_initial = K, iterations=30, runs = 2)
est.sample(X)

bayes_pred = est.assignments()
utils.saveData(f"out{data_dir}/{filename}.pred.bayes.labels", bayes_pred, "labels")
utils.saveData(f"out{data_dir}/{filename}.pred.bayes.time", time.process_time() - start_time, "single")

29/30               
Run: 1, K:6, BIC: 53517060.17186609, logmax post: -26747357.68254354, max_post_iter: 23

Run:  2
29/30               
Run: 2, K:6, BIC: 53517047.2751954, logmax post: -26747351.234208193, max_post_iter: 17


'outdataGaurik/cell_motif_r_n_human.pred.bayes.time'

In [4]:
start_time = time.process_time()
est = DiagGaussianFS(alpha = 1.0, m_0_partial = 0, k_0 = 0.03, v_0_partial = 3, S_0_partial = 0.3, K_initial = K, iterations=10, runs = 2, FS=True)
est.sample(X)

bayes_pred_fs = est.assignments()
utils.saveData(f"out{data_dir}/{filename}.pred.bayes.fs.labels", bayes_pred_fs, "labels")
utils.saveData(f"out{data_dir}/{filename}.pred.bayes.fs.time", time.process_time() - start_time, "single")


Run:  1
9/10               
Run: 1, K:6, BIC: 51393784.20071914, logmax post: -25696709.10035957, max_post_iter: 2
Final features:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 1 1 ... 1 1 1]]

Run:  2
9/10               
Run: 2, K:6, BIC: 51456816.867024235, logmax post: -25728225.433512118, max_post_iter: 2
Final features:
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [1 1 1 ... 1 1 1]]


'outdataGaurik/cell_motif_r_n_human.pred.bayes.fs.time'

#### Evaluate the performance

In [5]:
EM_labels = utils.extractData(f"out{data_dir}/{filename}.pred.em.labels", "labels")
bayes_labels = utils.extractData(f"out{data_dir}/{filename}.pred.bayes.labels", "labels")
bayes_fs_labels = utils.extractData(f"out{data_dir}/{filename}.pred.bayes.fs.labels", "labels")

EM_ari = round(adjusted_rand_score(z_true, EM_labels), 3)
bayes_ari = round(adjusted_rand_score(z_true, bayes_labels), 3)
bayes_fs_ari = round(adjusted_rand_score(z_true, bayes_fs_labels), 3)

EM_time = round(float(utils.extractData(f"out{data_dir}/{filename}.pred.em.time", "single")), 3)
bayes_time = round(float(utils.extractData(f"out{data_dir}/{filename}.pred.bayes.time", "single")), 3)
bayes_fs_time = round(float(utils.extractData(f"out{data_dir}/{filename}.pred.bayes.fs.time", "single")), 3)

In [6]:
print(EM_ari, bayes_ari, bayes_fs_ari)
print(EM_time, bayes_time, bayes_fs_time)

0.16 0.194 0.19
674.929 678.136 12065.212


## UCI data

In [3]:
uci_data_dir = "dataUCI"

## Breast Cancer Wisconsin (Diagnostic)

https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic

In [34]:
breast = fetch_ucirepo(id=17)
bb = breast.variables
filename_bb = breast.metadata.name
bb

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,ID,ID,Categorical,,,,no
1,Diagnosis,Target,Categorical,,,,no
2,radius1,Feature,Continuous,,,,no
3,texture1,Feature,Continuous,,,,no
4,perimeter1,Feature,Continuous,,,,no
5,area1,Feature,Continuous,,,,no
6,smoothness1,Feature,Continuous,,,,no
7,compactness1,Feature,Continuous,,,,no
8,concavity1,Feature,Continuous,,,,no
9,concave_points1,Feature,Continuous,,,,no


In [35]:
bb_feature = bb.loc[bb['role'] == 'Feature']
bb_cont_features_df = bb_feature.loc[bb_feature['type'] == 'Continuous']
bb_cat_features_df = bb_feature.loc[bb_feature['type'] == 'Categorical']
bb_cont_features = bb_cont_features_df["name"].tolist()
bb_cat_features = bb_cat_features_df["name"].tolist()

In [260]:
y = np.ravel(breast.data.targets)
clusters = np.unique(y)
clusters_to_int = {clust: i for i, clust in enumerate(clusters)}
z_true_17 = np.vectorize(clusters_to_int.get)(y)
len(set(z_true_17))

2

In [37]:
print(bb_cat_features)
print(bb_cont_features)

[]
['radius1', 'texture1', 'perimeter1', 'area1', 'smoothness1', 'compactness1', 'concavity1', 'concave_points1', 'symmetry1', 'fractal_dimension1', 'radius2', 'texture2', 'perimeter2', 'area2', 'smoothness2', 'compactness2', 'concavity2', 'concave_points2', 'symmetry2', 'fractal_dimension2', 'radius3', 'texture3', 'perimeter3', 'area3', 'smoothness3', 'compactness3', 'concavity3', 'concave_points3', 'symmetry3', 'fractal_dimension3']


In [38]:
N = breast.data.features.shape[0]
gD = len(bb_cont_features)

X = np.zeros((N, gD))

for i in range(gD):
    X[:, i] = breast.data.features[bb_cont_features[i]]

utils.saveData(f"{uci_data_dir}/{filename_bb}.gauss.csv", X, "data")
utils.saveData(f"{uci_data_dir}/{filename_bb}.labels", z_true_17, "labels")


'dataUCI/Breast Cancer Wisconsin (Diagnostic).labels'

In [39]:
start_time = time.perf_counter()

os.system(f"julia mmm.jl '{uci_data_dir}/{filename_bb}.gauss.csv'")
z_pred_mmm_uci_17 = utils.extractData(f"{uci_data_dir}/{filename_bb}.gauss.csv.labels.thichet", "labels")

mmm_ari_uci_17 = round(adjusted_rand_score(z_pred_mmm_uci_17, z_true_17), 3)
mmm_time_uci_17 = round(time.perf_counter() - start_time ,3)

utils.saveData(f"{uci_data_dir}/{filename_bb}.pred.mmm.labels", z_pred_mmm_uci_17, "labels")
utils.saveData(f"{uci_data_dir}/{filename_bb}.pred.mmm.time", mmm_time_uci_17, "single")

└ @ ArgParse :-1


−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−
                    Parameters
−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−
c (pseudocount)                       : 0.5
μ₀ (normal-gamma param)               : 0.0
β₀ (normal-gamma param)               : 0.5
a₀ (normal-gamma param)               : 0.5
b₀ (normal-gamma param)               : 0.5
lrow_c (number of categorical columns): 0
lrow_n (number of numeric columns)    : 30
nrows (total number of rows)          : 569
maxnclust (max clusters, 0=no limit)  : 0
nclust (fixed #clusters, 0= not fixed): 0
model selection criterion             : Marginal likelihood (TI)
niter                                 : 10

Input filename                        : dataUCI/Breast Cancer Wisconsin (Diagnostic).gauss.csv
Output filename                       : dataUCI/Breast Cancer Wisconsin (Diagnostic).gauss.csv.labels.thichet
−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−


'dataUCI/Breast Cancer Wisconsin (Diagnostic).pred.mmm.time'

In [46]:
print(f"\n{uci_data_dir}/{filename_bb}")

start_time = time.perf_counter()
X = utils.extractData(f"{uci_data_dir}/{filename_bb}.gauss.csv", "data")

est = DiagGaussian(alpha = 1.0, m_0_partial = 0, k_0 = 0.03, v_0_partial = 3, S_0_partial = 0.3, iterations=50, runs = 3)

est.sample(X)
bayes_pred_uci_17 = est.assignments()
bayes_ari_uci_17 = round(adjusted_rand_score(bayes_pred_uci_17, z_true_17), 3)
bayes_time_uci_17 = round(time.perf_counter() - start_time ,3)
utils.saveData(f"{uci_data_dir}/{filename_bb}.pred.bayes.labels", bayes_pred_uci_17, "labels")
utils.saveData(f"{uci_data_dir}/{filename_bb}.pred.bayes.time", bayes_time_uci_17, "single")


dataUCI/Breast Cancer Wisconsin (Diagnostic)

Run:  1
49/50               
Run: 1, K:4, BIC: 24752.29423488369, logmax post: -11614.881465346685, max_post_iter: 30

Run:  2
49/50               
Run: 2, K:2, BIC: 19928.863226817462, logmax post: -9583.798787361151, max_post_iter: 33

Run:  3
49/50               
Run: 3, K:4, BIC: 24751.790192386452, logmax post: -11614.629444098066, max_post_iter: 39


'dataUCI/Breast Cancer Wisconsin (Diagnostic).pred.bayes.time'

In [203]:
print(f"\n{uci_data_dir}/{filename_bb}")

start_time = time.perf_counter()
X = utils.extractData(f"{uci_data_dir}/{filename_bb}.gauss.csv", "data")

est = DiagGaussianFS(alpha = 1.0, m_0_partial = 0, k_0 = 0.03, v_0_partial = 3, S_0_partial = 0.3, iterations=50, runs = 2, K_initial=len(set(z_true_17)), FS=True)

est.sample(X)
bayes_pred_uci_17 = est.assignments()
bayes_ari_uci_17 = round(adjusted_rand_score(bayes_pred_uci_17, z_true_17), 3)
bayes_time_uci_17 = round(time.perf_counter() - start_time ,3)
utils.saveData(f"{uci_data_dir}/{filename_bb}.pred.bayes.labels", bayes_pred_uci_17, "labels")
utils.saveData(f"{uci_data_dir}/{filename_bb}.pred.bayes.time", bayes_time_uci_17, "single")


dataUCI/Breast Cancer Wisconsin (Diagnostic)

Run:  1
49/50               
Run: 1, K:2, BIC: 6062.013518685587, logmax post: -3009.5067593427934, max_post_iter: 34
Final features:
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 1 1 1 0 0 1 0 0 0]]

Run:  2
49/50               
Run: 2, K:2, BIC: 6017.45162949529, logmax post: -2987.225814747645, max_post_iter: 2
Final features:
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 1 1 1 0 0 1 0 0 0]]


'dataUCI/Breast Cancer Wisconsin (Diagnostic).pred.bayes.time'

In [195]:
start_time = time.perf_counter()
training_runs_EM = 5
EM_gmm = GaussianMixture(n_components=len(set(z_true_17)), n_init=training_runs_EM)
X = utils.extractData(f"{uci_data_dir}/{filename_bb}.gauss.csv", "data")
EM_gmm.fit(X)
z_pred_EM = EM_gmm.predict(X)

em_ari_uci_17 = round(adjusted_rand_score(z_pred_EM, z_true_17), 3)
em_time_uci_17 = round(time.perf_counter() - start_time ,3)

utils.saveData(f"{uci_data_dir}/{filename_bb}.pred.em.labels", z_pred_EM, "labels")
utils.saveData(f"{uci_data_dir}/{filename_bb}.pred.em.time", em_time_uci_17, "single")


'dataUCI/Breast Cancer Wisconsin (Diagnostic).pred.em.time'

In [198]:
start_time = time.process_time()
training_runs_EM = 2

best_BIC = np.inf
X = utils.extractData(f"{uci_data_dir}/{filename_bb}.gauss.csv", "data")

for K in range(2, 21):
    EM_gmm = GaussianMixture(n_components=K, n_init=training_runs_EM)
    EM_gmm.fit(X)
    gmm_BIC = EM_gmm.bic(X)
    # print(f"BIC: {gmm_BIC} K: {K}")
    
    if gmm_BIC < best_BIC:
        best_BIC = gmm_BIC
        best_gmm = EM_gmm

z_pred_EM_unknownK = best_gmm.predict(X)
print(f"Best BIC: {best_BIC} K: {len(set(z_pred_EM_unknownK))}")

em_ari_uK_uci_17 = round(adjusted_rand_score(z_pred_EM_unknownK, z_true_17), 3)
em_time_uK_uci_17 = round(time.perf_counter() - start_time ,3)

utils.saveData(f"{uci_data_dir}/{filename_bb}.pred.em_unknown_K.labels", z_pred_EM_unknownK, "labels")
utils.saveData(f"{uci_data_dir}/{filename_bb}.pred.em_unknown_K.time", time.process_time() - start_time, "single")

Best BIC: -38148.47774325239 K: 2


'dataUCI/Breast Cancer Wisconsin (Diagnostic).pred.em_unknown_K.time'

In [200]:
len(set(z_true_17))

2

In [204]:
print(bayes_ari_uci_17, mmm_ari_uci_17, em_ari_uci_17, em_ari_uK_uci_17)
print(bayes_time_uci_17, mmm_time_uci_17, em_time_uci_17, em_time_uK_uci_17)

0.646 0.646 0.812 0.812
79.348 44.913 2.042 386864.413


## Abalone 

https://archive.ics.uci.edu/dataset/1/abalone

In [48]:
abalone = fetch_ucirepo(id=1)
aa = abalone.variables
filename_aa = abalone.metadata.name
aa

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,Sex,Feature,Categorical,,"M, F, and I (infant)",,no
1,Length,Feature,Continuous,,Longest shell measurement,mm,no
2,Diameter,Feature,Continuous,,perpendicular to length,mm,no
3,Height,Feature,Continuous,,with meat in shell,mm,no
4,Whole_weight,Feature,Continuous,,whole abalone,grams,no
5,Shucked_weight,Feature,Continuous,,weight of meat,grams,no
6,Viscera_weight,Feature,Continuous,,gut weight (after bleeding),grams,no
7,Shell_weight,Feature,Continuous,,after being dried,grams,no
8,Rings,Target,Integer,,+1.5 gives the age in years,,no


In [261]:
aa_feature = aa.loc[aa['role'] == 'Feature']
aa_cont_features_df = aa_feature.loc[aa_feature['type'] == 'Continuous']
aa_cat_features_df = aa_feature.loc[aa_feature['type'] == 'Categorical']
aa_cont_features = aa_cont_features_df["name"].tolist()
aa_cat_features = aa_cat_features_df["name"].tolist()
y = np.ravel(abalone.data.targets)
clusters = np.unique(y)
clusters_to_int = {clust: i for i, clust in enumerate(clusters)}
z_true_1 = np.vectorize(clusters_to_int.get)(y)

In [262]:
print(aa_cat_features)
print(aa_cont_features)
print(len(set(z_true_1)))

['Sex']
['Length', 'Diameter', 'Height', 'Whole_weight', 'Shucked_weight', 'Viscera_weight', 'Shell_weight']
28


In [53]:
N = abalone.data.features.shape[0]
gD = len(aa_cont_features)
cD =  len(aa_cat_features)

X = np.zeros((N, gD))
C_str = np.zeros((N, cD), 'str')

for i in range(gD):
    X[:, i] = abalone.data.features[aa_cont_features[i]]
for i in range(cD):
    C_str[:, i] = abalone.data.features[aa_cat_features[i]]


categories = np.unique(C_str)
category_to_int = {category: i for i, category in enumerate(categories)}
C = np.vectorize(category_to_int.get)(C_str)
mixed_data = np.concatenate((X, C), axis=1) 

utils.saveData(f"{uci_data_dir}/{filename_aa}.gauss.csv", X, "data")
utils.saveData(f"{uci_data_dir}/{filename_aa}.cat.csv", C, "data")
utils.saveData(f"{uci_data_dir}/{filename_aa}.mixed.csv", mixed_data, "data")
utils.saveData(f"{uci_data_dir}/{filename_aa}.labels", z_true_1, "labels")


'dataUCI/Abalone.labels'

In [None]:
start_time = time.perf_counter()

os.system(f"julia mmm.jl '{uci_data_dir}/{filename_aa}.mixed.csv'")
z_pred_mmm_uci_1 = utils.extractData(f"{uci_data_dir}/{filename_aa}.mixed.csv.labels.thichet", "labels")

mmm_ari_uci_1 = round(adjusted_rand_score(z_pred_mmm_uci_1, z_true_1), 3)
mmm_time_uci_1 = round(time.perf_counter() - start_time ,3)

utils.saveData(f"{uci_data_dir}/{filename_aa}.pred.mmm.labels", z_pred_mmm_uci_1, "labels")
utils.saveData(f"{uci_data_dir}/{filename_aa}.pred.mmm.time", mmm_time_uci_1, "single")

In [59]:
print(f"\n{uci_data_dir}/{filename_aa}")

start_time = time.perf_counter()
X = utils.extractData(f"{uci_data_dir}/{filename_aa}.gauss.csv", "data")
C = utils.extractData(f"{uci_data_dir}/{filename_aa}.cat.csv", "data_int")

est = DiagGaussianCategorical(alpha = 1.0, gamma=0.2, m_0_partial = 0, k_0 = 0.03, v_0_partial = 3, S_0_partial = 0.3, K_initial = len(set(z_true_1)), iterations=30, runs = 2)
est.sample(X, C)

bayes_pred_uci_1 = est.assignments()
bayes_ari_uci_1 = round(adjusted_rand_score(bayes_pred_uci_1, z_true_1), 3)
bayes_time_uci_1 = round(time.perf_counter() - start_time ,3)
utils.saveData(f"{uci_data_dir}/{filename_aa}.pred.bayes.labels", bayes_pred_uci_1, "labels")
utils.saveData(f"{uci_data_dir}/{filename_aa}.pred.bayes.time", bayes_time_uci_1, "single")



dataUCI/Abalone

Run:  1
29/30               
Run: 1, K:6, BIC: -233350.79709938006, logmax post: 58525.289617546194, max_post_iter: 17

Run:  2
29/30               
Run: 2, K:6, BIC: -232859.8696507404, logmax post: 58402.55775538628, max_post_iter: 8


'dataUCI/Abalone.pred.bayes.time'

In [257]:
print(f"\n{uci_data_dir}/{filename_aa}")

start_time = time.perf_counter()
X = utils.extractData(f"{uci_data_dir}/{filename_aa}.gauss.csv", "data")
C = utils.extractData(f"{uci_data_dir}/{filename_aa}.cat.csv", "data_int")

est = DiagGaussian(alpha = 1.0, m_0_partial = 0, k_0 = 0.03, v_0_partial = 3, S_0_partial = 0.3, K_initial = len(set(z_true_1)), iterations=30, runs = 2)
est.sample(X)

bayes_gauss_pred_uci_1 = est.assignments()
bayes_gauss_ari_uci_1 = round(adjusted_rand_score(bayes_gauss_pred_uci_1, z_true_1), 3)
bayes_gauss_time_uci_1 = round(time.perf_counter() - start_time ,3)
utils.saveData(f"{uci_data_dir}/{filename_aa}.pred.bayes.labels", bayes_gauss_pred_uci_1, "labels")
utils.saveData(f"{uci_data_dir}/{filename_aa}.pred.bayes.time", bayes_gauss_time_uci_1, "single")





dataUCI/Abalone

Run:  1
29/30               
Run: 1, K:6, BIC: -123588.84249535263, logmax post: 62144.58988738519, max_post_iter: 30

Run:  2
29/30               
Run: 2, K:6, BIC: -123733.81705021023, logmax post: 62217.07716481399, max_post_iter: 30


'dataUCI/Abalone.pred.bayes.time'

In [254]:
start_time = time.process_time()
training_runs_EM = 2

best_BIC = np.inf
X = utils.extractData(f"{uci_data_dir}/{filename_aa}.mixed.csv", "data")

for K in range(2, 31):
    EM_gmm = GaussianMixture(n_components=K, n_init=training_runs_EM)
    EM_gmm.fit(X)
    gmm_BIC = EM_gmm.bic(X)
    # print(f"BIC: {gmm_BIC} K: {K}")
    
    if gmm_BIC < best_BIC:
        best_BIC = gmm_BIC
        best_gmm = EM_gmm

z_pred_EM_unknownK = best_gmm.predict(X)
print(f"Best BIC: {best_BIC} K: {len(set(z_pred_EM_unknownK))}")

em_ari_uK_uci_1 = round(adjusted_rand_score(z_pred_EM_unknownK, z_true_1), 3)
em_time_uK_uci_1 = round(time.perf_counter() - start_time ,3)

utils.saveData(f"{uci_data_dir}/{filename_aa}.pred.em_unknown_K.labels", z_pred_EM_unknownK, "labels")
utils.saveData(f"{uci_data_dir}/{filename_aa}.pred.em_unknown_K.time", time.process_time() - start_time, "single")

Best BIC: -161159.37891073295 K: 15


'dataUCI/Abalone.pred.em_unknown_K.time'

In [258]:
print(len(set(z_pred_EM_unknownK)))
print(len(set(bayes_gauss_pred_uci_1)))
print(len(set(z_pred_mmm_uci_1)))

15
6
4


In [259]:
print(bayes_ari_uci_1, mmm_ari_uci_1, bayes_gauss_ari_uci_1, em_ari_uK_uci_1)
print(bayes_time_uci_1, mmm_time_uci_1, bayes_gauss_time_uci_1, em_time_uK_uci_1)

0.069 0.066 0.068 0.068
59.135 55.111 55.07 386953.553


## Sonar

In [146]:
sonar = fetch_ucirepo(id=151)
ss = sonar.variables
filename_ss = sonar.metadata.name
ss

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,Attribute1,Feature,Continuous,,,,no
1,Attribute2,Feature,Continuous,,,,no
2,Attribute3,Feature,Continuous,,,,no
3,Attribute4,Feature,Continuous,,,,no
4,Attribute5,Feature,Continuous,,,,no
...,...,...,...,...,...,...,...
56,Attribute57,Feature,Continuous,,,,no
57,Attribute58,Feature,Continuous,,,,no
58,Attribute59,Feature,Continuous,,,,no
59,Attribute60,Feature,Continuous,,,,no


In [147]:
ss_feature = ss.loc[ss['role'] == 'Feature']
ss_cont_features_df = ss_feature.loc[ss_feature['type'] == 'Continuous']
ss_cat_features_df = ss_feature.loc[ss_feature['type'] == 'Categorical']
ss_cont_features = ss_cont_features_df["name"].tolist()
ss_cat_features = ss_cat_features_df["name"].tolist()

In [148]:
y = np.ravel(sonar.data.targets)
clusters = np.unique(y)
clusters_to_int = {clust: i for i, clust in enumerate(clusters)}
z_true_sonar = np.vectorize(clusters_to_int.get)(y)

In [149]:
print(ss_cat_features)
print(ss_cont_features)

[]
['Attribute1', 'Attribute2', 'Attribute3', 'Attribute4', 'Attribute5', 'Attribute6', 'Attribute7', 'Attribute8', 'Attribute9', 'Attribute10', 'Attribute11', 'Attribute12', 'Attribute13', 'Attribute14', 'Attribute15', 'Attribute16', 'Attribute17', 'Attribute18', 'Attribute19', 'Attribute20', 'Attribute21', 'Attribute22', 'Attribute23', 'Attribute24', 'Attribute25', 'Attribute26', 'Attribute27', 'Attribute28', 'Attribute29', 'Attribute30', 'Attribute31', 'Attribute32', 'Attribute33', 'Attribute34', 'Attribute35', 'Attribute36', 'Attribute37', 'Attribute38', 'Attribute39', 'Attribute40', 'Attribute41', 'Attribute42', 'Attribute43', 'Attribute44', 'Attribute45', 'Attribute46', 'Attribute47', 'Attribute48', 'Attribute49', 'Attribute50', 'Attribute51', 'Attribute52', 'Attribute53', 'Attribute54', 'Attribute55', 'Attribute56', 'Attribute57', 'Attribute58', 'Attribute59', 'Attribute60']


In [150]:
N = sonar.data.features.shape[0]
gD = len(ss_cont_features)

X = np.zeros((N, gD))

for i in range(gD):
    X[:, i] = sonar.data.features[ss_cont_features[i]]

utils.saveData(f"{uci_data_dir}/{filename_ss}.gauss.csv", X, "data")
utils.saveData(f"{uci_data_dir}/{filename_ss}.labels", z_true_sonar, "labels")


'dataUCI/Connectionist Bench (Sonar, Mines vs. Rocks).labels'

In [30]:
start_time = time.perf_counter()

os.system(f"julia mmm.jl '{uci_data_dir}/{filename_ss}.gauss.csv'")
z_pred_mmm_uci_151 = utils.extractData(f"{uci_data_dir}/{filename_ss}.gauss.csv.labels.thichet", "labels")

mmm_ari_uci_151 = round(adjusted_rand_score(z_pred_mmm_uci_151, z_true_sonar), 3)
mmm_time_uci_151 = round(time.perf_counter() - start_time ,3)

utils.saveData(f"{uci_data_dir}/{filename_ss}.pred.mmm.labels", z_pred_mmm_uci_151, "labels")
utils.saveData(f"{uci_data_dir}/{filename_ss}.pred.mmm.time", mmm_time_uci_151, "single")


└ @ ArgParse :-1


−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−
                    Parameters
−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−
c (pseudocount)                       : 0.5
μ₀ (normal-gamma param)               : 0.0
β₀ (normal-gamma param)               : 0.5
a₀ (normal-gamma param)               : 0.5
b₀ (normal-gamma param)               : 0.5
lrow_c (number of categorical columns): 0
lrow_n (number of numeric columns)    : 60
nrows (total number of rows)          : 208
maxnclust (max clusters, 0=no limit)  : 0
nclust (fixed #clusters, 0= not fixed): 0
model selection criterion             : Marginal likelihood (TI)
niter                                 : 10

Input filename                        : dataUCI/Connectionist Bench (Sonar, Mines vs. Rocks).gauss.csv
Output filename                       : dataUCI/Connectionist Bench (Sonar, Mines vs. Rocks).gauss.csv.labels.thichet
−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−


'dataUCI/Connectionist Bench (Sonar, Mines vs. Rocks).pred.mmm.time'

In [154]:
print(f"\n{uci_data_dir}/{filename_ss}")

start_time = time.perf_counter()
X = utils.extractData(f"{uci_data_dir}/{filename_ss}.gauss.csv", "data")

est = DiagGaussianFS(alpha = 1.0, m_0_partial = 0, k_0 = 0.03, v_0_partial = 3, S_0_partial = 0.3, K_initial = 3, iterations=30, runs = 10)

est.sample(X)
bayes_pred_uci_151 = est.assignments()
bayes_ari_uci_151 = round(adjusted_rand_score(bayes_pred_uci_151, z_true_sonar), 3)
bayes_time_uci_151 = round(time.perf_counter() - start_time ,3)
utils.saveData(f"{uci_data_dir}/{filename_ss}.pred.bayes.labels", bayes_pred_uci_151, "labels")
utils.saveData(f"{uci_data_dir}/{filename_ss}.pred.bayes.time", bayes_time_uci_151, "single")



dataUCI/Connectionist Bench (Sonar, Mines vs. Rocks)

Run:  1
29/30               
Run: 1, K:3, BIC: -18139.63556296008, logmax post: 9109.81778148004, max_post_iter: 19
Final features:
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1
  1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


'dataUCI/Connectionist Bench (Sonar, Mines vs. Rocks).pred.bayes.time'

In [155]:
print(bayes_ari_uci_151, mmm_ari_uci_151)
print(bayes_time_uci_151, mmm_time_uci_151)

0.004 0.0
25.388 41.796


## Prima indians diabetes

In [93]:
kaggle_data_dir = "dataKaggle"
filename_diab = "diabetesRaw"

In [94]:
df = pd.read_csv(f'{outDir}/{filename}.csv')
z_true_diab = df['Outcome'].tolist()

In [120]:
df = pd.read_csv(f'{outDir}/{filename}.csv')
X = df.to_numpy()[1:, :-1]
X = X.astype(float)
# print(X)
# print(df)
X_gl = df['Glucose'].to_numpy()
X_bp = df['BloodPressure'].to_numpy()
X_bmi = df['BMI'].to_numpy()

In [123]:
X = np.zeros((len(X_gl), 3), float)

In [125]:
X[:, 0] = X_gl
X[:, 1] = X_bp
X[:, 2] = X_bmi
X

array([[148. ,  72. ,  33.6],
       [ 85. ,  66. ,  26.6],
       [183. ,  64. ,  23.3],
       ...,
       [121. ,  72. ,  26.2],
       [126. ,  60. ,  30.1],
       [ 93. ,  70. ,  30.4]])

In [126]:
utils.saveData(f"{kaggle_data_dir}/{filename_diab}.gauss.csv", X, "data")
utils.saveData(f"{kaggle_data_dir}/{filename_diab}.labels", z_true_diab, "labels")

'dataKaggle/diabetesRaw.labels'

In [127]:
start_time = time.perf_counter()

os.system(f"julia mmm.jl '{kaggle_data_dir}/{filename_diab}.gauss.csv'")
z_pred_mmm_kaggle_diab = utils.extractData(f"{kaggle_data_dir}/{filename_diab}.gauss.csv.labels.thichet", "labels")

mmm_ari_kaggle_diab = round(adjusted_rand_score(z_pred_mmm_kaggle_diab, z_true_diab), 3)
mmm_time_kaggle_diab = round(time.perf_counter() - start_time ,3)

utils.saveData(f"{kaggle_data_dir}/{filename_diab}.pred.mmm.labels", z_pred_mmm_kaggle_diab, "labels")
utils.saveData(f"{kaggle_data_dir}/{filename_diab}.pred.mmm.time", mmm_time_kaggle_diab, "single")

└ @ ArgParse :-1


−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−
                    Parameters
−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−
c (pseudocount)                       : 0.5
μ₀ (normal-gamma param)               : 0.0
β₀ (normal-gamma param)               : 0.5
a₀ (normal-gamma param)               : 0.5
b₀ (normal-gamma param)               : 0.5
lrow_c (number of categorical columns): 0
lrow_n (number of numeric columns)    : 3
nrows (total number of rows)          : 768
maxnclust (max clusters, 0=no limit)  : 0
nclust (fixed #clusters, 0= not fixed): 0
model selection criterion             : Marginal likelihood (TI)
niter                                 : 10

Input filename                        : dataKaggle/diabetesRaw.gauss.csv
Output filename                       : dataKaggle/diabetesRaw.gauss.csv.labels.thichet
−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−


'dataKaggle/diabetesRaw.pred.mmm.time'

In [128]:
print(f"\n{kaggle_data_dir}/{filename_diab}")

start_time = time.perf_counter()
X = utils.extractData(f"{kaggle_data_dir}/{filename_diab}.gauss.csv", "data")

est = DiagGaussian(alpha = 1.0, m_0_partial = 0, k_0 = 0.03, v_0_partial = 3, S_0_partial = 0.3, K_initial = len(set(z_true_diab)), iterations=50, runs = 5)
est.sample(X)

bayes_pred_kaggle_diab = est.assignments()
bayes_ari_kaggle_diab = round(adjusted_rand_score(bayes_pred_kaggle_diab, z_true_diab), 3)
bayes_time_kaggle_diab = round(time.perf_counter() - start_time ,3)
utils.saveData(f"{kaggle_data_dir}/{filename_diab}.pred.bayes.labels", bayes_pred_kaggle_diab, "labels")
utils.saveData(f"{kaggle_data_dir}/{filename_diab}.pred.bayes.time", bayes_time_kaggle_diab, "single")



dataKaggle/diabetesRaw

Run:  1
49/50               
Run: 1, K:2, BIC: 10819.960355293846, logmax post: -5370.117439248037, max_post_iter: 21

Run:  2
49/50               
Run: 2, K:2, BIC: 10819.293530364468, logmax post: -5369.784026783348, max_post_iter: 10

Run:  3
49/50               
Run: 3, K:2, BIC: 10875.306579475007, logmax post: -5397.790551338618, max_post_iter: 30

Run:  4
49/50               
Run: 4, K:2, BIC: 10847.66340670137, logmax post: -5383.968964951799, max_post_iter: 50

Run:  5
49/50               
Run: 5, K:2, BIC: 10862.826920401933, logmax post: -5391.55072180208, max_post_iter: 45


'dataKaggle/diabetesRaw.pred.bayes.time'

In [129]:
print(bayes_ari_kaggle_diab, mmm_ari_kaggle_diab)
print(bayes_time_kaggle_diab, mmm_time_kaggle_diab)

0.007 0.099
36.579 38.007


## Ecoli

In [130]:
ecoli = fetch_ucirepo(id=39)
ee = ecoli.variables
filename_ee = ecoli.metadata.name
ee

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,Sequence,ID,Categorical,,Accession number for the SWISS-PROT database,,no
1,mcg,Feature,Continuous,,McGeoch's method for signal sequence recognition,,no
2,gvh,Feature,Continuous,,von Heijne's method for signal sequence recogn...,,no
3,lip,Feature,Binary,,von Heijne's Signal Peptidase II consensus seq...,,no
4,chg,Feature,Binary,,Presence of charge on N-terminus of predicted ...,,no
5,aac,Feature,Continuous,,score of discriminant analysis of the amino ac...,,no
6,alm1,Feature,Continuous,,score of the ALOM membrane spanning region pre...,,no
7,alm2,Feature,Continuous,,score of ALOM program after excluding putative...,,no
8,class,Target,Categorical,,,,no


In [264]:
ee_feature = ee.loc[ee['role'] == 'Feature']
ee_cont_features_df = ee_feature.loc[ee_feature['type'] == 'Continuous']
ee_cat_features_df = ee_feature.loc[ee_feature['type'] == 'Binary']
ee_cont_features = ee_cont_features_df["name"].tolist()
ee_cat_features = ee_cat_features_df["name"].tolist()
y = np.ravel(ecoli.data.targets)
clusters = np.unique(y)
clusters_to_int = {clust: i for i, clust in enumerate(clusters)}
z_true_ee = np.vectorize(clusters_to_int.get)(y)
len(set(z_true_ee))

8

In [132]:
print(ee_cat_features)
print(ee_cont_features)

['lip', 'chg']
['mcg', 'gvh', 'aac', 'alm1', 'alm2']


In [133]:
N = ecoli.data.features.shape[0]
gD = len(ee_cont_features)
cD =  len(ee_cat_features)

X = np.zeros((N, gD))
C_str = np.zeros((N, cD), 'str')

for i in range(gD):
    X[:, i] = ecoli.data.features[ee_cont_features[i]]
for i in range(cD):
    C_str[:, i] = ecoli.data.features[ee_cat_features[i]]


categories = np.unique(C_str)
category_to_int = {category: i for i, category in enumerate(categories)}
C = np.vectorize(category_to_int.get)(C_str)
mixed_data = np.concatenate((X, C), axis=1) 

utils.saveData(f"{uci_data_dir}/{filename_ee}.gauss.csv", X, "data")
utils.saveData(f"{uci_data_dir}/{filename_ee}.cat.csv", C, "data")
utils.saveData(f"{uci_data_dir}/{filename_ee}.mixed.csv", mixed_data, "data")
utils.saveData(f"{uci_data_dir}/{filename_ee}.labels", z_true_1, "labels")


'dataUCI/Ecoli.labels'

In [135]:
start_time = time.perf_counter()

os.system(f"julia mmm.jl '{uci_data_dir}/{filename_ee}.mixed.csv'")
z_pred_mmm_uci_39 = utils.extractData(f"{uci_data_dir}/{filename_ee}.mixed.csv.labels.thichet", "labels")

mmm_ari_uci_39 = round(adjusted_rand_score(z_pred_mmm_uci_39, z_true_ee), 3)
mmm_time_uci_39 = round(time.perf_counter() - start_time ,3)

utils.saveData(f"{uci_data_dir}/{filename_ee}.pred.mmm.labels", z_pred_mmm_uci_39, "labels")
utils.saveData(f"{uci_data_dir}/{filename_ee}.pred.mmm.time", mmm_time_uci_39, "single")

└ @ ArgParse :-1


−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−
                    Parameters
−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−
c (pseudocount)                       : 0.5
μ₀ (normal-gamma param)               : 0.0
β₀ (normal-gamma param)               : 0.5
a₀ (normal-gamma param)               : 0.5
b₀ (normal-gamma param)               : 0.5
lrow_c (number of categorical columns): 0
lrow_n (number of numeric columns)    : 7
nrows (total number of rows)          : 336
maxnclust (max clusters, 0=no limit)  : 0
nclust (fixed #clusters, 0= not fixed): 0
model selection criterion             : Marginal likelihood (TI)
niter                                 : 10

Input filename                        : dataUCI/Ecoli.mixed.csv
Output filename                       : dataUCI/Ecoli.mixed.csv.labels.thichet
−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−


'dataUCI/Ecoli.pred.mmm.time'

In [144]:
print(f"\n{uci_data_dir}/{filename_ee}")

start_time = time.perf_counter()
X = utils.extractData(f"{uci_data_dir}/{filename_ee}.gauss.csv", "data")
C = utils.extractData(f"{uci_data_dir}/{filename_ee}.cat.csv", "data_int")

est = DiagGaussianCategorical(alpha = 1.0, gamma=0.2, m_0_partial = 0, k_0 = 0.03, v_0_partial = 3, S_0_partial = 0.3, iterations=200, runs = 1)
est.sample(X, C)

bayes_pred_uci_39 = est.assignments()
bayes_ari_uci_39 = round(adjusted_rand_score(bayes_pred_uci_39, z_true_ee), 3)
bayes_time_uci_39 = round(time.perf_counter() - start_time ,3)
utils.saveData(f"{uci_data_dir}/{filename_ee}.pred.bayes.labels", bayes_pred_uci_39, "labels")
utils.saveData(f"{uci_data_dir}/{filename_ee}.pred.bayes.time", bayes_time_uci_39, "single")



dataUCI/Ecoli

Run:  1
199/200               
Run: 1, K:2, BIC: -8561.22974974561, logmax post: 2175.2101043961816, max_post_iter: 142


'dataUCI/Ecoli.pred.bayes.time'

In [230]:
print(f"\n{uci_data_dir}/{filename_ee}")

start_time = time.perf_counter()
X = utils.extractData(f"{uci_data_dir}/{filename_ee}.gauss.csv", "data")
C = utils.extractData(f"{uci_data_dir}/{filename_ee}.cat.csv", "data_int")

est = DiagGaussian(alpha = 1.0, K_initial = 2, m_0_partial = 0, k_0 = 0.03, v_0_partial = 3, S_0_partial = 0.3, iterations=200, runs = 1)
est.sample(X)

bayesgauss_pred_uci_39 = est.assignments()
bayesgauss_ari_uci_39 = round(adjusted_rand_score(bayesgauss_pred_uci_39, z_true_ee), 3)
bayesgauss_time_uci_39 = round(time.perf_counter() - start_time ,3)
utils.saveData(f"{uci_data_dir}/{filename_ee}.pred.bayes.labels", bayesgauss_pred_uci_39, "labels")
utils.saveData(f"{uci_data_dir}/{filename_ee}.pred.bayes.time", bayesgauss_time_uci_39, "single")



dataUCI/Ecoli

Run:  1
5/200               
Run: 1, K:2, BIC: -5392.091696004724, logmax post: 2777.485404241847, max_post_iter: 3


'dataUCI/Ecoli.pred.bayes.time'

In [227]:

start_time = time.process_time()
training_runs_EM = 2

best_BIC = np.inf
X = utils.extractData(f"{uci_data_dir}/{filename_ee}.gauss.csv", "data")

for K in range(2, 11):
    EM_gmm = GaussianMixture(n_components=K, n_init=training_runs_EM)
    EM_gmm.fit(X)
    gmm_BIC = EM_gmm.bic(X)
    # print(f"BIC: {gmm_BIC} K: {K}")
    
    if gmm_BIC < best_BIC:
        best_BIC = gmm_BIC
        best_gmm = EM_gmm


z_pred_EM_unknownK = best_gmm.predict(X)
print(f"Best BIC: {best_BIC} K: {len(set(z_pred_EM_unknownK))}")

em_ari_uK_uci_39 = round(adjusted_rand_score(z_pred_EM_unknownK, z_true_ee), 3)
em_time_uK_uci_39 = round(time.perf_counter() - start_time ,3)

utils.saveData(f"{uci_data_dir}/{filename_aa}.pred.em_unknown_K.labels", z_pred_EM_unknownK, "labels")
utils.saveData(f"{uci_data_dir}/{filename_aa}.pred.em_unknown_K.time", time.process_time() - start_time, "single")

Best BIC: -2902.918112620195 K: 4


'dataUCI/Abalone.pred.em_unknown_K.time'

In [231]:
print(bayes_ari_uci_39, mmm_ari_uci_39, bayesgauss_ari_uci_39, em_ari_uK_uci_39 )
print(bayes_time_uci_39, mmm_time_uci_39, bayesgauss_time_uci_39, em_time_uK_uci_39 )

0.391 0.448 0.038 0.59
19.567 37.6 0.939 385458.25


In [229]:
print(len(set(em_pred_uci_39)))
print(len(set(bayes_pred_uci_39)))
print(len(set(z_pred_mmm_uci_39)))
print(len(set(z_pred_EM_unknownK)))

2
2
2
4


## Ionosphere

In [156]:
iono = fetch_ucirepo(id=52)
ii = iono.variables
filename_ii = iono.metadata.name
ii

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,Attribute1,Feature,Continuous,,,,no
1,Attribute2,Feature,Continuous,,,,no
2,Attribute3,Feature,Continuous,,,,no
3,Attribute4,Feature,Continuous,,,,no
4,Attribute5,Feature,Continuous,,,,no
5,Attribute6,Feature,Continuous,,,,no
6,Attribute7,Feature,Continuous,,,,no
7,Attribute8,Feature,Continuous,,,,no
8,Attribute9,Feature,Continuous,,,,no
9,Attribute10,Feature,Continuous,,,,no


In [157]:
ii_feature = ii.loc[ii['role'] == 'Feature']
ii_cont_features_df = ii_feature.loc[ii_feature['type'] == 'Continuous']
ii_cat_features_df = ii_feature.loc[ii_feature['type'] == 'Categorical']
ii_cont_features = ii_cont_features_df["name"].tolist()
ii_cat_features = ii_cat_features_df["name"].tolist()

In [158]:
y = np.ravel(iono.data.targets)
clusters = np.unique(y)
clusters_to_int = {clust: i for i, clust in enumerate(clusters)}
z_true_52 = np.vectorize(clusters_to_int.get)(y)

In [159]:
print(ii_cat_features)
print(ii_cont_features)

[]
['Attribute1', 'Attribute2', 'Attribute3', 'Attribute4', 'Attribute5', 'Attribute6', 'Attribute7', 'Attribute8', 'Attribute9', 'Attribute10', 'Attribute11', 'Attribute12', 'Attribute13', 'Attribute14', 'Attribute15', 'Attribute16', 'Attribute17', 'Attribute18', 'Attribute19', 'Attribute20', 'Attribute21', 'Attribute22', 'Attribute23', 'Attribute24', 'Attribute25', 'Attribute26', 'Attribute27', 'Attribute28', 'Attribute29', 'Attribute30', 'Attribute31', 'Attribute32', 'Attribute33', 'Attribute34']


In [160]:
N = iono.data.features.shape[0]
gD = len(ii_cont_features)

X = np.zeros((N, gD))

for i in range(gD):
    X[:, i] = iono.data.features[ii_cont_features[i]]

utils.saveData(f"{uci_data_dir}/{filename_ii}.gauss.csv", X, "data")
utils.saveData(f"{uci_data_dir}/{filename_ii}.labels", z_true_52, "labels")


'dataUCI/Ionosphere.labels'

In [161]:
start_time = time.perf_counter()

os.system(f"julia mmm.jl '{uci_data_dir}/{filename_ii}.gauss.csv'")
z_pred_mmm_uci_52 = utils.extractData(f"{uci_data_dir}/{filename_ii}.gauss.csv.labels.thichet", "labels")

mmm_ari_uci_52 = round(adjusted_rand_score(z_pred_mmm_uci_52, z_true_52), 3)
mmm_time_uci_52 = round(time.perf_counter() - start_time ,3)

utils.saveData(f"{uci_data_dir}/{filename_ii}.pred.mmm.labels", z_pred_mmm_uci_52, "labels")
utils.saveData(f"{uci_data_dir}/{filename_ii}.pred.mmm.time", mmm_time_uci_52, "single")

└ @ ArgParse :-1


−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−
                    Parameters
−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−
c (pseudocount)                       : 0.5
μ₀ (normal-gamma param)               : 0.0
β₀ (normal-gamma param)               : 0.5
a₀ (normal-gamma param)               : 0.5
b₀ (normal-gamma param)               : 0.5
lrow_c (number of categorical columns): 0
lrow_n (number of numeric columns)    : 34
nrows (total number of rows)          : 351
maxnclust (max clusters, 0=no limit)  : 0
nclust (fixed #clusters, 0= not fixed): 0
model selection criterion             : Marginal likelihood (TI)
niter                                 : 10

Input filename                        : dataUCI/Ionosphere.gauss.csv
Output filename                       : dataUCI/Ionosphere.gauss.csv.labels.thichet
−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−−


'dataUCI/Ionosphere.pred.mmm.time'

In [188]:
print(f"\n{uci_data_dir}/{filename_ii}")

start_time = time.perf_counter()
X = utils.extractData(f"{uci_data_dir}/{filename_ii}.gauss.csv", "data")

est = DiagGaussianFS(alpha = 1.0, m_0_partial = 0, k_0 = 0.03, v_0_partial = 3, S_0_partial = 0.3, iterations=30, runs = 1)

est.sample(X)
bayes_pred_uci_52 = est.assignments()
bayes_ari_uci_52 = round(adjusted_rand_score(bayes_pred_uci_52, z_true_52), 3)
bayes_time_uci_52 = round(time.perf_counter() - start_time ,3)
utils.saveData(f"{uci_data_dir}/{filename_ii}.pred.bayes.labels", bayes_pred_uci_52, "labels")
utils.saveData(f"{uci_data_dir}/{filename_ii}.pred.bayes.time", bayes_time_uci_52, "single")


dataUCI/Ionosphere

Run:  1
29/30               
Run: 1, K:22, BIC: -2088.2958250546326, logmax post: 1724.1479125273163, max_post_iter: 10
Final features:
[[1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 ...
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]
 [1 1 1 ... 1 1 1]]


'dataUCI/Ionosphere.pred.bayes.time'

In [247]:
print(f"\n{uci_data_dir}/{filename_ii}")

start_time = time.perf_counter()
X = utils.extractData(f"{uci_data_dir}/{filename_ii}.gauss.csv", "data")

est = DiagGaussian(alpha = 1.0, m_0_partial = 0, k_0 = 0.03, v_0_partial = 3, S_0_partial = 0.3, iterations=100, runs = 3, K_initial=10)

est.sample(X)
bayes_pred_uci_52 = est.assignments()
bayes_ari_uci_52 = round(adjusted_rand_score(bayes_pred_uci_52, z_true_52), 3)
bayes_time_uci_52 = round(time.perf_counter() - start_time ,3)
utils.saveData(f"{uci_data_dir}/{filename_ii}.pred.bayes.labels", bayes_pred_uci_52, "labels")
utils.saveData(f"{uci_data_dir}/{filename_ii}.pred.bayes.time", bayes_time_uci_52, "single")


dataUCI/Ionosphere

Run:  1
99/100               
Run: 1, K:10, BIC: 7823.391701546194, logmax post: -1919.0285347947024, max_post_iter: 99

Run:  2
99/100               
Run: 2, K:10, BIC: 7920.199163207733, logmax post: -1967.4322656254724, max_post_iter: 97

Run:  3
99/100               
Run: 3, K:10, BIC: 7175.629730584053, logmax post: -1595.147549313632, max_post_iter: 52


'dataUCI/Ionosphere.pred.bayes.time'

In [189]:
print(f"\n{uci_data_dir}/{filename_ii}")

start_time = time.perf_counter()
training_runs_EM = 2
EM_gmm = GaussianMixture(n_components=len(set(z_true_52)), n_init=training_runs_EM, covariance_type = "diag")
X = utils.extractData(f"{uci_data_dir}/{filename_ii}.gauss.csv", "data")
EM_gmm.fit(X)

# print(f"BIC: {EM_gmm.bic(X)}\nK: {K}")
em_pred_uci_52 = EM_gmm.predict(X)
em_ari_uci_52 = round(adjusted_rand_score(em_pred_uci_52, z_true_52), 3)
em_time_uci_52 = round(time.perf_counter() - start_time ,3)
utils.saveData(f"{uci_data_dir}/{filename}.pred.em_diag.gauss.labels", em_pred_uci_52, "labels")
utils.saveData(f"{uci_data_dir}/{filename}.pred.em_diag.gauss.time", em_time_uci_52, "single")



dataUCI/Ionosphere


'dataUCI/diabetesRaw.pred.em_diag.gauss.time'

In [251]:

start_time = time.process_time()
training_runs_EM = 2

best_BIC = np.inf
X = utils.extractData(f"{uci_data_dir}/{filename_ii}.gauss.csv", "data")

for K in range(2, 11):
    EM_gmm = GaussianMixture(n_components=K, n_init=training_runs_EM)
    EM_gmm.fit(X)
    gmm_BIC = EM_gmm.bic(X)
    # print(f"BIC: {gmm_BIC} K: {K}")
    
    if gmm_BIC < best_BIC:
        best_BIC = gmm_BIC
        best_gmm = EM_gmm


z_pred_EM_unknownK = best_gmm.predict(X)
print(f"Best BIC: {best_BIC} K: {len(set(z_pred_EM_unknownK))}")

em_ari_uK_uci_52 = round(adjusted_rand_score(z_pred_EM_unknownK, z_true_52), 3)
em_time_uK_uci_52 = round(time.perf_counter() - start_time ,3)

utils.saveData(f"{uci_data_dir}/{filename_ii}.pred.em_unknown_K.labels", z_pred_EM_unknownK, "labels")
utils.saveData(f"{uci_data_dir}/{filename_ii}.pred.em_unknown_K.time", time.process_time() - start_time, "single")

Best BIC: -1280.912398062057 K: 3


'dataUCI/Ionosphere.pred.em_unknown_K.time'

In [252]:
print(bayes_ari_uci_52, em_ari_uci_52, mmm_ari_uci_52, em_ari_uK_uci_52)
print(bayes_time_uci_52, em_ari_uci_52, mmm_time_uci_52, em_time_uK_uci_52)

0.233 0.213 0.212 0.51
31.12 0.213 47.31 386956.282


In [253]:
print(len(set(em_pred_uci_52)))
print(len(set(bayes_pred_uci_52)))
print(len(set(z_pred_mmm_uci_52)))
print(len(set(z_pred_EM_unknownK)))

2
10
5
3


In [237]:
len(set(z_true_52))

2

In [None]:
##### save data
def saveUCIData(uci_data_dir, id):
    data = fetch_ucirepo(id=id)
    data_var = data.variables
    filename = data.metadata['name']
    X_raw = data.data.features

    bb_feature = data_var.loc[data_var['role'] == 'Feature']
    bb_cont_features_df = bb_feature.loc[bb_feature['type'] == 'Continuous']
    bb_cat_features_df = bb_feature.loc[bb_feature['type'] == 'Categorical']
    bb_cont_features = bb_cont_features_df["name"].tolist()
    bb_cat_features = bb_cat_features_df["name"].tolist()
    
    z_true = np.ravel(data.data.targets)
    
    N = X_raw.shape[0]
    gD = len(bb_cont_features)
    cD = len(bb_cat_features)
    
    X = np.zeros((N, gD))
    C_str = np.zeros((N, cD), "str")

    for i in range(gD):
        X[:, i] = X_raw[bb_cont_features[i]]

    for i in range(cD):
        C_str[:, i] = X_raw[bb_cat_features[i]]

    if len(C_str[0]) != 0:
        categories = np.unique(C_str)
        category_to_int = {category: i for i, category in enumerate(categories)}
        C = np.vectorize(category_to_int.get)(C_str)
    else:
        C = np.zeros((N, 0), int)
    print(C)
    mixed_data = np.concatenate((X, C), axis=1) 

    utils.saveData(f"{uci_data_dir}/{filename}.gauss.csv", X, "data")
    utils.saveData(f"{uci_data_dir}/{filename}.cat.csv", C, "data")
    utils.saveData(f"{uci_data_dir}/{filename}.mixed.csv", mixed_data, "data")
    utils.saveData(f"{uci_data_dir}/{filename}.labels", z_true, "labels")

    return filename



def runBayes(data_dir, filename):

    print(f"\n{data_dir}/{filename}")

    start_time = time.process_time()
    X = utils.extractData(f"{data_dir}/{filename}.gauss.csv", "data")
    C = utils.extractData(f"{data_dir}/{filename}.cat.csv", "data_int")
    
    est = DiagGaussianCategorical(alpha = 1.0, m_0_partial = 0, k_0 = 0.03, v_0_partial = 3, S_0_partial = 0.3, gamma = 0.2, K_initial = 3, iterations=30, runs = 5)
    
    est.sample(X, C)
    bayes_pred = est.assignments()
    utils.saveData(f"{data_dir}/{filename}.pred.bayes.labels", bayes_pred, "labels")
    utils.saveData(f"{data_dir}/{filename}.pred.bayes.time", time.process_time() - start_time, "single")

def runMMM(data_dir, filename):
    start_time = time.process_time()

    X = utils.extractData(f"{data_dir}/{filename}.gauss.csv", "data")
    C = utils.extractData(f"{data_dir}/{filename}.cat.csv", "data_int")
    mixed_data = np.concatenate((X, C), axis=1)

    utils.saveData(f"{data_dir}/{filename}.mixed.csv", mixed_data, 'data')
    os.system(f"julia mmm.jl {data_dir}/{filename}.mixed.csv")
    z_pred_mmm = utils.extractData(f"{data_dir}/{filename}.mixed.csv.labels.thichet", "labels")
    
    utils.saveData(f"{data_dir}/{filename}.pred.mmm.labels", z_pred_mmm, "labels")
    utils.saveData(f"{data_dir}/{filename}.pred.mmm.time", time.process_time() - start_time, "single")



def runBayesGauss(data_dir, filename):
    
    print(f"\n{filename}")

    start_time = time.process_time()
    
    X = utils.extractData(f"{data_dir}/{filename}.gauss.csv", "data")
    C = utils.extractData(f"{data_dir}/{filename}.cat.csv", "data_int")
    one_hot_C = np.zeros((N, cD*M))
    encoder = OneHotEncoder(sparse=False)
    one_hot_C = encoder.fit_transform(C)
    mixed_data_one_hot = np.concatenate((X, one_hot_C), axis=1)
    
    est = DiagGaussianFS(alpha = 1.0, m_0_partial = 0, k_0 = 0.03, v_0_partial = 3, S_0_partial = 0.3, K_initial = K, iterations=30, runs = 2)
    est.sample(mixed_data_one_hot)
    bayes_pred = est.assignments()
    utils.saveData(f"{data_dir}/{filename}.pred.bayes.gauss.labels", bayes_pred, "labels")
    utils.saveData(f"{data_dir}/{filename}.pred.bayes.gauss.time", time.process_time() - start_time, "single")



def runEM_diag(data_dir, filename):
    
    print(f"\n{filename}")

    start_time = time.process_time()
    training_runs_EM = 2
    EM_gmm = GaussianMixture(n_components=K, n_init=training_runs_EM, covariance_type = "diag")
    X = utils.extractData(f"{data_dir}/{filename}.gauss.csv", "data")
    C = utils.extractData(f"{data_dir}/{filename}.cat.csv", "data_int")
    one_hot_C = np.zeros((N, cD*M))
    encoder = OneHotEncoder(sparse=False)
    one_hot_C = encoder.fit_transform(C)
    mixed_data_one_hot = np.concatenate((X, one_hot_C), axis=1)
    EM_gmm.fit(mixed_data_one_hot)
    
    print(f"BIC: {EM_gmm.bic(mixed_data_one_hot)}\nK: {K}")
    z_pred_EM = EM_gmm.predict(mixed_data_one_hot)
    utils.saveData(f"{data_dir}/{filename}.pred.em_diag.labels", z_pred_EM, "labels")
    utils.saveData(f"{data_dir}/{filename}.pred.em_diag.time", time.process_time() - start_time, "single")

