In [1]:
%config Completer.use_jedi = False

In [2]:
import logging
logging.basicConfig(level=logging.INFO)

import sys
sys.path.insert(0, "/home/laadd/DDP/B-SOID/")

import seaborn as sns
from BSOID.bsoid import *
from scipy.cluster.hierarchy import *
from scipy.spatial.distance import squareform



In [3]:
from BSOID.similarity import *
def estimate_similarity_between_clusters(templates, labels, points_per_class=600):
    classes, counts = np.unique(labels, return_counts=True)
    
    X, labels = find_templates(labels, templates, num_points=points_per_class * (classes.max() + 1))
    
    clusters = []
    for lab in np.unique(labels):
        clusters.append(X[np.where(labels == lab)[0]])
    
    def par_sim(i, j, clusters):
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            sim = density_separation_similarity(clusters[i], clusters[j], metric="euclidean")
            
        return [sim, i, j]
    
    nclasses = len(clusters)
    sim = np.array([par_sim(i, j, clusters) for i, j in combinations(range(nclasses), 2)])
    
    sim_mat = np.zeros((nclasses, nclasses))
    for i in range(sim.shape[0]):
        c1, c2 = sim[i, 1:].astype(int)
        sim_mat[c1, c2] = sim[i,0]
        sim_mat[c2, c1] = sim[i,0]
        
    return sim_mat
# sim = estimate_similarity_between_clusters(points_per_class=600)        

In [4]:
def plot(x, **kwargs):
    if 's' not in kwargs:
        s = 1
    plt.scatter(x[:,0], x[:,1], **kwargs)

In [23]:
def get_cophenetic_mat(labels):
    mat = np.zeros((labels.size, labels.size), dtype=np.int64)
    for i, j in combinations(range(labels.size), 2):
        mat[i,j] = mat[j,i] = int(labels[i] == labels[j])
    
    mat += np.identity(labels.size, dtype=np.int64)
    return mat

def generate_evidence_accumulation_matrix(nruns=200):
    bsoid = BSOID("../config/config.yaml")
    templates, _ = bsoid.load_pooled_dataset()
    
    os.environ["NUMBA_NUM_THREADS"] = "1"
    
    with open("/home/laadd/data/models/max_label.model", "rb") as f:
        model = joblib.load(f)
    
    clustering = model.predict(templates).flatten()
    
    os.environ["PYTHONPATH"] = "/home/laadd/DDP/B-SOID/:" + os.environ.get("PYTHONPATH", "")
    os.environ["PYTHONPATH"] = "/home/laadd/DDP/B-SOID/BSOID:" + os.environ.get("PYTHONPATH", "")
    os.environ["PYTHONPATH"] = "/home/laadd/DDP/B-SOID/stability:" + os.environ.get("PYTHONPATH", "")
    
    num_cpus = psutil.cpu_count(logical=False)
    ray.init(num_cpus=num_cpus)
    @ray.remote
    def get_labels(data):
        templates, clustering = data
        sim_mat = estimate_similarity_between_clusters(templates, clustering, points_per_class=600)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            embedding = umap.UMAP(metric="precomputed", min_dist=0.0, n_neighbors=2).fit_transform(sim_mat)
        labels = cluster_with_hdbscan(embedding, verbose=False, min_samples=1, prediction_data=True, cluster_range=[3, 5, 5], core_dist_n_jobs=1)[2]
        return labels
    
    data_id = ray.put([templates, clustering])
    futures = [get_labels.remote(data_id) for _ in range(nruns)]
    pbar, results = tqdm(total=len(futures)), []
    while len(futures) > 0:
        n = len(futures) if len(futures) < num_cpus else num_cpus
        fin, rest = ray.wait(futures, num_returns=n)
        results.extend(ray.get(fin))
        futures = rest
        pbar.update(n)

    ray.shutdown()
    pbar.close()
    
    n_clusters = clustering.max() + 1
    ea_mat = np.zeros((n_clusters, n_clusters), dtype=np.int64)
    for lab in labels:
        ea_mat += get_cophenetic_mat(lab)
    
    return ea_mat.astype(np.float64) / nruns 
# embedding = umap.UMAP(metric="precomputed", min_dist=0.0, n_neighbors=2).fit_transform(sim)
# labels = cluster_with_hdbscan(embedding, verbose=True, min_samples=1, prediction_data=True, cluster_range=[3, 5, 5])[2]
# plot(embedding, c=labels, s=40, cmap="Set3")

In [None]:
ea_mat = generate_evidence_accumulation_matrix()

    Run ID       : dis
 Save Location   : /home/laadd/data/bsoid_data/dis/output
      FPS        : 30
 Min. Confidence : 0.3
  Stride Window  : 100ms



In [None]:
def cluster_eac(M):
    M = np.abs(M.max() - M)
    Z = linkage(M, method="single")

In [22]:
ray.shutdown()