In [1]:
import logging
logging.basicConfig(level=logging.INFO)

import sys
import random
sys.path.insert(0, "D:/IIT/DDP/DDP/B-SOID")

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from new_clustering import *
from joblib import Parallel, delayed

In [2]:
data_dir = "../../../data/2clustering"
with open(os.path.join(data_dir, "strainwise_labels.sav"), "rb") as f:
    feats, embedding, labels = joblib.load(f)
with open(os.path.join(data_dir, "pairwise_sim.sav"), "rb") as f:
    sim, thresh = joblib.load(f)

# Try Similarity Measures

In [3]:
clusters = collect_strainwise_clusters(feats, labels, embedding, thresh=0.6)

strain2idx = {}
for cluster_idx in clusters.keys():
    strain = cluster_idx.split(':')[0]
    if strain in strain2idx:
        strain2idx[strain].append(cluster_idx)
    else:
        strain2idx[strain] = [cluster_idx]

INFO:new_clustering:Strain: 129P3/J ; Features: (53990, 22) ; Embedding: (53990, 12) ; Labels: (53990,)
INFO:new_clustering:Strain: 129S1/SvlmJ ; Features: (53990, 22) ; Embedding: (53990, 12) ; Labels: (53990,)
INFO:new_clustering:Strain: 129X1/SvJ ; Features: (53990, 22) ; Embedding: (53990, 12) ; Labels: (53990,)
INFO:new_clustering:Strain: A/J ; Features: (53990, 22) ; Embedding: (53990, 12) ; Labels: (53990,)
INFO:new_clustering:Strain: AKR/J ; Features: (53990, 22) ; Embedding: (53990, 12) ; Labels: (53990,)
INFO:new_clustering:Strain: B6129PF1/J ; Features: (53990, 22) ; Embedding: (53990, 12) ; Labels: (53990,)
INFO:new_clustering:Strain: B6129SF1/J ; Features: (53990, 22) ; Embedding: (53990, 12) ; Labels: (53990,)
INFO:new_clustering:Strain: B6AF1/J ; Features: (53990, 22) ; Embedding: (53990, 12) ; Labels: (53990,)
INFO:new_clustering:Strain: B6C3F1/J ; Features: (53990, 22) ; Embedding: (53990, 12) ; Labels: (53990,)
INFO:new_clustering:Strain: B6CBAF1/J ; Features: (53990,

INFO:new_clustering:pooling 24 clusters from CB6F1/J with entropy ratio 0.9571596173349464
INFO:new_clustering:pooling 33 clusters from CBA/CaJ with entropy ratio 0.9331836210293518
INFO:new_clustering:pooling 41 clusters from CBA/J with entropy ratio 0.9568402811531074
INFO:new_clustering:pooling 29 clusters from CByB6F1/J with entropy ratio 0.9333462730131035
INFO:new_clustering:pooling 65 clusters from CZECHII/EiJ with entropy ratio 0.9661655802182814
INFO:new_clustering:pooling 16 clusters from DBA/1J with entropy ratio 0.6279957381926006
INFO:new_clustering:pooling 38 clusters from DBA/2J with entropy ratio 0.9524914122606575
INFO:new_clustering:pooling 5 clusters from FVB/NJ with entropy ratio 0.9399498293294577
INFO:new_clustering:pooling 52 clusters from I/LnJ with entropy ratio 0.9312728323399928
INFO:new_clustering:pooling 20 clusters from KK/HiJ with entropy ratio 0.7708926536941607
INFO:new_clustering:pooling 25 clusters from LG/J with entropy ratio 0.8918462878242392
INFO:

In [4]:
def cluster_similarity(cluster1, cluster2):
    X = [cluster1["feats"], cluster2["feats"]]
    y = [np.zeros((X[0].shape[0])), np.zeros((X[1].shape[0]))]
    
    X, y = np.vstack(X), np.hstack(y)
    idx = np.random.permutation(np.arange(X.shape[0]))
    X, y = X[idx], y[idx]
    
    model = LinearDiscriminantAnalysis()
    model.fit(X, y)
    
    model.transform(X).reshape(1,-1)
    print(X.shape, y.shape)
    
    return roc_auc_score(y, )

In [33]:
def plot_random_strain(thresh):
    while True:
        strain = random.sample(list(strain2idx.keys()), 1)[0]
        X = [clusters[cluster_idx]["feats"] for cluster_idx in strain2idx[strain]]
        y = [i * np.ones((x.shape[0],)) for i, x in enumerate(X)]
        X, y = np.vstack(X), np.hstack(y)
        counts = np.unique(y, return_counts=True)[1]
        prop = [x / y.size for x in counts]
        entropy_ratio = -sum(p * np.log2(p) for p in prop) / max_entropy(len(prop))
        if entropy_ratio >= thresh:
            print(f"found: {strain} with entropy: {entropy_ratio}")
            break
    
    mapper = umap.UMAP(min_dist=0.0, n_neighbors=500, n_components=2, densmap=True).fit(StandardScaler().fit_transform(X))
    embed = mapper.embedding_
    _, _, glabels, _ = cluster_with_hdbscan(embed, [0.5, 1.0, 11], {"prediction_data": True, "min_samples": 1})
    
    plt.figure(figsize=(10,10))
    plt.scatter(embed[:,0], embed[:,1], c=glabels, cmap="Spectral", s=0.2, alpha=0.75)
    plt.show()

In [None]:
plot_random_strain(0.9)

found: CZECHII/EiJ with entropy: 0.9661655802182814


# Trim identified clusters

In [None]:
def get_entropy_ratio_for_strain(strain, clusters):
    counts = []
    for cluster_id, data in clusters.items():
        if cluster_id.split(':')[0] == strain:
            counts.append(data["feats"].shape[0])
    
    prop = [x / sum(counts) for x in counts]
    entropy_ratio = -sum(p * np.log2(p) for p in prop) / max_entropy(len(counts))

    return entropy_ratio

def trim_clusters(clusters, sim_mat, thresh):
    # find strains below threshold
    strains = list(set([cluster_id.split(':')[0] for cluster_id in clusters.keys()]))
    remove_strains = [strain for strain in strains if get_entropy_ratio_for_strain(strain, clusters) < thresh]
    
    # find cluster idxs to be retained/removed
    retain_k, retain_cluster_ids = [], []
    for cluster_id in clusters.keys():
        strain, _, k = cluster_id.split(':')
        if strain not in remove_strains:
            retain_k.append(int(k))
            retain_cluster_ids.append(cluster_id)
    
    print(f"Retained {len(retain_cluster_ids)} out of {len(clusters)} clusters")
    
    sim_mat = sim_mat[:,retain_k]
    sim_mat = sim_mat[retain_k,:]
    
    idxmap = {k: i for i, k in enumerate(sorted(retain_k))}
    clusters = {cluster_id: clusters[cluster_id] for cluster_id in retain_cluster_ids}
    new_clusters = {}
    for cluster_id, data in clusters.items():
        strain, idx, k = cluster_id.split(':')
        new_clusters[f"{strain}:{idx}:{idxmap[int(k)]}"] = data

    return sim_mat, new_clusters

# Grouping Analysis

In [None]:
def embed_and_group_clusters(sim_mat):
    mapper = umap.UMAP(min_dist=0.0, n_neighbors=50, n_components=2).fit(sim_mat)
    assgn, _, glabels, _ = cluster_with_hdbscan(mapper.embedding_, [1.5, 3], HDBSCAN_PARAMS)
    
    embed = mapper.embedding_[assgn >= 0]
    plt.scatter(embed[:,0], embed[:,1], c=glabels[assgn >= 0], s=5, cmap="Spectral")
    plt.show()
    
    return glabels

def same_strain_grouping_frac(glabels, clusters):
    cluster2group = {k: lab for k, lab in enumerate(glabels)}
    group_frac = {}
    for cluster_id in clusters.keys():
        strain, _, k = cluster_id.split(':')
        if strain in group_frac:
            group_frac[strain].append(cluster2group[int(k)])
        else:
            group_frac[strain] = [cluster2group[int(k)]]
    group_frac = {strain: round(np.unique(labs).size / len(labs), 2) for strain, labs in group_frac.items()}
    return group_frac

def avg_group_sim(glabels, clusters, sim_mat):
    group_sim = {}
    for k, lab in enumerate(glabels):
        if lab in group_sim:
            group_sim[lab].append(k)
        else:
            group_sim[lab] = [k]
    
    within_group_sim = {}
    for group, cluster_idx in group_sim.items():
        within_group_sim[group] = np.array([sim_mat[i,j] for i, j in combinations(cluster_idx, 2)]).mean()
    
    n = glabels.max() + 1
    between_group_sim = np.zeros((n, n))
    for i, j in combinations(list(group_sim.keys()), 2):
        avg_sim = []
        for cluster1 in group_sim[i]:
            for cluster2 in group_sim[j]:
                avg_sim.append(sim_mat[cluster1, cluster2])
        between_group_sim[i,j] = between_group_sim[j,i] = np.array(avg_sim).mean()
    
    between_group_sim = np.abs(between_group_sim - 0.5) + 0.5
    return within_group_sim, between_group_sim

## w/o NN imputing

In [None]:
clusters = collect_strainwise_clusters(feats, labels, embedding, thresh)
tsim, tclusters = trim_clusters(clusters, similarity_matrix(sim), thresh=0.5)
no_impute_glabs = embed_and_group_clusters(tsim)
no_impute_group_frac = same_strain_grouping_frac(no_impute_glabs, tclusters)
no_impute_wgs, no_impute_bgs = avg_group_sim(no_impute_glabs, tclusters, tsim)

## w/ NN imputing

In [None]:
clusters = collect_strainwise_clusters(feats, labels, embedding, thresh)
imputing_sim = impute_same_strain_values(sim, clusters)
impute_tsim, impute_tclusters = trim_clusters(clusters, similarity_matrix(imputing_sim), thresh=0.5)
impute_glabs = embed_and_group_clusters(impute_tsim)
impute_group_frac = same_strain_grouping_frac(impute_glabs, impute_tclusters)
impute_wgs, impute_bgs = avg_group_sim(impute_glabs, impute_tclusters, impute_tsim)

In [None]:
group_frac = {"strain": [], "type": [], "frac": []}
for strain in impute_group_frac.keys():
    group_frac["strain"].extend([strain, strain])
    group_frac["type"].append("impute")
    group_frac["frac"].append(impute_group_frac[strain])
    group_frac["type"].append("no impute")
    group_frac["frac"].append(no_impute_group_frac[strain])
group_frac = pd.DataFrame.from_dict(group_frac)

plt.figure(figsize=(5, 10))
sns.barplot(x="frac", y="strain", hue="type", data=group_frac)
plt.plot([0.5, 0.5], plt.ylim(), "--", c="0.8")
plt.title("Strainwise Grouping Fraction")
plt.show()

In [None]:
fig, axs = plt.subplots(2, 2, figsize=(15, 15))

axs[0][0].barh(list(no_impute_wgs.keys()), [x for _, x in no_impute_wgs.items()])
axs[0][0].plot([0.5, 0.5], axs[0][0].get_ylim(), '--', color='0.8')
axs[1][0].barh(list(impute_wgs.keys()), [x for _, x in impute_wgs.items()])
axs[1][0].plot([0.5, 0.5], axs[1][0].get_ylim(), '--', color='0.8')
sns.heatmap(no_impute_bgs, ax=axs[0][1])
sns.heatmap(impute_bgs, ax=axs[1][1])
plt.show()

In [None]:
M = similarity_matrix(sim)

In [None]:
import cv2
cv2.imshow("heatmap", M)
cv2.waitKey(0)

In [None]:
from sklearn.datasets import make_blobs
n = 22
centers = np.vstack((np.zeros(2,), np.ones(2,)))
X, y = make_blobs([2000, 500], n, centers=centers, cluster_std=1, shuffle=True)
model = LinearDiscriminantAnalysis().fit(X,y)
Xproj = model.transform(X)

plt.scatter(Xproj, np.zeros_like(Xproj), c=y, s=1)
plt.show()

# Cluster Data with Frames

In [None]:
from prediction import extract_data_from_video

bsoid = BSOID("../config/config.yaml")
video_dir = "../../../data/videos"
video_files = sorted([os.path.join(video_dir, f) for f in os.listdir(video_dir) if f.endswith(".avi")])
raw_files = sorted([os.path.join(video_dir, f) for f in os.listdir(video_dir) if f.endswith(".h5")])

video_data = [extract_data_from_video(bsoid, raw_file, video_file) for raw_file, video_file in zip(raw_files, video_files)]
for 