In [1]:
import os
import numpy as np
import pandas as pd
from sklearnex import patch_sklearn
patch_sklearn()

from local.caching import load, save

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
cosdis_save = "metabolite_cos_dist"
cosdis = load(cosdis_save)

recovering & decompressing cached data from [{WORKSPACE}/main/pre_cluster_y/cache/metabolite_cos_dist.pkl.gz]


In [3]:
from plotly import graph_objects as go, subplots as sp
from sklearn.neighbors import KNeighborsClassifier

# settings

axis_col = 'rgba(0, 0, 0, 0.15)'
no_col = 'rgba(0, 0, 0, 0)'
axis_desc: dict = dict(linecolor=no_col, gridcolor=axis_col, zerolinecolor=axis_col, zerolinewidth=1)
layout = dict(
    autosize=False,
    width=1400,
    height=650,
    margin=dict(
        l=25, r=25, b=25, t=50, pad=5
    ),
    # paper_bgcolor="white",
    font_family="Times New Roman",
    font_color="black",
    font_size=20,
    plot_bgcolor='white',
    xaxis=axis_desc,
    yaxis=axis_desc,
    xaxis2=axis_desc,
    yaxis2=axis_desc,
)

In [4]:
positions = load("tsne_p100")

recovering & decompressing cached data from [{WORKSPACE}/main/pre_cluster_y/cache/tsne_p100.pkl.gz]


In [7]:
from hdbscan import HDBSCAN

cosdis_d = cosdis.astype(np.double)
def try_one(e: float):
    model = HDBSCAN(min_samples=5, metric="precomputed", core_dist_n_jobs=12,
        # alpha=float(alpha),
        cluster_selection_epsilon=float(e)
    )
    model.fit(cosdis_d)
    group_assignments = model.labels_
    groups = {}
    for i, g in enumerate(group_assignments):
        groups[g] = groups.get(g, []) + [i]
    sgroups = sorted(groups.items(), key=lambda t: len(t[1]), reverse=True)

    def make_traces():
        s, o = 2, 0.5
        return [
            go.Scatter(
                x=[x for i, (x, y) in enumerate(positions) if i in members],
                y=[y for i, (x, y) in enumerate(positions) if i in members],
                mode='markers',
                marker=dict(size=s,opacity=o,color="#555555") if g==-1 or len(members)<5 else dict(size=s, opacity=o),
                name=f"{g}"
            )
        for j, (g, members) in enumerate(sgroups)]

    fig = sp.make_subplots(
        rows=1, cols=1, shared_xaxes=True, shared_yaxes=True, horizontal_spacing=0.02,
        # x_title="% Completeness"
    )

    for i, tr in enumerate(make_traces()):
        fig.add_trace(tr, row=1, col=1)
    _layout = layout.copy()
    _layout.update(dict(
        xaxis=dict(title="", **axis_desc),
        yaxis=dict(title="", **axis_desc),
        width=500,
        height=450,
    ))
    fig.update_annotations(font_size=24)
    fig.update_layout(go.Layout(title=f"{e}", **_layout))
    fig.show()

    return sgroups

labels = []
for v in [0, 0.05, 0.07, 0.1, 0.2]:
    labels.append(try_one(v))

In [10]:
df: pd.DataFrame = load('biocyc_metabolite_usage', alt_workspace="../prep/")
min_freq = 0.01
indexes_kept = [i for i, v in enumerate(df.sum(axis=0)) if v >= len(df)*min_freq]
fdf = df.iloc[:, indexes_kept]
fdf.shape, df.shape

recovering & decompressing cached data from [../prep/cache/biocyc_metabolite_usage.pkl.gz]


((19999, 9849), (19999, 32681))

In [11]:
for g, members in labels[2]:
    if g == -1: continue
    print(len(members))
    for m in members:
        print(fdf.columns[m])
    # print(g, len(members))
    print()
    print("#"*150)
    print()

1059
i_UTP
i_PPI
i_GLC-1-P
i_PROTON
i_Hydroxymethylbilane-Synthase-ES2
i_PORPHOBILINOGEN
i_HYDROGEN-PEROXIDE
i_MET
i_WATER
i_CPD0-2351
i_NAD
i_Dihydro-Lipoyl-Proteins
i_NADH
i_2-D-THREO-HYDROXY-3-CARBOXY-ISOCAPROATE
i_CPD-9451
i_S-ADENOSYLMETHIONINE
i_ATP
i_Acceptor
i_Hydroxymethylbilane-Synthase-ES3
i_Pi
i_Oxo-glutarate-dehydrogenase-DH-lipoyl
i_DIHYDROXY-ACETONE-PHOSPHATE
i_GTP
i_L-CITRULLINE
i_CARBAMOYL-P
i_Single-Stranded-DNAs
i_Ribonucleoside-Triphosphates
i_Hydroxymethylbilane-Synthase-ES4
i_23S-rRNA-guanine-2551
i_PROTEIN-LIPOYLLYSINE
i_DIHYDROLIPOYL-GCVH
i_NADP
i_NADPH
i_CPD-19169
i_CO-A
i_3-CARBOXY-3-HYDROXY-ISOCAPROATE
i_G3P
i_2-PG
i_FUM
i_R-3-Hydroxypalmitoyl-ACPs
i_L-ASPARTATE
i_O-UREIDOHOMOSERINE
i_Protein-Histidines
i_23-DIPHOSPHOGLYCERATE
i_Protein-pi-phospho-L-histidines
i_CPD-16758
i_SUC
i_ACETYL-COA
i_NAD-P-OR-NOP
i_HYPOXANTHINE
i_E-
i_ETR-Quinones
i_Beta-hydroxydecanoyl-ACPs
i_UDP-N-ACETYL-D-GLUCOSAMINE
i_OXALACETIC_ACID
i_FRUCTOSE-6P
i_CPD-15709
i_ETF-Oxidized
i_CPD