In [1]:
import os
import numpy as np
import pandas as pd
from sklearnex import patch_sklearn
patch_sklearn()

from local.caching import load, save

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
df: pd.DataFrame = load('biocyc_metabolite_usage', alt_workspace="../prep/")

recovering & decompressing cached data from [../prep/cache/biocyc_metabolite_usage.pkl.gz]


In [3]:
min_freq = 0.01
indexes_kept = [i for i, v in enumerate(df.sum(axis=0)) if v >= len(df)*min_freq]
fdf = df.iloc[:, indexes_kept]
fdf.shape, df.shape

((19999, 9849), (19999, 32681))

In [4]:
cosdis_save = "metabolite_cos_dist"
if not os.path.exists(f"cache/{cosdis_save}.pkl.gz"):
    mat = (fdf/fdf.mean(axis=0)).astype(dtype=np.float32)

    from sklearn.metrics.pairwise import pairwise_distances
    from scipy import sparse
    mat_sparse = sparse.csr_matrix(mat)
    print('start pairwise calc')
    cosdis = pairwise_distances(mat_sparse.T, metric="cosine", n_jobs=6)
    save(cosdis_save, cosdis)
else:
    cosdis = load(cosdis_save)

recovering & decompressing cached data from [{WORKSPACE}/main/pre_cluster_y/cache/metabolite_cos_dist.pkl.gz]


In [5]:
from plotly import graph_objects as go, subplots as sp
from sklearn.manifold import TSNE

# settings

axis_col = 'rgba(0, 0, 0, 0.15)'
no_col = 'rgba(0, 0, 0, 0)'
axis_desc: dict = dict(linecolor=no_col, gridcolor=axis_col, zerolinecolor=axis_col, zerolinewidth=1)
layout = dict(
    autosize=False,
    width=1400,
    height=650,
    margin=dict(
        l=25, r=25, b=25, t=50, pad=5
    ),
    # paper_bgcolor="white",
    font_family="Times New Roman",
    font_color="black",
    font_size=20,
    plot_bgcolor='white',
    xaxis=axis_desc,
    yaxis=axis_desc,
    xaxis2=axis_desc,
    yaxis2=axis_desc,
)

In [6]:
def try_one(perp):
    model = TSNE(n_components=2, learning_rate='auto', init="random", perplexity=perp, metric="precomputed")
    positions = model.fit_transform(cosdis)

    def make_traces():
        s, o = 5, 0.3
        return [
            go.Scatter(
                x=[x for x, y in positions],
                y=[y for x, y in positions],
                mode='markers',
                marker=dict(
                    size=s,
                    color='#222222',
                    opacity=o
                ),
                # name=f"markers: Other N={len(oth)}"
            ),
        ]

    fig = sp.make_subplots(
        rows=1, cols=2, shared_xaxes=True, shared_yaxes=True, horizontal_spacing=0.02,
        # x_title="% Completeness"
    )
    for i, tr in enumerate(make_traces()):
        fig.add_trace(tr, row=1, col=i+1)
    _layout = layout.copy()
    _layout.update(dict(
        xaxis=dict(title="", **axis_desc),
        yaxis=dict(title="", **axis_desc),
    ))
    fig.update_annotations(font_size=24)
    fig.update_layout(go.Layout(title=f"perp:{perp}", **_layout))
    fig.show()

    save(f"tsne_p{perp}", positions)

for p in [3, 10, 33, 100, 500, 1000]:
    try_one(p)

compressing & caching data to [{WORKSPACE}/main/pre_cluster_y/cache/tsne_p3.pkl.gz]


compressing & caching data to [{WORKSPACE}/main/pre_cluster_y/cache/tsne_p10.pkl.gz]


compressing & caching data to [{WORKSPACE}/main/pre_cluster_y/cache/tsne_p33.pkl.gz]


compressing & caching data to [{WORKSPACE}/main/pre_cluster_y/cache/tsne_p100.pkl.gz]


compressing & caching data to [{WORKSPACE}/main/pre_cluster_y/cache/tsne_p500.pkl.gz]


compressing & caching data to [{WORKSPACE}/main/pre_cluster_y/cache/tsne_p1000.pkl.gz]
