In [15]:
import numpy as np
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from typing import Any
from plotly import graph_objects as go, subplots as sp

from local.caching import save, load, save_exists

# this is just an optimization package
# may not be available for some systems
try:
    from sklearnex import patch_sklearn
    patch_sklearn()
except: pass

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [16]:
# mappings = load("1k_cy_mapping")
mappings = load("ad_mapping")
print(len(mappings))

recovering & decompressing cached data from [{WORKSPACE}/main/scratch/cache/ad_mapping.pkl.gz]
2495


In [17]:
original_x = np.array([emb for doi, t, a, emb in mappings])
original_x.shape

(2495, 1536)

In [18]:
model = PCA(n_components=2)
pca_x = model.fit_transform(original_x)
pca_x.shape

(2495, 2)

In [19]:
save_name = "latentx"
regen = False
regen = True

R = 80
if not regen and save_exists(save_name):
    latentx = load(save_name)
else:
    rand_seed = 36
    model = TSNE(n_components=2, random_state=rand_seed, perplexity=30)
    latentx = model.fit_transform(original_x)
    a, b = latentx.min(axis=0), latentx.max(axis=0)
    middle = (a+b)/2
    scale = (b-a)/2
    latentx[:, 0] -= middle[0]
    latentx[:, 1] -= middle[1]
    latentx[:,0] /= scale[0]
    latentx[:,1] /= scale[1]
    latentx *= R*0.90
    
    save(save_name, latentx)
latentx.shape



compressing & caching data to [{WORKSPACE}/main/scratch/cache/latentx.pkl.gz]


(2495, 2)

In [24]:
# query = "Comparative genomics uncovers the prolific and distinctive"
# query = "Phylum-wide comparative genomics unravel the diversity of secondary metabolism in Cyanobacteria"
# query = "Minimal genomes, maximal productivity: comparative genomics of the photosystem"
# query = "Comparative genomics analysis of NtcA regulons in cyanobacteria: regulation of nitrogen"
# query = "cTFbase: a database for comparative genomics of transcription factors in cyanobacteria"

# query = "a novel method for"
query = "Genome enrichment"

def divide_chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]
        
found_x, found_y, found_title, found_is = 0, 0, "", []
def _print(x, y, doi, ftitle, fabstract):
    if doi != "": print(f"https://doi.org/{doi}")
    print(f"loc: {x:0.2f}, {y:0.2f}")
    print(ftitle)
    print()
    print("\n".join(" ".join(c) for c in divide_chunks(fabstract.split(" "), 12)))
    print()

query = query.lower()
for i, (doi, ftitle, fabstract, x, y) in enumerate([(doi, t, a, x, y) for (doi, t, a, e), (x, y) in zip(mappings, latentx)]):
    if query not in ftitle.lower(): continue
    found_is.append(i)
    found_x, found_y, found_title = x, y, ftitle
    _print(x, y, doi, ftitle, fabstract)

https://doi.org/10.1101/gr.277266.122
loc: 10.03, -16.53
Genome enrichment of rare and unknown species from complicated microbiome by nanopore selective sequencing.

Rare species are vital members of a microbial community, but retrieving their
genomes is difficult due to their low abundance. The ReadUntil (RU) approach
allows nanopore devices to sequence specific DNA molecules selectively in real-time, which
provides an opportunity for enriching rare species. Despite the robustness of enriching
rare species by reducing the sequencing depth of known host sequences, such
as the human genome, there is still a gap in RU-based enriching
of rare species in environmental samples whose community composition is unclear, and
many rare species have poor or incomplete reference genomes in public databases.
Therefore, here we present metaRUpore to overcome this challenge. We applied metaRUpore
to a thermophilic anaerobic digester (TAD) community and human gut microbial community,
it reduced coverag

In [25]:
proximity, count = 10, 99
# proximity, count = 99, 5

closest = []
for doi, ftitle, fabstract, x, y in [(doi, t, a, x, y) for (doi, t, a, e), (x, y) in zip(mappings, latentx)]:
    dist = np.sqrt((x-found_x)**2 + (y-found_y)**2)
    if dist > proximity: continue
    if ftitle == found_title: continue
    closest.append((dist, (x, y, doi, ftitle, fabstract)))

closest = sorted(closest, key=lambda t: t[0])[:count]
for dist, (x, y, doi, ftitle, fabstract) in closest:
    print(f"distance: {dist:0.2f}")
    _print(x, y, doi, ftitle, fabstract)

for i, (doi, ftitle, fabstract, x, y) in enumerate([(doi, t, a, x, y) for (doi, t, a, e), (x, y) in zip(mappings, latentx)]):
    for dist, (x, y, doi, _ftitle, fabstract) in closest:
        if _ftitle == ftitle: found_is.append(i)

distance: 1.62
https://doi.org/10.1186/2049-2618-2-11
loc: 9.90, -14.91
CopyRighter: a rapid tool for improving the accuracy of microbial community profiles through lineage-specific gene copy number correction.

Culture-independent molecular surveys targeting conserved marker genes, most notably 16S rRNA, to
assess microbial diversity remain semi-quantitative due to variations in the number of
gene copies between species.
Based on 2,900 sequenced reference genomes, we show that
16S rRNA gene copy number (GCN) is strongly linked to microbial phylogenetic
taxonomy, potentially under-representing Archaea in amplicon microbial profiles. Using this relationship, we
inferred the GCN of all bacterial and archaeal lineages in the Greengenes
database within a phylogenetic framework. We created CopyRighter, new software which uses
these estimates to correct 16S rRNA amplicon microbial profiles and associated quantitative
(q)PCR total abundance. CopyRighter parses microbial profiles and, because 

In [26]:
# settings
axis_col = 'rgba(0, 0, 0, 0.15)'
no_col = 'rgba(0, 0, 0, 0)'
axis_desc: dict = dict(linecolor=no_col, gridcolor=axis_col, zerolinecolor=axis_col, zerolinewidth=1)
layout = dict(
    autosize=False,
    width=800,
    height=800,
    margin=dict(
        l=25, r=25, b=25, t=25, pad=5
    ),
    # paper_bgcolor="white",
    font_family="Times New Roman",
    font_color="black",
    font_size=20,
    plot_bgcolor='white',
    xaxis=axis_desc,
    yaxis=axis_desc,
    xaxis2=axis_desc,
    yaxis2=axis_desc,
)

fig = sp.make_subplots(
    rows=1, cols=1, shared_xaxes=True, shared_yaxes=True, horizontal_spacing=0.02,
    # x_title="% Completeness"
)

s, o = 5, 0.3
fig.add_trace(
    go.Scatter(
        x = [x for i, (x, y) in enumerate(latentx) if i not in found_is],
        y = [y for i, (x, y) in enumerate(latentx) if i not in found_is],
        mode='markers',
        marker=dict(
            size=s,
            color='#3679c6',
            opacity=o
        ),
        showlegend=False,
        # text=[f"{t}<br><br>{'<br>'.join([' '.join(c) for c in divide_chunks(a.split(' '), 18)])}" for doi, t, a, e in mappings],
        text=['<br>'.join(' '.join(c) for c in divide_chunks(t.split(" "), 12)) for i, (doi, t, a, e) in enumerate(mappings) if i not in found_is],
    ),
    row=1, col=1,
)

fig.add_trace(
    go.Scatter(
        x = [found_x], y = [found_y],
        mode='markers',
        marker=dict(
            size=s*2,
            color='#f76e21',
            opacity=1
        ),
        showlegend=False,
        text=['<br>'.join(' '.join(c) for c in divide_chunks(found_title.split(" "), 12))],
    ),
    row=1, col=1,
)

fig.add_trace(
    go.Scatter(
        x = [x for i, (x, y, doi, ftitle, fabstract) in closest],
        y = [y for i, (x, y, doi, ftitle, fabstract) in closest],

        # x = [x for x, y in pca_x],
        # y = [y for x, y in pca_x],
        mode='markers',
        marker=dict(
            size=s*1.2,
            color='#32cd32',
            opacity=0.6
        ),
        showlegend=False,
        text=['<br>'.join(' '.join(c) for c in divide_chunks(ftitle.split(" "), 12)) for i, (x, y, doi, ftitle, fabstract) in closest]
    ),
    row=1, col=1,
)

# for i, ((doi, t, a, e), (x, y)) in enumerate(zip(mappings, latentx)):
#     if i % 250 != 0: continue

#     fig.add_annotation(
#         x=x, y=y,
#         xshift=-10, yshift=10,
#         text="<br>".join(" ".join(c) for c in divide_chunks(t.split(" "), 12)),
#         showarrow=True,
#         arrowhead=2,
#     )

_layout: dict[Any, Any] = layout.copy()
_layout.update(dict(
    hoverlabel=dict(
        # bgcolor="rgba(0, 0, 0, 0.2)",
        font_size=12,
        # font_family="Rockwell"
    ),
    xaxis=dict(title="", range=(-R, R), **axis_desc),
    yaxis=dict(title="", range=(-R, R), **axis_desc),
))
fig.update_annotations(font_size=10)
fig.update_layout(go.Layout(**_layout))
fig.show()