Code to reproduce the t-SNE figures shown in the main text and supplementary.


In [1]:
#@title Imports
from sklearn.decomposition import PCA
import plotly.graph_objects as go
from sklearn.manifold import TSNE
import plotly.express as px
from tqdm import tqdm
import pandas as pd
import numpy as np
import random

In [None]:
# Specify the paths to LazBF_sequences.csv, LazBF_sample.csv, LazDEF_sequences.csv, LazDEF_sample.csv
# By default, these files are located in /LazBFDEF/data/
LazBF_sequences_path = '../data/LazBF_sequences.csv'
LazBF_sample_path = '../data/LazBF_sample.csv'
LazDEF_sequences_path = '../data/LazDEF_sequences.csv'
LazDEF_sample_path = '../data/LazDEF_sample.csv'

In [2]:
# Load sequences from csv
df = pd.read_csv(LazBF_sequences_path)
LazBF_sequences = df['sequences'].tolist()
LazBF_labels = df['labels'].tolist()

df = pd.read_csv(LazBF_sample_path)
LazBF_sample = df['sequences'].tolist()
LazBF_sample_labels = df['labels'].tolist()

df = pd.read_csv(LazDEF_sequences_path)
LazDEF_sequences = df['sequences'].tolist()
LazDEF_labels = df['labels'].tolist()

df = pd.read_csv(LazDEF_sample_path)
LazDEF_sample = df['sequences'].tolist()
LazDEF_sample_labels = df['labels'].tolist()

# Load Embs
lazbf_mlm_none = np.load("../Embeddings/LazBF_mlm_none.npy")
lazdef_mlm_none = np.load("../Embeddings/LazDEF_mlm_none.npy")

lazbf_mlm_lazbf = np.load("../Embeddings/LazBF_mlm_LazBF.npy")
lazdef_mlm_lazbf = np.load("../Embeddings/LazDEF_mlm_LazBF.npy")

lazbf_mlm_lazdef = np.load("../Embeddings/LazBF_mlm_LazDEF.npy")
lazdef_mlm_lazdef = np.load("../Embeddings/LazDEF_mlm_LazDEF.npy")

cutoff = 5000

lazbf_mlm_none = lazbf_mlm_none[-cutoff-5000:-cutoff] 
lazdef_mlm_none = lazdef_mlm_none[-cutoff-5000:-cutoff]

lazbf_mlm_lazbf = lazbf_mlm_lazbf[-cutoff-5000:-cutoff]
lazdef_mlm_lazbf = lazdef_mlm_lazbf[-cutoff-5000:-cutoff]

lazbf_mlm_lazdef = lazbf_mlm_lazdef[-cutoff-5000:-cutoff]
lazdef_mlm_lazdef = lazdef_mlm_lazdef[-cutoff-5000:-cutoff]

LazBF_sample_labels = LazBF_sample_labels[-cutoff-5000:-cutoff]
LazDEF_sample_labels = LazDEF_sample_labels[-cutoff-5000:-cutoff]

In [3]:
# t-sne for lazbf_mlm_none
pca = PCA(n_components=10)
X_embedded = pca.fit_transform(lazbf_mlm_none)

tsne = TSNE(n_components=2, perplexity=cutoff/100, learning_rate=cutoff/12)
X_embedded = tsne.fit_transform(X_embedded)

df = pd.DataFrame({'t-SNE 1': X_embedded[:, 0], 't-SNE 2': X_embedded[:, 1], 'labels': LazBF_sample_labels, 'alpha': [0.75]*cutoff})
fig = px.scatter(df, x='t-SNE 1', y='t-SNE 2', color='labels', color_continuous_scale=['royalblue', 'crimson'], opacity=[0.85]*cutoff)

fig.update_layout(
    autosize=False,
    width=500,
    height=500,
    yaxis=dict(scaleanchor="y", scaleratio=1, showgrid=False, zeroline=False, color='black'),
    xaxis=dict(scaleanchor="x", scaleratio=1, showgrid=False, zeroline=False, color='black'),
    plot_bgcolor='#fff',
    paper_bgcolor='#fff',
    showlegend=False,
)

fig.update_layout(coloraxis_showscale=False, font=dict(family="Arial"))

fig.update_xaxes(
    title_font=dict(size=24), 
    tickfont=dict(size=24), 
)
fig.update_yaxes(
    title_font=dict(size=24), 
    tickfont=dict(size=24),
)

# Define border rectangles
x_range = [df['t-SNE 1'].min()-10, df['t-SNE 1'].max()+10]
y_range = [df['t-SNE 2'].min()-10, df['t-SNE 2'].max()+10]

border_x = [x_range[0], x_range[1], x_range[1], x_range[0], x_range[0]]
border_y = [y_range[0], y_range[0], y_range[1], y_range[1], y_range[0]]

fig.add_shape(
    type="line",
    x0=border_x[0], y0=border_y[0], x1=border_x[1], y1=border_y[1],
    line=dict(color="black", width=1),
)
fig.add_shape(
    type="line",
    x0=border_x[1], y0=border_y[1], x1=border_x[2], y1=border_y[2],
    line=dict(color="black", width=1),
)
fig.add_shape(
    type="line",
    x0=border_x[2], y0=border_y[2], x1=border_x[3], y1=border_y[3],
    line=dict(color="black", width=1),
)
fig.add_shape(
    type="line",
    x0=border_x[3], y0=border_y[3], x1=border_x[0], y1=border_y[0],
    line=dict(color="black", width=1),
)
fig.write_image("lazbf_mlm_none.png", scale=3)

fig.show()

In [6]:
#@title t-sne for lazbf_mlm_lazdef
pca = PCA(n_components=10)
X_embedded = pca.fit_transform(lazbf_mlm_lazdef)

tsne = TSNE(n_components=2, perplexity=cutoff/100, learning_rate=cutoff/12)
X_embedded = tsne.fit_transform(X_embedded)

df = pd.DataFrame({'t-SNE 1': X_embedded[:, 0], 't-SNE 2': X_embedded[:, 1], 'labels': LazBF_sample_labels, 'alpha': [0.75]*cutoff})
fig = px.scatter(df, x='t-SNE 1', y='t-SNE 2', color='labels', color_continuous_scale=['royalblue', 'crimson'], opacity=[0.85]*cutoff)

fig.update_layout(
    autosize=False,
    width=500,
    height=500,
    yaxis=dict(scaleanchor="y", scaleratio=1, showgrid=False, zeroline=False, color='black'),
    xaxis=dict(scaleanchor="x", scaleratio=1, showgrid=False, zeroline=False, color='black'),
    plot_bgcolor='#fff',
    paper_bgcolor='#fff', 
    showlegend=False,
)

fig.update_layout(coloraxis_showscale=False, font=dict(family="Arial"))

fig.update_xaxes(
    title_font=dict(size=24), 
    tickfont=dict(size=24),
)
fig.update_yaxes(
    title_font=dict(size=24), 
    tickfont=dict(size=24),
)

# Define border rectangles
x_range = [df['t-SNE 1'].min()-10, df['t-SNE 1'].max()+10]
y_range = [df['t-SNE 2'].min()-10, df['t-SNE 2'].max()+10]

border_x = [x_range[0], x_range[1], x_range[1], x_range[0], x_range[0]]
border_y = [y_range[0], y_range[0], y_range[1], y_range[1], y_range[0]]

fig.add_shape(
    type="line",
    x0=border_x[0], y0=border_y[0], x1=border_x[1], y1=border_y[1],
    line=dict(color="black", width=1),
)
fig.add_shape(
    type="line",
    x0=border_x[1], y0=border_y[1], x1=border_x[2], y1=border_y[2],
    line=dict(color="black", width=1),
)
fig.add_shape(
    type="line",
    x0=border_x[2], y0=border_y[2], x1=border_x[3], y1=border_y[3],
    line=dict(color="black", width=1),
)
fig.add_shape(
    type="line",
    x0=border_x[3], y0=border_y[3], x1=border_x[0], y1=border_y[0],
    line=dict(color="black", width=1),
)
fig.write_image("lazbf_mlm_lazdef.png", scale=3)

fig.show()

In [13]:
#@title t-sne for lazbf_mlm_lazbf
pca = PCA(n_components=10)
X_embedded = pca.fit_transform(lazbf_mlm_lazbf)

tsne = TSNE(n_components=2, perplexity=cutoff/100, learning_rate=cutoff/12)
X_embedded = tsne.fit_transform(X_embedded)

df = pd.DataFrame({'t-SNE 1': X_embedded[:, 0], 't-SNE 2': X_embedded[:, 1], 'labels': LazBF_sample_labels, 'alpha': [0.75]*cutoff})
fig = px.scatter(df, x='t-SNE 1', y='t-SNE 2', color='labels', color_continuous_scale=['royalblue', 'crimson'], opacity=[0.85]*cutoff)

fig.update_layout(
    autosize=False,
    width=500,
    height=500,
    yaxis=dict(scaleanchor="y", scaleratio=1, showgrid=False, zeroline=False, color='black'),
    xaxis=dict(scaleanchor="x", scaleratio=1, showgrid=False, zeroline=False, color='black'),
    plot_bgcolor='#fff', 
    paper_bgcolor='#fff',
    showlegend=False,
)

fig.update_layout(coloraxis_showscale=False, font=dict(family="Arial"))

fig.update_xaxes(
    title_font=dict(size=24),
    tickfont=dict(size=24),
)
fig.update_yaxes(
    title_font=dict(size=24), 
    tickfont=dict(size=24),
)

# Define border rectangles
x_range = [df['t-SNE 1'].min()-10, df['t-SNE 1'].max()+10]
y_range = [df['t-SNE 2'].min()-10, df['t-SNE 2'].max()+10]

border_x = [x_range[0], x_range[1], x_range[1], x_range[0], x_range[0]]
border_y = [y_range[0], y_range[0], y_range[1], y_range[1], y_range[0]]

fig.add_shape(
    type="line",
    x0=border_x[0], y0=border_y[0], x1=border_x[1], y1=border_y[1],
    line=dict(color="black", width=1),
)
fig.add_shape(
    type="line",
    x0=border_x[1], y0=border_y[1], x1=border_x[2], y1=border_y[2],
    line=dict(color="black", width=1),
)
fig.add_shape(
    type="line",
    x0=border_x[2], y0=border_y[2], x1=border_x[3], y1=border_y[3],
    line=dict(color="black", width=1),
)
fig.add_shape(
    type="line",
    x0=border_x[3], y0=border_y[3], x1=border_x[0], y1=border_y[0],
    line=dict(color="black", width=1),
)
fig.write_image("lazbf_mlm_lazbf.png", scale=3)

fig.show()

In [8]:
#@title t-sne for lazdef_mlm_none
pca = PCA(n_components=10)
X_embedded = pca.fit_transform(lazdef_mlm_none)

tsne = TSNE(n_components=2, perplexity=cutoff/100, learning_rate=cutoff/12)
X_embedded = tsne.fit_transform(X_embedded)

df = pd.DataFrame({'t-SNE 1': X_embedded[:, 0], 't-SNE 2': X_embedded[:, 1], 'labels': LazDEF_sample_labels, 'alpha': [0.75]*cutoff})
fig = px.scatter(df, x='t-SNE 1', y='t-SNE 2', color='labels', color_continuous_scale=['royalblue', 'crimson'], opacity=[0.85]*cutoff)

fig.update_layout(
    autosize=False,
    width=500,
    height=500,
    yaxis=dict(scaleanchor="y", scaleratio=1, showgrid=False, zeroline=False, color='black'),
    xaxis=dict(scaleanchor="x", scaleratio=1, showgrid=False, zeroline=False, color='black'),
    plot_bgcolor='#fff',
    paper_bgcolor='#fff', 
    showlegend=False,
)

fig.update_layout(coloraxis_showscale=False, font=dict(family="Arial"))

fig.update_xaxes(
    title_font=dict(size=24),
    tickfont=dict(size=24), 
)
fig.update_yaxes(
    title_font=dict(size=24),
    tickfont=dict(size=24), 
)

# Define border rectangles
x_range = [df['t-SNE 1'].min()-10, df['t-SNE 1'].max()+10]
y_range = [df['t-SNE 2'].min()-10, df['t-SNE 2'].max()+10]

border_x = [x_range[0], x_range[1], x_range[1], x_range[0], x_range[0]]
border_y = [y_range[0], y_range[0], y_range[1], y_range[1], y_range[0]]

fig.add_shape(
    type="line",
    x0=border_x[0], y0=border_y[0], x1=border_x[1], y1=border_y[1],
    line=dict(color="black", width=1),
)
fig.add_shape(
    type="line",
    x0=border_x[1], y0=border_y[1], x1=border_x[2], y1=border_y[2],
    line=dict(color="black", width=1),
)
fig.add_shape(
    type="line",
    x0=border_x[2], y0=border_y[2], x1=border_x[3], y1=border_y[3],
    line=dict(color="black", width=1),
)
fig.add_shape(
    type="line",
    x0=border_x[3], y0=border_y[3], x1=border_x[0], y1=border_y[0],
    line=dict(color="black", width=1),
)
fig.write_image("lazdef_mlm_none.png", scale=3)

fig.show()

In [9]:
#@title t-sne for lazdef_mlm_lazbf
pca = PCA(n_components=10)
X_embedded = pca.fit_transform(lazdef_mlm_lazbf)

tsne = TSNE(n_components=2, perplexity=cutoff/100, learning_rate=cutoff/12)
X_embedded = tsne.fit_transform(X_embedded)

df = pd.DataFrame({'t-SNE 1': X_embedded[:, 0], 't-SNE 2': X_embedded[:, 1], 'labels': LazDEF_sample_labels, 'alpha': [0.75]*cutoff})
fig = px.scatter(df, x='t-SNE 1', y='t-SNE 2', color='labels', color_continuous_scale=['royalblue', 'crimson'], opacity=[0.85]*cutoff)

fig.update_layout(
    autosize=False,
    width=500,
    height=500,
    yaxis=dict(scaleanchor="y", scaleratio=1, showgrid=False, zeroline=False, color='black'),
    xaxis=dict(scaleanchor="x", scaleratio=1, showgrid=False, zeroline=False, color='black'),
    plot_bgcolor='#fff',
    paper_bgcolor='#fff',
    showlegend=False,
)

fig.update_layout(coloraxis_showscale=False, font=dict(family="Arial"))

fig.update_xaxes(
    title_font=dict(size=24), 
    tickfont=dict(size=24),
)
fig.update_yaxes(
    title_font=dict(size=24),
    tickfont=dict(size=24),
)

# Define border rectangles
x_range = [df['t-SNE 1'].min()-10, df['t-SNE 1'].max()+10]
y_range = [df['t-SNE 2'].min()-10, df['t-SNE 2'].max()+10]

border_x = [x_range[0], x_range[1], x_range[1], x_range[0], x_range[0]]
border_y = [y_range[0], y_range[0], y_range[1], y_range[1], y_range[0]]

fig.add_shape(
    type="line",
    x0=border_x[0], y0=border_y[0], x1=border_x[1], y1=border_y[1],
    line=dict(color="black", width=1),
)
fig.add_shape(
    type="line",
    x0=border_x[1], y0=border_y[1], x1=border_x[2], y1=border_y[2],
    line=dict(color="black", width=1),
)
fig.add_shape(
    type="line",
    x0=border_x[2], y0=border_y[2], x1=border_x[3], y1=border_y[3],
    line=dict(color="black", width=1),
)
fig.add_shape(
    type="line",
    x0=border_x[3], y0=border_y[3], x1=border_x[0], y1=border_y[0],
    line=dict(color="black", width=1),
)
fig.write_image("lazdef_mlm_lazbf.png", scale=3)

fig.show()

In [10]:
#@title t-sne for lazdef_mlm_lazdef
pca = PCA(n_components=10)
X_embedded = pca.fit_transform(lazdef_mlm_lazdef)

tsne = TSNE(n_components=2, perplexity=cutoff/100, learning_rate=cutoff/12)
X_embedded = tsne.fit_transform(X_embedded)

df = pd.DataFrame({'t-SNE 1': X_embedded[:, 0], 't-SNE 2': X_embedded[:, 1], 'labels': LazDEF_sample_labels, 'alpha': [0.75]*cutoff})
fig = px.scatter(df, x='t-SNE 1', y='t-SNE 2', color='labels', color_continuous_scale=['royalblue', 'crimson'], opacity=[0.85]*cutoff)

fig.update_layout(
    autosize=False,
    width=500,
    height=500,
    yaxis=dict(scaleanchor="y", scaleratio=1, showgrid=False, zeroline=False, color='black'),
    xaxis=dict(scaleanchor="x", scaleratio=1, showgrid=False, zeroline=False, color='black'),
    plot_bgcolor='#fff',
    paper_bgcolor='#fff',
    showlegend=False,
)

fig.update_layout(coloraxis_showscale=False, font=dict(family="Arial"))

fig.update_xaxes(
    title_font=dict(size=24),
    tickfont=dict(size=24),
)
fig.update_yaxes(
    title_font=dict(size=24),
    tickfont=dict(size=24),
)

# Define border rectangles
x_range = [df['t-SNE 1'].min()-10, df['t-SNE 1'].max()+10]
y_range = [df['t-SNE 2'].min()-10, df['t-SNE 2'].max()+10]

border_x = [x_range[0], x_range[1], x_range[1], x_range[0], x_range[0]]
border_y = [y_range[0], y_range[0], y_range[1], y_range[1], y_range[0]]

fig.add_shape(
    type="line",
    x0=border_x[0], y0=border_y[0], x1=border_x[1], y1=border_y[1],
    line=dict(color="black", width=1),
)
fig.add_shape(
    type="line",
    x0=border_x[1], y0=border_y[1], x1=border_x[2], y1=border_y[2],
    line=dict(color="black", width=1),
)
fig.add_shape(
    type="line",
    x0=border_x[2], y0=border_y[2], x1=border_x[3], y1=border_y[3],
    line=dict(color="black", width=1),
)
fig.add_shape(
    type="line",
    x0=border_x[3], y0=border_y[3], x1=border_x[0], y1=border_y[0],
    line=dict(color="black", width=1),
)
fig.write_image("lazdef_mlm_lazdef.png", scale=3)

fig.show()

In [11]:
#@title t-sne for lazdef_mlm_pa
lazbf_mlm_pa = np.load("../Embeddings/LazBF_mlm_PA.npy")[-cutoff-5000:-cutoff]
lazdef_mlm_pa = np.load("../Embeddings/LazDEF_mlm_PA.npy")[-cutoff-5000:-cutoff]

pca = PCA(n_components=10)
X_embedded = pca.fit_transform(lazdef_mlm_pa)

tsne = TSNE(n_components=2, perplexity=cutoff/100, learning_rate=cutoff/12)
X_embedded = tsne.fit_transform(X_embedded)

df = pd.DataFrame({'t-SNE 1': X_embedded[:, 0], 't-SNE 2': X_embedded[:, 1], 'labels': LazDEF_sample_labels, 'alpha': [0.75]*cutoff})
fig = px.scatter(df, x='t-SNE 1', y='t-SNE 2', color='labels', color_continuous_scale=['royalblue', 'crimson'], opacity=[0.85]*cutoff)

fig.update_layout(
    autosize=False,
    width=500,
    height=500,
    yaxis=dict(scaleanchor="y", scaleratio=1, showgrid=False, zeroline=False, color='black'),
    xaxis=dict(scaleanchor="x", scaleratio=1, showgrid=False, zeroline=False, color='black'),
    plot_bgcolor='#fff', #rgba(0, 0, 0, 0)',
    paper_bgcolor='#fff', #'rgba(0, 0, 0, 0)',
    showlegend=False,
)

fig.update_layout(coloraxis_showscale=False, font=dict(family="Arial"))

fig.update_xaxes(
    title_font=dict(size=24),  # Increase the font size of the x-axis title
    tickfont=dict(size=24),     # Increase the font size of tick labels
)
fig.update_yaxes(
    title_font=dict(size=24),  # Increase the font size of the y-axis title
    tickfont=dict(size=24),     # Increase the font size of tick labels
)

# Define border rectangles
x_range = [df['t-SNE 1'].min()-10, df['t-SNE 1'].max()+10]
y_range = [df['t-SNE 2'].min()-10, df['t-SNE 2'].max()+10]

border_x = [x_range[0], x_range[1], x_range[1], x_range[0], x_range[0]]
border_y = [y_range[0], y_range[0], y_range[1], y_range[1], y_range[0]]

fig.add_shape(
    type="line",
    x0=border_x[0], y0=border_y[0], x1=border_x[1], y1=border_y[1],
    line=dict(color="black", width=1),
)
fig.add_shape(
    type="line",
    x0=border_x[1], y0=border_y[1], x1=border_x[2], y1=border_y[2],
    line=dict(color="black", width=1),
)
fig.add_shape(
    type="line",
    x0=border_x[2], y0=border_y[2], x1=border_x[3], y1=border_y[3],
    line=dict(color="black", width=1),
)
fig.add_shape(
    type="line",
    x0=border_x[3], y0=border_y[3], x1=border_x[0], y1=border_y[0],
    line=dict(color="black", width=1),
)
fig.write_image("lazdef_mlm_pa.png", scale=3)

fig.show()

In [12]:
#@title t-sne for lazbf_mlm_pa
pca = PCA(n_components=10)
X_embedded = pca.fit_transform(lazbf_mlm_pa)

tsne = TSNE(n_components=2, perplexity=cutoff/100, learning_rate=cutoff/12)
X_embedded = tsne.fit_transform(X_embedded)

df = pd.DataFrame({'t-SNE 1': X_embedded[:, 0], 't-SNE 2': X_embedded[:, 1], 'labels': LazBF_sample_labels, 'alpha': [0.75]*cutoff})
fig = px.scatter(df, x='t-SNE 1', y='t-SNE 2', color='labels', color_continuous_scale=['royalblue', 'crimson'], opacity=[0.85]*cutoff)

fig.update_layout(
    autosize=False,
    width=500,
    height=500,
    yaxis=dict(scaleanchor="y", scaleratio=1, showgrid=False, zeroline=False, color='black'),
    xaxis=dict(scaleanchor="x", scaleratio=1, showgrid=False, zeroline=False, color='black'),
    plot_bgcolor='#fff', #rgba(0, 0, 0, 0)',
    paper_bgcolor='#fff', #'rgba(0, 0, 0, 0)',
    showlegend=False,
)

fig.update_layout(coloraxis_showscale=False, font=dict(family="Arial"))

fig.update_xaxes(
    title_font=dict(size=24),  # Increase the font size of the x-axis title
    tickfont=dict(size=24),     # Increase the font size of tick labels
)
fig.update_yaxes(
    title_font=dict(size=24),  # Increase the font size of the y-axis title
    tickfont=dict(size=24),     # Increase the font size of tick labels
)

# Define border rectangles
x_range = [df['t-SNE 1'].min()-10, df['t-SNE 1'].max()+10]
y_range = [df['t-SNE 2'].min()-10, df['t-SNE 2'].max()+10]

border_x = [x_range[0], x_range[1], x_range[1], x_range[0], x_range[0]]
border_y = [y_range[0], y_range[0], y_range[1], y_range[1], y_range[0]]

fig.add_shape(
    type="line",
    x0=border_x[0], y0=border_y[0], x1=border_x[1], y1=border_y[1],
    line=dict(color="black", width=1),
)
fig.add_shape(
    type="line",
    x0=border_x[1], y0=border_y[1], x1=border_x[2], y1=border_y[2],
    line=dict(color="black", width=1),
)
fig.add_shape(
    type="line",
    x0=border_x[2], y0=border_y[2], x1=border_x[3], y1=border_y[3],
    line=dict(color="black", width=1),
)
fig.add_shape(
    type="line",
    x0=border_x[3], y0=border_y[3], x1=border_x[0], y1=border_y[0],
    line=dict(color="black", width=1),
)
fig.write_image("lazbf_mlm_pa.png", scale=3)

fig.show()