In [26]:
import h5py
import numpy as np
from umap import UMAP

# import matplotlib.pyplot as plt
# import seaborn as sns
import plotly.graph_objects as go

In [3]:
clades = [
    "coleoptera", 
    "diptera", 
    "hemiptera", 
    "homoptera", 
    "hymenoptera", 
    "lepidoptera", 
    "odonata", 
    "orthoptera"
]

base_path = "./saved/timout"

color_map = {
    "coleoptera": "red", 
    "diptera": "blue", 
    "hemiptera": "green", 
    "homoptera": "orange",
    "hymenoptera": "purple", 
    "lepidoptera": "pink", 
    "odonata": "brown", 
    "orthoptera": "yellow"
}


In [25]:
def print_keys(filename):
    with h5py.File(filename, 'r') as file:
        print(list(file.keys()))

# Use this function on one of the .h5 files to see its structure
print_keys(f'{base_path}/coleoptera/per_protein_embeddings.h5')

['CAK1651321_1 Protein timeless homolog [Acanthoscelides obtectus]', 'CAK1651322_1 Protein timeless homolog [Acanthoscelides obtectus]', 'CAK1651328_1 Protein timeless homolog [Acanthoscelides obtectus]', 'CAK1651332_1 Protein timeless homolog [Acanthoscelides obtectus]', 'CAK1657700_1 Protein timeless homolog [Acanthoscelides obtectus]', 'CAK1664182_1 Protein timeless homolog [Acanthoscelides obtectus]', 'KAF7271216_1 hypothetical protein GWI33_015878 [Rhynchophorus ferrugineus]', 'KAF7275727_1 hypothetical protein GWI33_011329, partial [Rhynchophorus ferrugineus]', 'KAF7276150_1 hypothetical protein GWI33_010874, partial [Rhynchophorus ferrugineus]', 'KAF7276725_1 hypothetical protein GWI33_009884, partial [Rhynchophorus ferrugineus]', 'KAF7277062_1 hypothetical protein GWI33_009485, partial [Rhynchophorus ferrugineus]', 'KAF7285717_1 hypothetical protein GWI33_010138 [Rhynchophorus ferrugineus]', 'XP_008201051_1 protein timeless homolog isoform X1 [Tribolium castaneum]', 'XP_0158407

In [32]:
def load_embeddings(path):
    embeddings = []
    with h5py.File(path, 'r') as file:
        keys = list(file.keys())
        for key in file.keys():
            try:
                embeddings.append(np.array(file[key]))
            except Exception as e:
                print(f"Could not load {key} due to: {e}")
    return np.array(embeddings), keys


In [44]:
# Load all embeddings and labels
embeddings_data = {}
keys_data = {}
for clade in clades:
    file_path = f"{base_path}/{clade}/per_protein_embeddings.h5"
    embeddings_data[clade], keys_data[clade] = load_embeddings(file_path)

{'coleoptera': ['CAK1651321_1 Protein timeless homolog [Acanthoscelides obtectus]', 'CAK1651322_1 Protein timeless homolog [Acanthoscelides obtectus]', 'CAK1651328_1 Protein timeless homolog [Acanthoscelides obtectus]', 'CAK1651332_1 Protein timeless homolog [Acanthoscelides obtectus]', 'CAK1657700_1 Protein timeless homolog [Acanthoscelides obtectus]', 'CAK1664182_1 Protein timeless homolog [Acanthoscelides obtectus]', 'KAF7271216_1 hypothetical protein GWI33_015878 [Rhynchophorus ferrugineus]', 'KAF7275727_1 hypothetical protein GWI33_011329, partial [Rhynchophorus ferrugineus]', 'KAF7276150_1 hypothetical protein GWI33_010874, partial [Rhynchophorus ferrugineus]', 'KAF7276725_1 hypothetical protein GWI33_009884, partial [Rhynchophorus ferrugineus]', 'KAF7277062_1 hypothetical protein GWI33_009485, partial [Rhynchophorus ferrugineus]', 'KAF7285717_1 hypothetical protein GWI33_010138 [Rhynchophorus ferrugineus]', 'XP_008201051_1 protein timeless homolog isoform X1 [Tribolium castaneum

In [39]:
# Initialize UMAP for 3D projection
umap_reducer_3d = UMAP(n_components=3, n_neighbors=30, min_dist=0.0, random_state=42)

# Concatenate all embeddings for fitting
all_embeddings_3d = np.concatenate(list(embeddings_data.values()))
all_labels_3d = np.concatenate([[clade] * len(data) for clade, data in embeddings_data.items()])

# Fit UMAP
embedded_data_3d = umap_reducer_3d.fit_transform(all_embeddings_3d)


n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [46]:
# Create a new figure for 3D plotting
fig = go.Figure()

# Plot each clade's points in the 3D space
for clade in embeddings_data.keys():
    indices = (all_labels_3d == clade)
    fig.add_trace(go.Scatter3d(
        x=embedded_data_3d[indices, 0],
        y=embedded_data_3d[indices, 1],
        z=embedded_data_3d[indices, 2],
        mode='markers',
        marker=dict(
            size=5,
            color=color_map[clade],
        ),
        text=keys_data[clade],
        hoverinfo="text",
        name=clade
    ))

# Set the title and labels
fig.update_layout(
    title='3D UMAP Projection of Protein Embeddings by Insect Clade',
    scene=dict(
        xaxis_title='UMAP Dimension 1',
        yaxis_title='UMAP Dimension 2',
        zaxis_title='UMAP Dimension 3'
    ),
    legend_title="Clade",
    width=1200,
    height=1000,
)

# Show plot
fig.show()

In [18]:
# Initialize UMAP for 2D projection
umap_reducer_2d = UMAP(n_components=2, n_neighbors=30, min_dist=0.0, random_state=42)

# Concatenate all embeddings for fitting
all_embeddings_2d = np.concatenate(list(embeddings_data.values()))
all_labels_2d = np.concatenate([[clade] * len(data) for clade, data in embeddings_data.items()])

# Fit UMAP
embedded_data_2d = umap_reducer_2d.fit_transform(all_embeddings_2d)


n_jobs value -1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [48]:
import plotly.graph_objects as go
import numpy as np

# Assuming 'all_labels_2d' and 'embedded_data_2d' are already defined from previous UMAP reduction

# Define hardcoded colors for each clade
color_map = {
    "coleoptera": "red", "diptera": "blue", "hemiptera": "green", "homoptera": "orange",
    "hymenoptera": "purple", "lepidoptera": "pink", "odonata": "brown", "orthoptera": "yellow"
}

# Create a new figure for 2D plotting
fig = go.Figure()

# Plot each clade's points in the 2D space
for clade in embeddings_data.keys():
    indices = (all_labels_2d == clade)
    fig.add_trace(go.Scatter(
        x=embedded_data_2d[indices, 0],
        y=embedded_data_2d[indices, 1],
        mode='markers',
        marker=dict(
            size=5,
            color=color_map[clade],  # Color for each clade
        ),
        text=keys_data[clade],
        hoverinfo="text",
        name=clade  # Name of the clade for the legend
    ))

# Set the title and labels
fig.update_layout(
    title='2D UMAP Projection of Protein Embeddings by Insect Clade',
    xaxis_title='UMAP Dimension 1',
    yaxis_title='UMAP Dimension 2',
    legend_title="Clade",
    width=1200,
    height=1000,
)

# Show plot
fig.show()