In [2]:
import numpy as np
import os
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.express as px
import plotly.graph_objects as go

input_dir = '/root/autodl-tmp/Wendell/pca'
embedding_files = [f for f in os.listdir(input_dir) if f.endswith('.npy')]

all_embeddings = []
for fname in embedding_files:
    arr = np.load(os.path.join(input_dir, fname))
    all_embeddings.append(arr.mean(axis=0))
all_embeddings = np.stack(all_embeddings)


tsne = TSNE(n_components=min(50, all_embeddings.shape[0]-1),
            random_state=42,
            method='exact',
            init='random')
embeddings_tsne = tsne.fit_transform(all_embeddings)


pca2d = PCA(n_components=2).fit_transform(embeddings_tsne)
pca3d = PCA(n_components=3).fit_transform(embeddings_tsne)

df = pd.DataFrame({
    'File': embedding_files,
    'x2': pca2d[:,0],
    'y2': pca2d[:,1],
    'x3': pca3d[:,0],
    'y3': pca3d[:,1],
    'z3': pca3d[:,2]
})


fig2d = px.scatter(df, x='x2', y='y2', text='File', 
                 title='2D Protein Embedding Visualization',
                 labels={'x2': 'PC1', 'y2': 'PC2'},
                 hover_name='File')

fig2d.update_traces(textposition='top center', 
                  marker=dict(size=12, opacity=0.8),
                  textfont=dict(size=10))
fig2d.update_layout(height=800, width=1200)
fig2d.show()


ValueError: Mime type rendering requires nbformat>=4.2.0 but it is not installed

In [None]:


fig3d = go.Figure()

fig3d.add_trace(go.Scatter3d(
    x=df['x3'],
    y=df['y3'],
    z=df['z3'],
    mode='markers+text',
    text=df['File'],
    textposition="top center",
    marker=dict(
        size=8,
        opacity=0.8
    )
))

fig3d.update_layout(
    title='3D Protein Embedding Visualization',
    scene=dict(
        xaxis_title='PC1',
        yaxis_title='PC2',
        zaxis_title='PC3'
    ),
    height=1000,
    width=1200
)
fig3d.show()
