PROJECT CODE - SUCCESSFUL MAIN

In [23]:
import plotly.express as px
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from umap import UMAP
from sklearn.metrics import accuracy_score
import numpy as np

df = pd.read_csv('isaac_db_playground_read_out_embeddings _updated.csv')
df['p.phenotype'] = df['p.phenotype'].fillna('<null>')

embedding_columns = [
    'embedding_prot_t5_xl_bfd',
    'embedding_esmc_600m',
    'embedding_esmc_300m',
    'embedding_facebook/esm2_t33_650M_UR50D',
    'embedding_prot_t5_xl_uniref50',
    'embedding_facebook/esm2_t36_3B_UR50D',
    'embedding_facebook/esm2_t12_35M_UR50D',
    'embedding_facebook/esm2_t6_8M_UR50D',
    'embedding_facebook/esm2_t30_150M_UR50D'
]

def parse_embedding(col):
    def safe_parse(x):
        try:
            return np.array([float(val) for val in x.strip('[]').split() if val != '...'])
        except Exception:
            return np.array([])
    return df[col].apply(safe_parse)

def project_and_plot(col_name):
    embeddings = np.vstack(parse_embedding(col_name))
    reducer = UMAP(n_components=2, random_state=42)
    projection = reducer.fit_transform(embeddings)

    df_plot = df.copy()
    df_plot['UMAP_1'] = projection[:, 0]
    df_plot['UMAP_2'] = projection[:, 1]


    phenotype_centers = (
        df_plot.groupby('p.phenotype')[['UMAP_1', 'UMAP_2']]
        .mean()
        .reset_index()
        .rename(columns={'UMAP_1': 'center_UMAP_1', 'UMAP_2': 'center_UMAP_2'})
    )

    global_center = df_plot[['UMAP_1', 'UMAP_2']].mean()

    fig = px.scatter(
        df_plot,
        x='UMAP_1', y='UMAP_2',
        color='p.phenotype',
        hover_data={'p.accession_id': True, 'p.name': True, 'p.phenotype': True},
        title=f'UMAP Projection - {col_name}',
        labels={'color': 'p.phenotype'}
    )

    fig.add_scatter(
        x=phenotype_centers['center_UMAP_1'],
        y=phenotype_centers['center_UMAP_2'],
        mode='markers+text',
        text=phenotype_centers['p.phenotype'],
        marker=dict(size=12, color='white', symbol='x'),
        textposition='top center',
        name='Phenotype Center'
    )

    fig.add_scatter(
        x=[global_center['UMAP_1']],
        y=[global_center['UMAP_2']],
        mode='markers+text',
        text=['Global Center'],
        marker=dict(size=14, color='red', symbol='star'),
        textposition='bottom right',
        name='Global Center',
        showlegend=True
    )

    fig.add_scatter(
        x=[global_center['UMAP_1']],
        y=[global_center['UMAP_2']],
        mode='markers',
        marker=dict(size=6, color='black'),
        name='Global Center Point',
        showlegend=False
    )
    fig.update_layout(width=1200, height=700)
    fig.show()



def project_and_plot_with_knn(col_name, n_neighbors=5):
    embeddings = np.vstack(parse_embedding(col_name))
    labels = df['p.phenotype'].fillna('<null>').values

    valid_mask = (labels != '<null>') & (np.array([e.size > 0 for e in parse_embedding(col_name)]))
    embeddings = embeddings[valid_mask]
    labels = labels[valid_mask]

    reducer = UMAP(n_components=2, random_state=42)
    projection = reducer.fit_transform(embeddings)

    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(projection, labels)
    knn_preds = knn.predict(projection)

    accuracy = accuracy_score(labels, knn_preds)
    print(f"Percentage of true phenotypes matching KNN predicted phenotypes: {accuracy * 100:.2f}%")

    df_plot = pd.DataFrame({
        'UMAP_1': projection[:, 0],
        'UMAP_2': projection[:, 1],
        'True Phenotype': labels,
        'KNN Prediction': knn_preds
    })

    phenotype_centers = (
        df_plot.groupby('True Phenotype')[['UMAP_1', 'UMAP_2']]
        .mean()
        .reset_index()
        .rename(columns={'UMAP_1': 'center_UMAP_1', 'UMAP_2': 'center_UMAP_2'})
    )
    global_center = df_plot[['UMAP_1', 'UMAP_2']].mean()

    fig = px.scatter(
        df_plot,
        x='UMAP_1',
        y='UMAP_2',
        color='True Phenotype',
        symbol='KNN Prediction',
        hover_data=['True Phenotype', 'KNN Prediction'],
        title=f'UMAP Projection with KNN on {col_name}',
        labels={'color': 'True Phenotype', 'symbol': 'KNN Prediction'}
    )

    fig.add_scatter(
        x=phenotype_centers['center_UMAP_1'],
        y=phenotype_centers['center_UMAP_2'],
        mode='markers+text',
        text=phenotype_centers['True Phenotype'],
        marker=dict(size=12, color='white', symbol='x'),
        textposition='top center',
        name='Phenotype Center'
    )

    fig.add_scatter(
        x=[global_center['UMAP_1']],
        y=[global_center['UMAP_2']],
        mode='markers+text',
        text=['Global Center'],
        marker=dict(size=14, color='red', symbol='star'),
        textposition='bottom right',
        name='Global Center',
        showlegend=True
    )

    fig.add_scatter(
        x=[global_center['UMAP_1']],
        y=[global_center['UMAP_2']],
        mode='markers',
        marker=dict(size=6, color='black'),
        name='Global Center Point',
        showlegend=False
    )

    fig.update_layout(width=1200, height=700)
    fig.show()

project_and_plot('embedding_prot_t5_xl_bfd')

def project_and_get_knn_accuracy(col_name, n_neighbors=5):
    embeddings = np.vstack(parse_embedding(col_name))
    labels = df['p.phenotype'].fillna('<null>').values

    valid_mask = (labels != '<null>') & (np.array([e.size > 0 for e in parse_embedding(col_name)]))
    embeddings = embeddings[valid_mask]
    labels = labels[valid_mask]

    reducer = UMAP(n_components=2, random_state=42)
    projection = reducer.fit_transform(embeddings)

    knn = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn.fit(projection, labels)
    knn_preds = knn.predict(projection)

    accuracy = accuracy_score(labels, knn_preds)
    return accuracy

neighbors = [3, 5, 7, 9, 11, 13, 15, 17, 19]
accuracy_bfd = []
accuracy_uniref50 = []
accuracy_esmc_600m = []
accuracy_esmc_300m = []
accuracy_esm2_t36 = []
accuracy_esm2_t6 = []
accuracy_esm2_t30 = []
accuracy_esm2_t12 = []
accuracy_esm2_t33 = []



accuracies = {name: [] for name in embedding_columns}

for k in neighbors:
    for name in embedding_columns:
        acc = project_and_get_knn_accuracy(name, n_neighbors=k)
        accuracies[name].append(acc)


for name, acc_list in accuracies.items():
    formatted = [f"{a * 100:.2f}%" for a in acc_list]
    print(f"Accuracies for {name}:", formatted)

project_and_plot_with_knn('embedding_prot_t5_xl_bfd', n_neighbors=5)

mean_accuracies = {
    name: np.mean(acc_list) * 100 for name, acc_list in accuracies.items()
}


sorted_means = sorted(mean_accuracies.items(), key=lambda x: x[1], reverse=True)

print("\nMean Accuracies (Sorted):")
for name, mean in sorted_means:
    print(f"{name}: {mean:.2f}%")

neighborsX = [3, 5, 7, 9, 11, 13, 15, 17, 19]
accuracyY = [66.98, 70.71, 74.60, 75.56, 77.86, 80.16, 80.24, 81.98, 82.22]

df = pd.DataFrame({
    'Number of Neighbors': neighborsX,
    'Accuracy (%)': accuracyY
})

fig = px.line(
    df,
    x='Number of Neighbors',
    y='Accuracy (%)',
    markers=True,
    text=df['Accuracy (%)'],
    title='Mean Accuracy of all Models'
)

fig.update_traces(textposition='top center')

fig.update_layout(
    xaxis=dict(tickmode='array', tickvals=neighborsX),
    yaxis=dict(range=[0, 100], dtick=10)
)

fig.show()




'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.




'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 ove

Accuracies for embedding_prot_t5_xl_bfd: ['80.71%', '75.71%', '75.00%', '75.00%', '75.71%', '75.71%', '74.29%', '75.71%', '72.14%']
Accuracies for embedding_esmc_600m: ['73.57%', '66.43%', '62.14%', '65.00%', '65.71%', '67.14%', '67.14%', '66.43%', '69.29%']
Accuracies for embedding_esmc_300m: ['84.29%', '84.29%', '82.86%', '82.86%', '82.86%', '81.43%', '81.43%', '80.71%', '79.29%']
Accuracies for embedding_facebook/esm2_t33_650M_UR50D: ['86.43%', '79.29%', '81.43%', '81.43%', '80.71%', '79.29%', '77.86%', '78.57%', '76.43%']
Accuracies for embedding_prot_t5_xl_uniref50: ['78.57%', '73.57%', '72.86%', '72.86%', '70.71%', '67.86%', '69.29%', '65.71%', '65.00%']
Accuracies for embedding_facebook/esm2_t36_3B_UR50D: ['80.71%', '77.14%', '71.43%', '72.86%', '73.57%', '73.57%', '73.57%', '75.00%', '73.57%']
Accuracies for embedding_facebook/esm2_t12_35M_UR50D: ['87.14%', '82.14%', '82.86%', '83.57%', '82.14%', '80.00%', '80.71%', '80.00%', '79.29%']
Accuracies for embedding_facebook/esm2_t6_


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.




Mean Accuracies (Sorted):
embedding_esmc_300m: 82.22%
embedding_facebook/esm2_t12_35M_UR50D: 81.98%
embedding_facebook/esm2_t30_150M_UR50D: 80.24%
embedding_facebook/esm2_t33_650M_UR50D: 80.16%
embedding_facebook/esm2_t6_8M_UR50D: 77.86%
embedding_prot_t5_xl_bfd: 75.56%
embedding_facebook/esm2_t36_3B_UR50D: 74.60%
embedding_prot_t5_xl_uniref50: 70.71%
embedding_esmc_600m: 66.98%
