## Find Candidate Building Blocks

In [None]:
## Run once cell

%load_ext autoreload
%autoreload 2

import os
os.chdir('..')

In [None]:
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from helpers.edges import connect_edges, CascadingEdges
from helpers.cascades import Cascade, MakeCascade, Metrics, MetricsPair, MakeMetricsPair

In [None]:
path_to_data = "data/final/"
df = pd.read_pickle(path_to_data + "master_dataframe.pkl")
edges = pd.read_pickle(path_to_data + "master_edges.pkl")
df.shape, len(edges)

In [None]:
## Initialize the cascading edges
cascading_edges = CascadingEdges(edges)

### DBSCAN Detour

In [None]:
%%time

from sklearn.cluster import DBSCAN

def cluster_scalar_reps(df, level, eps=0.5, min_samples=5):
    # Filter the dataframe for the selected level
    level_df = df[df['level'] == level].copy()
    
    # Assuming 'scalar_reps' is the column containing the scalar representations
    scalar_reps = np.stack(level_df['scalar_rep'].values)
    
    # Apply DBSCAN clustering algorithm
    db = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine').fit(scalar_reps)
    
    # Add cluster labels to the dataframe
    level_df.loc[:, 'cluster'] = db.labels_
    
    return level_df


lvl2_eps = [8e-10, 5.8e-5, 2.88e-4, 5.29e-4, 1.321e-3]
lvl3_eps = [9.9e-10, 1.1e-4, 2.8e-4, 4.4e-4, 1.98e-3]
lvl4_eps = [8.7e-11, 1.2e-5, 2.89e-5, 4.8e-5, 1.7e-4]

fig, ax = plt.subplots(1, 3, figsize=(15, 5))
ax[0].plot(lvl2_eps)
ax[1].plot(lvl3_eps)
ax[2].plot(lvl4_eps)

In [None]:
df2s = []
df3s = []
df4s = []

for eps in lvl2_eps:
    df2s.append(cluster_scalar_reps(df, level=2, eps=eps, min_samples=5))

for eps in lvl3_eps:
    df3s.append(cluster_scalar_reps(df, level=3, eps=eps, min_samples=5))

for eps in lvl4_eps:
    df4s.append(cluster_scalar_reps(df, level=4, eps=eps, min_samples=5))


# df2 = cluster_scalar_reps(df, level=2, eps=8e-10, min_samples=10)
# df3 = cluster_scalar_reps(df, level=3, eps=10e-10, min_samples=10)
# df4 = cluster_scalar_reps(df, level=4, eps=8.7e-11, min_samples=10)


In [None]:

# Plotting the relationship between the number of clusters and the eps values for each level
fig, axs = plt.subplots(1, 3, figsize=(18, 6))

# Titles for subplots
titles = ['Level 2 Clusters vs Eps', 'Level 3 Clusters vs Eps', 'Level 4 Clusters vs Eps']
# DataFrames lists and eps values for each level
dfs_lists = [df2s, df3s, df4s]
eps_values = [lvl2_eps, lvl3_eps, lvl4_eps]
colors = plt.cm.viridis(np.linspace(0, 1, len(eps)))

for i, (dfs, eps) in enumerate(zip(dfs_lists, eps_values)):
    # Count the number of unique clusters for each DataFrame in the list
    num_clusters = [len(df['cluster'].unique()) for df in dfs]
    # Plotting
    num_minus_one_clusters = [df[df['cluster'] == -1].shape[0] for df in dfs]
    num_other_clusters = [df[df['cluster'] != -1].shape[0] for df in dfs]
    
    bar_width = 0.35
    index = np.arange(len(eps))
    
    bar1 = axs[i].bar(index, num_minus_one_clusters, bar_width, alpha=0.8, label='Noise (-1) Cluster')
    bar2 = axs[i].bar(index + bar_width, num_other_clusters, bar_width, alpha=0.8, label='Other Clusters')
    
    axs[i].set_xticks(index + bar_width / 2)
    axs[i].set_xticklabels([f"{e:.1e}" for e in eps])
    axs[i].legend()
    axs[i].plot(eps, num_clusters, marker='o')
    axs[i].set_xlabel('Eps Value', fontsize=15)
    axs[i].set_title(titles[i], fontsize=16)
    axs[i].grid(True)
axs[0].set_ylabel('Size of Clusters', fontsize=15)

plt.tight_layout()
plt.show()

In [None]:

def clusters_for_pdb(dfxs, pdb_id):
    for dfx in dfxs:
        # nodes = dfx[dfx['pdb_id'][:-1] == pdb_id]
        nodes = dfx[dfx['pdb_id'].str.lower().str.contains(pdb_id.lower())]
        # print(f"PDB ID: {pdb_id} is in the following clusters: {nodes['cluster'].unique()}")
        print(f"Indices for PDB ID: {pdb_id} in the clusters {nodes['cluster'].unique()}: {nodes.index.values}")

cd20s = ['6PE9', '6TKB', '6PE8', '6TKF', '6TKE', '6TKD', '6TKC', '1QSC', '6BRB', '3LKJ',
         '6PE7', '1ALY']
ms_related = ['6H24', '1PY9', '5HIU', '6FG1', '6FG2', '4Q6R', '4GMV']

beta_helix_and_friends = ['1kzq', '4mzu', '1wpc', '1fnu', '4g6r', '4jj2', '3hno', '1lxa', 
                          '6ria', '1hg9', '1dcq', '1cb7', '3a1m', '4zu7', '1acc', '1l5j', 
                          '6rib', '2jer', '1air', '2d40', '2fla', '1qte', '2kl8', '1dbv', 
                          '2obg', '7jvi', '2z0q', '1yox', '1f6w', '3i48', '3zds', '4puq', 
                          '1qre', '6e5c', '1cts', '1hin', '2qnz', '3ub3', '1idj', '3obw', 
                          '1dab', '3uxh', '4osd', '4aq6', '4aq2', '4fl6', '2ln3', '1znp'
                          ]


term_pdbs = ['3hno', '1cb7']
# term_pdbs = ['1m8n', '1ezg']


# for pdb in beta_helix_and_friends:
#     clusters_for_pdb(df3s[0:1], pdb)

In [None]:
class Idx2Datum:
    def __init__(self, df):
        self.df = df

    def __call__(self, *idxs):
        return self.df.loc[idxs, 'datum'].values


indices = [1000]
MakeCascade.plot_datums(Idx2Datum(df)(indices)).show()


In [None]:
from tqdm import tqdm

def map_lower_to_upper_clusters(lower_df, upper_df):
    """
    This function iterates through clusters in the lower_df, and for each cluster, 
    it finds all indices in the upper_df that connect to the indices within that cluster 
    using edge cascades. It tracks and returns the number of clusters in the upper_df 
    that are covered by each cluster in the lower_df.
    
    Args:
    lower_df (DataFrame): DataFrame containing lower level clusters.
    upper_df (DataFrame): DataFrame containing upper level clusters.
    
    Returns:
    Dict: A dictionary where keys are clusters from lower_df and values are the count of 
          covered clusters from upper_df.
    """

    make_cascades = CascadingEdges(edges)

    # Filter out the -1 cluster in lower_df
    lower_df = lower_df[lower_df['cluster'] != -1]
    
    # Initialize dictionary to store results
    cluster_coverage = {}
    
    # Iterate over unique clusters in lower_df
    for lower_cluster in tqdm(lower_df['cluster'].unique()):
        # Get indices for the current lower cluster
        lower_indices = lower_df[lower_df['cluster'] == lower_cluster].index
        
        # Set to store covered clusters in upper_df
        # covered_clusters = set()
        covered_clusters = []
        
        # Iterate over each index in the current lower cluster
        for lower_index in lower_indices:
            cascades = make_cascades(lower_index)
            
            # Check which upper_df indices are hit by cascades
            # for cascade_index in cascades:
            #     if cascade_index in upper_df.index:
            # if cascades[-1]:
            # covered_clusters.add(upper_df.loc[cascades[-1], 'cluster'])
            try:
                covered_clusters.append(upper_df.loc[cascades[-1], 'cluster'])
            except KeyError:
                pass
        
        # Store the number of covered clusters for the current lower cluster
        # cluster_coverage[lower_cluster] = len(covered_clusters)
        cluster_coverage[lower_cluster] = covered_clusters

    # Print the number of unique clusters in lower_df and upper_df
    print(f"Number of unique lower clusters: {len(lower_df['cluster'].unique())}")
    print(f"Number of unique upper clusters: {len(upper_df['cluster'].unique())}")
    
    return cluster_coverage

# Example usage
coverage_result = map_lower_to_upper_clusters(df2s[-1], df4s[-1])

# Iterate through the result to get the individual, unique cluster labels for each lower cluster
for lower_cluster, covered_clusters in coverage_result.items():
    unique_covered_clusters = set(covered_clusters)
    if len(unique_covered_clusters) > 1:
        print(f"Lower Cluster {lower_cluster} covers these unique upper clusters: {unique_covered_clusters}")
    # print(f"Lower Cluster {lower_cluster} covers these unique upper clusters: {unique_covered_clusters}")

# # Index into one of them (example: first lower cluster) and show the dataframe
# first_lower_cluster = next(iter(coverage_result.keys()))
# first_covered_clusters = set(coverage_result[first_lower_cluster])
# print(f"Data for the first lower cluster {first_lower_cluster} covering unique upper clusters:")
# display(df4[df4['cluster'].isin(first_covered_clusters)])


print(coverage_result)


#### Search through DBSCAN clusters

In [None]:
import random

make_cascades = CascadingEdges(edges)


def get_cluster(cluster_label, df):
    # Filter the DataFrame to get only the rows with the specified cluster label
    cluster_df = df[df['cluster'] == cluster_label]
    
    # Pick a random index from the cluster
    if not cluster_df.empty:
        random_index = random.choice(cluster_df.index.tolist())
        
        # Get the datum at te random index
        # datum = cluster_df.loc[random_index, 'datum']
        
        # Calculate neighbor metrics using the datum
        Cascade = MakeCascade(df, make_cascades(random_index))
        
        return Cascade
    else:
        return None



### Search for candidates

Algorithm for searching candidates based on bottom-level threshold and upper-level threshold

In [None]:
from helpers.neighborhood import GetNeighbors, NeighborMetrics, MakeNeighborMetrics
from helpers.candidates import MakeCandidate

# get_neighbors = GetNeighbors(df)
# query_index = 249972
# dists, neighbors = get_neighbors(queryƒbe_index, radius=0.00426)
# # dists, neighbors = get_neighbors(query_index, n_neighbors=4)
# print(dists)
# neighbors


make_candidate = MakeCandidate(df, edges, 249972)
candidate = make_candidate(n_neighbors_threshold=10)
# candidate.eval(divergence_threshold=0.0002)
# neighbors = candidate_eval.search_candidates(0.00426, divergence_threshold=0.02)




In [None]:
print(candidate)

In [None]:
# ubi = "MQIFVKTLTG KTITLEVEPS DTIENVKAKI QDKEGIPPDQ QRLIFAGKQL EDGRTLSDYN IQKESTLHLV LRLRGG"
ubiquitin_scaffold = "MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGG"
# MQIFVKTLT-[Motif]-GKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGG

def scaffolded_motif(motif, scaffold=ubiquitin_scaffold):
    print(f"Length of motif: {len(motif)}")
    return f"{scaffold[:9]}{motif}{scaffold[9:]}"


In [None]:
# Now we do a random search for candidates

def get_random_candidates_by_level(df, level, num_candidates, max_tries=500, n_neighbors_threshold=10, divergence_threshold=0.00007):
    # Filter the dataframe for the given level
    level_df = df[df['level'] == level]
    
    # If k is greater than the number of rows in level_df, reduce k to the number of rows
    if num_candidates > len(level_df):
        num_candidates = len(level_df)
    
    # # Select k indices evenly spaced around the dataframe
    # indices = np.linspace(0, len(level_df) - 1, k, dtype=int)
    # Select k indices at random from the dataframe
    indices = np.random.choice(level_df.index, max_tries, replace=False)
    
    # Get the actual indices from the dataframe
    actual_indices = level_df.loc[indices].index
    
    # List to store candidates
    candidates = []


    # Generate candidates for each index
    n_candidates = 0
    for total_count, idx in enumerate(actual_indices):
        if n_candidates > num_candidates:
            break
        make_candidate = MakeCandidate(df, edges, idx)
        # candidate = make_candidate(n_neighbors_threshold=n_neighbors_threshold)
        candidate = make_candidate(radius_threshold=0.2)
        if candidate is None or not candidate.eval(divergence_threshold=divergence_threshold):
            continue
        candidates.append(candidate)
        n_candidates += 1

    print(f"Total candidates sampled: {total_count}")
    if n_candidates == 0:
        print("No candidates found!")
    return candidates

# Example usage:
selected_level = 1
num_candidates = 5
random_candidates = get_random_candidates_by_level(df, selected_level, 
                                                   num_candidates,
                                                   n_neighbors_threshold=7,
                                                   divergence_threshold=4e-2)
for candidate in random_candidates:
    print(candidate)


In [None]:
CANDIDATE_IDX = -1

# print(scaffolded_motif("TGYLRN"))
random_candidates[CANDIDATE_IDX].neighbor_metrics.plot(5)



In [None]:
# This is an algorithm that will loop through a list of n_neighbors thresholds and find the best candidate for each threshold

def experiment(df, levels, n_neighbors_thresholds, divergence_thresholds):
    results = dict()
    for level in levels:
        print(f"Level: {level}")
        results[level] = dict(
            n_neighbors_threshold=[],
            divergence_threshold=[],
            num_candidates=[]
        )
        for n_neighbors_threshold in n_neighbors_thresholds:
            for divergence_threshold in divergence_thresholds:

                candidates = get_random_candidates_by_level(df, level, 5, max_tries=50, 
                                                            n_neighbors_threshold=n_neighbors_threshold, 
                                                            divergence_threshold=divergence_threshold)

                results[level]['num_candidates'].append(len(candidates))
                results[level]['n_neighbors_threshold'].append(n_neighbors_threshold)
                results[level]['divergence_threshold'].append(divergence_threshold)
    return results


results = experiment(df, [2, 3], [5, 8, 10], [7e-5, 3e-5, 1e-5])

In [None]:
# ubi = "MQIFVKTLTG KTITLEVEPS DTIENVKAKI QDKEGIPPDQ QRLIFAGKQL EDGRTLSDYN IQKESTLHLV LRLRGG"
ubiquitin_scaffold = "MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGG"
# MQIFVKTLT-[Motif]-GKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFAGKQLEDGRTLSDYNIQKESTLHLVLRLRGG

def scaffolded_motif(motif, scaffold=ubiquitin_scaffold):
    print(f"Length of motif: {len(motif)}")
    return f"{scaffold[:9]}{motif}{scaffold[9:]}"

scaffolded_motif("NRFKTIEECRRTC")

In [None]:
from moleculib.protein.datum import ProteinDatum
from moleculib.protein.alphabet import all_residues
from helpers.utils import aa_map, residue_map

def datum_to_sequence(datum):
    """Given a datum object, return the sequence of the protein."""
    return [all_residues[token] for token in datum.residue_token]

full_protein = ProteinDatum.fetch_pdb_id("1c5e", format="pdb")
print(residue_map(full_protein.residue_token))

### Specific Neighbor Search

In [87]:
def clusters_for_pdb(dfxs, pdb_id):
    for dfx in dfxs:
        # nodes = dfx[dfx['pdb_id'][:-1] == pdb_id]
        nodes = dfx[dfx['pdb_id'].str.lower().str.contains(pdb_id.lower())]
        # print(f"Indices for PDB ID: {pdb_id} in the clusters {nodes['cluster'].unique()}: {nodes.index.values[:5]}")
        print(f"Indices for PDB ID: {pdb_id}: {nodes.index.values[:5]}")


for pdb in beta_helix_and_friends:
    clusters_for_pdb(df2s[1:2], pdb)

Indices for PDB ID: 1kzq: [215858 215859 215860 215861 215862]
Indices for PDB ID: 4mzu: [216314 216315 216316 216317 216318]
Indices for PDB ID: 1wpc: [219050 219051 219052 219053 219054]
Indices for PDB ID: 1fnu: [219264 219265 219266 219267 219268]
Indices for PDB ID: 4g6r: []
Indices for PDB ID: 4jj2: [220031 220032 220033 220034 220035]
Indices for PDB ID: 3hno: [220371 220372 220373 220374 220375]
Indices for PDB ID: 1lxa: [221283 221284 221285 221286 221287]
Indices for PDB ID: 6ria: [221431 221432 221433 221434 221435]
Indices for PDB ID: 1hg9: []
Indices for PDB ID: 1dcq: [224263 224264 224265 224266 224267]
Indices for PDB ID: 1cb7: [224491 224492 224493 224494 224495]
Indices for PDB ID: 3a1m: [225140 225141 225142 225143 225144]
Indices for PDB ID: 4zu7: [225714 225715 225716 225717 225718]
Indices for PDB ID: 1acc: [226812 226813 226814 226815 226816]
Indices for PDB ID: 1l5j: [227040 227041 227042 227043 227044]
Indices for PDB ID: 6rib: [227422 227423 227424 227425 22742

In [98]:

# query_index = 233899
query_index = 188414
# query_index = 225747
neighbor_metrics, distances, top_vectors =  MakeNeighborMetrics(df, edges, query_index)(n_neighbors=8)
# print(neighbor_metrics)
neighbor_metrics.plot()

Query: 1ayoA. part sequence: QDIPVRDLKPAIV


Neighbor 0 at index 188290, PDB ID: 1ayoB. part sequence: QDIPVRDLKPAIV -- Alignment: 13.0, RMSD: 0.0552, cosine: 0.000041
Full sequence: QDIPVRDLKPAIVKVYDYYETDEFAVAEYSAPCS -- part sequence at indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]


Neighbor 1 at index 76332, PDB ID: 1bcrA. part sequence: GGGGGGGGGGGGG -- Alignment: 0.0, RMSD: 4.9122, cosine: 0.000513
Full sequence: HGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG -- part sequence at indices: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]


Neighbor 2 at index 124481, PDB ID: 1cfbA. part sequence: ADQPTFVKYLIKV -- Alignment: 6.0, RMSD: 1.7505, cosine: 0.000527
Full sequence: WRQNNIVIADQPTFVKYLIKVVAINDRGESNVAAEEVVGYSGEDR -- part sequence at indices: [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


Neighbor 3 at index 4603, PDB ID: 1cfmA. part sequence: ASAAGKIVAITAL -- Alignment: 4.0, RMSD: 1.8097, cosine: 0.000547
Full sequence: PDGKKSNNTIYNASAAGKIVAITALSEKKGGFEVSIEKANGEVVVDKIPAGPDLIVKEGQT -- part sequence at indices: [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]


Neighbor 4 at index 85375, PDB ID: 1dqiD. part sequence: LKTKKKGKLYALS -- Alignment: 3.0, RMSD: 2.5928, cosine: 0.000555
Full sequence: LKTKKKGKLYALSYCNIHGLWENEVTLE -- part sequence at indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]


Neighbor 5 at index 190137, PDB ID: 1eo8A. part sequence: GGGGGGGGGGGGG -- Alignment: 0.0, RMSD: 24.6327, cosine: 0.000561
Full sequence: GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG -- part sequence at indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]


Neighbor 6 at index 208562, PDB ID: 3lkjA. part sequence: FELQPGASVFVNV -- Alignment: 4.0, RMSD: 4.8410, cosine: 0.000562
Full sequence: LGGVFELQPGASVFVNVTDPSQVSHGTGFTSFGLLKL -- part sequence at indices: [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]


Neighbor 7 at index 226516, PDB ID: 4zu7C. part sequence: LYIGPLVWHEMFD -- Alignment: 4.0, RMSD: 4.2463, cosine: 0.000582
Full sequence: ELNDPSVGLYIGPLVWHEMFDFTEGCVLLVLASEYYDETDYIRNYDFYIDEAKKRFLE -- part sequence at indices: [1, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


In [None]:
# df.groupby("level").size()

def sample_for_level(df, level, n_samples=1):
    return df.groupby("level").get_group(level).sample(n=n_samples)

for level in range(1, 5):
    print(sample_for_level(df, level, 1)['scalar_rep'].values[0].shape)



In [None]:
# display(neighbor_metrics.metrics)
for metric in neighbor_metrics.metrics:
    print(metric[0])

In [99]:
neighbor_metrics.plot()

Query: 1ayoA. part sequence: QDIPVRDLKPAIV


Neighbor 0 at index 188290, PDB ID: 1ayoB. part sequence: QDIPVRDLKPAIV -- Alignment: 13.0, RMSD: 0.0552, cosine: 0.000041
Full sequence: QDIPVRDLKPAIVKVYDYYETDEFAVAEYSAPCS -- part sequence at indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]


Neighbor 1 at index 76332, PDB ID: 1bcrA. part sequence: GGGGGGGGGGGGG -- Alignment: 0.0, RMSD: 4.9122, cosine: 0.000513
Full sequence: HGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG -- part sequence at indices: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]


Neighbor 2 at index 124481, PDB ID: 1cfbA. part sequence: ADQPTFVKYLIKV -- Alignment: 6.0, RMSD: 1.7505, cosine: 0.000527
Full sequence: WRQNNIVIADQPTFVKYLIKVVAINDRGESNVAAEEVVGYSGEDR -- part sequence at indices: [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


Neighbor 3 at index 4603, PDB ID: 1cfmA. part sequence: ASAAGKIVAITAL -- Alignment: 4.0, RMSD: 1.8097, cosine: 0.000547
Full sequence: PDGKKSNNTIYNASAAGKIVAITALSEKKGGFEVSIEKANGEVVVDKIPAGPDLIVKEGQT -- part sequence at indices: [12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]


Neighbor 4 at index 85375, PDB ID: 1dqiD. part sequence: LKTKKKGKLYALS -- Alignment: 3.0, RMSD: 2.5928, cosine: 0.000555
Full sequence: LKTKKKGKLYALSYCNIHGLWENEVTLE -- part sequence at indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]


Neighbor 5 at index 190137, PDB ID: 1eo8A. part sequence: GGGGGGGGGGGGG -- Alignment: 0.0, RMSD: 24.6327, cosine: 0.000561
Full sequence: GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG -- part sequence at indices: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]


Neighbor 6 at index 208562, PDB ID: 3lkjA. part sequence: FELQPGASVFVNV -- Alignment: 4.0, RMSD: 4.8410, cosine: 0.000562
Full sequence: LGGVFELQPGASVFVNVTDPSQVSHGTGFTSFGLLKL -- part sequence at indices: [4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]


Neighbor 7 at index 226516, PDB ID: 4zu7C. part sequence: LYIGPLVWHEMFD -- Alignment: 4.0, RMSD: 4.2463, cosine: 0.000582
Full sequence: ELNDPSVGLYIGPLVWHEMFDFTEGCVLLVLASEYYDETDYIRNYDFYIDEAKKRFLE -- part sequence at indices: [1, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]


Try to structure by the distance of the top nodes. 