# Thesis

For edges, we will try to add cascading edges across all levels, as there is a many-to-one relationship from lower levels to upper levels.

### Imports and load

In [1]:
%load_ext autoreload
%autoreload 2

import os 
import sys
module_path1 = os.path.abspath(os.path.join('../..'))
module_path2 = os.path.abspath(os.path.join('..'))
if module_path1 not in sys.path:
    sys.path.append(module_path1)
if module_path2 not in sys.path:
    sys.path.append(module_path2)

import numpy as np
import pandas as pd
import jax
import jax.numpy as jnp
import math

from typing import List, Dict, Tuple, Union 
from tqdm import tqdm
import pickle

import matplotlib.pyplot as plt
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from moleculib.protein.datum import ProteinDatum
from moleculib.graphics.py3Dmol import plot_py3dmol, plot_py3dmol_grid
from moleculib.protein.alphabet import all_residues

# Metrics computation
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances
from scipy.spatial.distance import cdist, pdist, euclidean, cosine
from sklearn.neighbors import radius_neighbors_graph, sort_graph_by_row_values
from scipy.sparse import csr_matrix

from Bio import Align
from einops import rearrange

from helpers_new import populate_representations, get_column, get_scalars, whatis


In [3]:
FOLDER_PREAMBLE = "../scripts/"
FOLDER = FOLDER_PREAMBLE + "denim-energy-1008-embeddings"
FOLDER_SMALL_FILES = FOLDER_PREAMBLE + "test-save"
embeddings_file = "encoded_dataset.pkl"
sliced_proteins_file = "sliced_dataset.pkl"

# Open both and store
with open(f"{FOLDER}/{embeddings_file}", "rb") as f:
    encoded_dataset = pickle.load(f)
with open(f"{FOLDER}/{sliced_proteins_file}", "rb") as f:
    sliced_dataset = pickle.load(f)

# Load the small folder's files
with open(f"{FOLDER_SMALL_FILES}/{embeddings_file}", "rb") as f:
    encoded_dataset_small = pickle.load(f)
with open(f"{FOLDER_SMALL_FILES}/{sliced_proteins_file}", "rb") as f:
    sliced_dataset_small = pickle.load(f)

# Make objects
reps, _ = populate_representations(encoded_dataset, sliced_dataset)
reps_small, _ = populate_representations(encoded_dataset_small, sliced_dataset_small)
df = reps.to_dataframe()
df_small = reps_small.to_dataframe()

print(f"Loaded big and small: {df.shape}, {df_small.shape}")


## Process 

# Count the "None" datums
n_none_datums = df[df['datum'].isnull()].shape[0]
print(f"Number of None datums: {n_none_datums}")

# Slice into a partial DataFrame, getting roughly
# 20% of each level
df_sample = df.groupby(['pdb_id', 'level']).apply(lambda x: x.sample(frac=0.2)).reset_index(drop=True)
print(df_sample.shape)
df_sample.head()

# Verify that the sample has about 20% of each level
df_sample.groupby(['pdb_id', 'level']).size().reset_index(name='counts')

# Now save the df sample into an original DataFrame and make a new one
# filtering out the None datums
df_original = df_sample.copy()
df_sample = df_original[~df_original['datum'].isnull()]
df_sample.head()



Loaded big and small: (431465, 7), (6004, 7)
Number of None datums: 5375
(86893, 7)






Unnamed: 0,pdb_id,level,level_idx,scalar_rep,datum,pos,color
0,12asA,0,97,"[-0.20669149, -0.669156, -2.9356103, 0.2499248...",(((<moleculib.protein.datum.ProteinDatum objec...,,
1,12asA,0,156,"[-0.21359211, -0.63425595, -2.955626, 0.234090...",(((<moleculib.protein.datum.ProteinDatum objec...,,
2,12asA,0,177,"[-0.21777605, -0.609324, -3.0176435, 0.2322384...",(((<moleculib.protein.datum.ProteinDatum objec...,,
3,12asA,0,224,"[-0.2298039, -0.6022485, -3.0240812, 0.1349030...",(((<moleculib.protein.datum.ProteinDatum objec...,,
4,12asA,0,47,"[-0.45050523, -0.6064707, -3.0410047, 0.036642...",(((<moleculib.protein.datum.ProteinDatum objec...,,


### Plotting Functionality

In [112]:
from helpers_new import PlotProteinDatum

plot_protein_datum = PlotProteinDatum(df_small)
# plot_protein_datum([1,2]).show()

### Edges Code

In [113]:
from helpers_new import connect_edges, CascadingEdges

kernel_size, stride = 5, 2
edges_top_down, edges_bottom_up, n_misses = connect_edges(df_small, kernel_size, stride)
print(f"Missed: {n_misses} edges")
whatis(edges_top_down, edges_bottom_up)

make_cascades = CascadingEdges(edges_bottom_up)


Missed: 1524 edges
Object 0: ({1680: [1621, 1622, 1623, 1624, 1625], 1682: [1623...) is a dictionary with length 1513
Object 1: ({1621: 1680, 1622: 1680, 1623: 1682, 1624: 1682, 1...) is a dictionary with length 3261


### All Cascades

We want to write the following: given a two indices on one level, generate a list of all indices that cascade upwards.

Then, from this list of indices, calculate cosine distances, get protein datum object, etc...

In [57]:
# Pick sample candidates
main_df = df_small.dropna(subset=['datum'])
print(main_df.shape)

u, v = 1342, 3834

# display(main_df.loc[df_small['level'] == 2])
display(main_df.loc[[u, v]])
us, vs = make_cascades(u), make_cascades(v)
print(us, vs)

(5889, 7)


Unnamed: 0,pdb_id,level,level_idx,scalar_rep,datum,pos,color
1342,1azzA,2,5,"[-0.032442138, -0.37545, 1.0176944, -0.9997409...",(((<moleculib.protein.datum.ProteinDatum objec...,,
3834,1eerA,2,7,"[0.13381311, -0.35669148, 0.92978686, -0.96813...",(((<moleculib.protein.datum.ProteinDatum objec...,,


Stopped cascading at 1429: no further parent found.
Stopped cascading at 3900: no further parent found.
[1342, 1400, 1429] [3834, 3877, 3900]


In [141]:
from moleculib.protein.alphabet import all_residues
from helpers_new import calculate_cosine_distances
from helpers_new import DistanceMapMetric, DistanceSeqMetric



class Comparison:
    """Compare a pair of lists of hierarchial (cascading) indices in the graph."""
    def __init__(self, df, us: List[int], vs: List[int], drop_na=True):
        if drop_na:
            self.df = df[df['datum'].notna()]
        else:
            self.df = df
        self.us = us
        self.vs = vs

        # Return attributes
        self.scores = dict(vector=list(),
                           structure=list(),
                           sequence=list()
                    )
        

    # def __post_init__(self,):
        # Data attributes
        self.u_datums: List[ProteinDatum] = []
        self.v_datums: List[ProteinDatum] = []
        self.u_seqs: List[str] = []
        self.v_seqs: List[str] = []
        for u, v in zip(us, vs):
            u_datum = self.df.loc[u, 'datum']
            v_datum = self.df.loc[v, 'datum']
            self.u_datums.append(u_datum)
            self.v_datums.append(v_datum)
            self.u_seqs.append(self._datum_to_sequence(u_datum))
            self.v_seqs.append(self._datum_to_sequence(v_datum))

        self.struct_metric = DistanceMapMetric()
        self.seq_metric = DistanceSeqMetric()

    def cascade_scores(self, **kwargs):
        """kwargs supported:
            `seq`, `struct`, `datum`,

        """
        for i, (u, v) in enumerate(zip(self.us, self.vs)):
            datum1, datum2 = self.u_datums[i], self.v_datums[i]
            print(u, v)
            # Vector score (cosine distance)
            vec1 = self.df.loc[u, 'scalar_rep']
            vec2 = self.df.loc[v, 'scalar_rep']
            print(f"Shape of vec1: {vec1.shape}, vec2: {vec2.shape}")
            struct_map = self.struct_metric(datum1, datum2)
            seq_map = self.seq_metric(datum1, datum2)

            # Append scores
            self.scores['vector'].append(cosine(vec1, vec2))
            self.scores['structure'].append(struct_map)
            self.scores['sequence'].append(seq_map)  # (alignment, hamming distance)

    def _datum_to_sequence(self, datum):
        return [all_residues[token] for token in datum.residue_token]


def compare_two_proteins(datum1: ProteinDatum, datum2: ProteinDatum):

    metric = DistanceMapMetric()
    seq_metric = DistanceSeqMetric()
    l_distance_map = metric(datum1, datum2)
    l_seq = seq_metric(datum1, datum2)
    # print(f"Length of datum1: {len(datum1.residue_token)}, Length of datum2: {len(datum2.residue_token)}")
    # print(f"Distance map score: {l_distance_map}, Sequence score (alignment, hamming): {l_seq}")
    return l_distance_map, l_seq

compare = Comparison(df_small, us, vs, drop_na=False)
compare.cascade_scores()
compare.scores



1342 3834
Shape of vec1: (46,), vec2: (46,)
1400 3877
Shape of vec1: (64,), vec2: (64,)
1429 3900
Shape of vec1: (89,), vec2: (89,)


{'vector': [0.005611203900133366, 0.005151189013532287, 0.0004284890255344953],
 'structure': [9.05007441167522, 28.9157780390867, 83.05165414765368],
 'sequence': [(4.0, 12), (9.0, 27), (20.0, 58)]}

In [131]:
us, vs

([1342, 1400, 1429], [3834, 3877, 3900])

In [134]:
df_small.loc[us]['scalr']

Unnamed: 0,pdb_id,level,level_idx,scalar_rep,datum,pos,color
1342,1azzA,2,5,"[-0.032442138, -0.37545, 1.0176944, -0.9997409...",(((<moleculib.protein.datum.ProteinDatum objec...,,
1400,1azzA,3,4,"[-0.2859857, 0.35381454, -0.69783026, -0.18327...",(((<moleculib.protein.datum.ProteinDatum objec...,,
1429,1azzA,4,4,"[-0.99477684, -0.070955195, -0.6756319, -0.904...",(((<moleculib.protein.datum.ProteinDatum objec...,,


In [135]:
df_small.loc[vs]

Unnamed: 0,pdb_id,level,level_idx,scalar_rep,datum,pos,color
3834,1eerA,2,7,"[0.13381311, -0.35669148, 0.92978686, -0.96813...",(((<moleculib.protein.datum.ProteinDatum objec...,,
3877,1eerA,3,6,"[-0.16497591, 0.3875209, -0.6433863, -0.218885...",(((<moleculib.protein.datum.ProteinDatum objec...,,
3900,1eerA,4,6,"[-0.981238, -0.013244436, -0.7671829, -0.87235...",(((<moleculib.protein.datum.ProteinDatum objec...,,


In [140]:
df_small.iloc[3877]['scalar_rep'].shape

(64,)

In [99]:
from moleculib.protein.alphabet import all_residues
from helpers_new import calculate_cosine_distances
from helpers_new import DistanceMapMetric, DistanceSeqMetric


"""Get Sequences and Datums as objects"""

def datum_to_sequence(datum):
    return [all_residues[token] for token in datum.residue_token]

def cascade_sequences(df, us, vs):
    """Loop through pairs of indices in the dataframe and 
        store their sequences
    """
    u_seqs, v_seqs = [], []
    seq_lengths = []
    for u, v in zip(us, vs):
        u_seqs.append(datum_to_sequence(df.loc[u, 'datum']))
        v_seqs.append(datum_to_sequence(df.loc[v, 'datum']))
        len1 = len(u_seqs[-1])
        len2 = len(v_seqs[-1])
        seq_lengths.append((len1, len2))
        if len1 != len2:
            print(f"Lengths of sequences are different: {len1}, {len2}")
    return u_seqs, v_seqs, seq_lengths

def cascade_datums(df, us, vs):
    """Get the datums for each pair of indices in the dataframe
    """
    u_datums, v_datums = [], []
    for u, v in zip(us, vs):
        u_datums.append(df.loc[u, 'datum'])
        v_datums.append(df.loc[v, 'datum'])
    return u_datums, v_datums


"""Vector-based Scores"""

def cascade_scores(df, us, vs):
    scores = []
    for u, v in zip(us, vs):
        vec1 = df.loc[u, 'scalar_rep']
        vec2 = df.loc[v, 'scalar_rep']
        scores.append(cosine(vec1, vec2))
    return scores


"""Structure maps and alignment scores"""


def compare_two_proteins(datum1: ProteinDatum, datum2: ProteinDatum):

    metric = DistanceMapMetric()
    seq_metric = DistanceSeqMetric()
    l_distance_map = metric(datum1, datum2)
    l_seq = seq_metric(datum1, datum2)
    # print(f"Length of datum1: {len(datum1.residue_token)}, Length of datum2: {len(datum2.residue_token)}")
    # print(f"Distance map score: {l_distance_map}, Sequence score (alignment, hamming): {l_seq}")
    return l_distance_map, l_seq


def cascade_protein_metrics(df, us, vs):
    """Cascade both distance map and sequence scores for each pair of proteins.
    """
    structure_map_metrics = []
    sequence_metrics = []
    for u, v in zip(us, vs):
        datum1 = df.loc[u, 'datum']
        datum2 = df.loc[v, 'datum']
        l_distance_map, l_seq = compare_two_proteins(datum1, datum2)
        structure_map_metrics.append(l_distance_map)
        sequence_metrics.append(l_seq)
    return structure_map_metrics, sequence_metrics


"""Eval"""


# Show the selected rows in the dataframe
display(df_small.loc[[u, v]])

# Get sequences
u_seqs, v_seqs, seq_lengths = cascade_sequences(df_small, us, vs)
u_datums, v_datums = cascade_datums(df_small, us, vs)

# print(f"Sequence for index {u}: {u_seqs}")
# print(f"Sequence for index {v}: {v_seqs}")
print(f"Sequence lengths: {seq_lengths}")
print()

# Calculate cosine distances
scores = calculate_cosine_distances(df_small, us, vs)
print(f"Cosine Scores: {scores}")
print()

# Protein datum metrics
print("Calculating protein metrics:")
struct_scores, seq_scores = cascade_protein_metrics(df_small, us, vs)
print(f"Structure map scores: {struct_scores}")
print(f"Sequence scores: {seq_scores}")


# Plot the proteins

PlotProteinDatum(df_small)(us, vs).show()



Unnamed: 0,pdb_id,level,level_idx,scalar_rep,datum,pos,color
1342,1azzA,2,5,"[-0.032442138, -0.37545, 1.0176944, -0.9997409...",(((<moleculib.protein.datum.ProteinDatum objec...,,
3834,1eerA,2,7,"[0.13381311, -0.35669148, 0.92978686, -0.96813...",(((<moleculib.protein.datum.ProteinDatum objec...,,


Sequence lengths: [(13, 13), (29, 29), (61, 61)]

Cosine Scores: [0.005611203900133366, 0.005151189013532287, 0.0004284890255344953]

Calculating protein metrics:
Structure map scores: [9.05007441167522, 28.9157780390867, 83.05165414765368]
Sequence scores: [(4.0, 12), (9.0, 27), (20.0, 58)]


In [63]:
datum1, datum2 = df_small.loc[u, 'datum'], df_small.loc[v, 'datum']
datum1.align_to(datum2)

<moleculib.protein.datum.ProteinDatum at 0x7f9f018c0fd0>

In [104]:
bb1, bb2 = u_datums[0], u_datums[1]
bb1.residue_token, bb2.residue_token

(array([ 6,  6, 15, 21, 16,  7, 10, 10, 18, 13, 12, 18, 17]),
 array([17,  9, 20, 12, 13, 19,  3,  3, 11,  7, 15,  6, 10,  3, 10, 16, 22,
         6, 22, 22, 13, 10,  3, 11,  5, 12,  4,  9,  6]))

In [142]:
len(datum1)

13

In [80]:
plot_py3dmol_grid([[datum1, datum2, datum1, datum2]])

<py3Dmol.view at 0x7f9f396d70d0>

In [97]:
import py3Dmol
from colour import Color

DEFAULT_COLORS = [
    "cyan",
    "orange",
    "lime",
]


def plot_py3dmol(
        data, 
        color: str = "DEFAULT",
        **kwargs,
    ):
    v = py3Dmol.view()
    if color == "DEFAULT":
        colors = [ DEFAULT_COLORS[i] for i in range(len(data))]
    else:
        # make a gradient from red to green
        colors = list(Color('green').range_to(Color('red'), len(data)))
        colors = [c.get_hex_l() for c in colors]

    for i, datum in enumerate(data):
        datum.plot(v, color=colors[i], **kwargs)
    
    return v

# plot_py3dmol([datum1], color="DEFAULT").show()




viewer = (0, 0)
view = py3Dmol.view(linked=True, width=300, height=300)
view.addModel(datum1.to_pdb_str(), 'pdb', viewer=viewer)
view.addStyle({'model': -1}, {'cartoon': {'color': "spectrum"}}, viewer=viewer)
# view.addStyle({'model': -1}, {'sphere': {'radius': 0.3}}, viewer=viewer)
view.addStyle({'model': -1}, {'stick': {'radius': 0.2}}, viewer=viewer)
# color = 'spectrum'
# view.addStyle({'model': -1}, {'stick': {'radius': 0.2, 'color': color}}, viewer=viewer)
# view.setStyle({'model': -1}, {}, viewer=viewer)

"""Try Slicing"""
view.setStyle({'model': -1, 'resi': '7-10'}, {"sphere": {'color': 'white'}}, viewer=viewer)


view.zoomTo()
view.setBackgroundColor("rgb(0,0,0)", 0)

view.show()


# if sphere:
#     view.addStyle({'model': -1}, {'sphere': {'radius': 0.3}}, viewer=viewer)

# if ribbon:
#     view.addStyle({'model': -1}, {'cartoon': {'color': color}}, viewer=viewer)

# if sidechain:
#     if color != 'spectrum':
#         view.addStyle({'model': -1}, {'stick': {'radius': 0.2, 'color': color}}, viewer=viewer)
#     else:
#         view.addStyle({'model': -1}, {'stick': {'radius': 0.2}}, viewer=viewer)

