## Analysis Dataset

Make an analysis dataset with both encoded representations and custom requests.

This file takes embeddings already produced by the ophiuchus model, processes them (removing Nones and the amino acid embedding level). It also combines any other encodings generated from other files to the other embeddings, producing a single, coherent dataset with which to compare in a separate notebook.

Use case is when the standard dataset used to generate mass embeddings by ophiuchus does not contain particular pairs that are of interest to us. We can then use this notebook to combine multiple embedded dataset.

In [1]:
# Run once cell

%load_ext autoreload
%autoreload 2

import os
os.chdir("..")

In [2]:
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from typing import List, Dict, Tuple, Union 

import numpy as np
import pandas as pd

import pickle
import json

from moleculib.protein.datum import ProteinDatum
from moleculib.graphics.py3Dmol import plot_py3dmol, plot_py3dmol_grid


from helpers.database import populate_representations, whatis
from helpers.edges import connect_edges, CascadingEdges
from helpers.data_processing import LoadData, save_df, save_edges

from helpers.utils import CheckPDBs, aa_map, residue_map

In [3]:

## General useful functions

def load_data(folder, embeddings_file, sliced_proteins_file, tsne_file):

    # Load data from files
    dataloader = LoadData(folder, embeddings_file, sliced_proteins_file, tsne_file)
    dataloader.load_all()

    # Make objects
    if tsne_file is not None:
        reps, _ = populate_representations(dataloader.encoded_dataset, 
                                        dataloader.sliced_dataset, 
                                        dataloader.tsne_dataset)
    else:
        reps, _ = populate_representations(dataloader.encoded_dataset, dataloader.sliced_dataset)
        
    df = reps.to_dataframe()
    print(f"Loaded full dataset: {df.shape}")
    return df

def get_info(df, verbose=True):
    """Return basic DataFrame information."""
    # Calculate the number of nodes per level
    nodes_per_level = df.groupby('level').size()

    # Count the "None" datums
    n_none_datums = df[df['datum'].isnull()].shape[0]
    if verbose:
        print("Shape of nodes per level: {}".format(nodes_per_level.shape))
        display(nodes_per_level)
        print(f"Number of None datums: {n_none_datums}")

def process_data(df):
    """Perform basic data processing. Drop Nones, and drop amino acid-level
        embeddings...
    """
    df_sample = df.dropna(subset=['datum']).reset_index(drop=True)  # drop nans
    df_sample = df_sample[df_sample['level'] != 0].reset_index(drop=True)  # drop level 0
    print(f"Shape of sample after drops: {df_sample.shape}")
    return df_sample


def make_edges(df, kernel, stride): 
    """Make edges and cascades."""
    # Compute edges
    edges_top_down, edges_bottom_up, n_misses = connect_edges(df, kernel, stride)
    make_cascades = CascadingEdges(edges_bottom_up)
    print(f"Misses: {n_misses}")

    return edges_top_down, edges_bottom_up, make_cascades


### General Embeddings

Load the embeddings and sliced datums of standard ophiuchus.

In [4]:
# Load the standard dataset
# Since we will do TSNE together later, don't load it now.

FOLDER = "data/denim-energy-1008-embeddings"
embeddings_file = "encoded_dataset.pkl"
sliced_proteins_file = "sliced_dataset.pkl"
tsne_file = "encoded_dataset_tsne.json"

df = load_data(FOLDER, embeddings_file=embeddings_file, 
               sliced_proteins_file=sliced_proteins_file, 
               tsne_file=None)
get_info(df)
df_sample = process_data(df)


Loading embeddings from data/denim-energy-1008-embeddings/encoded_dataset.pkl
Loading sliced proteins from data/denim-energy-1008-embeddings/sliced_dataset.pkl
Loaded full dataset: (431465, 7)
Shape of nodes per level: (5,)


level
0    221142
1    111222
2     56215
3     28602
4     14284
dtype: int64

Number of None datums: 5375
Shape of sample after drops: (204948, 7)


### Get other data sources of interest

In [5]:
customloader = LoadData(FOLDER="data/custom-embeddings",
                        embeddings_file="encoded_dataset_custom.pkl",
                        sliced_proteins_file="sliced_dataset_custom.pkl")
customloader.load_all()

# Make objects
custom_reps, _ = populate_representations(customloader.encoded_dataset, customloader.sliced_dataset)
custom_df = custom_reps.to_dataframe()
print(f"Loaded full dataset: {custom_df.shape}")

# Process as before
custom_sample = process_data(custom_df)

Loading embeddings from data/custom-embeddings/encoded_dataset_custom.pkl
Loading sliced proteins from data/custom-embeddings/sliced_dataset_custom.pkl
Loaded full dataset: (99206, 7)
Shape of sample after drops: (47114, 7)


In [6]:
# Append the rows of custom_sample to df_sample
new_df = pd.concat([df_sample, custom_sample], ignore_index=True)

# Check that the indices are unique and increasing
print(new_df.index.is_monotonic_increasing)
print(new_df.index.is_unique)
print(f"Shape of new_df: {new_df.shape}")

True
True
Shape of new_df: (252062, 7)


In [7]:
new_df.isna().sum()

pdb_id             0
level              0
level_idx          0
scalar_rep         0
datum              0
pos           252062
color         252062
dtype: int64

In [8]:
# Now do edges for the new dataframe

edges_top_down, edges_bottom_up, make_cascades = make_edges(new_df, kernel=5, stride=2)
whatis(edges_top_down, edges_bottom_up)

Misses: 11933
Object 0: ({81411: [81286, 81287, 81288, 81289, 81290], 81412...) is a dictionary with length 116399
Object 1: ({81286: 81411, 81287: 81411, 81288: 81412, 81289: ...) is a dictionary with length 236217


In [25]:
# Now save the new dataframe and edges. This is what will be used for the rest of the analysis.
save_df(new_df, "encoded_dataset_combined")
print("Saved new_df")
save_edges(edges_bottom_up, "edges_bottom_up")

DataFrame saved as JSON to data//encoded_dataset_combined.json
Saved new_df
edges_bottom_up has been saved to data//edges_bottom_up.json


In [26]:
%%time

# Save the DataFrame as a CSV file
new_df.to_csv("data/master_dataframe.csv")
print("DataFrame saved as CSV to data/combined_dataset.csv")

# Save the edges_bottom_up dictionary as a pickle file
import pickle
with open("data/master_edges.pkl", "wb") as f:
    pickle.dump(edges_bottom_up, f)
print("edges_bottom_up dictionary saved as pickle to data/edges_bottom_up.pkl")


DataFrame saved as CSV to data/combined_dataset.csv
edges_bottom_up dictionary saved as pickle to data/edges_bottom_up.pkl
CPU times: user 1min 28s, sys: 797 ms, total: 1min 29s
Wall time: 1min 29s


In [27]:
%%time

# Save the DataFrame as a pickle file
with open("data/master_dataframe.pkl", "wb") as f:
    pickle.dump(new_df, f)
print("DataFrame saved as pickle to data/master_dataframe.pkl")


DataFrame saved as pickle to data/master_dataframe.pkl
CPU times: user 13.6 s, sys: 2.31 s, total: 15.9 s
Wall time: 16.2 s


### TSNE Time

In [11]:
# Prepare the dataframe for TSNE processing

from collections import defaultdict
from sklearn.manifold import TSNE


def compute_tsne(df: pd.DataFrame):
    """Given a dataframe (corresponding to our database), compute tsne coords and colors."""
    # encoded_dataset_tsne = defaultdict(lambda : defaultdict(lambda : defaultdict(dict)))

    # Get unique levels
    positions_lst = []
    colors_lst = []

    levels = sorted(df['level'].unique())
    for level in levels:
        df_for_level = df[df['level'] == level]
        level_data = np.array(df_for_level['scalar_rep'].values.tolist())

        print(f'computing position tsne for level {level}: {level_data.shape}')
        position = TSNE(n_components=2, perplexity=3, learning_rate='auto', init='random').fit_transform(level_data)
        print(f'computing color tsne for level {level}: {level_data.shape}')
        colors = TSNE(n_components=3, perplexity=3, learning_rate='auto', init='random').fit_transform(level_data)
        colors = (colors - colors.min())
        colors = (colors * 255 / colors.max()).astype(np.int32)
        colors = [f'rgb({r}, {g}, {b})' for r, g, b in colors]

        # positions[level] = position
        # colors[level] = colors

        positions_lst.extend(position.tolist())
        colors_lst.extend(colors)

    return positions_lst, colors_lst

    #     cumsum = 0
    #     pdb_groups = df_for_level.groupby('pdb_id')
    #     for pdb, group in pdb_groups:
    #         len_ = group.shape[0]
    #         encoded_dataset_tsne[pdb][level]['pos'] = position[cumsum:cumsum+len_].tolist()
    #         encoded_dataset_tsne[pdb][level]['colors'] = colors[cumsum:cumsum+len_]
    #         cumsum += len_

    # return encoded_dataset_tsne



In [12]:
# Get a 10% random sample of the dataframe
small_df = new_df.sample(frac=0.005, random_state=42)
print(small_df.shape)
positions, colors = compute_tsne(small_df)



(1260, 7)
computing position tsne for level 1: (672, 33)
computing color tsne for level 1: (672, 33)
computing position tsne for level 2: (358, 46)
computing color tsne for level 2: (358, 46)
computing position tsne for level 3: (165, 64)
computing color tsne for level 3: (165, 64)
computing position tsne for level 4: (65, 89)
computing color tsne for level 4: (65, 89)


In [13]:
small_df['pos'] = positions
small_df['color'] = colors
print(small_df.shape)
small_df.head()

(1260, 7)


Unnamed: 0,pdb_id,level,level_idx,scalar_rep,datum,pos,color
97653,1d3bD,1,25,"[-0.13821925, 0.24543715, -0.33401486, 0.01873...",(((<moleculib.protein.datum.ProteinDatum objec...,"[-35.692317962646484, 32.21705627441406]","rgb(84, 108, 228)"
73996,1bvyF,1,20,"[-0.08123147, 0.25039104, -0.2238249, 0.043804...",(((<moleculib.protein.datum.ProteinDatum objec...,"[12.129412651062012, 0.21517214179039001]","rgb(165, 112, 189)"
177959,1eovA,1,58,"[-0.1622783, 0.27444598, -0.31894395, 0.078085...",(((<moleculib.protein.datum.ProteinDatum objec...,"[-0.19315895438194275, 16.106788635253906]","rgb(150, 79, 201)"
58740,1ez0A,1,33,"[0.03047585, 0.23298316, -0.13053995, 0.007102...",(((<moleculib.protein.datum.ProteinDatum objec...,"[46.32844161987305, -10.210537910461426]","rgb(195, 139, 114)"
154858,1axtL,1,105,"[-0.46172005, -0.21491313, -0.6202147, 0.16822...",(((<moleculib.protein.datum.ProteinDatum objec...,"[38.54891586303711, -34.35532760620117]","rgb(235, 126, 160)"


In [15]:
new_df_copied = new_df.copy()

In [16]:
# Let's try the full thing
new_df.shape

(252062, 7)

In [17]:
%%time
full_positions, full_colors = compute_tsne(new_df)

# This cell took 56 minutes to run on my machine.

computing position tsne for level 1: (135663, 33)
computing color tsne for level 1: (135663, 33)
computing position tsne for level 2: (67370, 46)
computing color tsne for level 2: (67370, 46)
computing position tsne for level 3: (33193, 64)
computing color tsne for level 3: (33193, 64)
computing position tsne for level 4: (15836, 89)
computing color tsne for level 4: (15836, 89)
CPU times: user 7h 12min 9s, sys: 8min 15s, total: 7h 20min 24s
Wall time: 56min 38s


In [18]:
# Save the full positions and full colors to pickle files
# Since they are Python lists, we need to use pickle directly
import pickle

with open('full_positions.pkl', 'wb') as f:
    pickle.dump(full_positions, f)

with open('full_colors.pkl', 'wb') as f:
    pickle.dump(full_colors, f)




In [19]:
## Append

new_df['pos'] = full_positions
new_df['color'] = full_colors
print(new_df.shape)
new_df.head()

(252062, 7)


Unnamed: 0,pdb_id,level,level_idx,scalar_rep,datum,pos,color
0,1f00I,1,0,"[0.081032045, 0.62376326, 0.28515857, 0.197421...",(((<moleculib.protein.datum.ProteinDatum objec...,"[-8.188774108886719, 106.9360122680664]","rgb(117, 49, 59)"
1,1f00I,1,1,"[-0.28887343, 0.001341799, -0.54696304, 0.1838...",(((<moleculib.protein.datum.ProteinDatum objec...,"[-80.61167907714844, 33.45246887207031]","rgb(141, 117, 56)"
2,1f00I,1,2,"[-0.11274243, 0.2764013, -0.36209202, 0.011574...",(((<moleculib.protein.datum.ProteinDatum objec...,"[71.27960205078125, 38.42264175415039]","rgb(92, 128, 174)"
3,1f00I,1,3,"[-0.12116315, 0.50699997, -0.15239324, 0.09882...",(((<moleculib.protein.datum.ProteinDatum objec...,"[45.77057647705078, -70.40210723876953]","rgb(91, 76, 112)"
4,1f00I,1,4,"[-0.14587262, 0.10403667, -0.38717338, 0.06709...",(((<moleculib.protein.datum.ProteinDatum objec...,"[38.595611572265625, 48.74850845336914]","rgb(93, 210, 190)"


In [20]:
# Check for None datums in the new_df DataFrame
none_datums_count = new_df['datum'].isnull().sum()
print(f"Number of None datums in new_df: {none_datums_count}")


Number of None datums in new_df: 0


In [21]:
# Now we compute edges on this 
edges_top_down, edges_bottom_up, make_cascades = make_edges(new_df, kernel=5, stride=2)

Misses: 11933


In [22]:
whatis(edges_bottom_up)

Object 0: ({81286: 81411, 81287: 81411, 81288: 81412, 81289: ...) is a dictionary with length 236217


Note that there is a problem whereby some datums have length 0. At this point we should just filter those out. It should be only 1024.

In [None]:


# Drop rows where the length of the datum object is 0
final_df = df[df['datum'].apply(len) > 0]
print(final_df.shape)

edges_top_down, edges_bottom_up, n_misses = connect_edges(final_df, 5, 2)

# from helpers.data_processing import save_df, save_edges

# Save like this
if False:
    with open("data/master_dataframe.pkl", "wb") as f:
        pickle.dump(final_df, f)
    print("final_df saved as pickle.")

    with open("data/master_edges.pkl", "wb") as f:
        pickle.dump(edges_bottom_up, f)
    print("edges_bottom_up saved as pickle.")


In [23]:
save_df(new_df, "data/master_dataframe.pkl")
save_edges(edges_bottom_up, "data/master_edges.pkl")

In [None]:
from helpers.data_processing import save_edges, save_df

# One final processing scheme to make the data usable in react.

path_to_visual = "/Users/moniradev/Documents/projects/foreign/picture-picture/src/data"
df_for_visual = df.drop(columns=['scalar_rep', 'datum'])
print(df.shape, df_for_visual.shape)
df_for_visual.tail()

save_df(df_for_visual, filename="master_dataframe", folder=path_to_visual)
save_edges(edges, filename="master_edges", folder=path_to_visual)


(251038, 7) (251038, 5)
DataFrame saved as JSON to /Users/moniradev/Documents/projects/foreign/picture-picture/src/data/master_dataframe.json
edges_bottom_up has been saved to /Users/moniradev/Documents/projects/foreign/picture-picture/src/data/master_edges.json


In [42]:
custom_pdbs = custom_df['pdb_id'].unique()
print(f"Number of unique pdbs: {len(custom_pdbs)}")

custom_assemblies = pd.Series([pdb[:-1] for pdb in custom_pdbs]).unique()
print(f"Number of unique assemblies: {len(custom_assemblies)}")

get_info(custom_df)

Number of unique pdbs: 287
Number of unique assemblies: 66
Shape of nodes per level: (5,)


level
0    51090
1    25599
2    12849
3     6484
4     3184
dtype: int64

Number of None datums: 1002
