## Picture Picture

This notebook's whole purpose for existence is to make pictures of proteins

In [1]:
## Run once cell

%load_ext autoreload
%autoreload 2

import os
os.chdir('..')

In [2]:
import sys
from typing import List, Tuple, Dict, Any
from functools import reduce

import numpy as np
import pandas as pd

from moleculib.assembly.datum import AssemblyDatum
from moleculib.protein.datum import ProteinDatum
from moleculib.graphics.py3Dmol import plot_py3dmol_grid

from moleculib.protein.transform import (
    ProteinCrop,
    TokenizeSequenceBoundaries,
    ProteinPad,
    MaybeMirror,
    BackboneOnly,
    DescribeChemistry
)


from helpers.edges import connect_edges, CascadingEdges
from helpers.cascades import Cascade, MakeCascade, Metrics, MetricsPair, MakeMetricsPair
from helpers.neighborhood import GetNeighbors, NeighborMetrics, MakeNeighborMetrics
from helpers.candidates import MakeCandidate



max_chain_len = 253  # max length for denim-energy model
protein_transform = [
    ProteinCrop(crop_size=max_chain_len),
    TokenizeSequenceBoundaries(),
    MaybeMirror(hand='left'),
    ProteinPad(pad_size=max_chain_len, random_position=False),
    BackboneOnly(filter=True),
    DescribeChemistry(),
]



path_to_data = "data/final/"
df = pd.read_pickle(path_to_data + "master_dataframe.pkl")
edges = pd.read_pickle(path_to_data + "master_edges.pkl")
df.shape, len(edges)

## Initialize the cascading edges
cascading_edges = CascadingEdges(edges)


### Boiler code

In [3]:

# Given a list of PDB ids, pull them from moleculib and visualize


def transform(datum):
    return reduce(lambda x, f: f.transform(x), protein_transform, datum)

class FetchPDBids:
    """Fetch PDB ids as AssemblyDatums."""
    def __init__(self, pdb_ids: List[str]):
        self.pdb_ids = [pdb_id.lower() for pdb_id in pdb_ids]
        self.datums = []
        self.transformed = []  # list of transformed ProteinDatums

    def __call__(self, chain_ids=None):
        print(f"Fetching {len(self.pdb_ids)} PDB IDs...", end=" ")
        for pdb_id in self.pdb_ids:
            assembly = AssemblyDatum.fetch_pdb_id(pdb_id)
            for datum in assembly.protein_data:
                # datum.idcode = pdb_id
                self.datums.append(datum)
            print(f"{pdb_id}, ", end="")
        print("\nDone")

    def togrid(self, k=None, num_columns=3, use_transformed=False):
        if k is None:
            k = len(self.datums)
        if use_transformed:
            if self.transformed == []:
                self.transform()
            datum_grid = self.make_grid(self.transformed[:k], num_columns)
        else:
            datum_grid = self.make_grid(self.datums[:k], num_columns)
        return datum_grid
    
    @staticmethod
    def make_grid(datums: List[ProteinDatum], num_columns=3):
        return [datums[i:i + num_columns] for i in range(0, len(datums), num_columns)]
    
    def transform(self):
        self.transformed = [transform(datum) for datum in self.datums]


In [19]:
beta_helix = ['2jp7', '1prp', '3nxq', '1gca', '1pcl', '1xiq', '2pqe', '1kzq', '4mzu', '1wpc', '1fnu', '4g6r', '4jj2', '3hno', '1lxa', '6ria', '1hg9', '1dcq', '1cb7', '3a1m', '4zu7', '1acc', '1l5j', '6rib', '2jer', '1air', '2d40', '2fla', '1qte', '2kl8', '1dbv', '2obg', '7jvi', '2z0q', '1yox', '1f6w', '3i48', '3zds', '4puq', '1qre', '6e5c', '1cts', '1hin', '2qnz', '3ub3', '1idj', '3obw', '1dab', '3uxh', '4osd', '4aq6', '4aq2', '4fl6', '2ln3', '1znp']
dna_binding = ['4xqk']
# dna_restriction_enzymes = ['EcoRI', 'HindIII', 'BglII', 'PstI', 'SmaI', 'NcoI', 'PvuII']
dna_restriction_enzymes = ['2OXV', '2E52', '1BAM', '2IXS', '1PVI']

In [None]:

def find_pdb(pdb_id, level=None):
    """Find matches where the pdb_id column contains `pdb_id`."""
    if level is None:
        return df[df['pdb_id'].str.contains(pdb_id, case=False)]
    return df[df['pdb_id'].str.contains(pdb_id, case=False) & (df['level'] == level)]


# term_indices = [232853,232854,232855,250110,250111,250112]

list_of_pdbs = ['2kl8', '2ln3']

# find_pdb(list_of_pdbs[1], level=2)

# fetcher = FetchPDBids([list_of_pdbs[1]])
fetcher = FetchPDBids(beta_helix)

fetcher()
fetcher.transform()
print(f"Number of fetched datums: {len(fetcher.datums)}")

plot_py3dmol_grid(fetcher.togrid(k=10, num_columns=3, use_transformed=True))

In [21]:
plot_py3dmol_grid(fetcher.togrid(k=30, num_columns=3, use_transformed=True))

<py3Dmol.view at 0x7f899e9617b0>

In [16]:
# sample = beta_helix[20:25]
sample = dna_binding
fetcher_sample = FetchPDBids(sample)
fetcher_sample()            
fetcher_sample.transform()
plot_py3dmol_grid(fetcher_sample.togrid(k=10, num_columns=3, use_transformed=True))

Fetching 1 PDB IDs... 4xqk, 
Done


<py3Dmol.view at 0x7f89392130d0>

In [9]:
# Plot them

plot_py3dmol_grid(fetcher.togrid(k=15, num_columns=3, use_transformed=True))

<py3Dmol.view at 0x7f8938472080>

In [None]:
from helpers.utils import aa_map, residue_map
aa_map([residue_map(fetcher.datums[0].residue_token)])[0]

