# Implementing protein components
this jnb gives you quick access to the implementation of protein components.


In [None]:
import gufe

## Dev function

In [None]:
#input:
pdb_path= "./thrombin_protein.pdb"


In [None]:
#Imports
import json, ast
from collections import defaultdict

from rdkit import Chem
from rdkit.Chem.rdchem import Mol, Atom, Conformer, EditableMol, BondType

from gufe.components.sub_files.pdbfile import PDBFile #Vendored code - import

from rdkit import Chem
from rdkit.Chem.rdchem import Mol, Atom, Conformer, EditableMol, BondType


In [None]:
openmm_PDBFile = PDBFile(pdb_path)
name ="trhomb"

In [None]:
bond_types = {  1 : BondType.SINGLE,
                2 : BondType.DOUBLE,
                3 : BondType.TRIPLE ,
               None :  BondType.SINGLE,
               }

negative_ions = ["CL"]
positive_ions = ["NA", "MG"]

## OpenMM to rdkit

In [None]:
periodicTable = Chem.GetPeriodicTable()
mol_topology = openmm_PDBFile.topology

rd_mol = Mol()
editable_rdmol = EditableMol(rd_mol)

# Build Topology
_residue_atom_map = defaultdict(list)
histidine_resi_atoms = defaultdict(list)

# Add Atoms
for atom in mol_topology.atoms():
    atomID = int(atom.index)
    resn = atom.residue.name
    resi = int(atom.residue.index)
    chaini = int(atom.residue.chain.index)
    
    
    a = Atom(atom.element.atomic_number)
    a.SetAtomMapNum(atomID)

    a.SetProp("name", atom.name)
    a.SetIntProp("id", atomID)

    a.SetProp("resName", resn)
    a.SetIntProp("resId", resi)
    a.SetIntProp("chainId", chaini)
    
    #For histidine fixes
    if("HIS" ==  atom.residue.name):
        histidine_resi_atoms[str(resi)+"_"+resn].append(atom.name)
    _residue_atom_map[str(resi)+"_"+resn].append(atomID)
    
    editable_rdmol.AddAtom(a)

# Add Bonds
for bond in mol_topology.bonds():
    bond_order = bond_types[bond.order]  
    editable_rdmol.AddBond(beginAtomIdx=bond.atom1.index, endAtomIdx=bond.atom2.index, order=bond_order)    

# Set Positions
# WIP: Make multi frame safe
rd_mol = editable_rdmol.GetMol()
positions = list(map(list, openmm_PDBFile.positions._value))
conf = Conformer(0)
for atom_id, atom_pos in enumerate(positions):
    conf.SetAtomPosition(atom_id, atom_pos) #unit: nm
rd_mol.AddConformer(conf)


# Add Additionals
# Formal Charge
atoms = rd_mol.GetAtoms()
netcharge = 0
for a in atoms:
    atomic_num = a.GetAtomicNum()
    atom_name = a.GetProp("name")
    resn = a.GetProp("resName") 

    connectivity = sum([int(bond.GetBondType()) for bond in a.GetBonds()]) #
    
    default_valence = periodicTable.GetDefaultValence(atomic_num)
    
    # HISTIDINE FIX  resonance
    # Due to the resonance of the Ns in His (which are frequently de/protonating in proteins), there can be bond type changes between ND1-CE1-NE2. 
    if("HIS" == resn and "N" in atom_name and len(atom_name)>1):
        resi = int(a.GetProp("resId"))
        dict_key = str(resi)+"_"+resn

        histidine_atoms = histidine_resi_atoms[dict_key]
        own_prot = atom_name.replace("N", "H") in histidine_atoms
        other_N = list(filter(lambda x: x.startswith("N") and len(x) > 1 and not atom_name== x, histidine_atoms))[0]
        other_prot = other_N.replace("N", "H") in histidine_atoms

        if(own_prot and not other_prot and connectivity != default_valence):
            #change bond-order
            bond_change = [bond for bond in a.GetBonds() if("CE1" in (bond.GetBeginAtom().GetProp("name"),
                                                                    bond.GetEndAtom().GetProp("name")))][0]
            bond_change.SetBondType(bond_types[1])
            
            alternate_atom = [atomB for atomB in rd_mol.GetAtoms() if(atomB.GetProp("resId") == str(resi) and atomB.GetProp("name") == str(other_N))][0]
            bond_change = [bond for bond in alternate_atom.GetBonds() if("CE1" in (bond.GetBeginAtom().GetProp("name"),
                                                                                bond.GetEndAtom().GetProp("name")))][0]
            bond_change.SetBondType(bond_types[2])  
        connectivity = sum([int(bond.GetBondType()) for bond in a.GetBonds()])

    ### HISTIDINE FIX DONE
    
    if(connectivity == 0): #ions:
        if(atom_name in positive_ions):
            fc = default_valence  #e.g. Sodium ions
        elif(atom_name in negative_ions):
            fc = -default_valence  #e.g. Chlorine ions
        else:
            raise ValueError("I don't know this Ion! \t"+atom_name)  
    elif(default_valence > connectivity):
        fc = -(default_valence-connectivity) # negative charge
    elif(default_valence < connectivity):
        fc = +(connectivity-default_valence) # positive charge
    else:
        fc = 0 # neutral

    a.SetFormalCharge(fc)
    a.UpdatePropertyCache(strict=True)
    
    netcharge+=fc

# Molecule props
# Adding nums:
rd_mol.SetProp("ofe-name", name)
rd_mol.SetIntProp("NumAtoms", mol_topology.getNumAtoms())
rd_mol.SetIntProp("NumBonds", mol_topology.getNumBonds())
rd_mol.SetIntProp("NumChains", mol_topology.getNumChains())
rd_mol.SetDoubleProp("NetCharge", netcharge)

# Chains
rd_mol.SetProp("chain_names", str([c.index for c in mol_topology.chains()]))
rd_mol.SetProp("_chain_resi", str([[r.index for r in c.residues()] for c in mol_topology.chains()]))

# Residues
res_seq = " ".join([r.name for r in mol_topology.residues()])
rd_mol.SetProp("sequence", res_seq)
rd_mol.SetProp("_residue_atom_map", str(dict(_residue_atom_map)))

# Box dimensions
pbcVs = list(map(list, mol_topology.getPeriodicBoxVectors()._value)) #unit: nm
unitCellDim = list(map(float, mol_topology.getUnitCellDimensions()._value)) #unit: nm
rd_mol.SetProp("PeriodicBoxVectors", str(pbcVs))
rd_mol.SetProp("UnitCellDimensions", str(unitCellDim))


rd_mol.UpdatePropertyCache(strict=True)

In [None]:
openmm_PDBFile.getTopology()

In [None]:
rd_mol

## Protein Component - Class

### Class Implementation - from_pdb

In [None]:
import ast, json

from rdkit import Chem
from gufe.components.sub_files.pdbfile import PDBFile
from gufe.components.sub_files.pdbstructure import PdbStructure

from openmm.unit import nanometers, angstroms, is_quantity, norm, Quantity


from gufe import ProteinComponent
pdb_path= "./thrombin_protein.pdb"

prot = ProteinComponent.from_pdbfile(pdb_path, name="thrombin")
prot

In [None]:
prot._rdkit

In [None]:
### WIP: to OpenMM/PDB

In [None]:
pdb_structure = PdbStructure()
openmm_PDBFile = PDBFile(pdb_structure)

periodic_box_vectors = ast.literal_eval(prot._rdkit.GetProp("PeriodicBoxVectors"))
openmm_PDBFile.topology.setPeriodicBoxVectors(None)



### Class Implementation - dict from and to

In [None]:
from gufe import ProteinComponent
pdb_path= "./thrombin_protein.pdb"

prot = ProteinComponent.from_pdbfile(pdb_path, name="thrombin")
prot

In [None]:
prot._rdkit.GetProp("_chain_residues")

In [None]:
dict_prot = prot.to_dict()
sec_prot = prot.from_dict(dict_prot)
dict_sec_prot = sec_prot.to_dict()

In [None]:
prot.to_rdkit() == sec_prot.to_rdkit()

In [None]:
for key, value in dict_prot.items():
    value2 = dict_sec_prot[key]
    print(key, value== value2)
    

### Class Implementation - to-> OpenMM files

In [1]:
from gufe import ProteinComponent
pdb_path= "./thrombin_protein.pdb"

prot = ProteinComponent.from_pdbfile(pdb_path, name="thrombin")
prot



ProteinComponent(name=thrombin)

In [2]:
from gufe.components.sub_files import topology
from openmm.app import Atom, Residue, Chain

In [7]:

dict_prot['molecules']["_residue_index"]

{'15_ACE': 0,
 '16_ILE': 1,
 '17_VAL': 2,
 '18_GLU': 3,
 '19_GLY': 4,
 '20_SER': 5,
 '21_ASP': 6,
 '22_ALA': 7,
 '23_GLU': 8,
 '24_ILE': 9,
 '25_GLY': 10,
 '26_MET': 11,
 '27_SER': 12,
 '28_PRO': 13,
 '29_TRP': 14,
 '30_GLN': 15,
 '31_VAL': 16,
 '32_MET': 17,
 '33_LEU': 18,
 '34_PHE': 19,
 '35_ARG': 20,
 '36_LYS': 21,
 '37_SER': 22,
 '38_PRO': 23,
 '39_GLN': 24,
 '40_GLU': 25,
 '41_LEU': 26,
 '42_LEU': 27,
 '43_CYS': 28,
 '44_GLY': 29,
 '45_ALA': 30,
 '46_SER': 31,
 '47_LEU': 32,
 '48_ILE': 33,
 '49_SER': 34,
 '50_ASP': 35,
 '51_ARG': 36,
 '52_TRP': 37,
 '53_VAL': 38,
 '54_LEU': 39,
 '55_THR': 40,
 '56_ALA': 41,
 '57_ALA': 42,
 '58_HIS': 43,
 '59_CYS': 44,
 '60_LEU': 45,
 '61_LEU': 46,
 '62_TYR': 47,
 '63_PRO': 48,
 '64_PRO': 49,
 '65_TRP': 50,
 '66_ASP': 51,
 '67_LYS': 52,
 '68_ASN': 53,
 '69_PHE': 54,
 '70_THR': 55,
 '71_GLU': 56,
 '72_ASN': 57,
 '73_ASP': 58,
 '74_LEU': 59,
 '75_LEU': 60,
 '76_VAL': 61,
 '77_ARG': 62,
 '78_ILE': 63,
 '79_GLY': 64,
 '80_LYS': 65,
 '81_HIS': 66,
 '82_

In [6]:
dict_prot['molecules']["_residue_name_id"]

{'15': 'LEU',
 '16': 'GLU',
 '17': 'ASP',
 '18': 'LYS',
 '19': 'THR',
 '20': 'GLU',
 '21': 'ARG',
 '22': 'GLU',
 '23': 'LEU',
 '24': 'LEU',
 '25': 'GLU',
 '26': 'SER',
 '27': 'TYR',
 '28': 'ILE',
 '29': 'NME',
 '30': 'GLN',
 '31': 'VAL',
 '32': 'MET',
 '33': 'LEU',
 '34': 'PHE',
 '35': 'ARG',
 '36': 'LYS',
 '37': 'SER',
 '38': 'PRO',
 '39': 'GLN',
 '40': 'GLU',
 '41': 'LEU',
 '42': 'LEU',
 '43': 'CYS',
 '44': 'GLY',
 '45': 'ALA',
 '46': 'SER',
 '47': 'LEU',
 '48': 'ILE',
 '49': 'SER',
 '50': 'ASP',
 '51': 'ARG',
 '52': 'TRP',
 '53': 'VAL',
 '54': 'LEU',
 '55': 'THR',
 '56': 'ALA',
 '57': 'ALA',
 '58': 'HIS',
 '59': 'CYS',
 '60': 'LEU',
 '61': 'LEU',
 '62': 'TYR',
 '63': 'PRO',
 '64': 'PRO',
 '65': 'TRP',
 '66': 'ASP',
 '67': 'LYS',
 '68': 'ASN',
 '69': 'PHE',
 '70': 'THR',
 '71': 'GLU',
 '72': 'ASN',
 '73': 'ASP',
 '74': 'LEU',
 '75': 'LEU',
 '76': 'VAL',
 '77': 'ARG',
 '78': 'ILE',
 '79': 'GLY',
 '80': 'LYS',
 '81': 'HIS',
 '82': 'SER',
 '83': 'ARG',
 '84': 'THR',
 '85': 'ARG',
 '86':

In [54]:
from openmm import Vec3
from openmm import app
from openmm import unit as omm_unit


dict_prot = prot.to_dict()
top = app.Topology()

# Chains
chains = []
for chain_name in dict_prot['molecules']["chain_names"]:
    c = top.addChain(id=chain_name) 
    chains.append(c)
    
# Residues:
residues ={}
for res_lab, resind in sorted(dict_prot['molecules']["_residue_index"].items(), key=lambda x:x[1]):
    resi, resn = res_lab.split("_")
    
    resind = dict_prot['molecules']["_residue_index"][res_lab]
    icode = dict_prot['molecules']["_residue_icode"][res_lab]
    resi = int(resi)
    
    part_of = [i for i, v in enumerate(dict_prot['molecules']["_chain_residues"]) if(resi in v)]
    chain_id = int([i for i, v in enumerate(dict_prot['molecules']["_chain_residues"]) if(resind in v)][0])
    chain =  chains[chain_id]
    
    #print(resi, resn, chain_id, chain)

    r=top.addResidue(name=resn, id=resind, chain=chain,  insertionCode=icode)
    residues.update({chain.id+"_"+str(resi):r})


# Atoms
atoms = {}
for atom in sorted(dict_prot['atoms'], key=lambda x: x[5]["id"]):   
    key = atom[5]["chainName"]+"_"+str(atom[5]["resId"])
    r= residues[key]
    aid = atom[5]["id"]
    atom = top.addAtom(name=atom[1],
                residue=r,
                id=aid,
                element= app.Element.getByAtomicNumber(atom[0])
                )
    atoms[atom.index]=atom #true?
    
# Bonds
for bond in dict_prot['bonds']:
    top.addBond(atom1=atoms[bond[0]],
                atom2=atoms[bond[1]], 
                type=bond[2],
                order=bond[2])

(6, 'CH3', 0, False, '', {'molAtomMapNumber': 1, 'name': 'CH3', 'id': 1, '_posIndex': 0, 'resName': 'ACE', 'resId': 15, 'insertionCode': ' ', 'chainName': 'H', 'chainId': 0, 'hetatom': 'False'})
(6, 'C', 0, False, '', {'molAtomMapNumber': 2, 'name': 'C', 'id': 2, '_posIndex': 1, 'resName': 'ACE', 'resId': 15, 'insertionCode': ' ', 'chainName': 'H', 'chainId': 0, 'hetatom': 'False'})
(8, 'O', 0, False, '', {'molAtomMapNumber': 3, 'name': 'O', 'id': 3, '_posIndex': 2, 'resName': 'ACE', 'resId': 15, 'insertionCode': ' ', 'chainName': 'H', 'chainId': 0, 'hetatom': 'False'})
(1, 'H1', 0, False, '', {'molAtomMapNumber': 4, 'name': 'H1', 'id': 4, '_posIndex': 3, 'resName': 'ACE', 'resId': 15, 'insertionCode': ' ', 'chainName': 'H', 'chainId': 0, 'hetatom': 'False'})
(1, 'H2', 0, False, '', {'molAtomMapNumber': 5, 'name': 'H2', 'id': 5, '_posIndex': 4, 'resName': 'ACE', 'resId': 15, 'insertionCode': ' ', 'chainName': 'H', 'chainId': 0, 'hetatom': 'False'})
(1, 'H3', 0, False, '', {'molAtomMapN

In [59]:
from gufe.molhashing import hashmol, deserialize_numpy, serialize_numpy
from openmm import Vec3
from openmm import unit as omm_unit

out_path = None
# get pos:
np_pos = deserialize_numpy(prot.to_dict()["conformers"][0])
openmm_pos = list(map(lambda x: Vec3(*x), np_pos))*omm_unit.angstrom

#write file
if(isinstance(out_path, str)):
    out_file = open(out_path,"r")
else:
    out_file = out_path

PDBFile.writeFile(topology=top, positions=openmm_pos, file=out_path)

REMARK   1 CREATED WITH OPENMM 2022-09-05
HETATM    1  CH3 ACE A   1       5.063  -8.435  15.788  1.00  0.00           C  
HETATM    2  C   ACE A   1       4.153  -8.168  16.931  1.00  0.00           C  
HETATM    3  O   ACE A   1       2.974  -7.915  16.724  1.00  0.00           O  
HETATM    4  H1  ACE A   1       4.536  -8.994  15.016  1.00  0.00           H  
HETATM    5  H2  ACE A   1       5.414  -7.493  15.386  1.00  0.00           H  
HETATM    6  H3  ACE A   1       5.929  -8.982  16.153  1.00  0.00           H  
ATOM      7  N   ILE A   2       4.745  -8.189  18.124  1.00  0.00           N  
ATOM      8  CA  ILE A   2       4.141  -7.876  19.396  1.00  0.00           C  
ATOM      9  C   ILE A   2       3.045  -8.904  19.712  1.00  0.00           C  
ATOM     10  O   ILE A   2       1.991  -8.904  19.080  1.00  0.00           O  
ATOM     11  CB  ILE A   2       3.549  -6.434  19.432  1.00  0.00           C  
ATOM     12  CG1 ILE A   2       4.558  -5.336  19.004  1.00  0.00 

In [58]:
app.Vec3

AttributeError: module 'openmm.app' has no attribute 'Vec3'

In [None]:
Chain(index, topology=, id=)

In [None]:
Residue(name="", index=, chain=, id= , insertionCode=)

In [None]:
Atom(name=, element=, index= , residu= , id=)

## Build a bond ordered xml

In [None]:
import xml.etree.ElementTree as etree

out_path = "../gufe/components/sub_files/data/residues.xml"
in_path = "../gufe/components/sub_files/data/residues_orig.xml"


exception_bond_keys = {
            # AminoAcids
            ##Backbone
            ('C', 'O'): { "order": 2, "resns": "all"},
            
            ## Carbonyls in R
            ("CZ", "NH2"): { "order": 2, "resns": ("ARG")},
            ("CG", "OD1"):{ "order": 2, "resns":  ("ASP", "ASN")},
            ("CD", "OE1"):{ "order": 2, "resns": ("GLN", "GLU")},
            ("CD", "OE"):{ "order": 2, "resns": ("PCA")},
            
            ## Aromatics:
            ("CD2", "CG"):{ "order": 2, "resns": ("HIS")},
            ("CE1", "ND1"):{ "order": 2, "resns": ("HIS")},
            
            ("CG", "CD1"):{ "order": 2, "resns": ("PHE", "TYR", "TRP")},
            ("CE1", "CZ"):{ "order": 2, "resns": ("PHE", "TYR")},
            ("CE2", "CD2"):{ "order": 2, "resns": ("PHE", "TYR")},
            
            ("CD2", "CE3"):{ "order": 2, "resns": ("TRP")},
            ("CE2", "CZ2"):{ "order": 2, "resns": ("TRP")},
            ("CZ3", "CH2"):{ "order": 2, "resns": ("TRP")},

            # NucleicAcids
            ## Phosphates
            ("OP1", "P"):{ "order": 2, "resns": ("U", "G", "A", "C", "DT", "DG", "DC", "DA")},
            
            ## Pyrimidines: Uracil, Thymin and Cytosin
            ("C2", "O2"):{ "order": 2, "resns": ("U", "DT", "C", "DC")},
            ("C5", "C6"):{ "order": 2, "resns": ("U", "DT", "C", "DC")},
            ("C4", "O4"):{ "order": 2, "resns": ("U", "DT")},
            ("C4", "N3"):{ "order": 2, "resns": ("C", "DC")},

            ## Purines: Guanine, Adenine
            ("C2", "N3"):{ "order": 2, "resns": ("G", "DG", "A", "DA")},
            ("C4", "C5"):{ "order": 2, "resns": ("G", "DG", "A", "DA")},
            ("N7", "C8"):{ "order": 2, "resns": ("G", "DG", "A", "DA")},
            ("C6", "O6"):{ "order": 2, "resns": ("G", "DG")},
            ("C6", "N1"):{ "order": 2, "resns": ("A", "DA")},            
             }
#sort keys :
exception_bond_keys = {tuple(sorted(list(key))): value for key, value in exception_bond_keys.items()}
#print(exception_bond_keys)


tree = etree.parse(in_path)

for residue in tree.getroot().findall('Residue'):
    resn = residue.get("name")
    for bond in residue.findall("Bond"):
        c1 = bond.get("from")
        c2 = bond.get("to")
        bond_atoms=tuple(sorted([c1, c2]))
        if(bond_atoms in exception_bond_keys and (exception_bond_keys[bond_atoms]["resns"] == "all" or resn in exception_bond_keys[bond_atoms]["resns"])):
            bond.set("order", str(exception_bond_keys[bond_atoms]["order"]))
        else:
            bond.set("order", str(1))
        #if(resn == "PHE"): print(bond_atoms, bond.get("order"))

tree.write(out_path)