# generatePocketPDBs

We want to attempt to run our SOAP pipeline just on the pockets, to eliminate the influence of the surface. 

For each file in the given xyz file, run fpocket, get the residues touching the pocket, then write them all back
out to a new xyz file

In [43]:
import quippy
import subprocess
import os
import glob
import ase

In [2]:
input_xyz_file = "intermediates/samples_filtered.xyz"

In [7]:
input_file = quippy.AtomsReader(input_xyz_file, format="xyz")
labels = []
for protein in input_file:
    label = protein.params["tag"]
    labels.append(label[:5])


In [10]:
os.mkdir("temp")

for label in labels:
    pdb_ref = label[:4]
    chain_ref = label[4]
    subprocess.call(["wget","http://www.rcsb.org/pdb/files/{}.pdb".format(pdb_ref)])
    try:
        with open("{}.pdb".format(pdb_ref)) as flines:
            pdb_data = flines.readlines()
    except IOError:
        print("IO Error")
    single_chain_data = []
    for line in pdb_data:
        if line.startswith("ATOM"):
            if line[21] == chain_ref:
                single_chain_data.append(line.strip())

    with open("temp/{}.pdb".format(label), "w") as flines:
        flines.write("\n".join(single_chain_data))
    

In [25]:
# For each pdb file in the temporary directory, run fpocket (stored in ~/.local/bin here)
for path in os.listdir("temp"):
    print(path)
    # For whatever reason, standard IPython syntax isn't working
    command="cd temp && fpocket -f {} && cat {}_out/pockets/*_atm.pdb > {}_pockets.pdb".format(
    path, path[:5], path[:5])
    subprocess.call(command, shell=True)

5i9eA.pdb
1oagA.pdb
1ew6A.pdb
4n4oA.pdb
4jouA.pdb
1etpA.pdb
4gu7A.pdb
4i91A.pdb
3qjtA.pdb
2czsA.pdb
1cxyA.pdb
1ebtA.pdb
1gdiA.pdb
5burA.pdb
5lthA.pdb
5d9hB.pdb
1schB.pdb
1hbhB.pdb
3meyB.pdb
2biyA.pdb
5f1cB.pdb
2j0pA.pdb
4xhoA.pdb
1yhuC.pdb
1y56A.pdb
5v2lA.pdb
4zqxA.pdb
2j3mB.pdb
1x9fB.pdb
4dxlA.pdb
5cx7E.pdb
2d2mA.pdb
3ie7A.pdb
1fj0A.pdb
1m7sA.pdb
1ashA.pdb
3tf8B.pdb
3mk7A.pdb
1ytmA.pdb
4f6tA.pdb
5uj7A.pdb
3innD.pdb
3sl2A.pdb
2xszC.pdb
3vliA.pdb
3epsA.pdb
1x9fA.pdb
1bvbA.pdb
1j77A.pdb
1cxaA.pdb
4x2dA.pdb
4uiqA.pdb
3b42A.pdb
4itmA.pdb
4v02A.pdb
1f1cA.pdb
1gcvA.pdb
1hbgA.pdb
3qx9C.pdb
4gxqB.pdb
4mb2B.pdb
4h2uB.pdb
1tc0B.pdb
2v7iA.pdb
3umoA.pdb
1xexA.pdb
1ko5A.pdb
3gw9A.pdb
5vqaA.pdb
5i0nA.pdb
1ji0A.pdb
2r79A.pdb
4mwhA.pdb
4nh0B.pdb
2rsfA.pdb
1gksA.pdb
5m7nA.pdb
4oqrA.pdb
5mcpH.pdb
4n4jA.pdb
4ejgE.pdb
3zhwA.pdb
3hyuA.pdb
4b8nB.pdb
1it2A.pdb
2vzwB.pdb
1esqC.pdb
1ccrA.pdb
1prhA.pdb
5tgcD.pdb
5mb9B.pdb
1yfrA.pdb
4kr7A.pdb
4nk2A.pdb
1mjhB.pdb
1zaoA.pdb
2kmxA.pdb
2zooA.pdb
3amtA.pdb
1ls9A.pdb


In [31]:
# Get the indices for the residues touching the pockets
def getIndicesOfPocketResidues(pathname):
    with open(pathname) as flines:
        residueIds = set([int(line[22:26].strip()) for line in flines if line[:4] == "ATOM"])
    return residueIds

In [34]:
# Given that a _pockets file has a source pdb file in the same directory, open that, and extract all atoms in the pocket.
def getPocketAtoms(pathname):
    residueIds = getIndicesOfPocketResidues(pathname)
    sourcePdb = pathname[:-12] + ".pdb" # BRITTLE
    print(sourcePdb)
    with open(sourcePdb) as flines:
        data = [line.strip() for line in flines if int(line[22:26].strip()) in residueIds]
    outPdb = sourcePdb[:-4] + "_fullpocket.pdb"
    print(outPdb)
    with open(outPdb, "w") as outflines:
        outflines.write("\n".join(data))

In [35]:
for path in glob.glob("temp/*_pockets.pdb"):
    getPocketAtoms(path)

temp/3w1gA.pdb
temp/3w1gA_fullpocket.pdb
temp/1gcvA.pdb
temp/1gcvA_fullpocket.pdb
temp/1ccrA.pdb
temp/1ccrA_fullpocket.pdb
temp/3gw9A.pdb
temp/3gw9A_fullpocket.pdb
temp/5f1cB.pdb
temp/5f1cB_fullpocket.pdb
temp/3bxuB.pdb
temp/3bxuB_fullpocket.pdb
temp/3umoA.pdb
temp/3umoA_fullpocket.pdb
temp/1cpqA.pdb
temp/1cpqA_fullpocket.pdb
temp/2olpA.pdb
temp/2olpA_fullpocket.pdb
temp/3innD.pdb
temp/3innD_fullpocket.pdb
temp/2j3mB.pdb
temp/2j3mB_fullpocket.pdb
temp/1ntfA.pdb
temp/1ntfA_fullpocket.pdb
temp/3sl2A.pdb
temp/3sl2A_fullpocket.pdb
temp/1prhA.pdb
temp/1prhA_fullpocket.pdb
temp/5mcpH.pdb
temp/5mcpH_fullpocket.pdb
temp/4gu7A.pdb
temp/4gu7A_fullpocket.pdb
temp/2jjxB.pdb
temp/2jjxB_fullpocket.pdb
temp/5m7nA.pdb
temp/5m7nA_fullpocket.pdb
temp/2d2mA.pdb
temp/2d2mA_fullpocket.pdb
temp/1xq5A.pdb
temp/1xq5A_fullpocket.pdb
temp/1nyrA.pdb
temp/1nyrA_fullpocket.pdb
temp/3tf8B.pdb
temp/3tf8B_fullpocket.pdb
temp/4uiqA.pdb
temp/4uiqA_fullpocket.pdb
temp/1fj0A.pdb
temp/1fj0A_fullpocket.pdb
temp/1ecaA.pdb
t

In [36]:
!cat temp/2biyA_fullpocket.pdb

ATOM     25  N   LYS A  76      33.606  57.689  35.415  1.00 42.38           N
ATOM     26  CA  LYS A  76      32.547  58.689  35.429  1.00 40.48           C
ATOM     27  C   LYS A  76      32.844  59.716  36.511  1.00 37.69           C
ATOM     28  O   LYS A  76      33.215  59.360  37.630  1.00 36.99           O
ATOM     29  CB  LYS A  76      31.180  58.044  35.649  1.00 40.61           C
ATOM     30  CG  LYS A  76      30.054  58.727  34.877  1.00 43.50           C
ATOM     31  CD  LYS A  76      30.020  58.357  33.382  1.00 43.96           C
ATOM     32  CE  LYS A  76      28.711  58.804  32.728  1.00 44.97           C
ATOM     33  NZ  LYS A  76      28.840  59.255  31.312  1.00 44.29           N
ATOM     34  N   LYS A  77      32.735  60.990  36.161  1.00 34.18           N
ATOM     35  CA  LYS A  77      32.946  62.052  37.128  1.00 32.65           C
ATOM     36  C   LYS A  77      31.739  62.152  38.077  1.00 31.20           C
ATOM     37  O   LYS A  77      30.689  

So we have all the pockets. Tag them all again and stick into a .xyz file

In [38]:
for 

input_file = quippy.AtomsReader(input_xyz_file, format="xyz")
full_labels = []
for protein in input_file:
    label = protein.params["tag"]
    full_labels.append(label)


In [45]:
protein_list = []
labels = []
for path in glob.glob("temp/*_fullpocket.pdb"):
    temp = quippy.Atoms(ase.io.read(path, format='proteindatabank')) # done like this to catch warning
    protein_list.append(temp)
    reference = path[5:10]
    tag = None
    for full_label in full_labels:
        if full_label.startswith(reference):
            tag=full_label
    
    if tag is not None:
        labels.append(tag)
    else:
        raise IOError

In [46]:
testFamily = quippy.AtomsList(protein_list)
print(len(testFamily), "proteins converted to xyz")
print(len(labels), "labels listed")
testFamily.write("intermediates/samples_filtered_pockets.xyz")

(165, 'proteins converted to xyz')
(165, 'labels listed')


In [47]:
xyzFileName= "intermediates/samples_filtered_pockets.xyz"
with open(xyzFileName) as flines:
    data = [line.strip() for line in flines]
newdata = []
counter = 0 
for line in data:
    if line.startswith("cutoff"):
        label = labels[counter]
        line += " tag=\"{}\" ".format(label)
        counter += 1
    newdata.append(line)
assert len(newdata) == len(data)
assert len(labels) == counter
with open("intermediates/samples_filtered_pockets_tagged.xyz", "w") as flines:
    flines.write("\n".join(newdata))
