# generateDatasetForPocketAnalysis


- Choose 100 HEM binders, 100 ATP binders
- Re-pull all the PDBs and compile into an XYZ file
- Do a sanity check on each XYZ file components' size (as previously I had an NMR file that was 21000 atoms large) 


In [15]:
import quippy
import ase
import os
import subprocess


In [24]:
def generateXYZfile():

    # Candidate proteins are stored here
    hemeInputDir = "inputs/protein-heme"
    hemeProteins = os.listdir(hemeInputDir)
    print(len(hemeProteins), "proteins in heme directory")

    nucleoInputDir = "inputs/protein-nucleotide"
    nucleoProteins = os.listdir(nucleoInputDir)
    print(len(nucleoProteins), "proteins in nucleotide directory")

    
    # Choose 100 with the HEM and ATP ligand respectively. Ignore all NMR data, it's bad.
    if not os.path.exists("hemes_converted"):
        os.mkdir("hemes_converted")
    if not os.path.exists("nucleos_converted"):
        os.mkdir("nucleos_converted")

    proteinHolder = []
    labels = []

    
    for hemePath in hemeProteins:
        pdb_ref = hemePath[:4]
        chain_ref = hemePath[4]
        if not os.path.exists("{}.pdb".format(pdb_ref)):
            subprocess.call(["wget","http://www.rcsb.org/pdb/files/{}.pdb".format(pdb_ref), "-q"])
        with open("{}.pdb".format(pdb_ref)) as flines:
            pdb_data = [line.strip() for line in flines]
        is_nmr = False               
        raw_data = []
        ligand_found = False

        for line in pdb_data:
            if line.startswith("MODEL"):
                is_nmr = True
                break
            if line.startswith("HETATM") and line[17:20].strip() == "HEM" and line[21] == chain_ref:
                    ligand_found = True            
            if line.startswith("ATOM") and line[21] == chain_ref:
                raw_data.append(line)
            
        if not ligand_found:
            continue           
        if is_nmr:
            continue
        print(pdb_ref, chain_ref, len(raw_data))
    
        newLabel = "hemes_converted/{}{}_heme.pdb".format(pdb_ref, chain_ref)
        
        with open(newLabel, "w") as flines:
            flines.write("\n".join(raw_data))
            
        temp = quippy.Atoms(ase.io.read(newLabel, format='proteindatabank')) # done like this to catch warning
        proteinHolder.append(temp)
        labels.append([pdb_ref+chain_ref, "heme"])
        
        if len(labels) == 100:
            break
    
    for nucleoPath in nucleoProteins:
        pdb_ref = nucleoPath[:4]
        chain_ref = nucleoPath[4]
        # One of the proteins somehow isn't in the PDB
        if not os.path.exists("{}.pdb".format(pdb_ref)):
            subprocess.call(["wget","http://www.rcsb.org/pdb/files/{}.pdb".format(pdb_ref), "-q"])
        
        try:
            with open("{}.pdb".format(pdb_ref)) as flines:
                pdb_data = [line.strip() for line in flines]
        except IOError:
            continue
            
        is_nmr = False               
        raw_data = []
        ligand_found = False

        for line in pdb_data:
            if line.startswith("MODEL"):
                is_nmr = True
                break
            if line.startswith("HETATM") and line[17:20].strip() == "ATP" and line[21] == chain_ref:
                    ligand_found = True            
            if line.startswith("ATOM") and line[21] == chain_ref:
                raw_data.append(line)

        if not ligand_found:
            continue           
        if is_nmr:
            continue
        print(pdb_ref, chain_ref, len(raw_data))
    
        newLabel = "nucleos_converted/{}{}_nucleo.pdb".format(pdb_ref, chain_ref)
        
        with open(newLabel, "w") as flines:
            flines.write("\n".join(raw_data))
            
        temp = quippy.Atoms(ase.io.read(newLabel, format='proteindatabank')) # done like this to catch warning
        proteinHolder.append(temp)
        labels.append([pdb_ref+chain_ref, "nucleo"])
        
        if len(labels) == 200:
            break

            
    with open("regenerated_labels.txt", "w") as flines:
        flines.write("\n".join(" ".join(x) for x in labels))

    testFamily = quippy.AtomsList(proteinHolder)
    print(len(testFamily), "proteins converted to xyz")
    print(len(labels), "labels listed")
    testFamily.write("reparsed_proteinstructures.xyz")

    xyzFileName= "reparsed_proteinstructures.xyz"
    with open(xyzFileName) as flines:
        data = [line.strip() for line in flines]
    newdata = []
    counter = 0 
    for line in data:
        if line.startswith("cutoff"):
            label = labels[counter][0] + "_" + labels[counter][1]
            line += " tag=\"{}\" ".format(label)
            counter += 1
        newdata.append(line)
    assert len(newdata) == len(data)
    assert len(labels) == counter
    with open("reparsed_proteinstructures.xyz", "w") as flines:
        flines.write("\n".join(newdata))

In [25]:
generateXYZfile()


(596, 'proteins in heme directory')
(1553, 'proteins in nucleotide directory')
('1u17', 'A', 1447)
('4uyl', 'B', 3752)
('1tgu', 'C', 4017)
('3t3z', 'B', 3793)
('2civ', 'A', 2345)
('1oag', 'A', 1905)
('1bz1', 'A', 1077)
('3s1i', 'A', 1050)
('1f4u', 'A', 3000)
('1ew6', 'A', 1096)
('3ia8', 'A', 1267)
('4n4o', 'A', 4018)
('2qpp', 'B', 1794)
('3mgx', 'A', 2966)
('4jou', 'A', 1259)
('1etp', 'A', 1378)
('4gl5', 'A', 3658)
('1dxt', 'B', 1131)
('5fuj', 'A', 1820)
('2hpd', 'B', 3678)
('2zbo', 'A', 639)
('1d06', 'A', 1019)
('3wfx', 'A', 1176)
('4uzv', 'A', 1070)
('3bcq', 'A', 1124)
('4gu7', 'A', 2363)
('4i91', 'A', 3757)
('3qjt', 'A', 4409)
('2czs', 'A', 564)
('1cxy', 'A', 639)
('1ebt', 'A', 1036)
('4u8u', 'C', 1180)
('1v4w', 'B', 1143)
('1gdi', 'A', 1177)
('3qns', 'A', 2538)
('1mjt', 'A', 2857)
('4qoq', 'A', 3960)
('4yt3', 'A', 3160)
('2bkm', 'A', 1054)
('5lth', 'A', 2756)
('1a4e', 'D', 3932)
('1sch', 'B', 2180)
('3riv', 'A', 2127)
('1hbh', 'B', 1138)
('1out', 'B', 1136)
('1hbr', 'A', 1054)
('1d

In [11]:
checkMismatch(path_to_xyz="test_proteinstructures_tagged.xyz")

NameError: name 'checkMismatch' is not defined