In [49]:
import os
import pathlib
import glob
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [7]:
# attributes.
REQUIRED_ATTRIBUTES = [
    "record",
    "name",
    "b",
    "q",
    "coor",
    "resn",
    "resi",
    "icode",
    "e",
    "charge",
    "chain",
    "altloc",
]

In [10]:
pwd

'/Users/mingbin/Desktop/Wankowicz_lab_Mac/Projects/water/notebook'

In [11]:
with open('/Users/mingbin/Desktop/Wankowicz_lab_Mac/Projects/water/data/pdb_redo_data/1a0j_final.pdb', 'r') as f:
    lines = f.readlines()

In [12]:
len(lines)

3337

In [13]:
lines[:10]

['HEADER                                                        1A0J\n',
 'COMPND    MOL_ID: 1;\n',
 'COMPND   2 MOLECULE: ;\n',
 'COMPND   3 CHAIN: A, B, C, D\n',
 'SOURCE    MOL_ID: 1\n',
 'EXPDTA    X-RAY DIFFRACTION\n',
 'REMARK   2\n',
 'REMARK   2 RESOLUTION.    1.70 ANGSTROMS.\n',
 'REMARK   3 \n',
 'REMARK   3 REFINEMENT.\n']

In [14]:
headings=[line.split()[0] for line in lines]

In [17]:
# Unique headings in the PDB file.
np.unique(headings)

array(['ATOM', 'COMPND', 'CRYST1', 'EXPDTA', 'FORMUL', 'HEADER', 'HET',
       'HETNAM', 'LINK', 'REMARK', 'SEQRES', 'SOURCE', 'SSBOND', 'TER'],
      dtype='<U6')

In [19]:
# "SEQRES": sequence of residues in each chain.
# "ATOM": atom information.
# "TER": end of chain.
# "HETATM"/"HET": heterogen atom information.
# "CONECT": connectivity information.
# "END": end of file.
# "REMARK": remark.
# "MODEL": model information. 
# "SSBOND": disulfide bond information. 
# "CRYST1": unit cell information.  

In [27]:
# Collect all residues from the line "SEQRES"
# Example: "SEQRES   1 D  223  ILE VAL GLY GLY TYR GLU CYS ARG LYS ASN SER ALA SER"
residues_dict={"chain":[], "total_res": [],"residues":[]} 
# "chain": chain ID. "total_res": total number of residues in the chain. "residues": list of residues in the chain.
# residues = []
for line in lines:
    if line.startswith("SEQRES"):
        residues_dict["chain"].append(line.split()[2])
        residues_dict["total_res"].append(line.split()[3])
        residues_dict["residues"].append(line.split()[4:])



In [28]:
len(residues_dict["residues"])

72

In [32]:
# residues_dict["residues"]

In [31]:
# residues_dict['chain']
residues_dict['total_res']

['223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223',
 '223']

In [39]:
# Rearrange the dictionary.
chains = list(set(residues_dict["chain"]))
chains 
clean_residues_dict = {"chain":[], "total_res": [], "residues":[]}
for c in chains: # B, D, A, C.
    res_per_chain = []
    for i, chain in enumerate(residues_dict["chain"]):
        # Process on chain at a time.
        if chain == c:
            res_per_chain.append(residues_dict["residues"][i])
            total = residues_dict["total_res"][i]
    clean_residues_dict["residues"].append(res_per_chain)
    clean_residues_dict["total_res"].append(total) # Only add once the total number of residues.
clean_residues_dict["chain"] = chains


In [40]:
print(clean_residues_dict["total_res"])
#  clean_residues_dict["residues"]
print(clean_residues_dict["chain"])

['223', '223', '223', '223']
['B', 'D', 'A', 'C']


In [41]:
pd.DataFrame(clean_residues_dict)

Unnamed: 0,chain,total_res,residues
0,B,223,"[[ILE, VAL, GLY, GLY, TYR, GLU, CYS, ARG, LYS,..."
1,D,223,"[[ILE, VAL, GLY, GLY, TYR, GLU, CYS, ARG, LYS,..."
2,A,223,"[[ILE, VAL, GLY, GLY, TYR, GLU, CYS, ARG, LYS,..."
3,C,223,"[[ILE, VAL, GLY, GLY, TYR, GLU, CYS, ARG, LYS,..."


In [46]:
def get_residues_dict(file):
    # Generate a dictionary of residues from a PDB file. Input: PDB file. Output: dictionary of residues.
    with open(file, 'r') as f:
        lines = f.readlines()

    # Collect all residues from the line "SEQRES"
    # Example: "SEQRES   1 D  223  ILE VAL GLY GLY TYR GLU CYS ARG LYS ASN SER ALA SER"
    residues_dict={"chain":[], "total_res": [],"residues":[]} 
    # "chain": chain ID. "total_res": total number of residues in the chain. "residues": list of residues in the chain.
    # residues = []
    for line in lines:
        if line.startswith("SEQRES"):
            residues_dict["chain"].append(line.split()[2])
            residues_dict["total_res"].append(int(line.split()[3]))
            residues_dict["residues"].append(line.split()[4:])   

    clean_residues_dict = {"chain":[], "total_res": [], "residues":[]}
    for c in chains: # B, D, A, C.
        res_per_chain = []
        for i, chain in enumerate(residues_dict["chain"]):
            # Process on chain at a time.
            if chain == c:
                res_per_chain.append(residues_dict["residues"][i])
                total = residues_dict["total_res"][i]
        clean_residues_dict["residues"].append(res_per_chain)
        clean_residues_dict["total_res"].append(total) # Only add once the total number of residues.
    clean_residues_dict["chain"] = chains 

    return clean_residues_dict

In [56]:
def get_residues_dict_test(file):
    # Generate a dictionary of residues from a PDB file. Input: PDB file. Output: dictionary of residues.
    with open(file, 'r') as f:
        lines = f.readlines()

    # Collect all residues from the line "SEQRES"
    # Example: "SEQRES   1 D  223  ILE VAL GLY GLY TYR GLU CYS ARG LYS ASN SER ALA SER"
    residues_dict={"chain":[], "total_res": [],"residues":[]} 
    # "chain": chain ID. "total_res": total number of residues in the chain. "residues": list of residues in the chain.
    # residues = []
    for line in lines:
        if line.startswith("SEQRES"):
            residues_dict["chain"].append(line.split()[2])
            residues_dict["total_res"].append(int(line.split()[3]))
            residues_dict["residues"].append(line.split()[4:])   

    clean_residues_dict = {"chain":[], "total_res": [], "residues":[]}
    for c in chains: # B, D, A, C.
        res_per_chain = []
        total = None  # Initialize total before the loop
        for i, chain in enumerate(residues_dict["chain"]):
            # Process on chain at a time.
            if chain == c:
                res_per_chain.append(residues_dict["residues"][i])
                total = residues_dict["total_res"][i]
        clean_residues_dict["residues"].append(res_per_chain)
        if total is not None:  # Only append if total was assigned
            clean_residues_dict["total_res"].append(total)
        else:
            # Handle the case where no matching chain was found
            clean_residues_dict["total_res"].append(0)  # or some other default value
        clean_residues_dict["chain"] = chains 

    return clean_residues_dict

In [47]:
# The total number of residues in each protein is the sum of residue # in each chain.
file = '/Users/mingbin/Desktop/Wankowicz_lab_Mac/Projects/water/data/pdb_redo_data/1a0j_final.pdb'
res_dict = get_residues_dict(file)
np.sum(res_dict["total_res"])

np.int64(892)

In [50]:
# Loop through all proteins.
pdb_path = pathlib.Path('/Users/mingbin/Desktop/Wankowicz_lab_Mac/Projects/water/data/pdb_redo_data')
pdbs = list(pdb_path.glob('*.pdb'))

In [51]:
len(pdbs)

40863

In [53]:
pdbs[0].stem

'7lmj_final'

In [58]:
test_dict=get_residues_dict(pdbs[0])

UnboundLocalError: local variable 'total' referenced before assignment

In [57]:
num_res_pdbs={"name":[], "num_res":[]}
for pdb in pdbs:
    name = pdb.stem.split('_')[0]
    # res_dict = get_residues_dict(pdb)
    res_dict = get_residues_dict_test(pdb)
    num_res = np.sum(res_dict["total_res"])
    num_res_pdbs["name"].append(name)
    num_res_pdbs["num_res"].append(num_res)
pd.DataFrame(num_res_pdbs)



Unnamed: 0,name,num_res
0,7lmj,0
1,4pqw,91
2,4zax,0
3,6zw2,0
4,8otp,0
...,...,...
40858,6gd4,0
40859,3av9,300
40860,2z6t,151
40861,1yux,402
