In [30]:
import Bio.PDB as PDB 
import numpy as np 
import freesasa
import glob
from Bio.PDB.DSSP import DSSP 

# Calculate parameters

In [2]:
surfaces = []
rsas = []
surface_seq = []

for file in glob.glob("data/training/crystal_structs/*.pdb"): 
    
    # parse the pdb file
    p = PDB.PDBParser(QUIET=True)
    s = p.get_structure(file, file) 
    
    # get the surface area
    structure = freesasa.Structure(file)
    result = freesasa.calc(structure)
    area_classes = freesasa.classifyResults(result, structure)
    
    # save this into numpy sheet result.totalArea() 
    surface = result.totalArea()
    surfaces.append(surface)
    
    # get the sequence length
    seq = 0
    for chain in s.get_chains():
        seq += len([_ for _ in chain.get_residues() if PDB.is_aa(_)])
    
    # save into numpy sheet rsa
    rsas.append(seq)
    
    # caculate the surface/sequence 
    surface_seq.append(surface/seq)

## Exracting secondary structure 

We distinguished between the a, b and
c residues that are buried in the protein core (solvent accessibility
 20%), moderately buried (between 20% and 50%), and solvent
exposed ( 50%).

In [52]:
p = PDB.PDBParser()
structure = p.get_structure(file, "data/training/crystal_structs/A0A140NA.pdb")
    
model = structure[0]
dssp = DSSP(model, "data/training/crystal_structs/A0A140NA.pdb")

# DSSP data is accessed by a tuple (chain_id, res_id)
a_key = list(dssp.keys())[2]

all_residues = list(dssp.keys())

dssp_info = [dssp[i] for i in all_residues] 


asa = [dssp[i][3] for i in all_residues]

burried = [0 if i <= 0.2 else 2 if i >= 0.5 else 1 for i in asa]

secondary_q8 = [dssp[i][2] for i in all_residues]

# helix = H, G, I
# beta = B, E 
# loop = rest 
# 0 is alpha, 1 is beta, 2 is coil
secondary_q3 = [0 if i in ['H', 'G', 'I'] else 1 if i in ['B', 'E'] else 2 for i in secondary_q8]

count_helices = secondary_q3.count(0)
count_sheets = secondary_q3.count(1)

# calculate fraction of buried beta residues 
# total amount residues 
# list of moderatly and 

# calculate fraction of moderately buried beta residues
mod_beta = 0

for i in range(len(burried)): 
    if burried[i] == 1 and secondary_q3[i] == 1:
        mod_beta += 1
    
frac_mod_beta = mod_beta / count_sheets

print(frac_mod_beta)

# calc fraction of moderately buried alfa residues
mod_alfa = 0

for i in range(len(burried)): 
    if burried[i] == 1 and secondary_q3[i] == 0:
        mod_alfa += 1
    
frac_mod_alfa = mod_alfa / count_helices

print(frac_mod_alfa)


# calc fraction of exposed a residues
exp_alfa = 0

for i in range(len(burried)): 
    if burried[i] == 2 and secondary_q3[i] == 0:
        exp_alfa += 1
    
frac_exp_alfa = exp_alfa / count_helices

print(frac_exp_alfa)

# calc fraction of each of the 20 amino acid types 


# calc fraction of K minus fraction of R


# fraction of negatively charged residues

# fraction of charged residues

# fraction of positively minus negatively charged residues



0.37037037037037035
0.2571428571428571
0.4


In [None]:
for file in glob.glob("data/training/crystal_structs/*.pdb"): 
    
    p = PDB.PDBParser()
    structure = p.get_structure(file, file)
    print(structure)
    
    model = structure[0]
    dssp = DSSP(model, file)
    # DSSP data is accessed by a tuple (chain_id, res_id)
    
    a_key = list(dssp.keys())[2]
    
    print(dssp[a_key])

## Saving file 

In [None]:
a = np.array(surfaces)
b = np.array(rsas)
c = np.array(surface_seq)

arr = np.column_stack((a, b, c))
np.savetxt("parameters.csv", arr, delimiter=",")