In [1]:
import requests

In [2]:
def get_data_from_RCSB(pdb_id):
    rcsb_url = f'https://files.rcsb.org/view/{pdb_id.upper()}.pdb'
    
    response = requests.get(rcsb_url)
    
    if response.ok:
        return [dataline.strip() for dataline in response.text.split('\n')]
    else:
        return []

In [3]:
pdb_data = get_data_from_RCSB('1LKX')

In [4]:
def change_aa_to_symbols(seq):
    seqlist = seq.split()
    sym = ''
    d = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
     'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N', 
     'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W', 
     'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M'}
    for aa in seqlist:
        sym += d[aa]
    return sym

def get_helix(data):
    helix = {}
    
    for line in data:
        if line.startswith('HELIX'):
            chain = line[19]
            
            init_res_ind = int(line[21:25].strip())
        
            term_res_ind = int(line[33:37].strip())
            if chain in helix:
                helix[chain].append((init_res_ind, term_res_ind))
            else:
                helix[chain] = [(init_res_ind, term_res_ind)]
    return helix

def get_sheet(data):
    sheet = {}

    for line in data:
        if line.startswith('SHEET'):
            chain = line[21]
            
            init_res_ind = int(line[22:26].strip())

            term_res_ind = int(line[33:37].strip())

            if chain in sheet:
                sheet[chain].append((init_res_ind, term_res_ind))

            else:
                sheet[chain] = [(init_res_ind, term_res_ind)]

    return sheet

def get_seqres(data):
    seq = {}
    
    for line in pdb_data:
        if line.startswith('SEQRES'):
            chain = line[11]
            sequence = change_aa_to_symbols(line[19:])
            if chain in seq:
                seq[chain] += sequence
            else:
                seq[chain] = sequence
    return seq

def get_atom(data):
    atom = {}
    
    for line in pdb_data:
        if line.startswith('ATOM') and line[12:16].strip() == 'CA':
            chain = line[21]
            aa = line[17:20]
            if chain in atom:
                atom[chain] += change_aa_to_symbols(aa)
            else:
                atom[chain] = change_aa_to_symbols(aa)
    return atom

In [5]:
def get_secondary_seq(helix, sheet, atom):
    secondary = {}
    actual_chain = None
    for chain, seq in atom.items():
        secondary[chain] = list('C'*len(atom[chain]))
    for chain in secondary.keys():
        for helix_pla in helix[chain]:
            for place in range(helix_pla[0], helix_pla[1]+1):
                try:
                    secondary[chain][place] = 'H'
                except IndexError:
                    pass
        for sheet_pla in sheet[chain]:
            for place in range(sheet_pla[0], sheet_pla[1]+1):
                try:
                    secondary[chain][place] = 'E'
                except IndexError:
                    pass
        secondary[chain] = ''.join(secondary[chain])
    return secondary

In [6]:
get_secondary_seq(get_helix(pdb_data), get_sheet(pdb_data), get_atom(pdb_data))

{'A': 'CCCCCCCCCCCCHHHHHCCCHHHHHHHHHHHHHHHCCCEEECCCCEEEECCCCCCCCCHHHHHHHHHCCHHHHHCCHHHHHHHHHHHHHHHHHCCEEEEEECCCCCHHHHHHHHHHHHHHHHHCCCHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCCCCCCEEEEEEEECCCCCEEEEEEEEECCHHHHHHCCCCCCCCHHHHHHHHHCCHHHHHHHHHCCCHHHHHHHHHHCCCCCCCCCHHHHHHHHHHHHHHHHCCHHHHHHHHHHHHHHHHHHHCCCEEEECCCCCCCCEEEECHHHHHHHHHHHHHCHHHHHHHHHHHCCCCCCCCCCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCEEEEEECCCCCCCCCCHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCCCCCCHHHHHHHCCCCCHHHHHHHHHHHHCCCHHHHHHHHHHHHHCCCCCEECCCCCCCCCCCCCEEEEEECCEEEEEECCHHHHHHHHCCHHHHHHHHHHCCHHHHHHHHCCCCCCCCCCCCCHHHHHHHHHHHHHHHHHHCCEEEEEEEECCCCCCCCCCCHHHHHHHHHHHHCHHHHHHHHHHHCCCCCCHHHHHHCCCCCCCCCCCCCC',
 'B': 'CCCCCCCCCCCCHHHHHCCCHHHHHHHHHHHHHHHCCCEEECCCCEEEECCCCCCCCCHHHHHHHHHCCHHHHHCCHHHHHHHHHHHHHHHHHCCEEEEEECCCCCHHHHHHHHHHHHHHHHHCCCHHHHHHHHHHHHHHHHHHHHHHHCCCCCCCCCCCCEEEEEEEECCCCCEEEEEEEEECCHHHHHHCCCCCCCCHHHHHHHHHCCHHHHHHHHHCCCHHHHHHHHHHCCCCCCCCCHHHHHHHHHHHHHHHHCCHHHHHHHHHHHHHHHHHHHCCCEEEECCCCCCCCEEEECHHHHHHHHHHHHHCHHHHHHHHHHHCCCCCCCCCC

In [7]:
def compare_seqres_and_atom(seqres, atom):
    final = {}
    for chain in seqres.keys():
        final[chain] = ''
        for letter in seqres[chain]:
            if letter == atom[chain][0]:
                final[chain] += letter
                atom[chain] = atom[chain][1:]
                if len(atom[chain]) == 0:
                    atom[chain] = [None]
            else:
                final[chain] += '-'
    return final

In [8]:
compare_seqres_and_atom(get_seqres(pdb_data), get_atom(pdb_data))

{'A': '--------GVPDFVLLNQITENAFIENLTMRHKSDNIYTYIGDVVISTNPFKNLNIYKESDIKAYNGRYKYEMPPHMYALANDAYRSMRQSQENQCVIISGESGAGKTEASKKIMQFLTFVSSNQSPNGERISKMLLDSNPLLEAFGNAKTLRNDNSSRFGKYMEMQFNAVGSPIGGKITNYLLEKSRVVGRTQGERSFHIFYQMLKGLSQSKLDELGLTPNAPAYEYLKKSGCFDVSTIDDSGEFKIIVKAMETLGLKESDQNSIWRILAAILHIGNITFAEAAEQ-T-T--VKVSDTKSLAAAASCLKTDQQSLSIALCYRS----V------ISVPMDCNQAAYSRDALAKALYERLFNWLVSKINTIINCTTEKGPVIGILDIYGFEVFQNNSFEQLNINFCNEKLQQLFIELTLKSEQEEYVREGIEWKNIEYFNNKPICELIEKKPIGLISLLDEACLIAKSTDQTFLDSICKQFEKNPHLQSYVVSKDRSIGDTCFRLKHYAGDVTYDVRGFLDKNKDTLFGDLISSMQSSSDPLVQGLFP----E-------TAGSQFRNAMNALITTLLACSPHYVRCIKSNDNKQAGVIDEDRVRHQVRYLGLLENVRVRRAGFAGRIEYTRFYNRYKMLCKK---------KQATELILQQHNIDKEEIRMGKTKVFIRNPTTLFYFEEKR-----',
 'B': '--------GVPDFVLLNQITENAFIENLTMRHKSDNIYTYIGDVVISTNPFKNLNIYKESDIKAYNGRYKYEMPPHMYALANDAYRSMRQSQENQCVIISGESGAGKTEASKKIMQFLTFVSSNQSPNGERISKMLLDSNPLLEAFGNAKTLRNDNSSRFGKYMEMQFNAVGSPIGGKITNYLLEKSRVVGRTQGERSFHIFYQMLKGLSQSKLDELGLTPNAPAYEYLKKSGCFDVSTIDDSGEFKIIVKAMETLGLKESDQNSIWRILAAILHIGNITFAEAAE