In [1]:
import pynmrstar
import pandas as pd
from Bio.Data.IUPACData import protein_letters_3to1
import os

### Dataframe of NMR Proteins

In [2]:
df = pd.read_csv('comparable_af2_nmr_structures.txt', delim_whitespace=True, names=['UniProt-ID', 'PDB-ID', 'chain', 'BMRB-ID',
                                                                               'residue-range-AF2', 'residue-range-NMR'], skiprows=1)
df.head(10)

Unnamed: 0,UniProt-ID,PDB-ID,chain,BMRB-ID,residue-range-AF2,residue-range-NMR
0,A1A4S6,2MIO,A,19684,728-786,12-70
1,O00168,2JO1,A,16168,21-92,1-72
2,O00170,2LKN,A,18001,2-166,2-166
3,O00175,1EIH,A,4590,27-99,1-73
4,O00206,5NAM,A,34108,623-670,1-48
5,O00206,5NAO,A,34109,623-657,1-35
6,O00213,2E45,A,7365,241-290,1-50
7,O00244,2LQ9,A,18299,1-68,1-68
8,O00257,2K28,A,15695,8-65,3-60
9,O00267,6EQY,A,34184,533-647,16-130


## Extract an assignment and build an anonmyised peak list from the assignment

In [3]:
def get_assignment_df_and_sequence(bmrb_id):

    # Get Assignment Data
    entry = pynmrstar.Entry.from_database(bmrb_id, convert_data_types=True)
    
    data = entry.get_loops_by_category("Atom_chem_shift")[0]

    tags = ['Comp_index_ID', 'Comp_ID', 'Atom_ID', 'Val', 'Entry_ID']
    assignment_df = pd.DataFrame(data.get_tag(tags), columns=tags)

    # Convert 3 letter sequence to 1 letter sequence
    assignment_df['Comp_ID'] = assignment_df['Comp_ID'].map(str.capitalize)
    assignment_df['Comp_ID_sl'] = assignment_df['Comp_ID'].map(protein_letters_3to1)

    # Get Sequence
    entity_saveframe = entry.get_saveframes_by_category('entity')[0]
    sequence = ''.join(entity_saveframe['Polymer_seq_one_letter_code'][0].split())

    return assignment_df, sequence

def get_sequence_from_assignment_df(df):

    amino_acid_list = df.groupby(['Comp_index_ID'])['Comp_ID_sl'].first()
    return ''.join(amino_acid_list.to_list())

def remove_unwanted_atom_types(df):
    tmp_df = df.copy()

    atom_list = ['N', 'H', 'C', 'CA', 'CB']
    tmp_df = tmp_df[tmp_df['Atom_ID'].isin(atom_list)]
    return tmp_df

def get_anonymised_assignment(df):

    df_list = []

    residues = sorted(df['Comp_index_ID'].unique())
    for i, residue in enumerate(residues):

        if i ==0:
            pass
        else:
            tmp_df = ((df['Comp_index_ID'] == residue) | ((df['Comp_index_ID'] == residues[i-1]) & (df['Atom_ID'].isin(['CA', 'CB', 'C']))))
            df_list.append(df[tmp_df])

    return pd.concat(df_list)#[['Atom_ID', 'Val']]


### Process a single Protein

In [8]:
assignment_df, sequence = get_assignment_df_and_sequence(34184)
#assignment_df = remove_unwanted_atom_types(assignment_df)
print(get_sequence_from_assignment_df(assignment_df))
print(get_sequence_from_assignment_df(assignment_df[assignment_df['Comp_index_ID'] >= 16]))
print(sequence)

### Read All Assignments and Store in Single dataframe

In [None]:
# Get Protein Assignments for all proteins on list
all_assignments_list = []
for i, protein_id in enumerate(df['BMRB-ID']):

    #print(f'{i}: {protein_id}')

    # Catch errors caused by entries without assignment lists - Skip these entries
    try:
        assignment_df, sequence = get_assignment_df_and_sequence(protein_id)
        assignment_df['sequence'] = sequence
    except IndexError as e:
        print(e)
        continue

    all_assignments_list.append(assignment_df)

all_assignments_df = pd.concat(all_assignments_list)
all_assignments_df = remove_unwanted_atom_types(all_assignments_df)
all_assignments_df.to_csv('All_Assignments.csv')

### Build Anonymised assignments

In [None]:

    




    # anonymised_assigned_df = get_anonymised_assignment(assignment_df)


    # # Catch errors caused by sequences having 3 letter codes that don't exist in letter code dictionary
    # try:
    #     assignment_sequence = get_sequence_from_assignment_df(assignment_df)
    # except TypeError as e:
    #     print(e)
    #     continue