This notebook is to check for PDBs containing mutations

In [1]:
#########################################################################################################################
#################################################    who (P)DB          #################################################
#########################################################################################################################

## file with general functions and variables useful for a lot of MSEAL stuff
import pandas as pd
import numpy as np
import os
import math
pd.set_option('max_colwidth', 100)
pd.set_option('max_rows', 10000)

In [2]:
DATA_DIR = "/Users/ryan/Desktop/data"
PDB_DIR = "%s/PDBs/" % (DATA_DIR)
PDB_chain_DIR  = "%s/PDB_chains/" % (DATA_DIR)

In [3]:
#####################################################################
######             CREATE WORKABLE PDB DATAFRAME               ######
#####################################################################


## simple dataframe from pdb (with NMR adjustment)
def get_PDB_DF(pdb, PDB_file=None):
    pdb_id = pdb[:4]
    if PDB_file == None:
        PDB_file = "%s/%s/%s/%s.pdb" %(PDB_DIR, pdb_id[0], pdb_id[1], pdb_id)
    ## downlod pdb file if not already downloaded
    if not(os.path.isfile(PDB_file)):
        os.system("curl https://files.rcsb.org/view/%s.pdb -o %s" %(pdb_id, PDB_file))
        
    f = open(PDB_file, "r")
    pdb_lines = f.readlines()
    pdb_file_df = pd.DataFrame(pdb_lines, columns = ['raw'])
    pdb_file_df['raw'] = pdb_file_df['raw'].str[:80]
    pdb_file_df['key'] = pdb_file_df['raw'].str[:6].str.replace(' ', '')
    ## only keep first model, mostly for NMR models
    pdb_file_df = get_first_model(pdb_file_df)
    return(pdb_file_df)

def get_first_model(PDB_DF):
    MODEL_DF = PDB_DF[PDB_DF['key'] == 'MODEL']
    ## return if NMR does not use multiple models
    if len(MODEL_DF) < 2:
        return(PDB_DF)
    
    ENDMDL_DF = PDB_DF[PDB_DF['key'] == 'ENDMDL']
    ## get range of all non first model coordinate lines
    index_srt_scnd_MDL = MODEL_DF.index.tolist()[1]
    index_end_MDLs = ENDMDL_DF.index.tolist()[len(ENDMDL_DF)-1]
    ## remove all non first model coordinate lines
    newPDB_DF = PDB_DF.loc[:index_srt_scnd_MDL-1]
    newPDB_DF = newPDB_DF.append(PDB_DF.loc[index_end_MDLs+1:])
    newPDB_DF = newPDB_DF.reset_index(drop=True)
    return(newPDB_DF)

In [4]:
## 'ATOM' and 'HETATM' use get_ATOM_DF for a atom level dataframe
def get_SEQADV_DF(SEQADV):
    SEQADV_data = split_SEQADV(SEQADV['raw'])
    SEQADV = pd.DataFrame({
         'idCode':SEQADV_data[0]  , 'resName':SEQADV_data[1],  'chainID':SEQADV_data[2]     \
        ,'seqNum':SEQADV_data[3]  , 'iCode':SEQADV_data[4]  ,  'database':SEQADV_data[5]     \
        ,'dbIdCode':SEQADV_data[6], 'dbRes':SEQADV_data[7]  ,  'dbSeq':SEQADV_data[8]          \
        ,'conflict':SEQADV_data[9]  })
    return(SEQADV)


def split_SEQADV(raw):
    return(  raw.str[6:11].str.replace(' ', ''),  raw.str[12:15].str.replace(' ', ''),     raw.str[16].str.replace(' ', ''), \
            raw.str[17:22].str.replace(' ', ''),     raw.str[23].str.replace(' ', ''),  raw.str[24:27].str.replace(' ', ''), \
            raw.str[29:37].str.replace(' ', ''),  raw.str[39:42].str.replace(' ', ''),  raw.str[43:48].str.replace(' ', ''), \
            raw.str[49:].str.replace(' ', '') )

def get_mutated_atom_df(pdb, pdb_df):
    atom_df = get_ATOM_DF(pdb_df[pdb_df['key']=="ATOM"], pdb)
    ## need to add unadjusted modified residues 
    more_atom_df = get_ATOM_DF(pdb_df[pdb_df['key']=="HETATM"], pdb)
    atom_df = atom_df.append(more_atom_df)
    
    seqadv_df = get_SEQADV_DF(pdb_df[pdb_df['key']=="SEQADV"])
    
    mutated_atoms_df = [] #pd.DataFrame(columns = atom_df.columns)
    for index, row in seqadv_df.iterrows():
        mutated_atom = atom_df[atom_df['chainID']==row['chainID']].copy()
        mutated_atom = mutated_atom[mutated_atom['seqNum']==row['seqNum']]
        mutated_atom['conflict'] = row['conflict']
        mutated_atoms_df.append(mutated_atom)
    if len(mutated_atoms_df) == 0:
        mutated_atoms_df = pd.DataFrame(columns = atom_df.columns)
    else:
        mutated_atoms_df = pd.concat(mutated_atoms_df, ignore_index=True)
    return(mutated_atoms_df)

In [5]:
MUTATION_CONFLICTS = ["ENGINEERED", "ENGINEEREDMUTATION", "DELETION", "INSERTION", "CONFLICT"]
def pdb_mutations(pdb_name):
    #print(pdb_name)
    conflicted_pdb = False
    all_conflicts = []
    mutated_pdb=False
    mutated_conflicts = []
    
    pdb=pdb_name[:4]
    pdb_df = get_PDB_DF(pdb)
    seqadv_df = get_SEQADV_DF(pdb_df[pdb_df['key']=="SEQADV"])
    #display(seqadv_df)
    
    if len(seqadv_df) >= 1:
        conflicted_pdb = True
        all_conflicts = seqadv_df['conflict'].unique().tolist()
    
    seqadv_df = seqadv_df.loc[seqadv_df['conflict'].isin(MUTATION_CONFLICTS)]
    if len(seqadv_df) >= 1:
        mutated_pdb = True
        mutated_conflicts = seqadv_df['conflict'].unique().tolist()
        
    
    return(pdb_name, mutated_pdb, mutated_conflicts, conflicted_pdb, all_conflicts)# str(all_conflicts).replace('[',''))

pdb_mutations("5FOK")

('5FOK', False, [], True, ['EXPRESSIONTAG'])

In [6]:
membrane_bbarrel_pdbs = pd.read_table("membranBbarrelPDBs.txt",header=None,names=['pdb_name'])
display(membrane_bbarrel_pdbs.head(2))
print(membrane_bbarrel_pdbs.shape)

Unnamed: 0,pdb_name
0,5FOK
1,4FSP


(579, 1)


In [7]:
mutant_data = []
pdb_list = membrane_bbarrel_pdbs['pdb_name'].unique().tolist()
print(len(pdb_list), ' pdbs')
for pdb in pdb_list:
    
    if 0==(pdb_list.index(pdb)%100):print('\n%s'%(pdb_list.index(pdb)),end='')
    print('.',end='')
    mutant_data.append(pdb_mutations(pdb))
mutant_data = pd.DataFrame(mutant_data, columns=['PDB', 'mutated_pdb', 'mutated_conflicts', 'conflicted_pdb', 'all_conflicts'])
mutant_data['all_conflicts'] = mutant_data['all_conflicts'].astype(str).str.replace('[','').str.replace(']','').str.replace('\'','')
mutant_data['mutated_conflicts'] = mutant_data['mutated_conflicts'].astype(str).str.replace('[','').str.replace(']','').str.replace('\'','')
mutant_data

579  pdbs

0....................................................................................................
100....................................................................................................
200....................................................................................................
300....................................................................................................
400....................................................................................................
500...............................................................................

Unnamed: 0,PDB,mutated_pdb,mutated_conflicts,conflicted_pdb,all_conflicts
0,5FOK,False,,True,EXPRESSIONTAG
1,4FSP,False,,True,EXPRESSIONTAG
2,5FQ6,False,,False,
3,3PGR,True,ENGINEEREDMUTATION,True,"ENGINEEREDMUTATION, EXPRESSIONTAG"
4,5MDQ,False,,False,
5,2W76,True,CONFLICT,True,CONFLICT
6,2VDF,True,CONFLICT,True,CONFLICT
7,1MPQ,False,,False,
8,6U3T,True,CONFLICT,True,CONFLICT
9,3DZM,True,INSERTION,True,INSERTION


In [8]:
mutant_data.to_csv("MembaneBbarrelMutationData.csv", index=False)

In [10]:
soluble_bbarrel_pdbs = pd.read_table("SolubleBbarrelPDBs.txt",header=None,names=['pdb_name'])
soluble_bbarrel_pdbs['pdb_name'] = soluble_bbarrel_pdbs['pdb_name'].str.upper()
display(soluble_bbarrel_pdbs.head(2))
print(soluble_bbarrel_pdbs.shape)

Unnamed: 0,pdb_name
0,2B97
1,1R2M


(1464, 1)


In [13]:
soluble_bbarrel_pdbs = pd.read_table("SolubleBbarrelPDBs.txt",header=None,names=['pdb_name'])
soluble_bbarrel_pdbs['pdb_name'] = soluble_bbarrel_pdbs['pdb_name'].str.upper()
display(soluble_bbarrel_pdbs.head(2))
print(soluble_bbarrel_pdbs.shape)

soluble_mutant_data = []
pdb_list = soluble_bbarrel_pdbs['pdb_name'].unique().tolist()
print(len(pdb_list), ' pdbs')
for pdb in pdb_list:
    if 0==(pdb_list.index(pdb)%100):print('\n%s'%(pdb_list.index(pdb)),end='')
    print('.',end='')
    soluble_mutant_data.append(pdb_mutations(pdb))
    
soluble_mutant_data = pd.DataFrame(soluble_mutant_data, columns=['PDB', 'mutated_pdb', 'mutated_conflicts', 'conflicted_pdb', 'all_conflicts'])
soluble_mutant_data['all_conflicts'] = soluble_mutant_data['all_conflicts'].astype(str).str.replace('[','').str.replace(']','').str.replace('\'','')
soluble_mutant_data['mutated_conflicts'] = soluble_mutant_data['mutated_conflicts'].astype(str).str.replace('[','').str.replace(']','').str.replace('\'','')
display(soluble_mutant_data.head(10))
print(soluble_mutant_data.shape)

Unnamed: 0,pdb_name
0,2B97
1,1R2M


(1464, 1)
1464  pdbs

0....................................................................................................
100....................................................................................................
200....................................................................................................
300....................................................................................................
400....................................................................................................
500....................................................................................................
600....................................................................................................
700....................................................................................................
800....................................................................................................
900.........................................

Unnamed: 0,PDB,mutated_pdb,mutated_conflicts,conflicted_pdb,all_conflicts
0,2B97,False,,False,
1,1R2M,False,,False,
2,2PL7,False,,False,
3,3QQT,False,,False,
4,2PL6,False,,False,
5,2GVM,False,,False,
6,2FZ6,False,,False,
7,4BWH,False,,True,EXPRESSIONTAG
8,4AOG,False,,True,CLONINGARTIFACT
9,1ZVC,False,,True,CLONINGARTIFACT


(1464, 5)


In [14]:
soluble_mutant_data.to_csv("SolubleBbarrelMutationData.csv", index=False)