# Getting protein name and other synonyms 

## Source: uniprot

In [39]:
#import library
import pandas as pd
import re

In [None]:
#install the library if you don't have it
!pip install bioservices
from bioservices import UniProt

In [3]:
#read csv file into dataframe
kinase_df = pd.read_csv('clean_human_kinase.csv', index_col=0)
kinase_df.tail()

Unnamed: 0_level_0,gene_name,uniprot_identifier,uniprot_number
family,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Atypical: PI3/PI4-kinase family,PRKDC,PRKDC_HUMAN,P78527
Atypical: PI3/PI4-kinase family,SMG1,SMG1_HUMAN,Q96Q15
Atypical: RIO-type Ser/Thr kinase family,RIOK1,RIOK1_HUMAN,Q9BRS2
Atypical: RIO-type Ser/Thr kinase family,RIOK2,RIOK2_HUMAN,Q9BVS4
Atypical: RIO-type Ser/Thr kinase family,RIOK3,RIOK3_HUMAN,O14730


In [7]:
#unprocessed uniprot_number
unprocessed_list = kinase_df['uniprot_number'].tolist()
unprocessed_list[:10]

['P31749',
 'P31751',
 'Q9Y243',
 'Q5VT25',
 'Q9Y5S2',
 'Q6DT37',
 'O14578',
 'Q09013',
 'Q15835',
 'P25098']

In [8]:
#turn gene name into a list
up_num_list = kinase_df['uniprot_number'].str.rstrip().tolist() #rstrip to remove the white spaces
up_num_list

['P31749',
 'P31751',
 'Q9Y243',
 'Q5VT25',
 'Q9Y5S2',
 'Q6DT37',
 'O14578',
 'Q09013',
 'Q15835',
 'P25098',
 'P35626',
 'P32298',
 'P34947',
 'P43250',
 'Q8WTQ7',
 'O95835',
 'Q9NRM7',
 'Q9Y2H9',
 'Q6P0Q8',
 'O60307',
 'O15021',
 'Q96GX5',
 'O15530',
 'Q6A1A2',
 'Q16512',
 'Q16513',
 'Q6P5Z2',
 'P17612',
 'P22694',
 'P22612',
 'P17252',
 'P05771',
 'Q05655',
 'Q02156',
 'P05129',
 'P24723',
 'P41743',
 'Q04759',
 'Q05513',
 'Q13976',
 'Q13237',
 'P51817',
 'O43930',
 'Q13464',
 'O75116',
 'Q15418',
 'Q15349',
 'P51812',
 'O75676',
 'O75582',
 'Q9UK32',
 'P23443',
 'Q9UBS0',
 'O00141',
 'Q9HBY8',
 'Q96BR1',
 'Q15208',
 'Q9Y2H1',
 'Q8TDC3',
 'Q8IWQ3',
 'Q14012',
 'Q8IU85',
 'Q96NX5',
 'Q9UQM7',
 'Q13554',
 'Q13557',
 'Q13555',
 'Q16566',
 'Q8NCB2',
 'O14936',
 'O14757',
 'O96017',
 'P53355',
 'Q9UIK4',
 'O43293',
 'O15075',
 'Q8N568',
 'Q9C098',
 'P57058',
 'O60229',
 'P49137',
 'Q16644',
 'Q8IW41',
 'Q9P0L2',
 'Q7KZI7',
 'P27448',
 'Q96L34',
 'Q14680',
 'Q9BUB5',
 'Q9HBH9',
 'Q15746',

In [13]:
#using bioservices
u = UniProt(verbose=False)
u.search("id:P31751", limit=1, columns="id,protein names,entry name")

'Entry\tProtein names\tEntry name\nP31751\tRAC-beta serine/threonine-protein kinase (EC 2.7.11.1) (Protein kinase Akt-2) (Protein kinase B beta) (PKB beta) (RAC-PK-beta)\tAKT2_HUMAN\n'

In [18]:
#split the output by multiple seps
example = u.search("id:Q5VT25", limit=1, columns="id,protein names, genes(PREFERRED), genes, entry name")
re.split('\t|\n', example)

['Entry',
 'Protein names',
 'Gene names  (primary )',
 'Gene names',
 'Entry name',
 'Q5VT25',
 'Serine/threonine-protein kinase MRCK alpha (EC 2.7.11.1) (CDC42-binding protein kinase alpha) (DMPK-like alpha) (Myotonic dystrophy kinase-related CDC42-binding kinase alpha) (MRCK alpha) (Myotonic dystrophy protein kinase-like alpha)',
 'CDC42BPA',
 'CDC42BPA KIAA0451',
 'MRCKA_HUMAN',
 '']

In [19]:
#retrieve the protein names
ex_str = re.split('\t|\n', example)
ex_str[6].split('(')[0].rstrip()

'Serine/threonine-protein kinase MRCK alpha'

In [20]:
#create a function to get the uniprot_number, gene_name, protein_name, gene_aliases
def get_meta(uniprot_num):
    """
    Takes in a uniprot number.
    Returns the uniprot_number, gene_name, protein_name, gene_aliases as a tuple.
    """
    query = "id:{}".format(uniprot_num)
    output = u.search(query, limit=1, columns="id,protein names, genes(PREFERRED), genes, entry name")
    tmp = re.split('\t|\n', output)
    protein_name = tmp[6].split('(')[0].rstrip()
    gene_name = tmp[7]
    gene_aliases = tmp[8].split()
    uniprot_entry = tmp[9]
    return(uniprot_num, gene_name, protein_name, gene_aliases, uniprot_entry)
get_meta('Q5VT25')

('Q5VT25',
 'CDC42BPA',
 'Serine/threonine-protein kinase MRCK alpha',
 ['CDC42BPA', 'KIAA0451'],
 'MRCKA_HUMAN')

In [21]:
#get all the protein names
total = []
unavailable = []
for num in up_num_list:
    tmp = get_meta(num)
    if len(tmp) < 5:
        unavailable.append(num)
    else:
        total.append(tmp)
print(len(unavailable))
print(total[:5])

0
[('P31749', 'AKT1', 'RAC-alpha serine/threonine-protein kinase', ['AKT1', 'PKB', 'RAC'], 'AKT1_HUMAN'), ('P31751', 'AKT2', 'RAC-beta serine/threonine-protein kinase', ['AKT2'], 'AKT2_HUMAN'), ('Q9Y243', 'AKT3', 'RAC-gamma serine/threonine-protein kinase', ['AKT3', 'PKBG'], 'AKT3_HUMAN'), ('Q5VT25', 'CDC42BPA', 'Serine/threonine-protein kinase MRCK alpha', ['CDC42BPA', 'KIAA0451'], 'MRCKA_HUMAN'), ('Q9Y5S2', 'CDC42BPB', 'Serine/threonine-protein kinase MRCK beta', ['CDC42BPB', 'KIAA1124'], 'MRCKB_HUMAN')]


In [22]:
#convert the list into dataframe
meta_df = pd.DataFrame(total, columns=['Uniprot_number','Gene_name','Protein_name','Gene_aliases', 'Uniprot_entry'])
meta_df.head()

Unnamed: 0,Uniprot_number,Gene_name,Protein_name,Gene_aliases,Uniprot_entry
0,P31749,AKT1,RAC-alpha serine/threonine-protein kinase,"[AKT1, PKB, RAC]",AKT1_HUMAN
1,P31751,AKT2,RAC-beta serine/threonine-protein kinase,[AKT2],AKT2_HUMAN
2,Q9Y243,AKT3,RAC-gamma serine/threonine-protein kinase,"[AKT3, PKBG]",AKT3_HUMAN
3,Q5VT25,CDC42BPA,Serine/threonine-protein kinase MRCK alpha,"[CDC42BPA, KIAA0451]",MRCKA_HUMAN
4,Q9Y5S2,CDC42BPB,Serine/threonine-protein kinase MRCK beta,"[CDC42BPB, KIAA1124]",MRCKB_HUMAN


In [23]:
#turn the meta df into csv file
meta_df.to_csv('meta_names.csv', header=True, index=False)

The metadata is now available

# Checking if Mo's kinase is within my list of human kinases

In [24]:
#read in mo's csv
kinase_substrate_df = pd.read_csv('clean_human_kinase_substrates.csv')
kinase_substrate_df.head()

Unnamed: 0.1,Unnamed: 0,GENE,KINASE,KIN_ACC_ID,SUBSTRATE,SUB_GENE_ID,SUB_ACC_ID,SUB_GENE,SUB_MOD_RSD,SITE_GRP_ID,SITE_+/-7_AA,DOMAIN
0,7,EIF2AK1,HRI,Q9BQI3,eIF2-alpha,1965.0,P05198,EIF2S1,S52,447635,MILLsELsRRRIRsI,S1
1,8,EIF2AK1,HRI,Q9BQI3,eIF2-alpha,1965.0,P05198,EIF2S1,S49,450210,IEGMILLsELsRRRI,S1
2,11,PRKCD,PKCD,Q05655,HDAC5,10014.0,Q9UQL6,HDAC5,S259,447995,FPLRkTAsEPNLKVR,
3,12,PRKCD,PKCD,Q05655,PTPRA iso2,5786.0,P18433-2,PTPRA,S204,447612,PLLARSPsTNRKYPP,
4,13,PRKCD,PKCD,Q05655,hnRNP K,3190.0,P61978,HNRNPK,S302,457408,GrGGrGGsrArNLPL,


In [25]:
meta_df.head()

Unnamed: 0,Uniprot_number,Gene_name,Protein_name,Gene_aliases,Uniprot_entry
0,P31749,AKT1,RAC-alpha serine/threonine-protein kinase,"[AKT1, PKB, RAC]",AKT1_HUMAN
1,P31751,AKT2,RAC-beta serine/threonine-protein kinase,[AKT2],AKT2_HUMAN
2,Q9Y243,AKT3,RAC-gamma serine/threonine-protein kinase,"[AKT3, PKBG]",AKT3_HUMAN
3,Q5VT25,CDC42BPA,Serine/threonine-protein kinase MRCK alpha,"[CDC42BPA, KIAA0451]",MRCKA_HUMAN
4,Q9Y5S2,CDC42BPB,Serine/threonine-protein kinase MRCK beta,"[CDC42BPB, KIAA1124]",MRCKB_HUMAN


In [49]:
#turn meta_df into dictionary with gene_aliases as key and the gene_name as value
kinase_dict = {}
for index, row in meta_df.iterrows():
    kinase_dict[tuple(row['Gene_aliases'])] = row['Gene_name']
#     print(tuple(row['Gene_aliases']))
kinase_dict

{('AKT1', 'PKB', 'RAC'): 'AKT1',
 ('AKT2',): 'AKT2',
 ('AKT3', 'PKBG'): 'AKT3',
 ('CDC42BPA', 'KIAA0451'): 'CDC42BPA',
 ('CDC42BPB', 'KIAA1124'): 'CDC42BPB',
 ('CDC42BPG', 'DMPK2'): 'CDC42BPG',
 ('CIT', 'CRIK', 'KIAA0949', 'STK21'): 'CIT',
 ('DMPK', 'DM1PK', 'MDPK'): 'DMPK',
 ('GRK1', 'RHOK'): 'GRK1',
 ('GRK2', 'ADRBK1', 'BARK', 'BARK1'): 'GRK2',
 ('GRK3', 'ADRBK2', 'BARK2'): 'GRK3',
 ('GRK4', 'GPRK2L', 'GPRK4'): 'GRK4',
 ('GRK5', 'GPRK5'): 'GRK5',
 ('GRK6', 'GPRK6'): 'GRK6',
 ('GRK7', 'GPRK7'): 'GRK7',
 ('LATS1', 'WARTS'): 'LATS1',
 ('LATS2', 'KPM'): 'LATS2',
 ('MAST1', 'KIAA0973', 'SAST'): 'MAST1',
 ('MAST2', 'KIAA0807', 'MAST205'): 'MAST2',
 ('MAST3', 'KIAA0561'): 'MAST3',
 ('MAST4', 'KIAA0303'): 'MAST4',
 ('MASTL', 'GW', 'GWL', 'THC2'): 'MASTL',
 ('PDPK1', 'PDK1'): 'PDPK1',
 ('PDPK2P', 'PDPK2'): 'PDPK2P',
 ('PKN1', 'PAK1', 'PKN', 'PRK1', 'PRKCL1'): 'PKN1',
 ('PKN2', 'PRK2', 'PRKCL2'): 'PKN2',
 ('PKN3', 'PKNBETA'): 'PKN3',
 ('PRKACA', 'PKACA'): 'PRKACA',
 ('PRKACB',): 'PRKACB',
 ('P

In [None]:
#turn mo's kinase_gene into a list
mo_kinase_gene = kinase_substrate_df['GENE'].to_list()
mo_kinase_gene

In [67]:
#turns the aliases into a list
kinase_list = []
for aliases in kinase_dict:
    for gene in aliases:
        kinase_list.append(gene)
kinase_list

['AKT1',
 'PKB',
 'RAC',
 'AKT2',
 'AKT3',
 'PKBG',
 'CDC42BPA',
 'KIAA0451',
 'CDC42BPB',
 'KIAA1124',
 'CDC42BPG',
 'DMPK2',
 'CIT',
 'CRIK',
 'KIAA0949',
 'STK21',
 'DMPK',
 'DM1PK',
 'MDPK',
 'GRK1',
 'RHOK',
 'GRK2',
 'ADRBK1',
 'BARK',
 'BARK1',
 'GRK3',
 'ADRBK2',
 'BARK2',
 'GRK4',
 'GPRK2L',
 'GPRK4',
 'GRK5',
 'GPRK5',
 'GRK6',
 'GPRK6',
 'GRK7',
 'GPRK7',
 'LATS1',
 'WARTS',
 'LATS2',
 'KPM',
 'MAST1',
 'KIAA0973',
 'SAST',
 'MAST2',
 'KIAA0807',
 'MAST205',
 'MAST3',
 'KIAA0561',
 'MAST4',
 'KIAA0303',
 'MASTL',
 'GW',
 'GWL',
 'THC2',
 'PDPK1',
 'PDK1',
 'PDPK2P',
 'PDPK2',
 'PKN1',
 'PAK1',
 'PKN',
 'PRK1',
 'PRKCL1',
 'PKN2',
 'PRK2',
 'PRKCL2',
 'PKN3',
 'PKNBETA',
 'PRKACA',
 'PKACA',
 'PRKACB',
 'PRKACG',
 'PRKCA',
 'PKCA',
 'PRKACA',
 'PRKCB',
 'PKCB',
 'PRKCB1',
 'PRKCD',
 'PRKCE',
 'PKCE',
 'PRKCG',
 'PKCG',
 'PRKCH',
 'PKCL',
 'PRKCL',
 'PRKCI',
 'DXS1179E',
 'PRKCQ',
 'PRKCT',
 'PRKCZ',
 'PKC2',
 'PRKG1',
 'PRKG1B',
 'PRKGR1A',
 'PRKGR1B',
 'PRKG2',
 'PRKGR2',
 '

In [68]:
#loop through the list to find any mo's gene that might not be available in mine
not_available=[]
available = []
for gene_query in mo_kinase_gene:
    if gene_query in available:
        continue
    elif gene_query in not_available:
        continue
    else:
        if gene_query in kinase_list:
            available.append(gene_query)
        else:
            not_available.append(gene_query)
print(len(available))
print(not_available)

340
['GTF2F1', 'PRKAG2', 'BCR/ABL', 'NME1', 'HSPA5', 'BCR', 'PIKFYVE', 'PHKA1', 'BAZ1B', 'TAF1', 'TGM2', 'ENPP3', 'PGK1', 'NME2', 'COL4A3BP', 'NPM/ALK', 'PRKAB1', 'PKM', 'BLVRA', 'CSNK2B', 'BRD4', 'FAM20C']


In [None]:
#output the unavailable into txt
with open('unavailable_kinase.txt','w') as unavailable:
    unavailable