# Retrieve GO terms for gene from the Uniprot database based on the classes

In [95]:
import numpy as np
import pandas as pd
import string
import os
from collections import Counter
from collections import defaultdict

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import re
from bioservices import *
import collections
%pylab inline --no-import-all

Populating the interactive namespace from numpy and matplotlib


In [60]:
train = pd.read_csv('..//bases/training_variants')
test = pd.read_csv('..//bases/test_variants')

In [61]:
# only use gene from train data -> contains the classes
all_genes = set(train.Gene)
print(len(all_genes))
print(all_genes)

264
{'CDK8', 'TERT', 'RAD51B', 'FGF3', 'NTRK3', 'EIF1AX', 'MAP2K2', 'PTPN11', 'CEBPA', 'AKT1', 'APC', 'ROS1', 'KDM5C', 'ATM', 'SPOP', 'ARID5B', 'CDK6', 'MTOR', 'ERBB2', 'PIK3CA', 'CARD11', 'BRCA2', 'CCND2', 'HLA-B', 'ARID2', 'RAD50', 'PBRM1', 'ETV1', 'ARID1B', 'FGFR1', 'FLT3', 'CDKN1A', 'CREBBP', 'GATA3', 'PTCH1', 'CIC', 'NTRK2', 'SF3B1', 'HNF1A', 'PTEN', 'DDR2', 'FLT1', 'SRSF2', 'RAD54L', 'NKX2-1', 'MYD88', 'INPP4B', 'RICTOR', 'CTNNB1', 'ERCC2', 'SOX9', 'ERG', 'MAP2K1', 'IKBKE', 'RAD51C', 'PIK3R1', 'LATS2', 'ESR1', 'CARM1', 'XRCC2', 'SMO', 'BRCA1', 'CHEK2', 'SDHC', 'PIK3CD', 'RAD21', 'TCF3', 'RASA1', 'U2AF1', 'CCND1', 'XPO1', 'SMARCA4', 'ARID1A', 'SETD2', 'MEF2B', 'FGFR3', 'MYOD1', 'NCOR1', 'DNMT3B', 'KMT2C', 'AXL', 'PPP2R1A', 'EPAS1', 'KEAP1', 'FBXW7', 'KMT2B', 'H3F3A', 'TSC1', 'BTK', 'HIST1H1C', 'ERRFI1', 'FUBP1', 'CDKN2B', 'PIK3CB', 'HRAS', 'KNSTRN', 'CCND3', 'RAC1', 'AGO2', 'IL7R', 'PAX8', 'RHEB', 'CDKN2A', 'FAM58A', 'PMS1', 'BCL2', 'PIK3R2', 'PMS2', 'RHOA', 'JAK2', 'ATRX', 'HLA-A

In [43]:
u = UniProt()

In [44]:
res = u.search("ZAP70_HUMAN")
print(res)

Entry	Entry name	Status	Protein names	Gene names	Organism	Length
P43403	ZAP70_HUMAN	reviewed	Tyrosine-protein kinase ZAP-70 (EC 2.7.10.2) (70 kDa zeta-chain associated protein) (Syk-related tyrosine kinase)	ZAP70 SRK	Homo sapiens (Human)	619



In [62]:
u.debugLevel = "INFO"
u.timeout = 100   # some queries are long and requires much more time; default is 1000 seconds

In [63]:
# just an example of query
a = u.search('SLC16A1+AND+organism:9606', frmt='tab', limit=1,
               columns="entry name")

In [64]:
[s.strip() for s in a.splitlines()]

['Entry name', 'MOT1_HUMAN']

In [70]:
gene_entry_dict = {} # here we will keep the gene_entries together with their classes
class_dict = {}
for gene in all_genes:
    gene_classes = list(train.Class[train.Gene==gene])
    keyword = 'gene:%s+AND+organism:9606' %gene #to query database, with gene and organism 9606 is Homo Sapien (human)
    entry_name_tab = u.search(keyword, frmt='tab', limit=1, columns="entry name") 
    entry_name = [s.strip() for s in entry_name_tab.splitlines()][1] # gets the entry name from uniprot i.e. second position in tab
    gene_entry_dict[gene] = entry_name
    class_dict[entry_name] = gene_classes

In [71]:
gene_entry_dict

{'ABL1': 'ABL1_HUMAN',
 'ACVR1': 'ACVR1_HUMAN',
 'AGO2': 'AGO2_HUMAN',
 'AKT1': 'AKT1_HUMAN',
 'AKT2': 'AKT2_HUMAN',
 'AKT3': 'AKT3_HUMAN',
 'ALK': 'TGFR1_HUMAN',
 'APC': 'APC_HUMAN',
 'AR': 'ANDR_HUMAN',
 'ARAF': 'ARAF_HUMAN',
 'ARID1A': 'ARI1A_HUMAN',
 'ARID1B': 'ARI1B_HUMAN',
 'ARID2': 'ARID2_HUMAN',
 'ARID5B': 'ARI5B_HUMAN',
 'ASXL1': 'ASXL1_HUMAN',
 'ASXL2': 'ASXL2_HUMAN',
 'ATM': 'ATM_HUMAN',
 'ATR': 'ATR_HUMAN',
 'ATRX': 'ATRX_HUMAN',
 'AURKA': 'AURKA_HUMAN',
 'AURKB': 'AURKB_HUMAN',
 'AXIN1': 'AXIN1_HUMAN',
 'AXL': 'UFO_HUMAN',
 'B2M': 'B2MG_HUMAN',
 'BAP1': 'RING2_HUMAN',
 'BARD1': 'BARD1_HUMAN',
 'BCL10': 'BCL10_HUMAN',
 'BCL2': 'BCL2_HUMAN',
 'BCL2L11': 'B2L11_HUMAN',
 'BCOR': 'BCOR_HUMAN',
 'BRAF': 'BRAF_HUMAN',
 'BRCA1': 'BRCA1_HUMAN',
 'BRCA2': 'BRCA2_HUMAN',
 'BRD4': 'BRD4_HUMAN',
 'BRIP1': 'FANCJ_HUMAN',
 'BTK': 'BTK_HUMAN',
 'CARD11': 'CAR11_HUMAN',
 'CARM1': 'CARM1_HUMAN',
 'CASP8': 'CASP8_HUMAN',
 'CBL': 'CBL_HUMAN',
 'CCND1': 'CCND1_HUMAN',
 'CCND2': 'CCND2_HUMAN',


In [72]:
class_dict

{'1A02_HUMAN': [1, 1],
 '1B07_HUMAN': [1],
 '2AAA_HUMAN': [7, 1, 1, 1, 1, 6, 1, 6, 1, 1, 1, 1],
 'ABL1_HUMAN': [7,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  7,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2,
  2],
 'ACVR1_HUMAN': [7, 2, 7],
 'AGO2_HUMAN': [1, 1, 1, 2, 1],
 'AKT1_HUMAN': [2,
  7,
  3,
  7,
  8,
  3,
  7,
  2,
  7,
  7,
  7,
  7,
  8,
  7,
  7,
  7,
  5,
  5,
  7,
  5,
  7,
  7,
  7,
  7,
  7,
  7,
  3,
  2],
 'AKT2_HUMAN': [7, 9, 7, 7, 7, 9, 2, 7, 7, 7, 7],
 'AKT3_HUMAN': [7, 7, 7, 2],
 'ANDR_HUMAN': [7, 1, 6, 6, 7, 1, 7, 7, 6, 7, 7, 7, 7, 5, 2, 5, 5, 7, 6, 7],
 'APC_HUMAN': [1, 4, 1, 4, 1],
 'ARAF_HUMAN': [7, 2, 7, 7, 7, 7, 7],
 'ARI1A_HUMAN': [1],
 'ARI1B_HUMAN': [1, 1],
 'ARI5B_HUMAN': [1],
 'ARID2_HUMAN': [1, 1],
 'ASXL1_HUMAN': [1],
 'ASXL2_HUMAN': [1],
 'ATM_HUMAN': [1, 1, 1, 4, 4, 5],
 'ATRX_HUMAN': [1, 1],
 'ATR_HUMAN': [1],
 'AURKA_HUMAN': [7, 7, 4],
 'AURKB_HUMAN': [2],
 'AXIN1_HUMAN': [1],
 'B2L11_HUMAN': [1, 1],
 'B2MG_HUMAN': [4, 

In [73]:
gene_entries = list(gene_entry_dict.values())
len(gene_entries)

264

In [74]:
df = u.get_df(gene_entries)
df

INFO:root:fetching information from uniprot for 262 entries
INFO:root:uniprot.get_df 1/2
INFO:root:uniprot.get_df 2/2
INFO:root:uniprot.get_df 3/2


Unnamed: 0,Entry,Entry name,Gene names,Gene names (primary ),Gene names (synonym ),Gene names (ordered locus ),Gene names (ORF ),Organism,Organism ID,Protein names,...,Miscellaneous [CC],Keywords,Protein existence,Status,Sequence annotation (Features),Protein families,Version,Comments,Cross-reference (null),Pathway.1
0,P35222,CTNB1_HUMAN,[CTNNB1 CTNNB OK/SW-cl.35 PRO2286],CTNNB1,CTNNB,,OK/SW-cl.35 PRO2286,Homo sapiens (Human),9606,Catenin beta-1 (Beta-catenin),...,,"[3D-structure, Acetylation, Activator, Alterna...",Evidence at protein level,reviewed,,[Beta-catenin family],216,"[Alternative products (1), Caution (1), Functi...",,
1,Q12809,KCNH2_HUMAN,[KCNH2 ERG ERG1 HERG],KCNH2,ERG ERG1 HERG,,,Homo sapiens (Human),9606,Potassium voltage-gated channel subfamily H me...,...,,"[3D-structure, Alternative splicing, Cell memb...",Evidence at protein level,reviewed,,"[Potassium channel family, H (Eag) (TC 1.A.1.2...",199,"[Alternative products (1), Caution (3), Domain...",,
2,Q12888,TP53B_HUMAN,[TP53BP1],TP53BP1,,,,Homo sapiens (Human),9606,TP53-binding protein 1 (53BP1) (p53-binding pr...,...,,"[3D-structure, Activator, Alternative splicing...",Evidence at protein level,reviewed,,[],190,"[Alternative products (1), Caution (2), Domain...",,
3,P46531,NOTC1_HUMAN,[NOTCH1 TAN1],NOTCH1,TAN1,,,Homo sapiens (Human),9606,Neurogenic locus notch homolog protein 1 (Notc...,...,,"[3D-structure, ANK repeat, Activator, Angiogen...",Evidence at protein level,reviewed,,[NOTCH family],211,"[Function (1), Involvement in disease (2), Pos...",,
4,P42336,PK3CA_HUMAN,[PIK3CA],PIK3CA,,,,Homo sapiens (Human),9606,"Phosphatidylinositol 4,5-bisphosphate 3-kinase...",...,MISCELLANEOUS: The avian sarcoma virus 16 geno...,"[3D-structure, ATP-binding, Angiogenesis, Comp...",Evidence at protein level,reviewed,,[PI3/PI4-kinase family],189,"[Catalytic activity (2), Domain (1), Function ...",,
5,P11362,FGFR1_HUMAN,[FGFR1 BFGFR CEK FGFBR FLG FLT2 HBGFR],FGFR1,BFGFR CEK FGFBR FLG FLT2 HBGFR,,,Homo sapiens (Human),9606,Fibroblast growth factor receptor 1 (FGFR-1) (...,...,,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, Tyr protein kinas...",233,"[Alternative products (1), Catalytic activity ...",,
6,Q09472,EP300_HUMAN,[EP300 P300],EP300,P300,,,Homo sapiens (Human),9606,Histone acetyltransferase p300 (p300 HAT) (EC ...,...,,"[3D-structure, Acetylation, Acyltransferase, B...",Evidence at protein level,reviewed,,[],222,"[Catalytic activity (1), Domain (1), Function ...",,
7,P28482,MK01_HUMAN,[MAPK1 ERK2 PRKM1 PRKM2],MAPK1,ERK2 PRKM1 PRKM2,,,Homo sapiens (Human),9606,Mitogen-activated protein kinase 1 (MAP kinase...,...,,"[3D-structure, ATP-binding, Acetylation, Alter...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, CMGC Ser/Thr prot...",205,"[Alternative products (1), Catalytic activity ...",,
8,P42771,CDN2A_HUMAN,[CDKN2A CDKN2 MTS1],CDKN2A,CDKN2 MTS1,,,Homo sapiens (Human),9606,Cyclin-dependent kinase inhibitor 2A (Cyclin-d...,...,,"[3D-structure, ANK repeat, Acetylation, Altern...",Evidence at protein level,reviewed,,[CDKN2 cyclin-dependent kinase inhibitor family],197,"[Alternative products (1), Caution (2), Functi...",,
9,P12830,CADH1_HUMAN,[CDH1 CDHE UVO],CDH1,CDHE UVO,,,Homo sapiens (Human),9606,Cadherin-1 (CAM 120/80) (Epithelial cadherin) ...,...,,"[3D-structure, Alternative splicing, Calcium, ...",Evidence at protein level,reviewed,,[],219,"[Alternative products (1), Domain (1), Functio...",,


In [75]:
df_new = df[df['Gene ontology (molecular function)'].notnull()] # don't consider genes with no molecular function


In [76]:
df_new['Gene ontology (molecular function)'] = df_new['Gene ontology (molecular function)'].apply(lambda x: x.split('; ')) #split functions based on ;

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [77]:
GO_terms_dict = dict(zip(df_new['Entry name'], df_new['Gene ontology (molecular function)']))

In [78]:
GO_terms_dict

{'1A02_HUMAN': ['beta-2-microglobulin binding [GO:0030881]',
  'peptide antigen binding [GO:0042605]',
  'receptor binding [GO:0005102]',
  'RNA binding [GO:0003723]',
  'TAP binding [GO:0046977]',
  'T cell receptor binding [GO:0042608]'],
 '1B07_HUMAN': ['peptide antigen binding [GO:0042605]',
  'receptor binding [GO:0005102]'],
 '1B14_HUMAN': ['peptide antigen binding [GO:0042605]',
  'TAP binding [GO:0046977]'],
 '2AAA_HUMAN': ['antigen binding [GO:0003823]',
  'protein heterodimerization activity [GO:0046982]',
  'protein phosphatase regulator activity [GO:0019888]',
  'protein serine/threonine phosphatase activity [GO:0004722]'],
 'ABL1_HUMAN': ['actin filament binding [GO:0051015]',
  'actin monomer binding [GO:0003785]',
  'ATP binding [GO:0005524]',
  'DNA binding [GO:0003677]',
  'ephrin receptor binding [GO:0046875]',
  'magnesium ion binding [GO:0000287]',
  'manganese ion binding [GO:0030145]',
  'mitogen-activated protein kinase binding [GO:0051019]',
  'nicotinate-nucleo

In [79]:
# Find most common GO terms to use as features
def flatten(l): # taken from https://stackoverflow.com/questions/33900770/most-frequent-values-in-a-dictionary
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, str): #replaced basestring with str for Python3
            for sub in flatten(el):
                yield sub
        else:
            yield el



In [142]:
All_GO_terms = list(flatten(GO_terms_dict.values()))
len(set(All_GO_terms))

561

In [143]:
# Initialization of the 2056 new features with 0's
for terms in All_GO_terms:
    train[terms] = 0

In [130]:
# looping through all classes and getting terms for each class
'''terms_per_class = defaultdict(list)
for entry, terms in GO_terms_dict.items():
    if entry in class_dict:
        gene_classes = class_dict[entry]
        for gene_class in gene_classes:
            terms_per_class[gene_class].extend(terms)
           
        
terms_per_class'''

In [127]:
# code if we want most commons
'''counter_dict = {}
for classes in terms_per_class:
    counter_dict[classes] = Counter(terms_per_class[classes]).most_common(50)'''


In [144]:
# adds the molecular function GO terms to each gene in train data frame
for i in train.index:
    gene = train.Gene[i]
    gene_entry = gene_entry_dict[gene]
    if gene_entry in GO_terms_dict:
        GO_terms = GO_terms_dict[gene_entry]
        train.loc[i, GO_terms] = 1

train.shape

(3321, 565)

In [145]:
train

Unnamed: 0,ID,Gene,Variation,Class,alpha-catenin binding [GO:0045294],androgen receptor binding [GO:0050681],cadherin binding [GO:0045296],chromatin binding [GO:0003682],disordered domain specific binding [GO:0097718],enzyme binding [GO:0019899],...,serine-type peptidase activity [GO:0008236],nuclear localization sequence binding [GO:0008139],cytokine binding [GO:0019955],stem cell factor receptor activity [GO:0005020],adenylate kinase activity [GO:0004017],double-stranded telomeric DNA binding [GO:0003691],G-quadruplex DNA binding [GO:0051880],single-stranded telomeric DNA binding [GO:0043047],importin-alpha family protein binding [GO:0061676],RING-like zinc finger domain binding [GO:0071535]
0,0,FAM58A,Truncating Mutations,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,CBL,W802*,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,CBL,Q249E,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,CBL,N454D,3,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,CBL,L399V,4,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,5,CBL,V391I,4,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,6,CBL,V430M,5,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,7,CBL,Deletion,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,8,CBL,Y371H,4,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,9,CBL,C384R,4,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [147]:
train.to_csv("uniprot_result/train_uniprot.csv",index=False)