# Retrieve most common GO terms for gene from the Uniprot database

In [1]:
import numpy as np
import pandas as pd
import string
import os
from collections import Counter

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import re
from bioservices import *
import collections
%pylab inline --no-import-all

Populating the interactive namespace from numpy and matplotlib


In [2]:
train = pd.read_csv('..//bases/training_variants')
test = pd.read_csv('..//bases/test_variants')

In [3]:
data_all = pd.concat((train, test), axis=0, ignore_index=True)

In [4]:
all_genes = set(data_all.Gene)
print(len(all_genes))
print(all_genes)

1507
{'GFI1', 'SLC4A4', 'GJB3', 'SUGCT', 'KDM5C', 'MTOR', 'IYD', 'TNFRSF13B', 'LRP2', 'CAV3', 'HABP2', 'DHCR7', 'PRKAG2', 'DOCK8', 'GCKR', 'NKX2-1', 'RBM20', 'ADAMTS10', 'CHMP4B', 'OAT', 'C6', 'POU3F4', 'RTN4R', 'XPO1', 'HFE2', 'SYNE1', 'COL4A5', 'ZNF41', 'CD207', 'HMGCS2', 'GUCA1B', 'SERPINF2', 'CISD2', 'CNNM4', 'CCND3', 'ERF', 'IL7R', 'CDKN2A', 'COCH', 'B4GALT7', 'CD96', 'SLC25A20', 'PMM2', 'ATRX', 'SLC6A5', 'ARAF', 'CNGB1', 'ZNF592', 'ANTXR1', 'RBBP8', 'MMAA', 'RRAS2', 'MMP13', 'CLCN2', 'SLC16A1', 'DARS2', 'BCOR', 'OTOF', 'MLPH', 'RAPSN', 'DNM1L', 'ARFGEF2', 'KRT81', 'AMT', 'TSPAN7', 'AICDA', 'ETHE1', 'TRMU', 'GPD1L', 'STAT3', 'PDSS1', 'CYBB', 'ZFYVE27', 'ARL13B', 'KCNQ2', 'MINPP1', 'SDHB', 'AIPL1', 'CASR', 'FGFR2', 'PIP5K1C', 'MATN3', 'XPC', 'ITGA2B', 'DGUOK', 'MMAB', 'TLR3', 'CCNE1', 'HEXA', 'AKT3', 'CPT1A', 'TET2', 'FBLN5', 'APCDD1', 'TSEN2', 'SEPT9', 'ATR', 'FRMD7', 'BMP15', 'AURKC', 'MYCN', 'EHMT1', 'CDK8', 'SPG7', 'KRT12', 'DIS3', 'TRIP11', 'ACVRL1', 'APC', 'ATP7B', 'SPOP', 'C

In [5]:
u = UniProt()

In [6]:
res = u.search("ZAP70_HUMAN")
print(res)

Entry	Entry name	Status	Protein names	Gene names	Organism	Length
P43403	ZAP70_HUMAN	reviewed	Tyrosine-protein kinase ZAP-70 (EC 2.7.10.2) (70 kDa zeta-chain associated protein) (Syk-related tyrosine kinase)	ZAP70 SRK	Homo sapiens (Human)	619



In [7]:
u.debugLevel = "INFO"
u.timeout = 100   # some queries are long and requires much more time; default is 1000 seconds

In [8]:
# just an example of query
a = u.search('SLC16A1+AND+organism:9606', frmt='tab', limit=1,
               columns="entry name")

In [9]:
[s.strip() for s in a.splitlines()]

['Entry name', 'MOT1_HUMAN']

In [10]:
gene_entry_dict = {}
class_dict = {}
for gene in all_genes:
    keyword = 'gene:%s+AND+organism:9606' %gene #to query database, with gene and organism 9606 is Homo Sapien (human)
    entry_name_tab = u.search(keyword, frmt='tab', limit=1, columns="entry name") 
    entry_name = [s.strip() for s in entry_name_tab.splitlines()][1] # gets the entry name = in second position in list
    gene_entry_dict[gene] = entry_name

In [11]:
gene_entry_dict

{'GFI1': 'GFI1_HUMAN',
 'SLC4A4': 'S4A4_HUMAN',
 'GJB3': 'CXB3_HUMAN',
 'SUGCT': 'SUCHY_HUMAN',
 'KDM5C': 'KDM5C_HUMAN',
 'MTOR': 'MTOR_HUMAN',
 'IYD': 'IYD1_HUMAN',
 'TNFRSF13B': 'TR13B_HUMAN',
 'LRP2': 'LRP2_HUMAN',
 'CAV3': 'CAV3_HUMAN',
 'HABP2': 'HABP2_HUMAN',
 'DHCR7': 'DHCR7_HUMAN',
 'PRKAG2': 'AAKG2_HUMAN',
 'DOCK8': 'DOCK8_HUMAN',
 'GCKR': 'GCKR_HUMAN',
 'NKX2-1': 'NKX21_HUMAN',
 'RBM20': 'RBM20_HUMAN',
 'ADAMTS10': 'ATS10_HUMAN',
 'CHMP4B': 'CHM4B_HUMAN',
 'OAT': 'S22A6_HUMAN',
 'C6': 'BRCC3_HUMAN',
 'POU3F4': 'PO3F4_HUMAN',
 'RTN4R': 'RTN4R_HUMAN',
 'XPO1': 'XPO1_HUMAN',
 'HFE2': 'RGMC_HUMAN',
 'SYNE1': 'SYNE1_HUMAN',
 'COL4A5': 'CO4A5_HUMAN',
 'ZNF41': 'ZNF41_HUMAN',
 'CD207': 'CLC4K_HUMAN',
 'HMGCS2': 'HMCS2_HUMAN',
 'GUCA1B': 'GUC1B_HUMAN',
 'SERPINF2': 'A2AP_HUMAN',
 'CISD2': 'CISD2_HUMAN',
 'CNNM4': 'CNNM4_HUMAN',
 'CCND3': 'CCND3_HUMAN',
 'ERF': 'TISB_HUMAN',
 'IL7R': 'IL7RA_HUMAN',
 'CDKN2A': 'CDN2A_HUMAN',
 'COCH': 'COCH_HUMAN',
 'B4GALT7': 'B4GT7_HUMAN',
 'CD96': 'T

In [12]:
gene_entries = list(gene_entry_dict.values())
len(gene_entries)

1507

In [13]:
df = u.get_df(gene_entries)
df

INFO:root:fetching information from uniprot for 1499 entries
INFO:root:uniprot.get_df 1/14
INFO:root:uniprot.get_df 2/14
INFO:root:uniprot.get_df 3/14
INFO:root:uniprot.get_df 4/14
INFO:root:uniprot.get_df 5/14
INFO:root:uniprot.get_df 6/14
INFO:root:uniprot.get_df 7/14
INFO:root:uniprot.get_df 8/14
INFO:root:uniprot.get_df 9/14
INFO:root:uniprot.get_df 10/14
INFO:root:uniprot.get_df 11/14
INFO:root:uniprot.get_df 12/14
INFO:root:uniprot.get_df 13/14
INFO:root:uniprot.get_df 14/14
INFO:root:uniprot.get_df 15/14


Unnamed: 0,Entry,Entry name,Gene names,Gene names (primary ),Gene names (synonym ),Gene names (ordered locus ),Gene names (ORF ),Organism,Organism ID,Protein names,...,Miscellaneous [CC],Keywords,Protein existence,Status,Sequence annotation (Features),Protein families,Version,Comments,Cross-reference (null),Pathway.1
0,Q12809,KCNH2_HUMAN,[KCNH2 ERG ERG1 HERG],KCNH2,ERG ERG1 HERG,,,Homo sapiens (Human),9606,Potassium voltage-gated channel subfamily H me...,...,,"[3D-structure, Alternative splicing, Cell memb...",Evidence at protein level,reviewed,,"[Potassium channel family, H (Eag) (TC 1.A.1.2...",199,"[Alternative products (1), Caution (3), Domain...",,
1,O00429,DNM1L_HUMAN,[DNM1L DLP1 DRP1],DNM1L,DLP1 DRP1,,,Homo sapiens (Human),9606,Dynamin-1-like protein (EC 3.6.5.5) (Dnm1p/Vps...,...,,"[3D-structure, Acetylation, Alternative splici...",Evidence at protein level,reviewed,,"[TRAFAC class dynamin-like GTPase superfamily,...",160,"[Alternative products (1), Catalytic activity ...",,
2,P35670,ATP7B_HUMAN,[ATP7B PWD WC1 WND],ATP7B,PWD WC1 WND,,,Homo sapiens (Human),9606,Copper-transporting ATPase 2 (EC 3.6.3.54) (Co...,...,,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,[Cation transport ATPase (P-type) (TC 3.A.3) f...,198,"[Alternative products (1), Catalytic activity ...",,
3,P11362,FGFR1_HUMAN,[FGFR1 BFGFR CEK FGFBR FLG FLT2 HBGFR],FGFR1,BFGFR CEK FGFBR FLG FLT2 HBGFR,,,Homo sapiens (Human),9606,Fibroblast growth factor receptor 1 (FGFR-1) (...,...,,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, Tyr protein kinas...",233,"[Alternative products (1), Catalytic activity ...",,
4,P00740,FA9_HUMAN,[F9],F9,,,,Homo sapiens (Human),9606,Coagulation factor IX (EC 3.4.21.22) (Christma...,...,"MISCELLANEOUS: In 1952, one of the earliest re...","[3D-structure, Alternative splicing, Blood coa...",Evidence at protein level,reviewed,,[Peptidase S1 family],235,"[Alternative products (1), Catalytic activity ...",,
5,P04839,CY24B_HUMAN,[CYBB NOX2],CYBB,NOX2,,,Homo sapiens (Human),9606,Cytochrome b-245 heavy chain (EC 1.-.-.-) (CGD...,...,,"[3D-structure, Cell membrane, Chronic granulom...",Evidence at protein level,reviewed,,[],194,"[Cofactor (1), Function (1), Involvement in di...",,
6,Q9BZS1,FOXP3_HUMAN,[FOXP3 IPEX JM2],FOXP3,IPEX,,JM2,Homo sapiens (Human),9606,Forkhead box protein P3 (Scurfin) [Cleaved int...,...,,"[3D-structure, Acetylation, Activator, Alterna...",Evidence at protein level,reviewed,,[],174,"[Alternative products (1), Caution (1), Domain...",,
7,Q9NVV9,THAP1_HUMAN,[THAP1],THAP1,,,,Homo sapiens (Human),9606,THAP domain-containing protein 1,...,,"[3D-structure, Alternative splicing, Cell cycl...",Evidence at protein level,reviewed,,[THAP1 family],138,"[Alternative products (1), Caution (1), Functi...",,
8,P04049,RAF1_HUMAN,[RAF1 RAF],RAF1,RAF,,,Homo sapiens (Human),9606,RAF proto-oncogene serine/threonine-protein ki...,...,,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, TKL Ser/Thr prote...",214,"[Alternative products (1), Catalytic activity ...",,
9,Q12791,KCMA1_HUMAN,[KCNMA1 KCNMA SLO],KCNMA1,KCNMA SLO,,,Homo sapiens (Human),9606,Calcium-activated potassium channel subunit al...,...,MISCELLANEOUS: The protein was initially thoug...,"[3D-structure, Alternative splicing, Calcium, ...",Evidence at protein level,reviewed,,"[Potassium channel family, Calcium-activated (...",174,"[Alternative products (1), Caution (1), Domain...",,


In [15]:
df_new = df[df['Gene ontology (molecular function)'].notnull()] # don't consider genes with no molecular function


In [16]:
df_new['Gene ontology (molecular function)'] = df_new['Gene ontology (molecular function)'].apply(lambda x: x.split('; ')) #split functions based on ;

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [17]:
GO_terms_dict = dict(zip(df_new['Entry name'], df_new['Gene ontology (molecular function)']))

In [18]:
GO_terms_dict

{'KCNH2_HUMAN': ['C3HC4-type RING finger domain binding [GO:0055131]',
  'delayed rectifier potassium channel activity [GO:0005251]',
  'identical protein binding [GO:0042802]',
  'inward rectifier potassium channel activity [GO:0005242]',
  'phosphorelay sensor kinase activity [GO:0000155]',
  'protein homodimerization activity [GO:0042803]',
  'scaffold protein binding [GO:0097110]',
  'ubiquitin protein ligase binding [GO:0031625]',
  'voltage-gated potassium channel activity [GO:0005249]',
  'voltage-gated potassium channel activity involved in cardiac muscle cell action potential repolarization [GO:0086008]',
  'voltage-gated potassium channel activity involved in ventricular cardiac muscle cell action potential repolarization [GO:1902282]'],
 'DNM1L_HUMAN': ['BH2 domain binding [GO:0051433]',
  'clathrin binding [GO:0030276]',
  'GTPase activator activity [GO:0005096]',
  'GTPase activity [GO:0003924]',
  'GTP binding [GO:0005525]',
  'GTP-dependent protein binding [GO:0030742]',

In [19]:
# Find most common GO terms to use as features
def flatten(l): # taken from https://stackoverflow.com/questions/33900770/most-frequent-values-in-a-dictionary
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, str): #replaced basestring with str for Python3
            for sub in flatten(el):
                yield sub
        else:
            yield el


In [20]:
All_GO_terms = list(flatten(GO_terms_dict.values()))
len(All_GO_terms)

7062

In [21]:
GO_terms_count = collections.Counter(All_GO_terms) 
GO_most_common = GO_terms_count.most_common(30)

In [22]:
GO_most_common

[('ATP binding [GO:0005524]', 225),
 ('metal ion binding [GO:0046872]', 203),
 ('protein homodimerization activity [GO:0042803]', 149),
 ('identical protein binding [GO:0042802]', 138),
 ('DNA binding [GO:0003677]', 122),
 ('transcription factor activity, sequence-specific DNA binding [GO:0003700]',
  99),
 ('RNA binding [GO:0003723]', 91),
 ('zinc ion binding [GO:0008270]', 82),
 ('protein heterodimerization activity [GO:0046982]', 79),
 ('calcium ion binding [GO:0005509]', 75),
 ('protein kinase binding [GO:0019901]', 71),
 ('chromatin binding [GO:0003682]', 65),
 ('transcription factor binding [GO:0008134]', 64),
 ('enzyme binding [GO:0019899]', 63),
 ('RNA polymerase II core promoter proximal region sequence-specific DNA binding [GO:0000978]',
  63),
 ('transcriptional activator activity, RNA polymerase II core promoter proximal region sequence-specific binding [GO:0001077]',
  56),
 ('ubiquitin protein ligase binding [GO:0031625]', 53),
 ('sequence-specific DNA binding [GO:0043565

In [23]:
#adding those features as dummy variables on data set
features_list = []
for common in GO_most_common:
    term = common[0]
    features_list.append(term)
    data_all[term] = 0

data_all

Unnamed: 0,Class,Gene,ID,Variation,ATP binding [GO:0005524],metal ion binding [GO:0046872],protein homodimerization activity [GO:0042803],identical protein binding [GO:0042802],DNA binding [GO:0003677],"transcription factor activity, sequence-specific DNA binding [GO:0003700]",...,receptor binding [GO:0005102],transcription coactivator activity [GO:0003713],protein C-terminus binding [GO:0008022],protein complex binding [GO:0032403],protein domain specific binding [GO:0019904],GTP binding [GO:0005525],protein tyrosine kinase activity [GO:0004713],protein kinase activity [GO:0004672],cadherin binding [GO:0045296],GTPase activity [GO:0003924]
0,1.0,FAM58A,0,Truncating Mutations,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2.0,CBL,1,W802*,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2.0,CBL,2,Q249E,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3.0,CBL,3,N454D,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4.0,CBL,4,L399V,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4.0,CBL,5,V391I,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,5.0,CBL,6,V430M,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1.0,CBL,7,Deletion,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,4.0,CBL,8,Y371H,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,4.0,CBL,9,C384R,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
GO_terms_dict_filtered = {}

for entry, terms in GO_terms_dict.items():
    GO_terms_dict_filtered[entry] = list(set(terms).intersection(features_list)) #only keeps elements from feature_list


In [25]:
GO_terms_dict_filtered

{'KCNH2_HUMAN': ['identical protein binding [GO:0042802]',
  'protein homodimerization activity [GO:0042803]',
  'ubiquitin protein ligase binding [GO:0031625]'],
 'DNM1L_HUMAN': ['protein complex binding [GO:0032403]',
  'protein homodimerization activity [GO:0042803]',
  'identical protein binding [GO:0042802]',
  'GTPase activity [GO:0003924]',
  'ubiquitin protein ligase binding [GO:0031625]',
  'GTP binding [GO:0005525]'],
 'ATP7B_HUMAN': ['ATP binding [GO:0005524]'],
 'FGFR1_HUMAN': ['identical protein binding [GO:0042802]',
  'protein tyrosine kinase activity [GO:0004713]',
  'protein homodimerization activity [GO:0042803]',
  'ATP binding [GO:0005524]'],
 'FA9_HUMAN': ['calcium ion binding [GO:0005509]'],
 'CY24B_HUMAN': ['metal ion binding [GO:0046872]',
  'protein heterodimerization activity [GO:0046982]'],
 'FOXP3_HUMAN': ['transcription factor activity, sequence-specific DNA binding [GO:0003700]',
  'protein homodimerization activity [GO:0042803]',
  'sequence-specific DNA 

In [31]:
feature_dict = {}
for gene in all_genes:
    entry = gene_entry_dict[gene]
    if entry in GO_terms_dict_filtered: 
        GO_terms = GO_terms_dict_filtered[entry]
        feature_dict[gene] = GO_terms

In [32]:
feature_dict

{'GFI1': ['transcription regulatory region DNA binding [GO:0044212]',
  'metal ion binding [GO:0046872]'],
 'SLC4A4': [],
 'GJB3': [],
 'SUGCT': [],
 'KDM5C': ['zinc ion binding [GO:0008270]', 'DNA binding [GO:0003677]'],
 'MTOR': ['protein serine/threonine kinase activity [GO:0004674]',
  'protein domain specific binding [GO:0019904]',
  'protein kinase binding [GO:0019901]',
  'protein kinase activity [GO:0004672]',
  'ATP binding [GO:0005524]'],
 'IYD': [],
 'TNFRSF13B': [],
 'LRP2': ['calcium ion binding [GO:0005509]'],
 'CAV3': ['protein complex binding [GO:0032403]',
  'protein C-terminus binding [GO:0008022]'],
 'HABP2': ['calcium ion binding [GO:0005509]'],
 'DHCR7': [],
 'PRKAG2': ['ATP binding [GO:0005524]', 'protein kinase binding [GO:0019901]'],
 'DOCK8': [],
 'GCKR': [],
 'NKX2-1': ['transcription regulatory region DNA binding [GO:0044212]',
  'enzyme binding [GO:0019899]',
  'transcription factor activity, sequence-specific DNA binding [GO:0003700]',
  'DNA binding [GO:00

In [33]:
for i in data_all.index:
    print(i)
    gene = data_all.Gene[i]
    if gene in feature_dict:
        GO_terms = feature_dict[gene]
        data_all.loc[i, GO_terms] = 1

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

3498
3499
3500
3501
3502
3503
3504
3505
3506
3507
3508
3509
3510
3511
3512
3513
3514
3515
3516
3517
3518
3519
3520
3521
3522
3523
3524
3525
3526
3527
3528
3529
3530
3531
3532
3533
3534
3535
3536
3537
3538
3539
3540
3541
3542
3543
3544
3545
3546
3547
3548
3549
3550
3551
3552
3553
3554
3555
3556
3557
3558
3559
3560
3561
3562
3563
3564
3565
3566
3567
3568
3569
3570
3571
3572
3573
3574
3575
3576
3577
3578
3579
3580
3581
3582
3583
3584
3585
3586
3587
3588
3589
3590
3591
3592
3593
3594
3595
3596
3597
3598
3599
3600
3601
3602
3603
3604
3605
3606
3607
3608
3609
3610
3611
3612
3613
3614
3615
3616
3617
3618
3619
3620
3621
3622
3623
3624
3625
3626
3627
3628
3629
3630
3631
3632
3633
3634
3635
3636
3637
3638
3639
3640
3641
3642
3643
3644
3645
3646
3647
3648
3649
3650
3651
3652
3653
3654
3655
3656
3657
3658
3659
3660
3661
3662
3663
3664
3665
3666
3667
3668
3669
3670
3671
3672
3673
3674
3675
3676
3677
3678
3679
3680
3681
3682
3683
3684
3685
3686
3687
3688
3689
3690
3691
3692
3693
3694
3695
3696
3697


6774
6775
6776
6777
6778
6779
6780
6781
6782
6783
6784
6785
6786
6787
6788
6789
6790
6791
6792
6793
6794
6795
6796
6797
6798
6799
6800
6801
6802
6803
6804
6805
6806
6807
6808
6809
6810
6811
6812
6813
6814
6815
6816
6817
6818
6819
6820
6821
6822
6823
6824
6825
6826
6827
6828
6829
6830
6831
6832
6833
6834
6835
6836
6837
6838
6839
6840
6841
6842
6843
6844
6845
6846
6847
6848
6849
6850
6851
6852
6853
6854
6855
6856
6857
6858
6859
6860
6861
6862
6863
6864
6865
6866
6867
6868
6869
6870
6871
6872
6873
6874
6875
6876
6877
6878
6879
6880
6881
6882
6883
6884
6885
6886
6887
6888
6889
6890
6891
6892
6893
6894
6895
6896
6897
6898
6899
6900
6901
6902
6903
6904
6905
6906
6907
6908
6909
6910
6911
6912
6913
6914
6915
6916
6917
6918
6919
6920
6921
6922
6923
6924
6925
6926
6927
6928
6929
6930
6931
6932
6933
6934
6935
6936
6937
6938
6939
6940
6941
6942
6943
6944
6945
6946
6947
6948
6949
6950
6951
6952
6953
6954
6955
6956
6957
6958
6959
6960
6961
6962
6963
6964
6965
6966
6967
6968
6969
6970
6971
6972
6973


In [40]:
print(data_all[data_all['protein kinase binding [GO:0019901]']==1])

      Class     Gene    ID             Variation  ATP binding [GO:0005524]  \
67      7.0     RHEB    67                  Y35C                         0   
68      7.0     RHEB    68                  Y35N                         0   
69      7.0     RHEB    69                  Y35H                         0   
71      2.0    CCND2    71         Amplification                         0   
75      2.0    CCND3    75         Amplification                         0   
76      2.0    CCND3    76                 P284L                         0   
77      7.0    CCND3    77                 Q276*                         0   
78      7.0    CCND3    78                 T286A                         0   
79      7.0    CCND3    79                 T283A                         0   
80      2.0    CCND3    80                 I290A                         0   
81      2.0    CCND3    81                 I290R                         0   
86      7.0    CCNE1    86         Amplification                

[895 rows x 34 columns]
