# Retrieve GO terms for gene from the Uniprot database based on the classes

In [38]:
import numpy as np
import pandas as pd
import string
import os
from collections import Counter
from collections import defaultdict

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import re
from bioservices import *
import collections
%pylab inline --no-import-all

Populating the interactive namespace from numpy and matplotlib


In [39]:
train = pd.read_csv('..//bases/training_variants')
test = pd.read_csv('..//bases/test_variants')

In [40]:
# only use gene from train data -> contains the classes
all_genes = set(train.Gene)
print(len(all_genes))
print(all_genes)

264
{'RAD51D', 'SRSF2', 'DNMT3A', 'EIF1AX', 'CEBPA', 'KEAP1', 'MEN1', 'EWSR1', 'ARID1B', 'HRAS', 'FOXA1', 'SPOP', 'MYOD1', 'BTK', 'AKT3', 'XRCC2', 'TCF3', 'CCND2', 'EPCAM', 'BRCA2', 'PRDM1', 'PTPRD', 'IGF1R', 'BCL2L11', 'FGFR3', 'TSC2', 'CDK8', 'ARID2', 'SRC', 'RBM10', 'RAD51C', 'ERCC2', 'MAP2K1', 'PIK3CD', 'ALK', 'CARM1', 'FLT3', 'PMS2', 'CARD11', 'IKBKE', 'IL7R', 'FGFR4', 'GNA11', 'MYCN', 'PAX8', 'SDHB', 'STAG2', 'RRAS2', 'PIK3CB', 'SOS1', 'TMPRSS2', 'CDK12', 'JUN', 'ARID5B', 'STAT3', 'GNAQ', 'NCOR1', 'SMARCA4', 'HIST1H1C', 'NRAS', 'ARAF', 'CDKN1B', 'RXRA', 'BRCA1', 'SF3B1', 'ERBB2', 'FANCA', 'RAD51B', 'PDGFRB', 'RHOA', 'INPP4B', 'DDR2', 'NKX2-1', 'BAP1', 'FAM58A', 'SHOC2', 'MAP2K4', 'AXL', 'CIC', 'CTNNB1', 'PIK3R2', 'MGA', 'EP300', 'PIK3R1', 'TERT', 'VHL', 'ASXL1', 'TCF7L2', 'AKT2', 'NTRK3', 'H3F3A', 'SMAD2', 'NF1', 'KNSTRN', 'ROS1', 'BCOR', 'RAB35', 'RB1', 'CDKN2C', 'ERBB3', 'AR', 'CASP8', 'CDKN2B', 'JAK1', 'EPAS1', 'ETV1', 'DICER1', 'MPL', 'BCL10', 'MSH6', 'KMT2A', 'NUP93', 'FOXL2

In [41]:
u = UniProt()

In [42]:
res = u.search("ZAP70_HUMAN")
print(res)

Entry	Entry name	Status	Protein names	Gene names	Organism	Length
P43403	ZAP70_HUMAN	reviewed	Tyrosine-protein kinase ZAP-70 (EC 2.7.10.2) (70 kDa zeta-chain associated protein) (Syk-related tyrosine kinase)	ZAP70 SRK	Homo sapiens (Human)	619



In [43]:
u.debugLevel = "INFO"
u.timeout = 100   # some queries are long and requires much more time; default is 1000 seconds

In [44]:
# just an example of query
a = u.search('SLC16A1+AND+organism:9606', frmt='tab', limit=1,
               columns="entry name")

In [45]:
[s.strip() for s in a.splitlines()]

['Entry name', 'MOT1_HUMAN']

In [46]:
gene_entry_dict = {} # here we will keep the gene_entries together with their classes
class_dict = {}
for gene in all_genes:
    gene_classes = list(train.Class[train.Gene==gene])
    keyword = 'gene:%s+AND+organism:9606' %gene #to query database, with gene and organism 9606 is Homo Sapien (human)
    entry_name_tab = u.search(keyword, frmt='tab', limit=1, columns="entry name") 
    entry_name = [s.strip() for s in entry_name_tab.splitlines()][1] # gets the entry name from uniprot i.e. second position in tab
    gene_entry_dict[gene] = entry_name
    class_dict[entry_name] = gene_classes

In [47]:
gene_entry_dict

{'ABL1': 'ABL1_HUMAN',
 'ACVR1': 'ACVR1_HUMAN',
 'AGO2': 'AGO2_HUMAN',
 'AKT1': 'AKT1_HUMAN',
 'AKT2': 'AKT2_HUMAN',
 'AKT3': 'AKT3_HUMAN',
 'ALK': 'TGFR1_HUMAN',
 'APC': 'APC_HUMAN',
 'AR': 'ANDR_HUMAN',
 'ARAF': 'ARAF_HUMAN',
 'ARID1A': 'ARI1A_HUMAN',
 'ARID1B': 'ARI1B_HUMAN',
 'ARID2': 'ARID2_HUMAN',
 'ARID5B': 'ARI5B_HUMAN',
 'ASXL1': 'ASXL1_HUMAN',
 'ASXL2': 'ASXL2_HUMAN',
 'ATM': 'ATM_HUMAN',
 'ATR': 'ATR_HUMAN',
 'ATRX': 'ATRX_HUMAN',
 'AURKA': 'AURKA_HUMAN',
 'AURKB': 'AURKB_HUMAN',
 'AXIN1': 'AXIN1_HUMAN',
 'AXL': 'UFO_HUMAN',
 'B2M': 'B2MG_HUMAN',
 'BAP1': 'RING2_HUMAN',
 'BARD1': 'BARD1_HUMAN',
 'BCL10': 'BCL10_HUMAN',
 'BCL2': 'BCL2_HUMAN',
 'BCL2L11': 'B2L11_HUMAN',
 'BCOR': 'BCOR_HUMAN',
 'BRAF': 'BRAF_HUMAN',
 'BRCA1': 'BRCA1_HUMAN',
 'BRCA2': 'BRCA2_HUMAN',
 'BRD4': 'BRD4_HUMAN',
 'BRIP1': 'FANCJ_HUMAN',
 'BTK': 'BTK_HUMAN',
 'CARD11': 'CAR11_HUMAN',
 'CARM1': 'CARM1_HUMAN',
 'CASP8': 'CASP8_HUMAN',
 'CBL': 'CBL_HUMAN',
 'CCND1': 'CCND1_HUMAN',
 'CCND2': 'CCND2_HUMAN',


In [48]:
gene_entries = list(gene_entry_dict.values())
len(gene_entries)

264

In [49]:
df = u.get_df(gene_entries)
df

INFO:root:fetching information from uniprot for 262 entries
INFO:root:uniprot.get_df 1/2
INFO:root:uniprot.get_df 2/2
INFO:root:uniprot.get_df 3/2


Unnamed: 0,Entry,Entry name,Gene names,Gene names (primary ),Gene names (synonym ),Gene names (ordered locus ),Gene names (ORF ),Organism,Organism ID,Protein names,...,Miscellaneous [CC],Keywords,Protein existence,Status,Sequence annotation (Features),Protein families,Version,Comments,Cross-reference (null),Pathway.1
0,P27986,P85A_HUMAN,[PIK3R1 GRB1],PIK3R1,GRB1,,,Homo sapiens (Human),9606,Phosphatidylinositol 3-kinase regulatory subun...,...,,"[3D-structure, Acetylation, Alternative splici...",Evidence at protein level,reviewed,,[PI3K p85 subunit family],214,"[Alternative products (1), Caution (1), Domain...",,
1,Q12809,KCNH2_HUMAN,[KCNH2 ERG ERG1 HERG],KCNH2,ERG ERG1 HERG,,,Homo sapiens (Human),9606,Potassium voltage-gated channel subfamily H me...,...,,"[3D-structure, Alternative splicing, Cell memb...",Evidence at protein level,reviewed,,"[Potassium channel family, H (Eag) (TC 1.A.1.2...",199,"[Alternative products (1), Caution (3), Domain...",,
2,P36897,TGFR1_HUMAN,[TGFBR1 ALK5 SKR4],TGFBR1,ALK5 SKR4,,,Homo sapiens (Human),9606,TGF-beta receptor type-1 (TGFR-1) (EC 2.7.11.3...,...,,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, TKL Ser/Thr prote...",203,"[Alternative products (1), Catalytic activity ...",,
3,P35968,VGFR2_HUMAN,[KDR FLK1 VEGFR2],KDR,FLK1 VEGFR2,,,Homo sapiens (Human),9606,Vascular endothelial growth factor receptor 2 ...,...,,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, Tyr protein kinas...",200,"[Alternative products (1), Catalytic activity ...",,
4,P21802,FGFR2_HUMAN,[FGFR2 BEK KGFR KSAM],FGFR2,BEK KGFR KSAM,,,Homo sapiens (Human),9606,Fibroblast growth factor receptor 2 (FGFR-2) (...,...,,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, Tyr protein kinas...",228,"[Alternative products (1), Catalytic activity ...",,
5,P31749,AKT1_HUMAN,[AKT1 PKB RAC],AKT1,PKB RAC,,,Homo sapiens (Human),9606,RAC-alpha serine/threonine-protein kinase (EC ...,...,,"[3D-structure, ATP-binding, Acetylation, Alter...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, AGC Ser/Thr prote...",216,"[Alternative products (1), Catalytic activity ...",,
6,Q06124,PTN11_HUMAN,[PTPN11 PTP2C SHPTP2],PTPN11,PTP2C SHPTP2,,,Homo sapiens (Human),9606,Tyrosine-protein phosphatase non-receptor type...,...,,"[3D-structure, Acetylation, Alternative splici...",Evidence at protein level,reviewed,,"[Protein-tyrosine phosphatase family, Non-rece...",213,"[Alternative products (1), Catalytic activity ...",,
7,P40692,MLH1_HUMAN,[MLH1 COCA2],MLH1,COCA2,,,Homo sapiens (Human),9606,DNA mismatch repair protein Mlh1 (MutL protein...,...,,"[3D-structure, ATP-binding, Acetylation, Alter...",Evidence at protein level,reviewed,,[DNA mismatch repair MutL/HexB family],200,"[Alternative products (1), Caution (1), Functi...",,
8,P63092,GNAS2_HUMAN,[GNAS GNAS1 GSP],GNAS,GNAS1 GSP,,,Homo sapiens (Human),9606,Guanine nucleotide-binding protein G(s) subuni...,...,MISCELLANEOUS: This protein is produced by a b...,"[3D-structure, ADP-ribosylation, Alternative s...",Evidence at protein level,reviewed,,"[G-alpha family, G(s) subfamily]",153,"[Alternative products (1), Caution (5), Functi...",,
9,P08069,IGF1R_HUMAN,[IGF1R],IGF1R,,,,Homo sapiens (Human),9606,Insulin-like growth factor 1 receptor (EC 2.7....,...,,"[3D-structure, ATP-binding, Cell membrane, Cle...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, Tyr protein kinas...",207,"[Catalytic activity (1), Enzyme regulation (1)...",,


In [50]:
df_new = df[df['Gene ontology (molecular function)'].notnull()] # don't consider genes with no molecular function


In [51]:
df_new['Gene ontology (molecular function)'] = df_new['Gene ontology (molecular function)'].apply(lambda x: x.split('; ')) #split functions based on ;

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [52]:
GO_terms_dict = dict(zip(df_new['Entry name'], df_new['Gene ontology (molecular function)']))

In [53]:
GO_terms_dict

{'1A02_HUMAN': ['beta-2-microglobulin binding [GO:0030881]',
  'peptide antigen binding [GO:0042605]',
  'receptor binding [GO:0005102]',
  'RNA binding [GO:0003723]',
  'TAP binding [GO:0046977]',
  'T cell receptor binding [GO:0042608]'],
 '1B07_HUMAN': ['peptide antigen binding [GO:0042605]',
  'receptor binding [GO:0005102]'],
 '1B14_HUMAN': ['peptide antigen binding [GO:0042605]',
  'TAP binding [GO:0046977]'],
 '2AAA_HUMAN': ['antigen binding [GO:0003823]',
  'protein heterodimerization activity [GO:0046982]',
  'protein phosphatase regulator activity [GO:0019888]',
  'protein serine/threonine phosphatase activity [GO:0004722]'],
 'ABL1_HUMAN': ['actin filament binding [GO:0051015]',
  'actin monomer binding [GO:0003785]',
  'ATP binding [GO:0005524]',
  'DNA binding [GO:0003677]',
  'ephrin receptor binding [GO:0046875]',
  'magnesium ion binding [GO:0000287]',
  'manganese ion binding [GO:0030145]',
  'mitogen-activated protein kinase binding [GO:0051019]',
  'nicotinate-nucleo

In [54]:
# Find most common GO terms to use as features
def flatten(l): # taken from https://stackoverflow.com/questions/33900770/most-frequent-values-in-a-dictionary
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, str): #replaced basestring with str for Python3
            for sub in flatten(el):
                yield sub
        else:
            yield el



In [55]:
All_GO_terms = list(flatten(GO_terms_dict.values()))
len(set(All_GO_terms))

561

In [56]:
# Initialization of the 2056 new features with 0's
for terms in All_GO_terms:
    train[terms] = 0

In [21]:
# looping through all classes and getting terms for each class
'''terms_per_class = defaultdict(list)
for entry, terms in GO_terms_dict.items():
    if entry in class_dict:
        gene_classes = class_dict[entry]
        for gene_class in gene_classes:
            terms_per_class[gene_class].extend(terms)
           
        
terms_per_class'''

'terms_per_class = defaultdict(list)\nfor entry, terms in GO_terms_dict.items():\n    if entry in class_dict:\n        gene_classes = class_dict[entry]\n        for gene_class in gene_classes:\n            terms_per_class[gene_class].extend(terms)\n           \n        \nterms_per_class'

In [22]:
# code if we want most commons
'''counter_dict = {}
for classes in terms_per_class:
    counter_dict[classes] = Counter(terms_per_class[classes]).most_common(50)'''


'counter_dict = {}\nfor classes in terms_per_class:\n    counter_dict[classes] = Counter(terms_per_class[classes]).most_common(50)'

In [57]:
# adds the molecular function GO terms to each gene in train data frame
for i in train.index:
    gene = train.Gene[i]
    gene_entry = gene_entry_dict[gene]
    if gene_entry in GO_terms_dict:
        GO_terms = GO_terms_dict[gene_entry]
        train.loc[i, GO_terms] = 1

train.shape

(3321, 565)

In [22]:
train

Unnamed: 0,ID,Gene,Variation,Class,1-phosphatidylinositol-3-kinase activity [GO:0016303],1-phosphatidylinositol-3-kinase regulator activity [GO:0046935],ErbB-3 class receptor binding [GO:0043125],insulin binding [GO:0043559],insulin-like growth factor receptor binding [GO:0005159],insulin receptor binding [GO:0005158],...,alpha-actinin binding [GO:0051393],mRNA 5'-UTR binding [GO:0048027],protein kinase A binding [GO:0051018],"translation repressor activity, nucleic acid binding [GO:0000900]",RING-like zinc finger domain binding [GO:0071535],connexin binding [GO:0071253],growth factor receptor binding [GO:0070851],hormone receptor binding [GO:0051427],endodeoxyribonuclease activity [GO:0004520],TFIID-class transcription factor binding [GO:0001094]
0,0,FAM58A,Truncating Mutations,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,CBL,W802*,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,CBL,Q249E,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,CBL,N454D,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,CBL,L399V,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,5,CBL,V391I,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,6,CBL,V430M,5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,7,CBL,Deletion,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,8,CBL,Y371H,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,9,CBL,C384R,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
# saving the train set together with all features from uniprot
train.to_csv("train_uniprot.csv",index=False)

In [58]:
# loading the XGboost most important 190 features
feature_scores = np.load("features_ranking.npy")

In [59]:
features = []
for feature_score in feature_scores:
    feature = feature_score[0]
    features.append(feature)

In [66]:
features

['magnesium ion binding [GO:0000287]',
 'protein tyrosine kinase activity [GO:0004713]',
 'enzyme binding [GO:0019899]',
 'ATP binding [GO:0005524]',
 'kinase activity [GO:0016301]',
 'transcription corepressor activity [GO:0003714]',
 'protein C-terminus binding [GO:0008022]',
 'H3 histone acetyltransferase activity [GO:0010484]',
 'damaged DNA binding [GO:0003684]',
 'protein kinase activity [GO:0004672]',
 'zinc ion binding [GO:0008270]',
 'ATPase activity [GO:0016887]',
 'tubulin binding [GO:0015631]',
 'protein homodimerization activity [GO:0042803]',
 'protein heterodimerization activity [GO:0046982]',
 'protein kinase binding [GO:0019901]',
 'protein serine/threonine kinase activity [GO:0004674]',
 'ubiquitin-protein transferase activity [GO:0004842]',
 'identical protein binding [GO:0042802]',
 'protein complex binding [GO:0032403]',
 'GTPase activity [GO:0003924]',
 'RNA polymerase II core promoter sequence-specific DNA binding [GO:0000979]',
 'transmembrane receptor protein t

In [91]:
# adding only the 190 most important features from XGboost + the dummy variables of gene
train_features = train[features]
train_original = pd.read_csv('..//bases/training_variants')
train_dummy = pd.get_dummies(train_original.Gene) 
train_new = pd.concat([train_original, train_features, train_dummy], axis=1)
train_new.shape

(3321, 458)

In [93]:
train_new


0       1
1       0
2       0
3       0
4       0
5       0
6       0
7       0
8       0
9       0
10      0
11      0
12      0
13      0
14      0
15      0
16      0
17      0
18      0
19      0
20      0
21      0
22      0
23      0
24      0
25      0
26      0
27      0
28      0
29      0
       ..
3291    0
3292    0
3293    0
3294    0
3295    0
3296    0
3297    0
3298    0
3299    0
3300    0
3301    0
3302    0
3303    0
3304    0
3305    0
3306    0
3307    0
3308    0
3309    0
3310    0
3311    0
3312    0
3313    0
3314    0
3315    0
3316    0
3317    0
3318    0
3319    0
3320    0
Name: FAM58A, Length: 3321, dtype: uint8

In [None]:
# save train_new somewhere
