# Retrieve GO terms for gene from the Uniprot database based on the classes

In [2]:
import numpy as np
import pandas as pd
import string
import os
from collections import Counter
from collections import defaultdict

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import re
from bioservices import *
import collections
%pylab inline --no-import-all

Populating the interactive namespace from numpy and matplotlib


In [38]:
train = pd.read_csv('..//..//bases/training_variants')
leaks = pd.read_csv('..//..//bases/soluce_oneclass.csv')
test = pd.read_csv('..//..//bases/test_variants')

test_leaks = test.iloc[leaks.ID]
test_leaks.index = range(len(test_leaks))
test_leaks['Class'] = leaks['class']

train = pd.concat((train, test_leaks), axis=0, ignore_index=True)
train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Unnamed: 0,ID,Gene,Variation,Class
0,0,FAM58A,Truncating Mutations,1
1,1,CBL,W802*,2
2,2,CBL,Q249E,2
3,3,CBL,N454D,3
4,4,CBL,L399V,4
5,5,CBL,V391I,4
6,6,CBL,V430M,5
7,7,CBL,Deletion,1
8,8,CBL,Y371H,4
9,9,CBL,C384R,4


In [39]:
# only use gene from train data -> contains the classes
all_genes = set(train.Gene)
print(len(all_genes))
print(all_genes)

269
{'RICTOR', 'IDH1', 'PIK3CB', 'ATRX', 'DNMT3B', 'JAK1', 'GNAS', 'ERG', 'AXIN1', 'STAG2', 'NUP93', 'RAD51C', 'MED12', 'MAP2K1', 'PRDM1', 'RAD51B', 'INPP4B', 'MYCN', 'CHEK2', 'FANCC', 'YAP1', 'KNSTRN', 'CCNE1', 'TMPRSS2', 'H3F3A', 'CDK8', 'BRCA2', 'RXRA', 'CDK4', 'RHOA', 'JUN', 'CBL', 'ROS1', 'PMS1', 'SOS1', 'CDK6', 'HIST1H1C', 'IGF1R', 'CCND1', 'RARA', 'KDM6A', 'FOXO1', 'CDH1', 'SOX17', 'PAX8', 'CIC', 'LATS1', 'ARID2', 'KMT2C', 'PPM1D', 'RNF43', 'RIT1', 'AR', 'CARD11', 'CDKN1B', 'SRSF2', 'PDGFRA', 'FOXL2', 'FGFR3', 'BRD4', 'ERBB4', 'NTRK1', 'FGF19', 'IKZF1', 'SMO', 'MET', 'BRIP1', 'EZH2', 'FOXP1', 'MSH6', 'CDKN1A', 'HLA-B', 'FANCA', 'PIK3R2', 'MYD88', 'BCOR', 'CCND2', 'SPOP', 'STAT3', 'KMT2B', 'PIM1', 'FGF4', 'KIT', 'CDKN2A', 'SMARCB1', 'ABL1', 'VHL', 'SRC', 'SPEN', 'RAD51D', 'GNAQ', 'FGFR4', 'SDHB', 'MDM4', 'MTOR', 'KEAP1', 'BRCA1', 'AGO2', 'SDHC', 'RAC1', 'NCOR1', 'BARD1', 'TGFBR2', 'SMAD2', 'TSC2', 'GNA11', 'TCF7L2', 'FAT1', 'NKX2-1', 'ARAF', 'AKT2', 'DUSP4', 'RAB35', 'TERT', 'MPL

In [10]:
u = UniProt()

Creating directory C:\Users\Franck\AppData\Local\bioservices\bioservices\Cache 
Welcome to Bioservices
It looks like you do not have a configuration file.
We are creating one with default values in C:\Users\Franck\AppData\Local\bioservices\bioservices\bioservices.cfg .
Done


In [11]:
res = u.search("ZAP70_HUMAN")
print(res)

Entry	Entry name	Status	Protein names	Gene names	Organism	Length
P43403	ZAP70_HUMAN	reviewed	Tyrosine-protein kinase ZAP-70 (EC 2.7.10.2) (70 kDa zeta-chain associated protein) (Syk-related tyrosine kinase)	ZAP70 SRK	Homo sapiens (Human)	619



In [12]:
u.debugLevel = "INFO"
u.timeout = 100   # some queries are long and requires much more time; default is 1000 seconds

In [13]:
# just an example of query
a = u.search('SLC16A1+AND+organism:9606', frmt='tab', limit=1,
               columns="entry name")

In [14]:
[s.strip() for s in a.splitlines()]

['Entry name', 'MOT1_HUMAN']

In [15]:
gene_entry_dict = {} # here we will keep the gene_entries together with their classes
class_dict = {}
for gene in all_genes:
    gene_classes = list(train.Class[train.Gene==gene])
    keyword = 'gene:%s+AND+organism:9606' %gene #to query database, with gene and organism 9606 is Homo Sapien (human)
    entry_name_tab = u.search(keyword, frmt='tab', limit=1, columns="entry name") 
    entry_name = [s.strip() for s in entry_name_tab.splitlines()][1] # gets the entry name from uniprot i.e. second position in tab
    gene_entry_dict[gene] = entry_name
    class_dict[entry_name] = gene_classes

In [16]:
gene_entry_dict

{'ABL1': 'ABL1_HUMAN',
 'ACVR1': 'ACVR1_HUMAN',
 'AGO2': 'AGO2_HUMAN',
 'AKT1': 'AKT1_HUMAN',
 'AKT2': 'AKT2_HUMAN',
 'AKT3': 'AKT3_HUMAN',
 'ALK': 'TGFR1_HUMAN',
 'APC': 'APC_HUMAN',
 'AR': 'ANDR_HUMAN',
 'ARAF': 'ARAF_HUMAN',
 'ARID1A': 'ARI1A_HUMAN',
 'ARID1B': 'ARI1B_HUMAN',
 'ARID2': 'ARID2_HUMAN',
 'ARID5B': 'ARI5B_HUMAN',
 'ASXL1': 'ASXL1_HUMAN',
 'ASXL2': 'ASXL2_HUMAN',
 'ATM': 'ATM_HUMAN',
 'ATR': 'ATR_HUMAN',
 'ATRX': 'ATRX_HUMAN',
 'AURKA': 'AURKA_HUMAN',
 'AURKB': 'AURKB_HUMAN',
 'AXIN1': 'AXIN1_HUMAN',
 'AXL': 'UFO_HUMAN',
 'B2M': 'B2MG_HUMAN',
 'BAP1': 'RING2_HUMAN',
 'BARD1': 'BARD1_HUMAN',
 'BCL10': 'BCL10_HUMAN',
 'BCL2': 'BCL2_HUMAN',
 'BCL2L11': 'B2L11_HUMAN',
 'BCOR': 'BCOR_HUMAN',
 'BRAF': 'BRAF_HUMAN',
 'BRCA1': 'BRCA1_HUMAN',
 'BRCA2': 'BRCA2_HUMAN',
 'BRD4': 'BRD4_HUMAN',
 'BRIP1': 'FANCJ_HUMAN',
 'BTK': 'BTK_HUMAN',
 'CARD11': 'CAR11_HUMAN',
 'CARM1': 'CARM1_HUMAN',
 'CASP8': 'CASP8_HUMAN',
 'CBL': 'CBL_HUMAN',
 'CCND1': 'CCND1_HUMAN',
 'CCND2': 'CCND2_HUMAN',


In [17]:
gene_entries = list(gene_entry_dict.values())
len(gene_entries)

264

In [18]:
df = u.get_df(gene_entries)
df

INFO:root:fetching information from uniprot for 262 entries
INFO:root:uniprot.get_df 1/2
INFO:root:uniprot.get_df 2/2
INFO:root:uniprot.get_df 3/2


Unnamed: 0,Entry,Entry name,Gene names,Gene names (primary ),Gene names (synonym ),Gene names (ordered locus ),Gene names (ORF ),Organism,Organism ID,Protein names,...,Miscellaneous [CC],Keywords,Protein existence,Status,Sequence annotation (Features),Protein families,Version,Comments,Cross-reference (null),Pathway.1
0,P35222,CTNB1_HUMAN,[CTNNB1 CTNNB OK/SW-cl.35 PRO2286],CTNNB1,CTNNB,,OK/SW-cl.35 PRO2286,Homo sapiens (Human),9606,Catenin beta-1 (Beta-catenin),...,,"[3D-structure, Acetylation, Activator, Alterna...",Evidence at protein level,reviewed,,[Beta-catenin family],216,"[Alternative products (1), Caution (1), Functi...",,
1,P36897,TGFR1_HUMAN,[TGFBR1 ALK5 SKR4],TGFBR1,ALK5 SKR4,,,Homo sapiens (Human),9606,TGF-beta receptor type-1 (TGFR-1) (EC 2.7.11.3...,...,,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, TKL Ser/Thr prote...",203,"[Alternative products (1), Catalytic activity ...",,
2,P46531,NOTC1_HUMAN,[NOTCH1 TAN1],NOTCH1,TAN1,,,Homo sapiens (Human),9606,Neurogenic locus notch homolog protein 1 (Notc...,...,,"[3D-structure, ANK repeat, Activator, Angiogen...",Evidence at protein level,reviewed,,[NOTCH family],211,"[Function (1), Involvement in disease (2), Pos...",,
3,P21802,FGFR2_HUMAN,[FGFR2 BEK KGFR KSAM],FGFR2,BEK KGFR KSAM,,,Homo sapiens (Human),9606,Fibroblast growth factor receptor 2 (FGFR-2) (...,...,,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, Tyr protein kinas...",228,"[Alternative products (1), Catalytic activity ...",,
4,P38398,BRCA1_HUMAN,[BRCA1 RNF53],BRCA1,RNF53,,,Homo sapiens (Human),9606,Breast cancer type 1 susceptibility protein (E...,...,,"[3D-structure, Acetylation, Activator, Alterna...",Evidence at protein level,reviewed,,[],228,"[Alternative products (1), Catalytic activity ...",,Protein modification; protein ubiquitination.
5,P31749,AKT1_HUMAN,[AKT1 PKB RAC],AKT1,PKB RAC,,,Homo sapiens (Human),9606,RAC-alpha serine/threonine-protein kinase (EC ...,...,,"[3D-structure, ATP-binding, Acetylation, Alter...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, AGC Ser/Thr prote...",216,"[Alternative products (1), Catalytic activity ...",,
6,P04626,ERBB2_HUMAN,[ERBB2 HER2 MLN19 NEU NGL],ERBB2,HER2 MLN19 NEU NGL,,,Homo sapiens (Human),9606,Receptor tyrosine-protein kinase erbB-2 (EC 2....,...,,"[3D-structure, ATP-binding, Activator, Alterna...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, Tyr protein kinas...",227,"[Alternative products (1), Catalytic activity ...",,
7,P42771,CDN2A_HUMAN,[CDKN2A CDKN2 MTS1],CDKN2A,CDKN2 MTS1,,,Homo sapiens (Human),9606,Cyclin-dependent kinase inhibitor 2A (Cyclin-d...,...,,"[3D-structure, ANK repeat, Acetylation, Altern...",Evidence at protein level,reviewed,,[CDKN2 cyclin-dependent kinase inhibitor family],197,"[Alternative products (1), Caution (2), Functi...",,
8,P17948,VGFR1_HUMAN,[FLT1 FLT FRT VEGFR1],FLT1,FLT FRT VEGFR1,,,Homo sapiens (Human),9606,Vascular endothelial growth factor receptor 1 ...,...,,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, Tyr protein kinas...",208,"[Alternative products (1), Catalytic activity ...",,
9,P84022,SMAD3_HUMAN,[SMAD3 MADH3],SMAD3,MADH3,,,Homo sapiens (Human),9606,Mothers against decapentaplegic homolog 3 (MAD...,...,,"[3D-structure, ADP-ribosylation, Acetylation, ...",Evidence at protein level,reviewed,,[Dwarfin/SMAD family],163,"[Alternative products (1), Caution (2), Domain...",,


In [19]:
df_new = df[df['Gene ontology (molecular function)'].notnull()] # don't consider genes with no molecular function


In [20]:
df_new['Gene ontology (molecular function)'] = df_new['Gene ontology (molecular function)'].apply(lambda x: x.split('; ')) #split functions based on ;

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [21]:
GO_terms_dict = dict(zip(df_new['Entry name'], df_new['Gene ontology (molecular function)']))

In [22]:
GO_terms_dict

{'1A02_HUMAN': ['beta-2-microglobulin binding [GO:0030881]',
  'peptide antigen binding [GO:0042605]',
  'receptor binding [GO:0005102]',
  'RNA binding [GO:0003723]',
  'TAP binding [GO:0046977]',
  'T cell receptor binding [GO:0042608]'],
 '1B07_HUMAN': ['peptide antigen binding [GO:0042605]',
  'receptor binding [GO:0005102]'],
 '1B14_HUMAN': ['peptide antigen binding [GO:0042605]',
  'TAP binding [GO:0046977]'],
 '2AAA_HUMAN': ['antigen binding [GO:0003823]',
  'protein heterodimerization activity [GO:0046982]',
  'protein phosphatase regulator activity [GO:0019888]',
  'protein serine/threonine phosphatase activity [GO:0004722]'],
 'ABL1_HUMAN': ['actin filament binding [GO:0051015]',
  'actin monomer binding [GO:0003785]',
  'ATP binding [GO:0005524]',
  'DNA binding [GO:0003677]',
  'ephrin receptor binding [GO:0046875]',
  'magnesium ion binding [GO:0000287]',
  'manganese ion binding [GO:0030145]',
  'mitogen-activated protein kinase binding [GO:0051019]',
  'nicotinate-nucleo

In [30]:
# Find most common GO terms to use as features
def flatten(l): # taken from https://stackoverflow.com/questions/33900770/most-frequent-values-in-a-dictionary
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, str): #replaced basestring with str for Python3
            for sub in flatten(el):
                yield sub
        else:
            yield el



In [31]:
All_GO_terms = list(flatten(GO_terms_dict.values()))
len(set(All_GO_terms))

561

In [36]:
# Initialization of the 2056 new features with 0's
for terms in All_GO_terms:
    train[terms] = 0
    test[terms]=0

In [26]:
# looping through all classes and getting terms for each class
'''terms_per_class = defaultdict(list)
for entry, terms in GO_terms_dict.items():
    if entry in class_dict:
        gene_classes = class_dict[entry]
        for gene_class in gene_classes:
            terms_per_class[gene_class].extend(terms)
           
        
terms_per_class'''

'terms_per_class = defaultdict(list)\nfor entry, terms in GO_terms_dict.items():\n    if entry in class_dict:\n        gene_classes = class_dict[entry]\n        for gene_class in gene_classes:\n            terms_per_class[gene_class].extend(terms)\n           \n        \nterms_per_class'

In [27]:
# code if we want most commons
'''counter_dict = {}
for classes in terms_per_class:
    counter_dict[classes] = Counter(terms_per_class[classes]).most_common(50)'''


'counter_dict = {}\nfor classes in terms_per_class:\n    counter_dict[classes] = Counter(terms_per_class[classes]).most_common(50)'

In [39]:
# adds the molecular function GO terms to each gene in train data frame
for i in train.index:
    gene = train.Gene[i]
    gene_entry = gene_entry_dict[gene]
    if gene_entry in GO_terms_dict:
        GO_terms = GO_terms_dict[gene_entry]
        train.loc[i, GO_terms] = 1

train.shape

(3321, 565)

In [49]:
list_of_genes_intrain=train["Gene"]
genes=list_of_genes_intrain.unique()

In [56]:
# saving the train set together with all features from uniprot
train.to_csv("train_uniprot.csv",index=False)

In [57]:
# loading the XGboost most important 190 features
feature_scores = np.load("features_ranking.npy")

In [58]:
features = []
for feature_score in feature_scores:
    feature = feature_score[0]
    features.append(feature)

In [59]:
# adding only the 190 most important features from XGboost + the dummy variables of gene
train_features = train[features]
train_original = pd.read_csv('..//bases/training_variants')
train_dummy = pd.get_dummies(train_original.Gene) 
train_new = pd.concat([train_original, train_features, train_dummy], axis=1)
train_new.shape

(3321, 458)

In [61]:
# save train_new somewhere
train_new.to_csv("train_metafeaturesl2.csv",index=False)