# Retrieve GO terms for gene from the Uniprot database based on the classes

In [1]:
import numpy as np
import pandas as pd
import string
import os
from collections import Counter
from collections import defaultdict

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RandomizedLasso

import re
from bioservices import *
import collections
%pylab inline --no-import-all

Populating the interactive namespace from numpy and matplotlib


In [2]:
train = pd.read_csv('..//..//../bases/new_training_variants.csv')
test = pd.read_csv('..//..//../bases/new_test_variants.csv')


In [3]:
# only use gene from train data -> contains the classes
all_genes = set(train.Gene)
print(len(all_genes))
print(all_genes)

269
{'RIT1', 'GNA11', 'ERCC2', 'RAC1', 'CDK6', 'ARID2', 'PTPN11', 'DDR2', 'MPL', 'AR', 'NCOR1', 'YAP1', 'FBXW7', 'NTRK2', 'MED12', 'SHOC2', 'CDK4', 'CREBBP', 'TET1', 'PRDM1', 'DUSP4', 'TMPRSS2', 'AXIN1', 'PPM1D', 'BAP1', 'EP300', 'NOTCH2', 'BARD1', 'CASP8', 'PIK3R3', 'CCNE1', 'STK11', 'CTNNB1', 'NTRK3', 'BCL2', 'NUP93', 'CDKN1B', 'GATA3', 'ETV6', 'KRAS', 'ELF3', 'PIK3CA', 'KDM6A', 'ALK', 'PDGFRB', 'KMT2A', 'NF1', 'RAB35', 'ERF', 'NOTCH1', 'SMAD2', 'ASXL2', 'ARAF', 'FANCC', 'RXRA', 'NRAS', 'CTCF', 'HLA-B', 'RAD51D', 'GNAS', 'NFKBIA', 'BRAF', 'APC', 'HIST1H1C', 'FOXO1', 'IDH1', 'AKT2', 'MYCN', 'PTPRD', 'NTRK1', 'FGFR4', 'CTLA4', 'PTCH1', 'ERCC3', 'RNF43', 'PDGFRA', 'RAD21', 'RARA', 'NKX2-1', 'BRCA1', 'FOXA1', 'EPCAM', 'TP53', 'FAT1', 'HNF1A', 'SF3B1', 'DIS3', 'BCOR', 'SMARCB1', 'MEN1', 'PIK3CD', 'RUNX1', 'CCND2', 'CDKN1A', 'CDKN2C', 'XRCC2', 'MAPK1', 'AGO2', 'FGF19', 'ERBB4', 'MSH6', 'RICTOR', 'KMT2C', 'PIM1', 'MET', 'RAD50', 'MDM4', 'EIF1AX', 'ATRX', 'CDKN2A', 'ATM', 'CCND3', 'ARID1A', 

In [4]:
u = UniProt()

In [5]:
res = u.search("ZAP70_HUMAN")
print(res)

Entry	Entry name	Status	Protein names	Gene names	Organism	Length
P43403	ZAP70_HUMAN	reviewed	Tyrosine-protein kinase ZAP-70 (EC 2.7.10.2) (70 kDa zeta-chain associated protein) (Syk-related tyrosine kinase)	ZAP70 SRK	Homo sapiens (Human)	619



In [6]:
u.debugLevel = "INFO"
u.timeout = 100   # some queries are long and requires much more time; default is 1000 seconds

In [7]:
# just an example of query
a = u.search('SLC16A1+AND+organism:9606', frmt='tab', limit=1,
               columns="entry name")

In [8]:
[s.strip() for s in a.splitlines()]

['Entry name', 'MOT1_HUMAN']

In [9]:
gene_entry_dict = {} # here we will keep the gene_entries together with their classes
class_dict = {}
for gene in all_genes:
    gene_classes = list(train.Class[train.Gene==gene])
    keyword = 'gene:%s+AND+organism:9606' %gene #to query database, with gene and organism 9606 is Homo Sapien (human)
    entry_name_tab = u.search(keyword, frmt='tab', limit=1, columns="entry name") 
    entry_name = [s.strip() for s in entry_name_tab.splitlines()][1] # gets the entry name from uniprot i.e. second position in tab
    gene_entry_dict[gene] = entry_name
    class_dict[entry_name] = gene_classes

In [10]:
gene_entry_dict

{'ABL1': 'ABL1_HUMAN',
 'ACVR1': 'ACVR1_HUMAN',
 'AGO2': 'AGO2_HUMAN',
 'AKT1': 'AKT1_HUMAN',
 'AKT2': 'AKT2_HUMAN',
 'AKT3': 'AKT3_HUMAN',
 'ALK': 'TGFR1_HUMAN',
 'APC': 'APC_HUMAN',
 'AR': 'ANDR_HUMAN',
 'ARAF': 'ARAF_HUMAN',
 'ARID1A': 'ARI1A_HUMAN',
 'ARID1B': 'ARI1B_HUMAN',
 'ARID2': 'ARID2_HUMAN',
 'ARID5B': 'ARI5B_HUMAN',
 'ASXL1': 'ASXL1_HUMAN',
 'ASXL2': 'ASXL2_HUMAN',
 'ATM': 'ATM_HUMAN',
 'ATR': 'ATR_HUMAN',
 'ATRX': 'ATRX_HUMAN',
 'AURKA': 'AURKA_HUMAN',
 'AURKB': 'AURKB_HUMAN',
 'AXIN1': 'AXIN1_HUMAN',
 'AXIN2': 'AXIN2_HUMAN',
 'AXL': 'UFO_HUMAN',
 'B2M': 'B2MG_HUMAN',
 'BAP1': 'RING2_HUMAN',
 'BARD1': 'BARD1_HUMAN',
 'BCL10': 'BCL10_HUMAN',
 'BCL2': 'BCL2_HUMAN',
 'BCL2L11': 'B2L11_HUMAN',
 'BCOR': 'BCOR_HUMAN',
 'BRAF': 'BRAF_HUMAN',
 'BRCA1': 'BRCA1_HUMAN',
 'BRCA2': 'BRCA2_HUMAN',
 'BRD4': 'BRD4_HUMAN',
 'BRIP1': 'FANCJ_HUMAN',
 'BTK': 'BTK_HUMAN',
 'CARD11': 'CAR11_HUMAN',
 'CARM1': 'CARM1_HUMAN',
 'CASP8': 'CASP8_HUMAN',
 'CBL': 'CBL_HUMAN',
 'CCND1': 'CCND1_HUMAN',


In [11]:
gene_entries = list(gene_entry_dict.values())
len(gene_entries)

269

In [12]:
df = u.get_df(gene_entries)
df

INFO:root:fetching information from uniprot for 267 entries
INFO:root:uniprot.get_df 1/2
INFO:root:uniprot.get_df 2/2
INFO:root:uniprot.get_df 3/2


Unnamed: 0,Entry,Entry name,Gene names,Gene names (primary ),Gene names (synonym ),Gene names (ordered locus ),Gene names (ORF ),Organism,Organism ID,Protein names,...,Miscellaneous [CC],Keywords,Protein existence,Status,Sequence annotation (Features),Protein families,Version,Comments,Cross-reference (null),Pathway.1
0,Q12809,KCNH2_HUMAN,[KCNH2 ERG ERG1 HERG],KCNH2,ERG ERG1 HERG,,,Homo sapiens (Human),9606,Potassium voltage-gated channel subfamily H me...,...,,"[3D-structure, Alternative splicing, Cell memb...",Evidence at protein level,reviewed,,"[Potassium channel family, H (Eag) (TC 1.A.1.2...",199,"[Alternative products (1), Caution (3), Domain...",,
1,Q12888,TP53B_HUMAN,[TP53BP1],TP53BP1,,,,Homo sapiens (Human),9606,TP53-binding protein 1 (53BP1) (p53-binding pr...,...,,"[3D-structure, Activator, Alternative splicing...",Evidence at protein level,reviewed,,[],190,"[Alternative products (1), Caution (2), Domain...",,
2,P21802,FGFR2_HUMAN,[FGFR2 BEK KGFR KSAM],FGFR2,BEK KGFR KSAM,,,Homo sapiens (Human),9606,Fibroblast growth factor receptor 2 (FGFR-2) (...,...,,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, Tyr protein kinas...",228,"[Alternative products (1), Catalytic activity ...",,
3,P31749,AKT1_HUMAN,[AKT1 PKB RAC],AKT1,PKB RAC,,,Homo sapiens (Human),9606,RAC-alpha serine/threonine-protein kinase (EC ...,...,,"[3D-structure, ATP-binding, Acetylation, Alter...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, AGC Ser/Thr prote...",216,"[Alternative products (1), Catalytic activity ...",,
4,Q06124,PTN11_HUMAN,[PTPN11 PTP2C SHPTP2],PTPN11,PTP2C SHPTP2,,,Homo sapiens (Human),9606,Tyrosine-protein phosphatase non-receptor type...,...,,"[3D-structure, Acetylation, Alternative splici...",Evidence at protein level,reviewed,,"[Protein-tyrosine phosphatase family, Non-rece...",213,"[Alternative products (1), Catalytic activity ...",,
5,Q09472,EP300_HUMAN,[EP300 P300],EP300,P300,,,Homo sapiens (Human),9606,Histone acetyltransferase p300 (p300 HAT) (EC ...,...,,"[3D-structure, Acetylation, Acyltransferase, B...",Evidence at protein level,reviewed,,[],222,"[Catalytic activity (1), Domain (1), Function ...",,
6,P04626,ERBB2_HUMAN,[ERBB2 HER2 MLN19 NEU NGL],ERBB2,HER2 MLN19 NEU NGL,,,Homo sapiens (Human),9606,Receptor tyrosine-protein kinase erbB-2 (EC 2....,...,,"[3D-structure, ATP-binding, Activator, Alterna...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, Tyr protein kinas...",227,"[Alternative products (1), Catalytic activity ...",,
7,P28482,MK01_HUMAN,[MAPK1 ERK2 PRKM1 PRKM2],MAPK1,ERK2 PRKM1 PRKM2,,,Homo sapiens (Human),9606,Mitogen-activated protein kinase 1 (MAP kinase...,...,,"[3D-structure, ATP-binding, Acetylation, Alter...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, CMGC Ser/Thr prot...",205,"[Alternative products (1), Catalytic activity ...",,
8,P84022,SMAD3_HUMAN,[SMAD3 MADH3],SMAD3,MADH3,,,Homo sapiens (Human),9606,Mothers against decapentaplegic homolog 3 (MAD...,...,,"[3D-structure, ADP-ribosylation, Acetylation, ...",Evidence at protein level,reviewed,,[Dwarfin/SMAD family],163,"[Alternative products (1), Caution (2), Domain...",,
9,P38936,CDN1A_HUMAN,[CDKN1A CAP20 CDKN1 CIP1 MDA6 PIC1 SDI1 WAF1],CDKN1A,CAP20 CDKN1 CIP1 MDA6 PIC1 SDI1 WAF1,,,Homo sapiens (Human),9606,Cyclin-dependent kinase inhibitor 1 (CDK-inter...,...,,"[3D-structure, Acetylation, Cell cycle, Comple...",Evidence at protein level,reviewed,,[CDI family],199,"[Domain (2), Function (1), Induction (1), Post...",,


In [17]:
df_new = df[df['Gene ontology (cellular component)'].notnull()] # don't consider genes with no biological process

In [19]:
df_new['Gene ontology (cellular component)'] = df_new['Gene ontology (cellular component)'].apply(lambda x: x.split('; ')) #split functions based on ;
df_new['Gene ontology (cellular component)']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


0      [cell surface [GO:0009986], perinuclear region...
1      [condensed chromosome kinetochore [GO:0000777]...
2      [cell cortex [GO:0005938], cell surface [GO:00...
3      [cell-cell junction [GO:0005911], ciliary basa...
4      [cytoplasm [GO:0005737], cytosol [GO:0005829],...
5      [cytosol [GO:0005829], histone acetyltransfera...
6      [apical plasma membrane [GO:0016324], basolate...
7      [axon [GO:0030424], azurophil granule lumen [G...
8      [cytoplasm [GO:0005737], cytosol [GO:0005829],...
9      [cyclin-dependent protein kinase holoenzyme co...
10     [axon [GO:0030424], cell surface [GO:0009986],...
11     [cytoplasm [GO:0005737], cytosol [GO:0005829],...
12     [cytosol [GO:0005829], early endosome [GO:0005...
13     [cytoplasm [GO:0005737], cytosol [GO:0005829],...
14     [actin cytoskeleton [GO:0015629], cell leading...
15     [Cul4A-RING E3 ubiquitin ligase complex [GO:00...
16     [cytoplasm [GO:0005737], cytosol [GO:0005829],...
17     [cytoplasm [GO:0005737],

In [22]:
GO_terms_dict = dict(zip(df_new['Entry name'], df_new['Gene ontology (cellular component)']))

In [23]:
GO_terms_dict

{'1A02_HUMAN': ['cell surface [GO:0009986]',
  'early endosome membrane [GO:0031901]',
  'endoplasmic reticulum [GO:0005783]',
  'endoplasmic reticulum exit site [GO:0070971]',
  'endoplasmic reticulum membrane [GO:0005789]',
  'ER to Golgi transport vesicle membrane [GO:0012507]',
  'Golgi apparatus [GO:0005794]',
  'Golgi medial cisterna [GO:0005797]',
  'Golgi membrane [GO:0000139]',
  'integral component of lumenal side of endoplasmic reticulum membrane [GO:0071556]',
  'MHC class I protein complex [GO:0042612]',
  'phagocytic vesicle membrane [GO:0030670]',
  'plasma membrane [GO:0005886]',
  'recycling endosome membrane [GO:0055038]'],
 '1B07_HUMAN': ['cell surface [GO:0009986]',
  'early endosome membrane [GO:0031901]',
  'endoplasmic reticulum [GO:0005783]',
  'ER to Golgi transport vesicle membrane [GO:0012507]',
  'extracellular exosome [GO:0070062]',
  'Golgi apparatus [GO:0005794]',
  'Golgi membrane [GO:0000139]',
  'integral component of lumenal side of endoplasmic reticu

In [24]:
# Find most common GO terms to use as features
def flatten(l): # taken from https://stackoverflow.com/questions/33900770/most-frequent-values-in-a-dictionary
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, str): #replaced basestring with str for Python3
            for sub in flatten(el):
                yield sub
        else:
            yield el



In [26]:
All_GO_terms = set(list(flatten(GO_terms_dict.values())))
len(All_GO_terms)


351

In [27]:
# Initialization of the 2056 new features with 0's
for terms in All_GO_terms:
    train[terms] = 0

In [26]:
# looping through all classes and getting terms for each class
'''terms_per_class = defaultdict(list)
for entry, terms in GO_terms_dict.items():
    if entry in class_dict:
        gene_classes = class_dict[entry]
        for gene_class in gene_classes:
            terms_per_class[gene_class].extend(terms)
           
        
terms_per_class'''

'terms_per_class = defaultdict(list)\nfor entry, terms in GO_terms_dict.items():\n    if entry in class_dict:\n        gene_classes = class_dict[entry]\n        for gene_class in gene_classes:\n            terms_per_class[gene_class].extend(terms)\n           \n        \nterms_per_class'

In [27]:
# code if we want most commons
'''counter_dict = {}
for classes in terms_per_class:
    counter_dict[classes] = Counter(terms_per_class[classes]).most_common(50)'''


'counter_dict = {}\nfor classes in terms_per_class:\n    counter_dict[classes] = Counter(terms_per_class[classes]).most_common(50)'

In [28]:
# adds the molecular function GO terms to each gene in train data frame
for i in train.index:
    gene = train.Gene[i]
    gene_entry = gene_entry_dict[gene]
    if gene_entry in GO_terms_dict:
        GO_terms = GO_terms_dict[gene_entry]
        train.loc[i, GO_terms] = 1

train.shape

(3689, 355)

In [30]:
train

Unnamed: 0,Class,Gene,ID,Variation,"chromosome, telomeric region [GO:0000781]",nuclear membrane [GO:0031965],transcriptional repressor complex [GO:0017053],transcriptional preinitiation complex [GO:0097550],extrinsic component of membrane [GO:0019898],mRNA cap binding complex [GO:0005845],...,cyclin E1-CDK2 complex [GO:0097134],pronucleus [GO:0045120],nuclear pericentric heterochromatin [GO:0031618],nuclear heterochromatin [GO:0005720],cortical actin cytoskeleton [GO:0030864],excitatory synapse [GO:0060076],Scrib-APC-beta-catenin complex [GO:0034750],podosome [GO:0002102],ciliary tip [GO:0097542],early endosome [GO:0005769]
0,1,FAM58A,0,Truncating Mutations,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,CBL,1,W802*,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,CBL,2,Q249E,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,CBL,3,N454D,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,CBL,4,L399V,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4,CBL,5,V391I,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,5,CBL,6,V430M,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,CBL,7,Deletion,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,4,CBL,8,Y371H,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,4,CBL,9,C384R,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
# fit the input X and output Y for the feature selection
X = train[list(All_GO_terms)]
y = train['Class']
names = X.columns

In [33]:
# Lasso model
lasso = Lasso(alpha=.001, random_state = 3).fit(X,y)
features_lasso = names[np.nonzero(lasso.coef_)]
len(features_lasso) # 164 in total

152

In [34]:
# saving the train set together with all features from uniprot
np.save("..//cellular_bases/features_cellular_function", features_lasso)