# Retrieve GO terms for gene from the Uniprot database based on the classes

In [60]:
import numpy as np
import pandas as pd
import string
import os
from collections import Counter
from collections import defaultdict

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RandomizedLasso

import re
from bioservices import *
import collections
%pylab inline --no-import-all

Populating the interactive namespace from numpy and matplotlib


In [11]:
train = pd.read_csv('..//..//../bases/new_training_variants.csv')
test = pd.read_csv('..//..//../bases/new_test_variants.csv')


In [12]:
# only use gene from train data -> contains the classes
all_genes = set(train.Gene)
print(len(all_genes))
print(all_genes)

269
{'MAP2K2', 'IKBKE', 'RAD51B', 'TSC1', 'CARD11', 'ROS1', 'SF3B1', 'EP300', 'NPM1', 'ETV1', 'TCF7L2', 'RAC1', 'MYD88', 'PTCH1', 'FGF19', 'PPP2R1A', 'KMT2B', 'PRDM1', 'ATM', 'DNMT3B', 'ATRX', 'FOXO1', 'MYOD1', 'GLI1', 'MDM2', 'NF1', 'HLA-B', 'FLT3', 'MSH2', 'RIT1', 'XRCC2', 'RNF43', 'FAM58A', 'FOXA1', 'MAPK1', 'FGF4', 'PPM1D', 'ERG', 'NCOR1', 'NFE2L2', 'CIC', 'AKT3', 'PTPN11', 'ELF3', 'INPP4B', 'KMT2C', 'NOTCH2', 'SRC', 'RARA', 'KMT2D', 'ESR1', 'EZH2', 'ERBB2', 'CCND2', 'PIK3CA', 'RXRA', 'FANCC', 'NFKBIA', 'ATR', 'CCNE1', 'SETD2', 'CDKN1A', 'RAD51C', 'FUBP1', 'BCL2L11', 'SPOP', 'MLH1', 'MDM4', 'PMS2', 'FOXP1', 'DICER1', 'RHEB', 'CDKN2A', 'TCF3', 'VEGFA', 'APC', 'TMPRSS2', 'RAD51D', 'SPEN', 'RICTOR', 'ALK', 'FGFR1', 'SOX9', 'PIM1', 'GNAS', 'PDGFRA', 'BTK', 'PTPRD', 'EPCAM', 'CEBPA', 'DUSP4', 'CDK8', 'ERCC2', 'AR', 'ACVR1', 'AXIN1', 'SMARCA4', 'AXL', 'WHSC1L1', 'GNA11', 'MYCN', 'AGO2', 'ERCC4', 'RUNX1', 'NKX2-1', 'STAG2', 'ASXL2', 'CDK4', 'JUN', 'SDHC', 'MYC', 'NSD1', 'BRAF', 'SMAD3', '

In [13]:
u = UniProt()

In [14]:
res = u.search("ZAP70_HUMAN")
print(res)

Entry	Entry name	Status	Protein names	Gene names	Organism	Length
P43403	ZAP70_HUMAN	reviewed	Tyrosine-protein kinase ZAP-70 (EC 2.7.10.2) (70 kDa zeta-chain associated protein) (Syk-related tyrosine kinase)	ZAP70 SRK	Homo sapiens (Human)	619



In [15]:
u.debugLevel = "INFO"
u.timeout = 100   # some queries are long and requires much more time; default is 1000 seconds

In [16]:
# just an example of query
a = u.search('SLC16A1+AND+organism:9606', frmt='tab', limit=1,
               columns="entry name")

In [17]:
[s.strip() for s in a.splitlines()]

['Entry name', 'MOT1_HUMAN']

In [18]:
gene_entry_dict = {} # here we will keep the gene_entries together with their classes
class_dict = {}
for gene in all_genes:
    gene_classes = list(train.Class[train.Gene==gene])
    keyword = 'gene:%s+AND+organism:9606' %gene #to query database, with gene and organism 9606 is Homo Sapien (human)
    entry_name_tab = u.search(keyword, frmt='tab', limit=1, columns="entry name") 
    entry_name = [s.strip() for s in entry_name_tab.splitlines()][1] # gets the entry name from uniprot i.e. second position in tab
    gene_entry_dict[gene] = entry_name
    class_dict[entry_name] = gene_classes

In [19]:
gene_entry_dict

{'ABL1': 'ABL1_HUMAN',
 'ACVR1': 'ACVR1_HUMAN',
 'AGO2': 'AGO2_HUMAN',
 'AKT1': 'AKT1_HUMAN',
 'AKT2': 'AKT2_HUMAN',
 'AKT3': 'AKT3_HUMAN',
 'ALK': 'TGFR1_HUMAN',
 'APC': 'APC_HUMAN',
 'AR': 'ANDR_HUMAN',
 'ARAF': 'ARAF_HUMAN',
 'ARID1A': 'ARI1A_HUMAN',
 'ARID1B': 'ARI1B_HUMAN',
 'ARID2': 'ARID2_HUMAN',
 'ARID5B': 'ARI5B_HUMAN',
 'ASXL1': 'ASXL1_HUMAN',
 'ASXL2': 'ASXL2_HUMAN',
 'ATM': 'ATM_HUMAN',
 'ATR': 'ATR_HUMAN',
 'ATRX': 'ATRX_HUMAN',
 'AURKA': 'AURKA_HUMAN',
 'AURKB': 'AURKB_HUMAN',
 'AXIN1': 'AXIN1_HUMAN',
 'AXIN2': 'AXIN2_HUMAN',
 'AXL': 'UFO_HUMAN',
 'B2M': 'B2MG_HUMAN',
 'BAP1': 'RING2_HUMAN',
 'BARD1': 'BARD1_HUMAN',
 'BCL10': 'BCL10_HUMAN',
 'BCL2': 'BCL2_HUMAN',
 'BCL2L11': 'B2L11_HUMAN',
 'BCOR': 'BCOR_HUMAN',
 'BRAF': 'BRAF_HUMAN',
 'BRCA1': 'BRCA1_HUMAN',
 'BRCA2': 'BRCA2_HUMAN',
 'BRD4': 'BRD4_HUMAN',
 'BRIP1': 'FANCJ_HUMAN',
 'BTK': 'BTK_HUMAN',
 'CARD11': 'CAR11_HUMAN',
 'CARM1': 'CARM1_HUMAN',
 'CASP8': 'CASP8_HUMAN',
 'CBL': 'CBL_HUMAN',
 'CCND1': 'CCND1_HUMAN',


In [20]:
gene_entries = list(gene_entry_dict.values())
len(gene_entries)

269

In [21]:
df = u.get_df(gene_entries)
df

INFO:root:fetching information from uniprot for 267 entries
INFO:root:uniprot.get_df 1/2
INFO:root:uniprot.get_df 2/2
INFO:root:uniprot.get_df 3/2


Unnamed: 0,Entry,Entry name,Gene names,Gene names (primary ),Gene names (synonym ),Gene names (ordered locus ),Gene names (ORF ),Organism,Organism ID,Protein names,...,Miscellaneous [CC],Keywords,Protein existence,Status,Sequence annotation (Features),Protein families,Version,Comments,Cross-reference (null),Pathway.1
0,P27986,P85A_HUMAN,[PIK3R1 GRB1],PIK3R1,GRB1,,,Homo sapiens (Human),9606,Phosphatidylinositol 3-kinase regulatory subun...,...,,"[3D-structure, Acetylation, Alternative splici...",Evidence at protein level,reviewed,,[PI3K p85 subunit family],214,"[Alternative products (1), Caution (1), Domain...",,
1,P36897,TGFR1_HUMAN,[TGFBR1 ALK5 SKR4],TGFBR1,ALK5 SKR4,,,Homo sapiens (Human),9606,TGF-beta receptor type-1 (TGFR-1) (EC 2.7.11.3...,...,,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, TKL Ser/Thr prote...",203,"[Alternative products (1), Catalytic activity ...",,
2,P46531,NOTC1_HUMAN,[NOTCH1 TAN1],NOTCH1,TAN1,,,Homo sapiens (Human),9606,Neurogenic locus notch homolog protein 1 (Notc...,...,,"[3D-structure, ANK repeat, Activator, Angiogen...",Evidence at protein level,reviewed,,[NOTCH family],211,"[Function (1), Involvement in disease (2), Pos...",,
3,Q09472,EP300_HUMAN,[EP300 P300],EP300,P300,,,Homo sapiens (Human),9606,Histone acetyltransferase p300 (p300 HAT) (EC ...,...,,"[3D-structure, Acetylation, Acyltransferase, B...",Evidence at protein level,reviewed,,[],222,"[Catalytic activity (1), Domain (1), Function ...",,
4,P42771,CDN2A_HUMAN,[CDKN2A CDKN2 MTS1],CDKN2A,CDKN2 MTS1,,,Homo sapiens (Human),9606,Cyclin-dependent kinase inhibitor 2A (Cyclin-d...,...,,"[3D-structure, ANK repeat, Acetylation, Altern...",Evidence at protein level,reviewed,,[CDKN2 cyclin-dependent kinase inhibitor family],197,"[Alternative products (1), Caution (2), Functi...",,
5,P12830,CADH1_HUMAN,[CDH1 CDHE UVO],CDH1,CDHE UVO,,,Homo sapiens (Human),9606,Cadherin-1 (CAM 120/80) (Epithelial cadherin) ...,...,,"[3D-structure, Alternative splicing, Calcium, ...",Evidence at protein level,reviewed,,[],219,"[Alternative products (1), Domain (1), Functio...",,
6,P37173,TGFR2_HUMAN,[TGFBR2],TGFBR2,,,,Homo sapiens (Human),9606,TGF-beta receptor type-2 (TGFR-2) (EC 2.7.11.3...,...,,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, TKL Ser/Thr prote...",208,"[Alternative products (1), Catalytic activity ...",,
7,Q03164,KMT2A_HUMAN,[KMT2A ALL1 CXXC7 HRX HTRX MLL MLL1 TRX1],KMT2A,ALL1 CXXC7 HRX HTRX MLL MLL1 TRX1,,,Homo sapiens (Human),9606,Histone-lysine N-methyltransferase 2A (Lysine ...,...,,"[3D-structure, Acetylation, Alternative splici...",Evidence at protein level,reviewed,,[Class V-like SAM-binding methyltransferase su...,214,"[Alternative products (1), Catalytic activity ...",,
8,P04629,NTRK1_HUMAN,[NTRK1 MTC TRK TRKA],NTRK1,MTC TRK TRKA,,,Homo sapiens (Human),9606,High affinity nerve growth factor receptor (EC...,...,MISCELLANEOUS: Trk also stands for tropomyosin...,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, Tyr protein kinas...",226,"[Alternative products (1), Catalytic activity ...",,
9,Q15831,STK11_HUMAN,[STK11 LKB1 PJS],STK11,LKB1 PJS,,,Homo sapiens (Human),9606,Serine/threonine-protein kinase STK11 (EC 2.7....,...,,"[3D-structure, ATP-binding, Acetylation, Alter...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, CAMK Ser/Thr prot...",193,"[Alternative products (1), Catalytic activity ...",,


In [70]:
df_new = df[df['Gene ontology (biological process)'].notnull()] # don't consider genes with no biological process

In [71]:
df_new['Gene ontology (biological process)'] = df_new['Gene ontology (biological process)'].apply(lambda x: x.split('; ')) #split functions based on ;


In [72]:
GO_terms_dict = dict(zip(df_new['Entry name'], df_new['Gene ontology (biological process)']))

In [73]:
GO_terms_dict

{'1A02_HUMAN': ['antibacterial humoral response [GO:0019731]',
  'antigen processing and presentation of endogenous peptide antigen via MHC class I [GO:0019885]',
  'antigen processing and presentation of endogenous peptide antigen via MHC class I via ER pathway, TAP-independent [GO:0002486]',
  'antigen processing and presentation of exogenous peptide antigen via MHC class I, TAP-dependent [GO:0002479]',
  'antigen processing and presentation of exogenous peptide antigen via MHC class I, TAP-independent [GO:0002480]',
  'antigen processing and presentation of peptide antigen via MHC class I [GO:0002474]',
  'defense response to Gram-positive bacterium [GO:0050830]',
  'interferon-gamma-mediated signaling pathway [GO:0060333]',
  'positive regulation of CD8-positive, alpha-beta T cell activation [GO:2001187]',
  'positive regulation of CD8-positive, alpha-beta T cell proliferation [GO:2000566]',
  'positive regulation of interferon-gamma production [GO:0032729]',
  'positive regulation

In [26]:
# Find most common GO terms to use as features
def flatten(l): # taken from https://stackoverflow.com/questions/33900770/most-frequent-values-in-a-dictionary
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, str): #replaced basestring with str for Python3
            for sub in flatten(el):
                yield sub
        else:
            yield el



In [48]:
All_GO_terms = set(list(flatten(GO_terms_dict.values())))
len(All_GO_terms)


2935

In [49]:
# Initialization of the 2056 new features with 0's
for terms in All_GO_terms:
    train[terms] = 0

In [26]:
# looping through all classes and getting terms for each class
'''terms_per_class = defaultdict(list)
for entry, terms in GO_terms_dict.items():
    if entry in class_dict:
        gene_classes = class_dict[entry]
        for gene_class in gene_classes:
            terms_per_class[gene_class].extend(terms)
           
        
terms_per_class'''

'terms_per_class = defaultdict(list)\nfor entry, terms in GO_terms_dict.items():\n    if entry in class_dict:\n        gene_classes = class_dict[entry]\n        for gene_class in gene_classes:\n            terms_per_class[gene_class].extend(terms)\n           \n        \nterms_per_class'

In [27]:
# code if we want most commons
'''counter_dict = {}
for classes in terms_per_class:
    counter_dict[classes] = Counter(terms_per_class[classes]).most_common(50)'''


'counter_dict = {}\nfor classes in terms_per_class:\n    counter_dict[classes] = Counter(terms_per_class[classes]).most_common(50)'

In [50]:
# adds the molecular function GO terms to each gene in train data frame
for i in train.index:
    gene = train.Gene[i]
    gene_entry = gene_entry_dict[gene]
    if gene_entry in GO_terms_dict:
        GO_terms = GO_terms_dict[gene_entry]
        train.loc[i, GO_terms] = 1

train.shape

(3689, 2939)

In [53]:
train

Unnamed: 0,Class,Gene,ID,Variation,mammary gland duct morphogenesis [GO:0060603],negative regulation of insulin receptor signaling pathway [GO:0046627],regulation of mRNA 3'-end processing [GO:0031440],ribonucleoprotein complex assembly [GO:0022618],response to stress [GO:0006950],positive regulation of ERK1 and ERK2 cascade [GO:0070374],...,cardiac right atrium morphogenesis [GO:0003213],cellular response to nitric oxide [GO:0071732],regulation of RNA polymerase II regulatory region sequence-specific DNA binding [GO:1903025],regulation of transcription from RNA polymerase III promoter [GO:0006359],positive regulation of epithelial cell apoptotic process [GO:1904037],defense response to virus [GO:0051607],skeletal system morphogenesis [GO:0048705],ear development [GO:0043583],negative regulation of multicellular organism growth [GO:0040015],peptidyl-lysine modification [GO:0018205]
0,1,FAM58A,0,Truncating Mutations,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,CBL,1,W802*,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,CBL,2,Q249E,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,CBL,3,N454D,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,CBL,4,L399V,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4,CBL,5,V391I,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,5,CBL,6,V430M,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,CBL,7,Deletion,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,4,CBL,8,Y371H,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,4,CBL,9,C384R,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [54]:
# fit the input X and output Y for the feature selection
X = train[list(All_GO_terms)]
y = train['Class']
names = X.columns

In [67]:
# Lasso model
lasso = Lasso(alpha=.002, random_state = 3).fit(X,y)
features_lasso = names[np.nonzero(lasso.coef_)]
len(features_lasso) # 164 in total

164

In [68]:
# saving the train set together with all features from uniprot
np.save("..//biological_bases/features_biological_function", features_lasso)