# Get the uniprot GO terms of biological process ready for modelling


In [3]:
# Gene Ontology can be found here: http://geneontology.org/page/ontology-documentation
import numpy as np
import pandas as pd
import string
import os
from collections import Counter
from collections import defaultdict

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import TruncatedSVD

import re
from bioservices import *
import collections
%pylab inline --no-import-all

Populating the interactive namespace from numpy and matplotlib


In [6]:
train = pd.read_csv('..//..//bases/new_training_variants.csv')
test = pd.read_csv('..//..//bases/new_test_variants.csv')

In [36]:
data_all = pd.concat((train, test), axis=0, ignore_index=True)

In [8]:
all_genes = set(data_all.Gene)
print(len(all_genes))
print(all_genes)

401
{'DNMT3A', 'DIS3', 'RB1', 'AGO2', 'TET1', 'CDK12', 'SLC33A1', 'KDM5C', 'ASS1', 'SDHB', 'ARID1A', 'KERA', 'WNT4', 'MYOT', 'SYT6', 'SEPT9', 'SF3B1', 'DNAH5', 'KMT2C', 'ETV6', 'PBRM1', 'GRM6', 'AXIN2', 'PRKRA', 'ALK', 'NRAS', 'KMT2D', 'MEN1', 'KDR', 'KCNQ4', 'EGFR', 'TGM5', 'TRPM1', 'PPP6C', 'RHOA', 'AKT2', 'KLF4', 'RPS19', 'SOX17', 'PMS2', 'PPM1D', 'CDK8', 'ERCC3', 'ITM2B', 'PIM1', 'OTOF', 'SPOP', 'SRSF2', 'CCNE1', 'ERG', 'SLC17A5', 'PTCH1', 'EPCAM', 'AURKB', 'RAD51B', 'IDH2', 'MTOR', 'FGFR1', 'ESR1', 'PIK3CD', 'PRDM1', 'RRAS2', 'JAK1', 'RAD50', 'POLH', 'FOXO1', 'ERF', 'DPM1', 'GLI1', 'U2AF1', 'DCC', 'DYNC2H1', 'APC', 'SMAD3', 'PLA2G6', 'HLA-A', 'SLC25A12', 'GALK1', 'CARM1', 'INPP4B', 'ARAF', 'RAB35', 'STK19', 'GPHN', 'TRPC6', 'SLC6A5', 'TSHR', 'LARGE1', 'SMARCB1', 'PNPO', 'NF2', 'TSC1', 'LRP4', 'CTNNB1', 'FLT1', 'CASP8', 'ADGRG1', 'STK11', 'BCOR', 'B4GALT7', 'CDK6', 'FANCC', 'KDM5A', 'IKZF1', 'FGFR4', 'NPM1', 'TCF7L2', 'DNMT3B', 'PIK3CA', 'BRCA1', 'VHL', 'PAX8', 'KISS1R', 'ACVR1', '

In [9]:
u = UniProt()

In [10]:
u.debugLevel = "INFO"
u.timeout = 100   # some queries are long and requires much more time; default is 1000 seconds

In [11]:
gene_entry_dict = {}
class_dict = {}
for gene in all_genes:
    keyword = 'gene:%s+AND+organism:9606' %gene #to query database, with gene and organism 9606 is Homo Sapien (human)
    entry_name_tab = u.search(keyword, frmt='tab', limit=1, columns="entry name") 
    entry_name = [s.strip() for s in entry_name_tab.splitlines()][1] # gets the entry name = in second position in list
    gene_entry_dict[gene] = entry_name

In [12]:
gene_entries = list(gene_entry_dict.values())
len(gene_entries)

401

In [13]:
df = u.get_df(gene_entries) # searches in uniprot -> gets results back 
df

INFO:root:fetching information from uniprot for 399 entries
INFO:root:uniprot.get_df 1/3
INFO:root:uniprot.get_df 2/3
INFO:root:uniprot.get_df 3/3
INFO:root:uniprot.get_df 4/3


Unnamed: 0,Entry,Entry name,Gene names,Gene names (primary ),Gene names (synonym ),Gene names (ordered locus ),Gene names (ORF ),Organism,Organism ID,Protein names,...,Miscellaneous [CC],Keywords,Protein existence,Status,Sequence annotation (Features),Protein families,Version,Comments,Cross-reference (null),Pathway.1
0,P36897,TGFR1_HUMAN,[TGFBR1 ALK5 SKR4],TGFBR1,ALK5 SKR4,,,Homo sapiens (Human),9606,TGF-beta receptor type-1 (TGFR-1) (EC 2.7.11.3...,...,,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, TKL Ser/Thr prote...",203,"[Alternative products (1), Catalytic activity ...",,
1,P35968,VGFR2_HUMAN,[KDR FLK1 VEGFR2],KDR,FLK1 VEGFR2,,,Homo sapiens (Human),9606,Vascular endothelial growth factor receptor 2 ...,...,,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, Tyr protein kinas...",200,"[Alternative products (1), Catalytic activity ...",,
2,P21802,FGFR2_HUMAN,[FGFR2 BEK KGFR KSAM],FGFR2,BEK KGFR KSAM,,,Homo sapiens (Human),9606,Fibroblast growth factor receptor 2 (FGFR-2) (...,...,,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, Tyr protein kinas...",228,"[Alternative products (1), Catalytic activity ...",,
3,P38398,BRCA1_HUMAN,[BRCA1 RNF53],BRCA1,RNF53,,,Homo sapiens (Human),9606,Breast cancer type 1 susceptibility protein (E...,...,,"[3D-structure, Acetylation, Activator, Alterna...",Evidence at protein level,reviewed,,[],228,"[Alternative products (1), Catalytic activity ...",,Protein modification; protein ubiquitination.
4,P42336,PK3CA_HUMAN,[PIK3CA],PIK3CA,,,,Homo sapiens (Human),9606,"Phosphatidylinositol 4,5-bisphosphate 3-kinase...",...,MISCELLANEOUS: The avian sarcoma virus 16 geno...,"[3D-structure, ATP-binding, Angiogenesis, Comp...",Evidence at protein level,reviewed,,[PI3/PI4-kinase family],189,"[Catalytic activity (2), Domain (1), Function ...",,
5,P11362,FGFR1_HUMAN,[FGFR1 BFGFR CEK FGFBR FLG FLT2 HBGFR],FGFR1,BFGFR CEK FGFBR FLG FLT2 HBGFR,,,Homo sapiens (Human),9606,Fibroblast growth factor receptor 1 (FGFR-1) (...,...,,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, Tyr protein kinas...",233,"[Alternative products (1), Catalytic activity ...",,
6,P27986,P85A_HUMAN,[PIK3R1 GRB1],PIK3R1,GRB1,,,Homo sapiens (Human),9606,Phosphatidylinositol 3-kinase regulatory subun...,...,,"[3D-structure, Acetylation, Alternative splici...",Evidence at protein level,reviewed,,[PI3K p85 subunit family],214,"[Alternative products (1), Caution (1), Domain...",,
7,Q09472,EP300_HUMAN,[EP300 P300],EP300,P300,,,Homo sapiens (Human),9606,Histone acetyltransferase p300 (p300 HAT) (EC ...,...,,"[3D-structure, Acetylation, Acyltransferase, B...",Evidence at protein level,reviewed,,[],222,"[Catalytic activity (1), Domain (1), Function ...",,
8,P12830,CADH1_HUMAN,[CDH1 CDHE UVO],CDH1,CDHE UVO,,,Homo sapiens (Human),9606,Cadherin-1 (CAM 120/80) (Epithelial cadherin) ...,...,,"[3D-structure, Alternative splicing, Calcium, ...",Evidence at protein level,reviewed,,[],219,"[Alternative products (1), Domain (1), Functio...",,
9,P17948,VGFR1_HUMAN,[FLT1 FLT FRT VEGFR1],FLT1,FLT FRT VEGFR1,,,Homo sapiens (Human),9606,Vascular endothelial growth factor receptor 1 ...,...,,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, Tyr protein kinas...",208,"[Alternative products (1), Catalytic activity ...",,


In [14]:
df_new = df[df['Gene ontology (biological process)'].notnull()] # don't consider genes with no biological process

In [15]:
df_new['Gene ontology (biological process)'] = df_new['Gene ontology (biological process)'].apply(lambda x: x.split('; ')) #split functions based on ;


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [16]:
GO_terms_dict = dict(zip(df_new['Entry name'], df_new['Gene ontology (biological process)']))

In [17]:
GO_terms_dict

{'1A02_HUMAN': ['antibacterial humoral response [GO:0019731]',
  'antigen processing and presentation of endogenous peptide antigen via MHC class I [GO:0019885]',
  'antigen processing and presentation of endogenous peptide antigen via MHC class I via ER pathway, TAP-independent [GO:0002486]',
  'antigen processing and presentation of exogenous peptide antigen via MHC class I, TAP-dependent [GO:0002479]',
  'antigen processing and presentation of exogenous peptide antigen via MHC class I, TAP-independent [GO:0002480]',
  'antigen processing and presentation of peptide antigen via MHC class I [GO:0002474]',
  'defense response to Gram-positive bacterium [GO:0050830]',
  'interferon-gamma-mediated signaling pathway [GO:0060333]',
  'positive regulation of CD8-positive, alpha-beta T cell activation [GO:2001187]',
  'positive regulation of CD8-positive, alpha-beta T cell proliferation [GO:2000566]',
  'positive regulation of interferon-gamma production [GO:0032729]',
  'positive regulation

In [18]:
# Find most common GO terms to use as features
def flatten(l): # taken from https://stackoverflow.com/questions/33900770/most-frequent-values-in-a-dictionary
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, str): #replaced basestring with str for Python3
            for sub in flatten(el):
                yield sub
        else:
            yield el


In [19]:
All_GO_terms = set(list(flatten(GO_terms_dict.values())))
len(All_GO_terms)

3458

In [32]:
# loading the XGboost most important 190 features
features = np.load("biological_bases/features_biological_function.npy")

In [34]:
len(features)

164

In [37]:
# initialize data with the features 
for feature in features:
    data_all[feature] = 0

data_all

Unnamed: 0,Class,Gene,ID,Variation,"negative regulation of transcription, DNA-templated [GO:0045892]",regulation of small GTPase mediated signal transduction [GO:0051056],positive regulation of focal adhesion assembly [GO:0051894],intracellular signal transduction [GO:0035556],regulation of extracellular matrix disassembly [GO:0010715],protein deubiquitination [GO:0016579],...,negative regulation of Ras protein signal transduction [GO:0046580],2-oxoglutarate metabolic process [GO:0006103],positive regulation of smooth muscle cell proliferation [GO:0048661],transcription initiation from RNA polymerase II promoter [GO:0006367],"regulation of alternative mRNA splicing, via spliceosome [GO:0000381]",spermatogenesis [GO:0007283],male gonad development [GO:0008584],protein transport [GO:0015031],cell chemotaxis [GO:0060326],positive regulation of oligodendrocyte differentiation [GO:0048714]
0,1.0,FAM58A,0,Truncating Mutations,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2.0,CBL,1,W802*,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2.0,CBL,2,Q249E,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3.0,CBL,3,N454D,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4.0,CBL,4,L399V,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4.0,CBL,5,V391I,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,5.0,CBL,6,V430M,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1.0,CBL,7,Deletion,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,4.0,CBL,8,Y371H,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,4.0,CBL,9,C384R,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
# add 1 if the GO term is inside the gene_entry_dict for a particular gene
for i in data_all.index:
    gene = data_all.Gene[i]
    gene_entry = gene_entry_dict[gene]
    if gene_entry in GO_terms_dict:
        GO_terms = GO_terms_dict[gene_entry]
        features_inside = list(set(GO_terms).intersection(features))# get only features in the GO_terms that we need
        data_all.loc[i, features_inside] = 1

In [39]:
data_all.shape

(4675, 168)

In [40]:
data_all

Unnamed: 0,Class,Gene,ID,Variation,"negative regulation of transcription, DNA-templated [GO:0045892]",regulation of small GTPase mediated signal transduction [GO:0051056],positive regulation of focal adhesion assembly [GO:0051894],intracellular signal transduction [GO:0035556],regulation of extracellular matrix disassembly [GO:0010715],protein deubiquitination [GO:0016579],...,negative regulation of Ras protein signal transduction [GO:0046580],2-oxoglutarate metabolic process [GO:0006103],positive regulation of smooth muscle cell proliferation [GO:0048661],transcription initiation from RNA polymerase II promoter [GO:0006367],"regulation of alternative mRNA splicing, via spliceosome [GO:0000381]",spermatogenesis [GO:0007283],male gonad development [GO:0008584],protein transport [GO:0015031],cell chemotaxis [GO:0060326],positive regulation of oligodendrocyte differentiation [GO:0048714]
0,1.0,FAM58A,0,Truncating Mutations,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2.0,CBL,1,W802*,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,2.0,CBL,2,Q249E,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,3.0,CBL,3,N454D,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,4.0,CBL,4,L399V,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,4.0,CBL,5,V391I,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
6,5.0,CBL,6,V430M,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
7,1.0,CBL,7,Deletion,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
8,4.0,CBL,8,Y371H,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
9,4.0,CBL,9,C384R,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [41]:
# Save the 190 features into one csv file in case we will use it again
data_all.to_csv("biological_bases/all_biological_functions.csv",index=False)

In [42]:
# Do an SVD on the molecular functions to get a reduction to 25 features
svd = TruncatedSVD(n_components=25, n_iter=20, random_state=20)
feature_columns = data_all.iloc[:,4:] #starting from the 4th column we have our features
truncated_molecular = pd.DataFrame(svd.fit_transform(feature_columns.values))


In [43]:
# add truncated molecular functions to our data 
data_new = pd.concat((train, test), axis=0, ignore_index=True)
data_SVD = pd.concat((data_new, truncated_molecular), axis = 1)
data_SVD

Unnamed: 0,Class,Gene,ID,Variation,0,1,2,3,4,5,...,15,16,17,18,19,20,21,22,23,24
0,1.0,FAM58A,0,Truncating Mutations,-3.609233e-21,-1.109024e-14,2.711342e-14,-6.087664e-14,-2.589498e-14,3.769184e-14,...,3.111606e-14,-1.736549e-14,1.714310e-14,-2.791821e-16,-4.559932e-14,3.646797e-14,-6.716253e-15,-1.849650e-15,-2.197315e-14,5.620231e-15
1,2.0,CBL,1,W802*,3.636432e-01,2.706280e-02,-7.453553e-02,5.555070e-02,2.618679e-01,-1.935565e-01,...,1.324105e-02,-6.315577e-02,1.094998e-01,9.069633e-02,-1.415679e-01,-1.573675e-01,5.380147e-01,-4.048268e-02,3.026497e-01,8.245131e-01
2,2.0,CBL,2,Q249E,3.636432e-01,2.706280e-02,-7.453553e-02,5.555070e-02,2.618679e-01,-1.935565e-01,...,1.324105e-02,-6.315577e-02,1.094998e-01,9.069633e-02,-1.415679e-01,-1.573675e-01,5.380147e-01,-4.048268e-02,3.026497e-01,8.245131e-01
3,3.0,CBL,3,N454D,3.636432e-01,2.706280e-02,-7.453553e-02,5.555070e-02,2.618679e-01,-1.935565e-01,...,1.324105e-02,-6.315577e-02,1.094998e-01,9.069633e-02,-1.415679e-01,-1.573675e-01,5.380147e-01,-4.048268e-02,3.026497e-01,8.245131e-01
4,4.0,CBL,4,L399V,3.636432e-01,2.706280e-02,-7.453553e-02,5.555070e-02,2.618679e-01,-1.935565e-01,...,1.324105e-02,-6.315577e-02,1.094998e-01,9.069633e-02,-1.415679e-01,-1.573675e-01,5.380147e-01,-4.048268e-02,3.026497e-01,8.245131e-01
5,4.0,CBL,5,V391I,3.636432e-01,2.706280e-02,-7.453553e-02,5.555070e-02,2.618679e-01,-1.935565e-01,...,1.324105e-02,-6.315577e-02,1.094998e-01,9.069633e-02,-1.415679e-01,-1.573675e-01,5.380147e-01,-4.048268e-02,3.026497e-01,8.245131e-01
6,5.0,CBL,6,V430M,3.636432e-01,2.706280e-02,-7.453553e-02,5.555070e-02,2.618679e-01,-1.935565e-01,...,1.324105e-02,-6.315577e-02,1.094998e-01,9.069633e-02,-1.415679e-01,-1.573675e-01,5.380147e-01,-4.048268e-02,3.026497e-01,8.245131e-01
7,1.0,CBL,7,Deletion,3.636432e-01,2.706280e-02,-7.453553e-02,5.555070e-02,2.618679e-01,-1.935565e-01,...,1.324105e-02,-6.315577e-02,1.094998e-01,9.069633e-02,-1.415679e-01,-1.573675e-01,5.380147e-01,-4.048268e-02,3.026497e-01,8.245131e-01
8,4.0,CBL,8,Y371H,3.636432e-01,2.706280e-02,-7.453553e-02,5.555070e-02,2.618679e-01,-1.935565e-01,...,1.324105e-02,-6.315577e-02,1.094998e-01,9.069633e-02,-1.415679e-01,-1.573675e-01,5.380147e-01,-4.048268e-02,3.026497e-01,8.245131e-01
9,4.0,CBL,9,C384R,3.636432e-01,2.706280e-02,-7.453553e-02,5.555070e-02,2.618679e-01,-1.935565e-01,...,1.324105e-02,-6.315577e-02,1.094998e-01,9.069633e-02,-1.415679e-01,-1.573675e-01,5.380147e-01,-4.048268e-02,3.026497e-01,8.245131e-01


In [44]:
print(svd.explained_variance_ratio_.sum())

0.799525384063


In [45]:
new_names = [] 
for i in range(25):
    new_names.append('biological_SVD_'+str(i+1))

data_SVD.columns = data_SVD.columns[:4].tolist() + new_names

In [46]:
# Save the 25 svd's features into one file 
data_SVD.to_csv("biological_bases/svd25_biological_functions.csv",index=False)