# Get the uniprot GO terms of cellular ready for modelling


In [1]:
# Gene Ontology can be found here: http://geneontology.org/page/ontology-documentation
import numpy as np
import pandas as pd
import string
import os
from collections import Counter
from collections import defaultdict

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import TruncatedSVD

import re
from bioservices import *
import collections
%pylab inline --no-import-all

Populating the interactive namespace from numpy and matplotlib


In [2]:
train = pd.read_csv('..//..//bases/new_training_variants.csv')
test = pd.read_csv('..//..//bases/new_test_variants.csv')

In [3]:
data_all = pd.concat((train, test), axis=0, ignore_index=True)

In [4]:
all_genes = set(data_all.Gene)
print(len(all_genes))
print(all_genes)

401
{'EWSR1', 'ITM2B', 'MYD88', 'MCC', 'LRP6', 'ADGRG1', 'MAP3K1', 'ATP2C1', 'WHSC1L1', 'LATS2', 'ALG10', 'CDKN1A', 'BARD1', 'STK19', 'ERBB4', 'SCO1', 'SLC22A4', 'KMT2B', 'NUP93', 'TP53BP1', 'ATM', 'SMARCA4', 'DNAI1', 'KDM6A', 'SIX3', 'BCL2L11', 'ARID1A', 'FGFR3', 'MSH2', 'PHOX2B', 'FLT1', 'CCND2', 'GALK1', 'BRD4', 'XRCC2', 'CDKN1B', 'RET', 'WNT4', 'SOS1', 'STK11', 'SOX17', 'CDK4', 'NF1', 'RP1', 'CDKN2A', 'MOCS2', 'RAD51B', 'FAT1', 'PIK3CA', 'RARA', 'B4GALT7', 'RBBP8', 'EPHA5', 'BRAF', 'NKX2-1', 'CDK8', 'RBM10', 'CTCF', 'SDHB', 'RASA1', 'CDKN2B', 'HLA-A', 'ERF', 'SLC19A2', 'TGFBR1', 'APC', 'HABP2', 'KRIT1', 'AGO2', 'SLC22A5', 'RPS19', 'FOXO1', 'AKT1', 'SLC7A7', 'PIK3CD', 'CLDN16', 'TET1', 'VEGFA', 'BCS1L', 'SF3B2', 'PTCH1', 'MAP2K4', 'RUNX1', 'RAD51D', 'MAP2K2', 'RYBP', 'ARID5B', 'PIK3CB', 'RNF6', 'KRAS', 'KCNQ4', 'MLH1', 'COX15', 'KMT2D', 'EIF2B5', 'PBRM1', 'KLF11', 'PPP6C', 'SF3B1', 'SHQ1', 'SLC4A4', 'U2AF1', 'CDK6', 'FANCC', 'MYC', 'MYOD1', 'BRCA2', 'MET', 'KDR', 'KDM5A', 'NTRK2', '

In [5]:
u = UniProt()

In [6]:
u.debugLevel = "INFO"
u.timeout = 100   # some queries are long and requires much more time; default is 1000 seconds

In [7]:
gene_entry_dict = {}
class_dict = {}
for gene in all_genes:
    keyword = 'gene:%s+AND+organism:9606' %gene #to query database, with gene and organism 9606 is Homo Sapien (human)
    entry_name_tab = u.search(keyword, frmt='tab', limit=1, columns="entry name") 
    entry_name = [s.strip() for s in entry_name_tab.splitlines()][1] # gets the entry name = in second position in list
    gene_entry_dict[gene] = entry_name

In [8]:
gene_entries = list(gene_entry_dict.values())
len(gene_entries)

401

In [9]:
df = u.get_df(gene_entries) # searches in uniprot -> gets results back 
df

INFO:root:fetching information from uniprot for 399 entries
INFO:root:uniprot.get_df 1/3
INFO:root:uniprot.get_df 2/3
INFO:root:uniprot.get_df 3/3
INFO:root:uniprot.get_df 4/3


Unnamed: 0,Entry,Entry name,Gene names,Gene names (primary ),Gene names (synonym ),Gene names (ordered locus ),Gene names (ORF ),Organism,Organism ID,Protein names,...,Miscellaneous [CC],Keywords,Protein existence,Status,Sequence annotation (Features),Protein families,Version,Comments,Cross-reference (null),Pathway.1
0,O00429,DNM1L_HUMAN,[DNM1L DLP1 DRP1],DNM1L,DLP1 DRP1,,,Homo sapiens (Human),9606,Dynamin-1-like protein (EC 3.6.5.5) (Dnm1p/Vps...,...,,"[3D-structure, Acetylation, Alternative splici...",Evidence at protein level,reviewed,,"[TRAFAC class dynamin-like GTPase superfamily,...",160,"[Alternative products (1), Catalytic activity ...",,
1,Q06124,PTN11_HUMAN,[PTPN11 PTP2C SHPTP2],PTPN11,PTP2C SHPTP2,,,Homo sapiens (Human),9606,Tyrosine-protein phosphatase non-receptor type...,...,,"[3D-structure, Acetylation, Alternative splici...",Evidence at protein level,reviewed,,"[Protein-tyrosine phosphatase family, Non-rece...",213,"[Alternative products (1), Catalytic activity ...",,
2,P17948,VGFR1_HUMAN,[FLT1 FLT FRT VEGFR1],FLT1,FLT FRT VEGFR1,,,Homo sapiens (Human),9606,Vascular endothelial growth factor receptor 1 ...,...,,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, Tyr protein kinas...",208,"[Alternative products (1), Catalytic activity ...",,
3,P38936,CDN1A_HUMAN,[CDKN1A CAP20 CDKN1 CIP1 MDA6 PIC1 SDI1 WAF1],CDKN1A,CAP20 CDKN1 CIP1 MDA6 PIC1 SDI1 WAF1,,,Homo sapiens (Human),9606,Cyclin-dependent kinase inhibitor 1 (CDK-inter...,...,,"[3D-structure, Acetylation, Cell cycle, Comple...",Evidence at protein level,reviewed,,[CDI family],199,"[Domain (2), Function (1), Induction (1), Post...",,
4,P04629,NTRK1_HUMAN,[NTRK1 MTC TRK TRKA],NTRK1,MTC TRK TRKA,,,Homo sapiens (Human),9606,High affinity nerve growth factor receptor (EC...,...,MISCELLANEOUS: Trk also stands for tropomyosin...,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, Tyr protein kinas...",226,"[Alternative products (1), Catalytic activity ...",,
5,Q15831,STK11_HUMAN,[STK11 LKB1 PJS],STK11,LKB1 PJS,,,Homo sapiens (Human),9606,Serine/threonine-protein kinase STK11 (EC 2.7....,...,,"[3D-structure, ATP-binding, Acetylation, Alter...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, CAMK Ser/Thr prot...",193,"[Alternative products (1), Catalytic activity ...",,
6,P27986,P85A_HUMAN,[PIK3R1 GRB1],PIK3R1,GRB1,,,Homo sapiens (Human),9606,Phosphatidylinositol 3-kinase regulatory subun...,...,,"[3D-structure, Acetylation, Alternative splici...",Evidence at protein level,reviewed,,[PI3K p85 subunit family],214,"[Alternative products (1), Caution (1), Domain...",,
7,O75581,LRP6_HUMAN,[LRP6],LRP6,,,,Homo sapiens (Human),9606,Low-density lipoprotein receptor-related prote...,...,,"[3D-structure, Cell membrane, Complete proteom...",Evidence at protein level,reviewed,,[LDLR family],156,"[Domain (2), Function (1), Induction (1), Invo...",,
8,P46527,CDN1B_HUMAN,[CDKN1B KIP1],CDKN1B,KIP1,,,Homo sapiens (Human),9606,Cyclin-dependent kinase inhibitor 1B (Cyclin-d...,...,"MISCELLANEOUS: Decreased levels of p27Kip1, ma...","[3D-structure, Cell cycle, Complete proteome, ...",Evidence at protein level,reviewed,,[CDI family],192,"[Domain (1), Function (1), Induction (1), Invo...",,
9,P82279,CRUM1_HUMAN,[CRB1],CRB1,,,,Homo sapiens (Human),9606,Protein crumbs homolog 1,...,,"[3D-structure, Alternative splicing, Calcium, ...",Evidence at protein level,reviewed,,[Crumbs protein family],175,"[Alternative products (1), Caution (2), Functi...",,


In [12]:
df_new = df[df['Gene ontology (cellular component)'].notnull()] # don't consider genes with no biological process

In [13]:
df_new['Gene ontology (cellular component)'] = df_new['Gene ontology (cellular component)'].apply(lambda x: x.split('; ')) #split functions based on ;


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [15]:
GO_terms_dict = dict(zip(df_new['Entry name'], df_new['Gene ontology (cellular component)']))

In [16]:
GO_terms_dict

{'1A02_HUMAN': ['cell surface [GO:0009986]',
  'early endosome membrane [GO:0031901]',
  'endoplasmic reticulum [GO:0005783]',
  'endoplasmic reticulum exit site [GO:0070971]',
  'endoplasmic reticulum membrane [GO:0005789]',
  'ER to Golgi transport vesicle membrane [GO:0012507]',
  'Golgi apparatus [GO:0005794]',
  'Golgi medial cisterna [GO:0005797]',
  'Golgi membrane [GO:0000139]',
  'integral component of lumenal side of endoplasmic reticulum membrane [GO:0071556]',
  'MHC class I protein complex [GO:0042612]',
  'phagocytic vesicle membrane [GO:0030670]',
  'plasma membrane [GO:0005886]',
  'recycling endosome membrane [GO:0055038]'],
 '1B07_HUMAN': ['cell surface [GO:0009986]',
  'early endosome membrane [GO:0031901]',
  'endoplasmic reticulum [GO:0005783]',
  'ER to Golgi transport vesicle membrane [GO:0012507]',
  'extracellular exosome [GO:0070062]',
  'Golgi apparatus [GO:0005794]',
  'Golgi membrane [GO:0000139]',
  'integral component of lumenal side of endoplasmic reticu

In [17]:
# Find most common GO terms to use as features
def flatten(l): # taken from https://stackoverflow.com/questions/33900770/most-frequent-values-in-a-dictionary
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, str): #replaced basestring with str for Python3
            for sub in flatten(el):
                yield sub
        else:
            yield el


In [18]:
All_GO_terms = set(list(flatten(GO_terms_dict.values())))
len(All_GO_terms)

430

In [19]:
# loading the XGboost most important 190 features
features = np.load("cellular_bases/features_cellular_function.npy")

In [20]:
len(features)

152

In [21]:
# initialize data with the features 
for feature in features:
    data_all[feature] = 0

data_all

Unnamed: 0,Class,Gene,ID,Variation,transcriptional preinitiation complex [GO:0097550],extrinsic component of membrane [GO:0019898],nuclear periphery [GO:0034399],apical junction complex [GO:0043296],mitochondrial nucleoid [GO:0042645],Cul4A-RING E3 ubiquitin ligase complex [GO:0031464],...,mitochondrial matrix [GO:0005759],nucleoplasm [GO:0005654],epsilon DNA polymerase complex [GO:0008622],basal plasma membrane [GO:0009925],immunological synapse [GO:0001772],cyclin E1-CDK2 complex [GO:0097134],pronucleus [GO:0045120],cortical actin cytoskeleton [GO:0030864],ciliary tip [GO:0097542],early endosome [GO:0005769]
0,1.0,FAM58A,0,Truncating Mutations,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2.0,CBL,1,W802*,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2.0,CBL,2,Q249E,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3.0,CBL,3,N454D,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4.0,CBL,4,L399V,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4.0,CBL,5,V391I,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,5.0,CBL,6,V430M,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1.0,CBL,7,Deletion,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,4.0,CBL,8,Y371H,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,4.0,CBL,9,C384R,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [25]:
# add 1 if the GO term is inside the gene_entry_dict for a particular gene
for i in data_all.index:
    gene = data_all.Gene[i]
    gene_entry = gene_entry_dict[gene]
    if gene_entry in GO_terms_dict:
        GO_terms = GO_terms_dict[gene_entry]
        features_inside = list(set(GO_terms).intersection(features))# get only features in the GO_terms that we need
        data_all.loc[i, features_inside] = 1

In [26]:
data_all.shape

(4675, 156)

In [27]:
data_all

Unnamed: 0,Class,Gene,ID,Variation,transcriptional preinitiation complex [GO:0097550],extrinsic component of membrane [GO:0019898],nuclear periphery [GO:0034399],apical junction complex [GO:0043296],mitochondrial nucleoid [GO:0042645],Cul4A-RING E3 ubiquitin ligase complex [GO:0031464],...,mitochondrial matrix [GO:0005759],nucleoplasm [GO:0005654],epsilon DNA polymerase complex [GO:0008622],basal plasma membrane [GO:0009925],immunological synapse [GO:0001772],cyclin E1-CDK2 complex [GO:0097134],pronucleus [GO:0045120],cortical actin cytoskeleton [GO:0030864],ciliary tip [GO:0097542],early endosome [GO:0005769]
0,1.0,FAM58A,0,Truncating Mutations,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2.0,CBL,1,W802*,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2.0,CBL,2,Q249E,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3.0,CBL,3,N454D,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4.0,CBL,4,L399V,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4.0,CBL,5,V391I,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,5.0,CBL,6,V430M,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1.0,CBL,7,Deletion,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,4.0,CBL,8,Y371H,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,4.0,CBL,9,C384R,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [29]:
# Save the 190 features into one csv file in case we will use it again
data_all.to_csv("cellular_bases/all_cellular_functions.csv",index=False)

In [30]:
# Do an SVD on the molecular functions to get a reduction to 25 features
svd = TruncatedSVD(n_components=25, n_iter=20, random_state=20)
feature_columns = data_all.iloc[:,4:] #starting from the 4th column we have our features
truncated_molecular = pd.DataFrame(svd.fit_transform(feature_columns.values))


In [31]:
# add truncated molecular functions to our data 
data_new = pd.concat((train, test), axis=0, ignore_index=True)
data_SVD = pd.concat((data_new, truncated_molecular), axis = 1)
data_SVD

Unnamed: 0,Class,Gene,ID,Variation,0,1,2,3,4,5,...,15,16,17,18,19,20,21,22,23,24
0,1.0,FAM58A,0,Truncating Mutations,0.489843,-0.122159,-0.108659,-0.053036,-0.161419,0.336464,...,-0.205091,-0.033233,0.307529,-0.164920,0.026095,0.169900,0.111213,-0.053869,0.108243,0.081136
1,2.0,CBL,1,W802*,1.510922,0.661364,0.581828,0.182276,-0.619431,-0.034273,...,-0.886259,0.210519,0.032528,0.235492,-0.279222,0.358453,-0.108003,-0.113755,-0.174818,0.111583
2,2.0,CBL,2,Q249E,1.510922,0.661364,0.581828,0.182276,-0.619431,-0.034273,...,-0.886259,0.210519,0.032528,0.235492,-0.279222,0.358453,-0.108003,-0.113755,-0.174818,0.111583
3,3.0,CBL,3,N454D,1.510922,0.661364,0.581828,0.182276,-0.619431,-0.034273,...,-0.886259,0.210519,0.032528,0.235492,-0.279222,0.358453,-0.108003,-0.113755,-0.174818,0.111583
4,4.0,CBL,4,L399V,1.510922,0.661364,0.581828,0.182276,-0.619431,-0.034273,...,-0.886259,0.210519,0.032528,0.235492,-0.279222,0.358453,-0.108003,-0.113755,-0.174818,0.111583
5,4.0,CBL,5,V391I,1.510922,0.661364,0.581828,0.182276,-0.619431,-0.034273,...,-0.886259,0.210519,0.032528,0.235492,-0.279222,0.358453,-0.108003,-0.113755,-0.174818,0.111583
6,5.0,CBL,6,V430M,1.510922,0.661364,0.581828,0.182276,-0.619431,-0.034273,...,-0.886259,0.210519,0.032528,0.235492,-0.279222,0.358453,-0.108003,-0.113755,-0.174818,0.111583
7,1.0,CBL,7,Deletion,1.510922,0.661364,0.581828,0.182276,-0.619431,-0.034273,...,-0.886259,0.210519,0.032528,0.235492,-0.279222,0.358453,-0.108003,-0.113755,-0.174818,0.111583
8,4.0,CBL,8,Y371H,1.510922,0.661364,0.581828,0.182276,-0.619431,-0.034273,...,-0.886259,0.210519,0.032528,0.235492,-0.279222,0.358453,-0.108003,-0.113755,-0.174818,0.111583
9,4.0,CBL,9,C384R,1.510922,0.661364,0.581828,0.182276,-0.619431,-0.034273,...,-0.886259,0.210519,0.032528,0.235492,-0.279222,0.358453,-0.108003,-0.113755,-0.174818,0.111583


In [32]:
print(svd.explained_variance_ratio_.sum())

0.848634838301


In [33]:
new_names = [] 
for i in range(25):
    new_names.append('cellular_SVD_'+str(i+1))

data_SVD.columns = data_SVD.columns[:4].tolist() + new_names

In [34]:
# Save the 25 svd's features into one file 
data_SVD.to_csv("cellular_bases/svd25_cellular.csv",index=False)