# Get the uniprot GO terms of biological process ready for modelling


In [2]:
# Gene Ontology can be found here: http://geneontology.org/page/ontology-documentation
import numpy as np
import pandas as pd
import string
import os
from collections import Counter
from collections import defaultdict

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import TruncatedSVD

import re
from bioservices import *
import collections
%pylab inline --no-import-all

Populating the interactive namespace from numpy and matplotlib


In [3]:
train = pd.read_csv('..//bases/training_variants')
test = pd.read_csv('..//bases/test_variants')

In [4]:
data_all = pd.concat((train, test), axis=0, ignore_index=True)

In [5]:
all_genes = set(data_all.Gene)
print(len(all_genes))
print(all_genes)

1507
{'GNPTG', 'FAAH', 'SUMF1', 'CDKL5', 'EWSR1', 'FOXF1', 'MINPP1', 'IDUA', 'C1GALT1C1', 'PLP1', 'TGIF1', 'DPM1', 'PLOD3', 'ANKK1', 'BCS1L', 'ERBB3', 'GFM1', 'L1CAM', 'MEF2B', 'SLC2A9', 'ATP7B', 'DYNC2H1', 'C7', 'ZMPSTE24', 'SC5D', 'AGTR2', 'UPK3A', 'TET1', 'OXCT1', 'DLL3', 'CANT1', 'CDAN1', 'AMHR2', 'ALDH3A2', 'BSND', 'CLCN2', 'SLC10A2', 'ARHGAP9', 'EDA', 'CAV3', 'NHLRC1', 'GTF2H5', 'NCOR1', 'RAC1', 'F9', 'MKKS', 'SMS', 'RAD21', 'CDH1', 'CNGB3', 'TMEM43', 'PON2', 'SLC34A3', 'KCNJ11', 'BBS2', 'CA2', 'ATP2C1', 'NPHP4', 'ATP6V0A4', 'TRPM4', 'CLN5', 'PAFAH1B1', 'ERCC8', 'RGR', 'FOXC1', 'FBN2', 'CBS', 'PLCE1', 'IGFALS', 'TUBB1', 'GARS', 'CPN1', 'HPD', 'DNMT3B', 'CHMP4B', 'SLC26A2', 'CLCF1', 'ACVR2B', 'IRS2', 'ABCA12', 'F12', 'PPARGC1B', 'MAP2K4', 'CDKN1A', 'PER2', 'AICDA', 'GLB1', 'INF2', 'IKZF1', 'STK11', 'NDUFS3', 'CLCNKA', 'PAPSS2', 'HMCN1', 'FREM1', 'TCF7L2', 'NKX2-1', 'RBM20', 'ESCO2', 'SLC19A2', 'GHRL', 'EXT2', 'ALK', 'TPCN2', 'BMPR1A', 'DPYD', 'LAMP2', 'LZTS1', 'KMT2D', 'MAPK1', 'F

In [6]:
u = UniProt()

In [7]:
u.debugLevel = "INFO"
u.timeout = 100   # some queries are long and requires much more time; default is 1000 seconds

In [8]:
gene_entry_dict = {}
class_dict = {}
for gene in all_genes:
    keyword = 'gene:%s+AND+organism:9606' %gene #to query database, with gene and organism 9606 is Homo Sapien (human)
    entry_name_tab = u.search(keyword, frmt='tab', limit=1, columns="entry name") 
    entry_name = [s.strip() for s in entry_name_tab.splitlines()][1] # gets the entry name = in second position in list
    gene_entry_dict[gene] = entry_name

In [9]:
gene_entries = list(gene_entry_dict.values())
len(gene_entries)

1507

In [10]:
df = u.get_df(gene_entries) # searches in uniprot -> gets results back 
df

INFO:root:fetching information from uniprot for 1499 entries
INFO:root:uniprot.get_df 1/14
INFO:root:uniprot.get_df 2/14
INFO:root:uniprot.get_df 3/14
INFO:root:uniprot.get_df 4/14
INFO:root:uniprot.get_df 5/14
INFO:root:uniprot.get_df 6/14
INFO:root:uniprot.get_df 7/14
INFO:root:uniprot.get_df 8/14
INFO:root:uniprot.get_df 9/14
INFO:root:uniprot.get_df 10/14
INFO:root:uniprot.get_df 11/14
INFO:root:uniprot.get_df 12/14
INFO:root:uniprot.get_df 13/14
INFO:root:uniprot.get_df 14/14
INFO:root:uniprot.get_df 15/14


Unnamed: 0,Entry,Entry name,Gene names,Gene names (primary ),Gene names (synonym ),Gene names (ordered locus ),Gene names (ORF ),Organism,Organism ID,Protein names,...,Miscellaneous [CC],Keywords,Protein existence,Status,Sequence annotation (Features),Protein families,Version,Comments,Cross-reference (null),Pathway.1
0,Q06124,PTN11_HUMAN,[PTPN11 PTP2C SHPTP2],PTPN11,PTP2C SHPTP2,,,Homo sapiens (Human),9606,Tyrosine-protein phosphatase non-receptor type...,...,,"[3D-structure, Acetylation, Alternative splici...",Evidence at protein level,reviewed,,"[Protein-tyrosine phosphatase family, Non-rece...",213,"[Alternative products (1), Catalytic activity ...",,
1,P27986,P85A_HUMAN,[PIK3R1 GRB1],PIK3R1,GRB1,,,Homo sapiens (Human),9606,Phosphatidylinositol 3-kinase regulatory subun...,...,,"[3D-structure, Acetylation, Alternative splici...",Evidence at protein level,reviewed,,[PI3K p85 subunit family],214,"[Alternative products (1), Caution (1), Domain...",,
2,P08069,IGF1R_HUMAN,[IGF1R],IGF1R,,,,Homo sapiens (Human),9606,Insulin-like growth factor 1 receptor (EC 2.7....,...,,"[3D-structure, ATP-binding, Cell membrane, Cle...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, Tyr protein kinas...",207,"[Catalytic activity (1), Enzyme regulation (1)...",,
3,P51587,BRCA2_HUMAN,[BRCA2 FACD FANCD1],BRCA2,FACD FANCD1,,,Homo sapiens (Human),9606,Breast cancer type 2 susceptibility protein (F...,...,,"[3D-structure, Cell cycle, Complete proteome, ...",Evidence at protein level,reviewed,,[],197,"[Function (1), Involvement in disease (5), Pos...",,
4,P13498,CY24A_HUMAN,[CYBA],CYBA,,,,Homo sapiens (Human),9606,Cytochrome b-245 light chain (Cytochrome b(558...,...,,"[3D-structure, Cell membrane, Chronic granulom...",Evidence at protein level,reviewed,,[P22phox family],179,"[Function (1), Involvement in disease (1), Pos...",,
5,P82279,CRUM1_HUMAN,[CRB1],CRB1,,,,Homo sapiens (Human),9606,Protein crumbs homolog 1,...,,"[3D-structure, Alternative splicing, Calcium, ...",Evidence at protein level,reviewed,,[Crumbs protein family],175,"[Alternative products (1), Caution (2), Functi...",,
6,Q13402,MYO7A_HUMAN,[MYO7A USH1B],MYO7A,USH1B,,,Homo sapiens (Human),9606,Unconventional myosin-VIIa,...,,"[ATP-binding, Actin-binding, Alternative splic...",Evidence at protein level,reviewed,,[TRAFAC class myosin-kinesin ATPase superfamil...,198,"[Alternative products (1), Caution (2), Develo...",,
7,O00255,MEN1_HUMAN,[MEN1 SCG2],MEN1,SCG2,,,Homo sapiens (Human),9606,Menin,...,,"[3D-structure, Alternative splicing, Chromatin...",Evidence at protein level,reviewed,,[],174,"[Alternative products (1), Function (1), Invol...",,
8,O60566,BUB1B_HUMAN,[BUB1B BUBR1 MAD3L SSK1],BUB1B,BUBR1 MAD3L SSK1,,,Homo sapiens (Human),9606,Mitotic checkpoint serine/threonine-protein ki...,...,,"[3D-structure, ATP-binding, Acetylation, Alter...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, Ser/Thr protein k...",179,"[Alternative products (1), Catalytic activity ...",,
9,O95255,MRP6_HUMAN,[ABCC6 ARA MRP6],ABCC6,ARA MRP6,,,Homo sapiens (Human),9606,Multidrug resistance-associated protein 6 (ATP...,...,,"[ATP-binding, Alternative splicing, Cell membr...",Evidence at protein level,reviewed,,"[ABC transporter superfamily, ABCC family, Con...",188,"[Alternative products (1), Caution (1), Functi...",,


In [11]:
df_new = df[df['Gene ontology (biological process)'].notnull()] # don't consider genes with no biological process

In [19]:
df_new['Gene ontology (biological process)'] = df_new['Gene ontology (biological process)'].apply(lambda x: x.split('; ')) #split functions based on ;


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [20]:
GO_terms_dict = dict(zip(df_new['Entry name'], df_new['Gene ontology (biological process)']))

In [21]:
GO_terms_dict

{'PTN11_HUMAN': ['abortive mitotic cell cycle [GO:0033277]',
  'activation of MAPK activity [GO:0000187]',
  'atrioventricular canal development [GO:0036302]',
  'axon guidance [GO:0007411]',
  'Bergmann glial cell differentiation [GO:0060020]',
  'brain development [GO:0007420]',
  'cellular response to epidermal growth factor stimulus [GO:0071364]',
  'cellular response to mechanical stimulus [GO:0071260]',
  'cerebellar cortex formation [GO:0021697]',
  'DNA damage checkpoint [GO:0000077]',
  'ephrin receptor signaling pathway [GO:0048013]',
  'epidermal growth factor receptor signaling pathway [GO:0007173]',
  'ERBB signaling pathway [GO:0038127]',
  'face morphogenesis [GO:0060325]',
  'fibroblast growth factor receptor signaling pathway [GO:0008543]',
  'genitalia development [GO:0048806]',
  'glucose homeostasis [GO:0042593]',
  'heart development [GO:0007507]',
  'homeostasis of number of cells within a tissue [GO:0048873]',
  'hormone-mediated signaling pathway [GO:0009755]',


In [23]:
# Find most common GO terms to use as features
def flatten(l): # taken from https://stackoverflow.com/questions/33900770/most-frequent-values-in-a-dictionary
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, str): #replaced basestring with str for Python3
            for sub in flatten(el):
                yield sub
        else:
            yield el


In [25]:
All_GO_terms = set(list(flatten(GO_terms_dict.values())))
len(All_GO_terms)

6188

In [25]:
# loading the XGboost most important 190 features
feature_scores = np.load("features_molecular_function.npy")

In [26]:
features = []
for feature_score in feature_scores:
    feature = feature_score[0]
    features.append(feature)

In [27]:
len(features)

190

In [108]:
# initialize data with the features 
for feature in features:
    data_all[feature] = 0

In [109]:
# add 1 if the GO term is inside the gene_entry_dict for a particular gene
for i in data_all.index:
    gene = data_all.Gene[i]
    gene_entry = gene_entry_dict[gene]
    if gene_entry in GO_terms_dict:
        GO_terms = GO_terms_dict[gene_entry]
        features_inside = list(set(GO_terms).intersection(features))# get only features in the GO_terms that we need
        data_all.loc[i, features_inside] = 1

In [110]:
data_all.shape

(8989, 194)

In [111]:
data_all

Unnamed: 0,Class,Gene,ID,Variation,magnesium ion binding [GO:0000287],protein tyrosine kinase activity [GO:0004713],enzyme binding [GO:0019899],ATP binding [GO:0005524],kinase activity [GO:0016301],transcription corepressor activity [GO:0003714],...,epidermal growth factor receptor binding [GO:0005154],phospholipase binding [GO:0043274],patched binding [GO:0005113],integrin binding [GO:0005178],hepatocyte growth factor-activated receptor activity [GO:0005008],transcription cofactor binding [GO:0001221],core promoter sequence-specific DNA binding [GO:0001046],platelet-derived growth factor binding [GO:0048407],"phosphatidylinositol-3,4-bisphosphate binding [GO:0043325]",interleukin-12 receptor binding [GO:0005143]
0,1.0,FAM58A,0,Truncating Mutations,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2.0,CBL,1,W802*,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,2.0,CBL,2,Q249E,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,3.0,CBL,3,N454D,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,4.0,CBL,4,L399V,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,4.0,CBL,5,V391I,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
6,5.0,CBL,6,V430M,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,1.0,CBL,7,Deletion,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
8,4.0,CBL,8,Y371H,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
9,4.0,CBL,9,C384R,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [112]:
# Save the 190 features into one csv file in case we will use it again
data_all.to_csv("all_molecular_functions.csv",index=False)

In [102]:
# Do an SVD on the molecular functions to get a reduction to 25 features
svd = TruncatedSVD(n_components=40, n_iter=20, random_state=18)
feature_columns = data_all.iloc[:,4:] #starting from the 4th column we have our features
truncated_molecular = pd.DataFrame(svd.fit_transform(feature_columns.values))


In [103]:
# add truncated molecular functions to our data 
data_new = pd.concat((train, test), axis=0, ignore_index=True)
data_SVD = pd.concat((data_new, truncated_molecular), axis = 1)
data_SVD

Unnamed: 0,Class,Gene,ID,Variation,0,1,2,3,4,5,...,30,31,32,33,34,35,36,37,38,39
0,1.0,FAM58A,0,Truncating Mutations,0.007159,0.013617,0.001422,-0.031347,0.006252,-0.030863,...,-0.119820,-0.042245,-0.024528,0.045054,0.043003,0.030717,0.003989,-0.099006,-0.077494,0.046300
1,2.0,CBL,1,W802*,0.368832,-0.153924,0.132930,0.052214,0.012198,0.354159,...,-0.700573,1.000566,-0.244497,-0.171683,-0.142837,0.279335,0.170585,-0.199989,0.271046,0.106479
2,2.0,CBL,2,Q249E,0.368832,-0.153924,0.132930,0.052214,0.012198,0.354159,...,-0.700573,1.000566,-0.244497,-0.171683,-0.142837,0.279335,0.170585,-0.199989,0.271046,0.106479
3,3.0,CBL,3,N454D,0.368832,-0.153924,0.132930,0.052214,0.012198,0.354159,...,-0.700573,1.000566,-0.244497,-0.171683,-0.142837,0.279335,0.170585,-0.199989,0.271046,0.106479
4,4.0,CBL,4,L399V,0.368832,-0.153924,0.132930,0.052214,0.012198,0.354159,...,-0.700573,1.000566,-0.244497,-0.171683,-0.142837,0.279335,0.170585,-0.199989,0.271046,0.106479
5,4.0,CBL,5,V391I,0.368832,-0.153924,0.132930,0.052214,0.012198,0.354159,...,-0.700573,1.000566,-0.244497,-0.171683,-0.142837,0.279335,0.170585,-0.199989,0.271046,0.106479
6,5.0,CBL,6,V430M,0.368832,-0.153924,0.132930,0.052214,0.012198,0.354159,...,-0.700573,1.000566,-0.244497,-0.171683,-0.142837,0.279335,0.170585,-0.199989,0.271046,0.106479
7,1.0,CBL,7,Deletion,0.368832,-0.153924,0.132930,0.052214,0.012198,0.354159,...,-0.700573,1.000566,-0.244497,-0.171683,-0.142837,0.279335,0.170585,-0.199989,0.271046,0.106479
8,4.0,CBL,8,Y371H,0.368832,-0.153924,0.132930,0.052214,0.012198,0.354159,...,-0.700573,1.000566,-0.244497,-0.171683,-0.142837,0.279335,0.170585,-0.199989,0.271046,0.106479
9,4.0,CBL,9,C384R,0.368832,-0.153924,0.132930,0.052214,0.012198,0.354159,...,-0.700573,1.000566,-0.244497,-0.171683,-0.142837,0.279335,0.170585,-0.199989,0.271046,0.106479


In [104]:
print(svd.explained_variance_ratio_.sum())

0.815991690466


In [32]:
genes_train = set(train.Gene)
len(genes_train)

264

In [67]:
#### Add the dummy variables for gene to the data
# first initialize with the 264 different genes
for gene in genes_train:
    data_SVD[gene] = 0

In [68]:
for i in data_SVD.index:
    gene = data_SVD.Gene[i]
    if gene in genes_train:
        data_SVD.loc[i, gene] = 1 

In [75]:
# Do an SVD on the molecular functions to get a reduction to 25 features
feature_columns = data_SVD.iloc[:,29:] #starting from the 4th column we have our features
feature_columns
truncated_molecular = pd.DataFrame(svd.fit_transform(feature_columns.values))
svd.

ValueError: n_components must be < n_features; got 25 >= 0

In [74]:
# add truncated molecular functions to our data 
data_new = pd.concat((train, test), axis=0, ignore_index=True)
data_SVD = pd.concat((data_new, truncated_molecular), axis = 1)
data_SVD

Unnamed: 0,Class,Gene,ID,Variation,0,1,2,3,4,5,...,15,16,17,18,19,20,21,22,23,24
0,1.0,FAM58A,0,Truncating Mutations,-1.161481e-23,-3.698891e-19,9.225550e-20,-5.680264e-20,2.335886e-21,3.214391e-22,...,1.034778e-25,1.829459e-24,4.086044e-24,-1.926360e-24,-1.971919e-24,1.443782e-25,-1.019774e-22,5.252388e-23,3.190730e-22,-7.535701e-22
1,2.0,CBL,1,W802*,-2.701430e-20,-6.268249e-16,8.218057e-16,1.716798e-15,9.189731e-16,1.639306e-14,...,3.684259e-08,-8.060685e-07,9.679444e-08,3.063973e-06,1.935051e-06,3.781824e-06,1.559796e-05,-3.240568e-04,8.344012e-05,-2.725758e-04
2,2.0,CBL,2,Q249E,-7.113213e-20,5.214321e-16,-1.857296e-14,1.550278e-14,7.155532e-15,1.639546e-14,...,3.684259e-08,-8.060685e-07,9.679444e-08,3.063973e-06,1.935051e-06,3.781824e-06,1.559796e-05,-3.240568e-04,8.344012e-05,-2.725758e-04
3,3.0,CBL,3,N454D,2.597770e-20,-1.683205e-15,2.054382e-14,-1.677792e-14,1.310019e-14,1.668098e-14,...,3.684259e-08,-8.060685e-07,9.679444e-08,3.063973e-06,1.935051e-06,3.781824e-06,1.559796e-05,-3.240568e-04,8.344012e-05,-2.725758e-04
4,4.0,CBL,4,L399V,-2.913790e-20,-5.029867e-16,-3.304991e-16,1.260797e-15,-1.293249e-15,1.597275e-14,...,3.684259e-08,-8.060685e-07,9.679444e-08,3.063973e-06,1.935051e-06,3.781824e-06,1.559796e-05,-3.240568e-04,8.344012e-05,-2.725758e-04
5,4.0,CBL,5,V391I,-2.809506e-20,-5.249856e-16,5.780590e-17,9.225718e-16,-9.752923e-16,1.656248e-14,...,3.684259e-08,-8.060685e-07,9.679444e-08,3.063973e-06,1.935051e-06,3.781824e-06,1.559796e-05,-3.240568e-04,8.344012e-05,-2.725758e-04
6,5.0,CBL,6,V430M,-2.808945e-20,-5.251659e-16,6.103512e-17,9.201825e-16,-9.789602e-16,1.636419e-14,...,3.684259e-08,-8.060685e-07,9.679444e-08,3.063973e-06,1.935051e-06,3.781824e-06,1.559796e-05,-3.240568e-04,8.344012e-05,-2.725758e-04
7,1.0,CBL,7,Deletion,-2.808909e-20,-5.251614e-16,6.096250e-17,9.199872e-16,-9.775340e-16,1.639267e-14,...,3.684259e-08,-8.060685e-07,9.679444e-08,3.063973e-06,1.935051e-06,3.781824e-06,1.559796e-05,-3.240568e-04,8.344012e-05,-2.725758e-04
8,4.0,CBL,8,Y371H,-2.808910e-20,-5.251615e-16,6.096299e-17,9.199790e-16,-9.775404e-16,1.639356e-14,...,3.684259e-08,-8.060685e-07,9.679444e-08,3.063973e-06,1.935051e-06,3.781824e-06,1.559796e-05,-3.240568e-04,8.344012e-05,-2.725758e-04
9,4.0,CBL,9,C384R,-2.808910e-20,-5.251615e-16,6.096308e-17,9.199773e-16,-9.775422e-16,1.639365e-14,...,3.684259e-08,-8.060685e-07,9.679444e-08,3.063973e-06,1.935051e-06,3.781824e-06,1.559796e-05,-3.240568e-04,8.344012e-05,-2.725758e-04
