# Get the uniprot GO terms of molecular function ready for modelling

In [95]:
# Gene Ontology can be found here: http://geneontology.org/page/ontology-documentation
import numpy as np
import pandas as pd
import string
import os
from collections import Counter
from collections import defaultdict

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import TruncatedSVD

import re
from bioservices import *
import collections
%pylab inline --no-import-all

Populating the interactive namespace from numpy and matplotlib


In [3]:
train = pd.read_csv('..//bases/training_variants')
test = pd.read_csv('..//bases/test_variants')

In [106]:
data_all = pd.concat((train, test), axis=0, ignore_index=True)

In [5]:
all_genes = set(data_all.Gene)
print(len(all_genes))
print(all_genes)

1507
{'PEX13', 'GNAS', 'ADGRV1', 'MESP2', 'NDUFS8', 'OPCML', 'EFHC1', 'CNNM4', 'TBX5', 'CASR', 'PHEX', 'QDPR', 'TRPM7', 'FAT1', 'RSPO4', 'CACNB2', 'ZDHHC9', 'ARSB', 'ERCC8', 'SMARCAL1', 'MYO1A', 'SCN5A', 'FANCI', 'CBX2', 'ETFDH', 'PLEKHG5', 'PCDH19', 'SAMD9', 'EGLN1', 'PIKFYVE', 'DPAGT1', 'NOG', 'SLC34A3', 'FAM20C', 'KCNQ3', 'PPARG', 'HGD', 'PHF8', 'SIL1', 'WHSC1L1', 'PROP1', 'RANBP2', 'POMT1', 'GYG1', 'DOCK8', 'PLA2G7', 'PROK2', 'HMCN1', 'HFE', 'CASP8', 'CD40LG', 'NLRP1', 'APOA5', 'CYLD', 'PAH', 'ZNF81', 'FREM1', 'PEX10', 'CX3CR1', 'ADAMTSL2', 'DNAAF1', 'CRELD1', 'KRT85', 'PCCA', 'OGG1', 'SHOC2', 'PKLR', 'TMPRSS6', 'LIPA', 'MYO6', 'GDAP1', 'CASP10', 'SIX1', 'LYST', 'USH1C', 'ZIC3', 'SLC2A10', 'UMPS', 'HFE2', 'PYY', 'PTPRJ', 'HAMP', 'SCN1B', 'TREX1', 'SNRNP200', 'BBS10', 'LRP8', 'POLG2', 'KDR', 'AMACR', 'PNPLA6', 'IMPG2', 'PIK3R3', 'BAG3', 'JAK1', 'AGXT', 'ATM', 'POU4F3', 'GNE', 'PITX2', 'TAS2R38', 'TWIST1', 'AXL', 'ACSL4', 'ATP7B', 'SLC35D1', 'SUOX', 'CDKN1B', 'ABCD1', 'BCKDHB', 'ABL1

In [6]:
u = UniProt()

In [7]:
u.debugLevel = "INFO"
u.timeout = 100   # some queries are long and requires much more time; default is 1000 seconds

In [8]:
gene_entry_dict = {}
class_dict = {}
for gene in all_genes:
    keyword = 'gene:%s+AND+organism:9606' %gene #to query database, with gene and organism 9606 is Homo Sapien (human)
    entry_name_tab = u.search(keyword, frmt='tab', limit=1, columns="entry name") 
    entry_name = [s.strip() for s in entry_name_tab.splitlines()][1] # gets the entry name = in second position in list
    gene_entry_dict[gene] = entry_name

In [12]:
gene_entries = list(gene_entry_dict.values())
len(gene_entries)

1507

In [13]:
df = u.get_df(gene_entries) # searches in uniprot -> gets results back 
df

INFO:root:fetching information from uniprot for 1499 entries
INFO:root:uniprot.get_df 1/14
INFO:root:uniprot.get_df 2/14
INFO:root:uniprot.get_df 3/14
INFO:root:uniprot.get_df 4/14
INFO:root:uniprot.get_df 5/14
INFO:root:uniprot.get_df 6/14
INFO:root:uniprot.get_df 7/14
INFO:root:uniprot.get_df 8/14
INFO:root:uniprot.get_df 9/14
INFO:root:uniprot.get_df 10/14
INFO:root:uniprot.get_df 11/14
INFO:root:uniprot.get_df 12/14
INFO:root:uniprot.get_df 13/14
INFO:root:uniprot.get_df 14/14
INFO:root:uniprot.get_df 15/14


Unnamed: 0,Entry,Entry name,Gene names,Gene names (primary ),Gene names (synonym ),Gene names (ordered locus ),Gene names (ORF ),Organism,Organism ID,Protein names,...,Miscellaneous [CC],Keywords,Protein existence,Status,Sequence annotation (Features),Protein families,Version,Comments,Cross-reference (null),Pathway.1
0,P46531,NOTC1_HUMAN,[NOTCH1 TAN1],NOTCH1,TAN1,,,Homo sapiens (Human),9606,Neurogenic locus notch homolog protein 1 (Notc...,...,,"[3D-structure, ANK repeat, Activator, Angiogen...",Evidence at protein level,reviewed,,[NOTCH family],211,"[Function (1), Involvement in disease (2), Pos...",,
1,P42336,PK3CA_HUMAN,[PIK3CA],PIK3CA,,,,Homo sapiens (Human),9606,"Phosphatidylinositol 4,5-bisphosphate 3-kinase...",...,MISCELLANEOUS: The avian sarcoma virus 16 geno...,"[3D-structure, ATP-binding, Angiogenesis, Comp...",Evidence at protein level,reviewed,,[PI3/PI4-kinase family],189,"[Catalytic activity (2), Domain (1), Function ...",,
2,P51532,SMCA4_HUMAN,[SMARCA4 BAF190A BRG1 SNF2B SNF2L4],SMARCA4,BAF190A BRG1 SNF2B SNF2L4,,,Homo sapiens (Human),9606,Transcription activator BRG1 (EC 3.6.4.-) (ATP...,...,,"[3D-structure, ATP-binding, Acetylation, Activ...",Evidence at protein level,reviewed,,[SNF2/RAD54 helicase family],199,"[Alternative products (1), Caution (1), Functi...",,
3,P10275,ANDR_HUMAN,[AR DHTR NR3C4],AR,DHTR NR3C4,,,Homo sapiens (Human),9606,Androgen receptor (Dihydrotestosterone recepto...,...,"MISCELLANEOUS: In the absence of ligand, stero...","[3D-structure, Activator, Alternative splicing...",Evidence at protein level,reviewed,,"[Nuclear hormone receptor family, NR3 subfamily]",253,"[Alternative products (1), Caution (2), Domain...",,
4,Q8IU80,TMPS6_HUMAN,[TMPRSS6 UNQ354/PRO618],TMPRSS6,,,UNQ354/PRO618,Homo sapiens (Human),9606,Transmembrane protease serine 6 (EC 3.4.21.-) ...,...,,"[Alternative splicing, Cell membrane, Complete...",Evidence at protein level,reviewed,,[Peptidase S1 family],144,"[Alternative products (1), Caution (2), Domain...",,
5,P63000,RAC1_HUMAN,[RAC1 TC25 MIG5],RAC1,TC25,,MIG5,Homo sapiens (Human),9606,Ras-related C3 botulinum toxin substrate 1 (Ce...,...,,"[3D-structure, ADP-ribosylation, Alternative s...",Evidence at protein level,reviewed,,"[Small GTPase superfamily, Rho family]",170,"[Alternative products (1), Domain (1), Enzyme ...",,
6,O95999,BCL10_HUMAN,[BCL10 CIPER CLAP],BCL10,CIPER CLAP,,,Homo sapiens (Human),9606,B-cell lymphoma/leukemia 10 (B-cell CLL/lympho...,...,,"[3D-structure, Acetylation, Apoptosis, Chromos...",Evidence at protein level,reviewed,,[],158,"[Function (1), Involvement in disease (3), Pos...",,
7,Q9NQ11,AT132_HUMAN,[ATP13A2 PARK9],ATP13A2,PARK9,,,Homo sapiens (Human),9606,Cation-transporting ATPase 13A2 (EC 3.6.3.-),...,,"[ATP-binding, Alternative splicing, Complete p...",Evidence at protein level,reviewed,,[Cation transport ATPase (P-type) (TC 3.A.3) f...,160,"[Alternative products (1), Catalytic activity ...",,
8,Q8WWY3,PRP31_HUMAN,[PRPF31 PRP31],PRPF31,PRP31,,,Homo sapiens (Human),9606,U4/U6 small nuclear ribonucleoprotein Prp31 (P...,...,,"[3D-structure, Acetylation, Alternative splici...",Evidence at protein level,reviewed,,[PRP31 family],141,"[Alternative products (1), Caution (3), Domain...",,
9,O94972,TRI37_HUMAN,[TRIM37 KIAA0898 MUL POB1],TRIM37,KIAA0898 MUL POB1,,,Homo sapiens (Human),9606,E3 ubiquitin-protein ligase TRIM37 (EC 2.3.2.2...,...,MISCELLANEOUS: Acts as a proto-oncogene via it...,"[3D-structure, Acetylation, Alternative splici...",Evidence at protein level,reviewed,,[TRIM/RBCC family],162,"[Alternative products (1), Catalytic activity ...",,Protein modification; protein ubiquitination.


In [14]:
df_new = df[df['Gene ontology (molecular function)'].notnull()] # don't consider genes with no molecular function

In [113]:
df_new['Gene ontology (molecular function)'] = df_new['Gene ontology (molecular function)'].apply(lambda x: x.split('; ')) #split functions based on ;


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [117]:
GO_terms_dict = dict(zip(df_new['Entry name'], df_new['Gene ontology (molecular function)']))

In [118]:
GO_terms_dict

{'NOTC1_HUMAN': ['calcium ion binding [GO:0005509]',
  'chromatin DNA binding [GO:0031490]',
  'core promoter binding [GO:0001047]',
  'enzyme binding [GO:0019899]',
  'enzyme inhibitor activity [GO:0004857]',
  'Notch binding [GO:0005112]',
  'protein heterodimerization activity [GO:0046982]',
  'receptor activity [GO:0004872]',
  'sequence-specific DNA binding [GO:0043565]',
  'transcriptional activator activity, RNA polymerase II transcription factor binding [GO:0001190]',
  'transcription factor activity, sequence-specific DNA binding [GO:0003700]'],
 'PK3CA_HUMAN': ['1-phosphatidylinositol-3-kinase activity [GO:0016303]',
  '1-phosphatidylinositol-4-phosphate 3-kinase activity [GO:0035005]',
  'ATP binding [GO:0005524]',
  'insulin receptor substrate binding [GO:0043560]',
  'kinase activity [GO:0016301]',
  'phosphatidylinositol 3-kinase activity [GO:0035004]',
  'phosphatidylinositol-4,5-bisphosphate 3-kinase activity [GO:0046934]',
  'protein kinase activator activity [GO:00302

In [119]:
# Find most common GO terms to use as features
def flatten(l): # taken from https://stackoverflow.com/questions/33900770/most-frequent-values-in-a-dictionary
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, str): #replaced basestring with str for Python3
            for sub in flatten(el):
                yield sub
        else:
            yield el


In [120]:
All_GO_terms = list(flatten(GO_terms_dict.values()))
len(set(All_GO_terms))

1742

In [25]:
# loading the XGboost most important 190 features
feature_scores = np.load("features_molecular_function.npy")

In [26]:
features = []
for feature_score in feature_scores:
    feature = feature_score[0]
    features.append(feature)

In [27]:
len(features)

190

In [108]:
# initialize data with the features 
for feature in features:
    data_all[feature] = 0

In [109]:
# add 1 if the GO term is inside the gene_entry_dict for a particular gene
for i in data_all.index:
    gene = data_all.Gene[i]
    gene_entry = gene_entry_dict[gene]
    if gene_entry in GO_terms_dict:
        GO_terms = GO_terms_dict[gene_entry]
        GO_terms = GO_terms.split('; ')# we have to split those again because seperated by ;
        features_inside = list(set(GO_terms).intersection(features))# get only features in the GO_terms that we need
        data_all.loc[i, features_inside] = 1

In [121]:
data_all.shape

(8989, 194)

In [122]:
data_all

Unnamed: 0,Class,Gene,ID,Variation,magnesium ion binding [GO:0000287],protein tyrosine kinase activity [GO:0004713],enzyme binding [GO:0019899],ATP binding [GO:0005524],kinase activity [GO:0016301],transcription corepressor activity [GO:0003714],...,epidermal growth factor receptor binding [GO:0005154],phospholipase binding [GO:0043274],patched binding [GO:0005113],integrin binding [GO:0005178],hepatocyte growth factor-activated receptor activity [GO:0005008],transcription cofactor binding [GO:0001221],core promoter sequence-specific DNA binding [GO:0001046],platelet-derived growth factor binding [GO:0048407],"phosphatidylinositol-3,4-bisphosphate binding [GO:0043325]",interleukin-12 receptor binding [GO:0005143]
0,1.0,FAM58A,0,Truncating Mutations,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2.0,CBL,1,W802*,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,2.0,CBL,2,Q249E,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,3.0,CBL,3,N454D,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,4.0,CBL,4,L399V,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,4.0,CBL,5,V391I,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
6,5.0,CBL,6,V430M,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,1.0,CBL,7,Deletion,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
8,4.0,CBL,8,Y371H,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
9,4.0,CBL,9,C384R,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [112]:
# Save the 190 features into one csv file in case we will use it again
data_all.to_csv("all_molecular_functions.csv",index=False)

In [129]:
# Do an SVD on the molecular functions to get a reduction to 25 features
svd = TruncatedSVD(n_components=60, n_iter=20, random_state=18)
feature_columns = data_all.iloc[:,4:] #starting from the 4th column we have our features
truncated_molecular = pd.DataFrame(svd.fit_transform(feature_columns.values))


In [130]:
# add truncated molecular functions to our data 
data_new = pd.concat((train, test), axis=0, ignore_index=True)
data_SVD = pd.concat((data_new, truncated_molecular), axis = 1)
data_SVD

Unnamed: 0,Class,Gene,ID,Variation,0,1,2,3,4,5,...,50,51,52,53,54,55,56,57,58,59
0,1.0,FAM58A,0,Truncating Mutations,0.007349,0.014057,0.000510,-0.031896,0.005534,-0.031307,...,-0.019335,0.096690,-0.008022,0.042252,-0.044389,-0.006457,0.044739,-0.007923,-0.015082,0.110292
1,2.0,CBL,1,W802*,0.370714,-0.161395,0.143519,0.047481,0.022149,0.348765,...,-0.002525,-0.064886,-0.123281,0.100598,0.320971,-0.081573,0.172307,-0.169171,-0.113876,-0.074413
2,2.0,CBL,2,Q249E,0.370714,-0.161395,0.143519,0.047481,0.022149,0.348765,...,-0.002525,-0.064886,-0.123281,0.100598,0.320971,-0.081573,0.172307,-0.169171,-0.113876,-0.074413
3,3.0,CBL,3,N454D,0.370714,-0.161395,0.143519,0.047481,0.022149,0.348765,...,-0.002525,-0.064886,-0.123281,0.100598,0.320971,-0.081573,0.172307,-0.169171,-0.113876,-0.074413
4,4.0,CBL,4,L399V,0.370714,-0.161395,0.143519,0.047481,0.022149,0.348765,...,-0.002525,-0.064886,-0.123281,0.100598,0.320971,-0.081573,0.172307,-0.169171,-0.113876,-0.074413
5,4.0,CBL,5,V391I,0.370714,-0.161395,0.143519,0.047481,0.022149,0.348765,...,-0.002525,-0.064886,-0.123281,0.100598,0.320971,-0.081573,0.172307,-0.169171,-0.113876,-0.074413
6,5.0,CBL,6,V430M,0.370714,-0.161395,0.143519,0.047481,0.022149,0.348765,...,-0.002525,-0.064886,-0.123281,0.100598,0.320971,-0.081573,0.172307,-0.169171,-0.113876,-0.074413
7,1.0,CBL,7,Deletion,0.370714,-0.161395,0.143519,0.047481,0.022149,0.348765,...,-0.002525,-0.064886,-0.123281,0.100598,0.320971,-0.081573,0.172307,-0.169171,-0.113876,-0.074413
8,4.0,CBL,8,Y371H,0.370714,-0.161395,0.143519,0.047481,0.022149,0.348765,...,-0.002525,-0.064886,-0.123281,0.100598,0.320971,-0.081573,0.172307,-0.169171,-0.113876,-0.074413
9,4.0,CBL,9,C384R,0.370714,-0.161395,0.143519,0.047481,0.022149,0.348765,...,-0.002525,-0.064886,-0.123281,0.100598,0.320971,-0.081573,0.172307,-0.169171,-0.113876,-0.074413


In [128]:
print(svd.explained_variance_ratio_.sum())

0.886322626557


In [133]:
# Save the 60 svd's features into one file 
data_all.to_csv("molecular_bases/svd60_molecular_functions.csv",index=False)

In [32]:
genes_train = set(train.Gene)
len(genes_train)

264

In [67]:
#### Add the dummy variables for gene to the data
# first initialize with the 264 different genes
for gene in genes_train:
    data_SVD[gene] = 0

In [68]:
for i in data_SVD.index:
    gene = data_SVD.Gene[i]
    if gene in genes_train:
        data_SVD.loc[i, gene] = 1 

In [75]:
# Do an SVD on the molecular functions to get a reduction to 25 features
feature_columns = data_SVD.iloc[:,29:] #starting from the 4th column we have our features
feature_columns
truncated_molecular = pd.DataFrame(svd.fit_transform(feature_columns.values))
svd.

ValueError: n_components must be < n_features; got 25 >= 0

In [74]:
# add truncated molecular functions to our data 
data_new = pd.concat((train, test), axis=0, ignore_index=True)
data_SVD = pd.concat((data_new, truncated_molecular), axis = 1)
data_SVD

Unnamed: 0,Class,Gene,ID,Variation,0,1,2,3,4,5,...,15,16,17,18,19,20,21,22,23,24
0,1.0,FAM58A,0,Truncating Mutations,-1.161481e-23,-3.698891e-19,9.225550e-20,-5.680264e-20,2.335886e-21,3.214391e-22,...,1.034778e-25,1.829459e-24,4.086044e-24,-1.926360e-24,-1.971919e-24,1.443782e-25,-1.019774e-22,5.252388e-23,3.190730e-22,-7.535701e-22
1,2.0,CBL,1,W802*,-2.701430e-20,-6.268249e-16,8.218057e-16,1.716798e-15,9.189731e-16,1.639306e-14,...,3.684259e-08,-8.060685e-07,9.679444e-08,3.063973e-06,1.935051e-06,3.781824e-06,1.559796e-05,-3.240568e-04,8.344012e-05,-2.725758e-04
2,2.0,CBL,2,Q249E,-7.113213e-20,5.214321e-16,-1.857296e-14,1.550278e-14,7.155532e-15,1.639546e-14,...,3.684259e-08,-8.060685e-07,9.679444e-08,3.063973e-06,1.935051e-06,3.781824e-06,1.559796e-05,-3.240568e-04,8.344012e-05,-2.725758e-04
3,3.0,CBL,3,N454D,2.597770e-20,-1.683205e-15,2.054382e-14,-1.677792e-14,1.310019e-14,1.668098e-14,...,3.684259e-08,-8.060685e-07,9.679444e-08,3.063973e-06,1.935051e-06,3.781824e-06,1.559796e-05,-3.240568e-04,8.344012e-05,-2.725758e-04
4,4.0,CBL,4,L399V,-2.913790e-20,-5.029867e-16,-3.304991e-16,1.260797e-15,-1.293249e-15,1.597275e-14,...,3.684259e-08,-8.060685e-07,9.679444e-08,3.063973e-06,1.935051e-06,3.781824e-06,1.559796e-05,-3.240568e-04,8.344012e-05,-2.725758e-04
5,4.0,CBL,5,V391I,-2.809506e-20,-5.249856e-16,5.780590e-17,9.225718e-16,-9.752923e-16,1.656248e-14,...,3.684259e-08,-8.060685e-07,9.679444e-08,3.063973e-06,1.935051e-06,3.781824e-06,1.559796e-05,-3.240568e-04,8.344012e-05,-2.725758e-04
6,5.0,CBL,6,V430M,-2.808945e-20,-5.251659e-16,6.103512e-17,9.201825e-16,-9.789602e-16,1.636419e-14,...,3.684259e-08,-8.060685e-07,9.679444e-08,3.063973e-06,1.935051e-06,3.781824e-06,1.559796e-05,-3.240568e-04,8.344012e-05,-2.725758e-04
7,1.0,CBL,7,Deletion,-2.808909e-20,-5.251614e-16,6.096250e-17,9.199872e-16,-9.775340e-16,1.639267e-14,...,3.684259e-08,-8.060685e-07,9.679444e-08,3.063973e-06,1.935051e-06,3.781824e-06,1.559796e-05,-3.240568e-04,8.344012e-05,-2.725758e-04
8,4.0,CBL,8,Y371H,-2.808910e-20,-5.251615e-16,6.096299e-17,9.199790e-16,-9.775404e-16,1.639356e-14,...,3.684259e-08,-8.060685e-07,9.679444e-08,3.063973e-06,1.935051e-06,3.781824e-06,1.559796e-05,-3.240568e-04,8.344012e-05,-2.725758e-04
9,4.0,CBL,9,C384R,-2.808910e-20,-5.251615e-16,6.096308e-17,9.199773e-16,-9.775422e-16,1.639365e-14,...,3.684259e-08,-8.060685e-07,9.679444e-08,3.063973e-06,1.935051e-06,3.781824e-06,1.559796e-05,-3.240568e-04,8.344012e-05,-2.725758e-04
