# Get the uniprot GO terms of molecular function ready for modelling

In [1]:
# Gene Ontology can be found here: http://geneontology.org/page/ontology-documentation
import numpy as np
import pandas as pd
import string
import os
from collections import Counter
from collections import defaultdict

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import TruncatedSVD

import re
from bioservices import *
import collections
%pylab inline --no-import-all

Populating the interactive namespace from numpy and matplotlib


In [3]:
new_test=pd.read_csv('..//..//bases/new_test_variants.csv')
new_test_texts = pd.read_csv('..//..//bases/new_test_text.csv', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")
new_test_final=pd.merge(new_test,new_test_texts,how="left",on="ID")

In [5]:
leaks=pd.read_csv('..//..//bases/s1_add_train.csv')
leaks_1=pd.DataFrame([leaks["ID"],leaks.drop("ID",axis=1).idxmax(axis=1).map(lambda x: x.lstrip('class'))])
leaks_2=leaks_1.T
leaks_2.columns=["ID","Class"]

In [6]:
train = pd.read_csv('..//..//bases/training_variants')
test = pd.read_csv('..//..//bases/test_variants')

In [7]:
train_texts = pd.read_csv('..//..//bases/training_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")
test_texts = pd.read_csv('..//..//bases/test_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")

In [8]:
train = pd.merge(train, train_texts, how='left', on='ID')
test = pd.merge(test, test_texts, how='left', on='ID')

In [9]:
leaks_3=pd.merge(leaks_2,test[test.ID.isin(leaks_2.ID)])
leaks_final=pd.merge(leaks_3,test_texts[test_texts.ID.isin(leaks_3.ID)])

In [10]:
train_all = pd.concat([train,leaks_final]) #adding first stage
train_all

Unnamed: 0,Class,Gene,ID,Text,Variation
0,1,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations
1,2,CBL,1,Abstract Background Non-small cell lung canc...,W802*
2,2,CBL,2,Abstract Background Non-small cell lung canc...,Q249E
3,3,CBL,3,Recent evidence has demonstrated that acquired...,N454D
4,4,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V
5,4,CBL,5,Oncogenic mutations in the monomeric Casitas B...,V391I
6,5,CBL,6,Oncogenic mutations in the monomeric Casitas B...,V430M
7,1,CBL,7,CBL is a negative regulator of activated recep...,Deletion
8,4,CBL,8,Abstract Juvenile myelomonocytic leukemia (JM...,Y371H
9,4,CBL,9,Abstract Juvenile myelomonocytic leukemia (JM...,C384R


In [11]:
merge_match = new_test.merge(train_all, left_on=['Gene', 'Variation'], right_on = ['Gene', 'Variation'])
Index_leak = merge_match.ID_x - 1
new_test_index = [item for item in new_test_final.index if item not in list(Index_leak)]
test_no_leaks = new_test_final.iloc[new_test_index]
test_no_leaks

Unnamed: 0,ID,Gene,Variation,Text
2,3,WNT4,E216G,Mycosis fungoides and Sézary syndrome are prim...
3,4,SUCLA2,G118R,Regulated progression through the cell cycle ...
5,6,CHEK2,E239K,The nuclei that laboratories solution p53 KIT ...
6,7,CHST3,T141M,Myeloid differentiation 88 (MyD88) is the key ...
7,8,RNF6,G244D,Human ESCCs 2 occur frequently worldwide (1) ....
8,9,SPAST,C448Y,large were of activity growth this product tol...
10,11,SCN4A,V445M,Endometrial carcinoma is the most common gynec...
14,15,ERBB2,G746S,The protein-kinase family is the most frequent...
15,16,TP53,Y234S,Among the best-studied therapeutic targets in ...
16,17,RAB27A,A87P,"Introduction In recent years, a better unders..."


In [12]:
train_all['Substitutions_var'] = train_all.Variation.apply(lambda x: bool(re.search('^[A-Z]\\d+[A-Z*]$', x))*1)
new_train = train_all[train_all['Substitutions_var']==1]

In [66]:
data_all = pd.concat((new_train, test_no_leaks), axis=0, ignore_index=True)
data_all = data_all[['Class', 'Gene', 'ID', 'Variation', 'Text']] # just reordering
data_all_backup = data_all[:] ## We keep backup in case we need to use again
data_all

Unnamed: 0,Class,Gene,ID,Variation,Text
0,2,CBL,1,W802*,Abstract Background Non-small cell lung canc...
1,2,CBL,2,Q249E,Abstract Background Non-small cell lung canc...
2,3,CBL,3,N454D,Recent evidence has demonstrated that acquired...
3,4,CBL,4,L399V,Oncogenic mutations in the monomeric Casitas B...
4,4,CBL,5,V391I,Oncogenic mutations in the monomeric Casitas B...
5,5,CBL,6,V430M,Oncogenic mutations in the monomeric Casitas B...
6,4,CBL,8,Y371H,Abstract Juvenile myelomonocytic leukemia (JM...
7,4,CBL,9,C384R,Abstract Juvenile myelomonocytic leukemia (JM...
8,4,CBL,10,P395A,Oncogenic mutations in the monomeric Casitas B...
9,4,CBL,11,K382E,Noonan syndrome is an autosomal dominant conge...


In [67]:
all_genes = set(data_all.Gene)
print(len(all_genes))
print(all_genes)

319
{'CDH1', 'BTK', 'KERA', 'ESR1', 'BRCA2', 'BCS1L', 'HRAS', 'SLC33A1', 'PDGFRA', 'MPDU1', 'ABCB11', 'NKX2-1', 'PIK3CA', 'TSC2', 'MYD88', 'TSHR', 'LRP6', 'MYOT', 'SLC6A5', 'BAG3', 'ATP2C1', 'PER2', 'MYCN', 'CASP8', 'RUNX1', 'GCM2', 'SF3B1', 'ERBB3', 'AKT2', 'SMAD2', 'SDHC', 'EP300', 'LRP5', 'BFSP2', 'GJB3', 'KCNE2', 'RASA1', 'SLC25A13', 'RECQL4', 'XPO1', 'TSC1', 'PDE8B', 'DYNC2H1', 'RPS26', 'KCNJ13', 'VHL', 'RPS19', 'HABP2', 'FGFR4', 'PIM1', 'RET', 'BRCA1', 'SLC22A5', 'RAD54L', 'NTRK1', 'SPAST', 'RB1', 'SLC19A2', 'GALK1', 'CIC', 'PHOX2B', 'MLH1', 'SDHB', 'SLC27A4', 'ITM2B', 'CARM1', 'AURKC', 'NDUFS6', 'PDGFRB', 'MTOR', 'SLC25A15', 'IDH1', 'CILP', 'HIST1H1C', 'PIK3R1', 'KCNQ4', 'NTRK3', 'MED12', 'CST3', 'DNM1L', 'WHSC1', 'NFKBIA', 'RXRA', 'JAK1', 'ZFPM2', 'ROS1', 'CREBBP', 'ABCC6', 'PLA2G6', 'EPHB2', 'GRM6', 'SLC7A9', 'FANCA', 'SMAD3', 'PTCH1', 'SMO', 'MCC', 'CRLF1', 'KLF4', 'APOL1', 'TMPRSS2', 'CTNNB1', 'EPHA2', 'TTK', 'TP53', 'MOCS2', 'POLE', 'WNT4', 'ERCC2', 'MPL', 'SCN4A', 'NEK8', 

In [68]:
u = UniProt()

In [69]:
u.debugLevel = "INFO"
u.timeout = 100   # some queries are long and requires much more time; default is 1000 seconds

In [70]:
gene_entry_dict = {}
class_dict = {}
for gene in all_genes:
    keyword = 'gene:%s+AND+organism:9606' %gene #to query database, with gene and organism 9606 is Homo Sapien (human)
    entry_name_tab = u.search(keyword, frmt='tab', limit=1, columns="entry name") 
    entry_name = [s.strip() for s in entry_name_tab.splitlines()][1] # gets the entry name = in second position in list
    gene_entry_dict[gene] = entry_name

In [71]:
gene_entries = list(gene_entry_dict.values())
len(gene_entries)

319

In [72]:
df = u.get_df(gene_entries) # searches in uniprot -> gets results back 
df

INFO:root:fetching information from uniprot for 318 entries
INFO:root:uniprot.get_df 1/3
INFO:root:uniprot.get_df 2/3
INFO:root:uniprot.get_df 3/3
INFO:root:uniprot.get_df 4/3


Unnamed: 0,Entry,Entry name,Gene names,Gene names (primary ),Gene names (synonym ),Gene names (ordered locus ),Gene names (ORF ),Organism,Organism ID,Protein names,...,Miscellaneous [CC],Keywords,Protein existence,Status,Sequence annotation (Features),Protein families,Version,Comments,Cross-reference (null),Pathway.1
0,P04637,P53_HUMAN,[TP53 P53],TP53,P53,,,Homo sapiens (Human),9606,Cellular tumor antigen p53 (Antigen NY-CO-13) ...,...,,"[3D-structure, Acetylation, Activator, Alterna...",Evidence at protein level,reviewed,,[P53 family],258,"[Alternative products (1), Caution (5), Cofact...",,
1,P36897,TGFR1_HUMAN,[TGFBR1 ALK5 SKR4],TGFBR1,ALK5 SKR4,,,Homo sapiens (Human),9606,TGF-beta receptor type-1 (TGFR-1) (EC 2.7.11.3...,...,,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, TKL Ser/Thr prote...",204,"[Alternative products (1), Catalytic activity ...",,
2,P46531,NOTC1_HUMAN,[NOTCH1 TAN1],NOTCH1,TAN1,,,Homo sapiens (Human),9606,Neurogenic locus notch homolog protein 1 (Notc...,...,,"[3D-structure, ANK repeat, Activator, Angiogen...",Evidence at protein level,reviewed,,[NOTCH family],212,"[Function (1), Involvement in disease (2), Pos...",,
3,O00429,DNM1L_HUMAN,[DNM1L DLP1 DRP1],DNM1L,DLP1 DRP1,,,Homo sapiens (Human),9606,Dynamin-1-like protein (EC 3.6.5.5) (Dnm1p/Vps...,...,,"[3D-structure, Acetylation, Alternative splici...",Evidence at protein level,reviewed,,"[TRAFAC class dynamin-like GTPase superfamily,...",161,"[Alternative products (1), Catalytic activity ...",,
4,P42336,PK3CA_HUMAN,[PIK3CA],PIK3CA,,,,Homo sapiens (Human),9606,"Phosphatidylinositol 4,5-bisphosphate 3-kinase...",...,MISCELLANEOUS: The avian sarcoma virus 16 geno...,"[3D-structure, ATP-binding, Angiogenesis, Comp...",Evidence at protein level,reviewed,,[PI3/PI4-kinase family],190,"[Catalytic activity (2), Domain (1), Function ...",,
5,P38936,CDN1A_HUMAN,[CDKN1A CAP20 CDKN1 CIP1 MDA6 PIC1 SDI1 WAF1],CDKN1A,CAP20 CDKN1 CIP1 MDA6 PIC1 SDI1 WAF1,,,Homo sapiens (Human),9606,Cyclin-dependent kinase inhibitor 1 (CDK-inter...,...,,"[3D-structure, Acetylation, Cell cycle, Comple...",Evidence at protein level,reviewed,,[CDI family],200,"[Domain (2), Function (1), Induction (1), Post...",,
6,P84022,SMAD3_HUMAN,[SMAD3 MADH3],SMAD3,MADH3,,,Homo sapiens (Human),9606,Mothers against decapentaplegic homolog 3 (MAD...,...,,"[3D-structure, ADP-ribosylation, Acetylation, ...",Evidence at protein level,reviewed,,[Dwarfin/SMAD family],164,"[Alternative products (1), Caution (2), Domain...",,
7,Q15831,STK11_HUMAN,[STK11 LKB1 PJS],STK11,LKB1 PJS,,,Homo sapiens (Human),9606,Serine/threonine-protein kinase STK11 (EC 2.7....,...,,"[3D-structure, ATP-binding, Acetylation, Alter...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, CAMK Ser/Thr prot...",194,"[Alternative products (1), Catalytic activity ...",,
8,Q15303,ERBB4_HUMAN,[ERBB4 HER4],ERBB4,HER4,,,Homo sapiens (Human),9606,Receptor tyrosine-protein kinase erbB-4 (EC 2....,...,,"[3D-structure, ATP-binding, Activator, Alterna...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, Tyr protein kinas...",198,"[Alternative products (1), Catalytic activity ...",,
9,P51587,BRCA2_HUMAN,[BRCA2 FACD FANCD1],BRCA2,FACD FANCD1,,,Homo sapiens (Human),9606,Breast cancer type 2 susceptibility protein (F...,...,,"[3D-structure, Cell cycle, Complete proteome, ...",Evidence at protein level,reviewed,,[],198,"[Function (1), Involvement in disease (5), Pos...",,


In [73]:
df['Gene ontology (GO)'][0]

['cytoplasm [GO:0005737]',
 'cytosol [GO:0005829]',
 'endoplasmic reticulum [GO:0005783]',
 'intracellular [GO:0005622]',
 'mitochondrial matrix [GO:0005759]',
 'mitochondrion [GO:0005739]',
 'nuclear chromatin [GO:0000790]',
 'nuclear matrix [GO:0016363]',
 'nucleolus [GO:0005730]',
 'nucleoplasm [GO:0005654]',
 'nucleus [GO:0005634]',
 'PML body [GO:0016605]',
 'protein complex [GO:0043234]',
 'replication fork [GO:0005657]',
 'ATP binding [GO:0005524]',
 'chaperone binding [GO:0051087]',
 'chromatin binding [GO:0003682]',
 'copper ion binding [GO:0005507]',
 'core promoter sequence-specific DNA binding [GO:0001046]',
 'damaged DNA binding [GO:0003684]',
 'disordered domain specific binding [GO:0097718]',
 'DNA binding [GO:0003677]',
 'enzyme binding [GO:0019899]',
 'histone acetyltransferase binding [GO:0035035]',
 'histone deacetylase binding [GO:0042826]',
 'identical protein binding [GO:0042802]',
 "mRNA 3'-UTR binding [GO:0003730]",
 'p53 binding [GO:0002039]',
 'protease bindin

In [74]:
df_new = df[df['Gene ontology (GO)'].notnull()] # don't consider genes with no molecular function

In [75]:
GO_terms_dict = dict(zip(df_new['Entry name'], df_new['Gene ontology (GO)']))

In [76]:
GO_terms_dict

{'2AAA_HUMAN': ['chromosome, centromeric region [GO:0000775]',
  'cytosol [GO:0005829]',
  'dendrite [GO:0030425]',
  'extracellular exosome [GO:0070062]',
  'lateral plasma membrane [GO:0016328]',
  'membrane [GO:0016020]',
  'microtubule cytoskeleton [GO:0015630]',
  'mitochondrion [GO:0005739]',
  'nucleus [GO:0005634]',
  'protein phosphatase type 2A complex [GO:0000159]',
  'antigen binding [GO:0003823]',
  'protein heterodimerization activity [GO:0046982]',
  'protein phosphatase regulator activity [GO:0019888]',
  'protein serine/threonine phosphatase activity [GO:0004722]',
  'apoptotic process [GO:0006915]',
  'ceramide metabolic process [GO:0006672]',
  'chromosome segregation [GO:0007059]',
  'ciliary basal body-plasma membrane docking [GO:0097711]',
  'female meiotic nuclear division [GO:0007143]',
  'G2/M transition of mitotic cell cycle [GO:0000086]',
  'inactivation of MAPK activity [GO:0000188]',
  'meiotic sister chromatid cohesion, centromeric [GO:0051754]',
  'meioti

In [77]:
# Find most common GO terms to use as features
def flatten(l): # taken from https://stackoverflow.com/questions/33900770/most-frequent-values-in-a-dictionary
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, str): #replaced basestring with str for Python3
            for sub in flatten(el):
                yield sub
        else:
            yield el


In [78]:
All_GO_terms = list(set(flatten(GO_terms_dict.values())))
len(All_GO_terms)

4200

In [94]:
features = pd.read_csv('all_GO_terms.csv')

In [95]:
features = features.values

In [96]:
features = [item for sublist in features for item in sublist]

In [97]:
len(features)

3327

In [98]:
# initialize data with the features 
for feature in features:
    data_all[feature] = 0

In [99]:
# add 1 if the GO term is inside the gene_entry_dict for a particular gene
for i in data_all.index:
    gene = data_all.Gene[i]
    gene_entry = gene_entry_dict[gene]
    if gene_entry in GO_terms_dict:
        GO_terms = GO_terms_dict[gene_entry]
        features_inside = list(set(GO_terms).intersection(features))# get only features in the GO_terms that we need
        data_all.loc[i, features_inside] = 1

In [100]:
data_all.shape

(3544, 3333)

In [101]:
data_all

Unnamed: 0,Class,Gene,ID,Variation,Text,0,platelet-derived growth factor binding [GO:0048407],negative regulation of platelet activation [GO:0010544],retinal rod cell apoptotic process [GO:0097473],central nervous system morphogenesis [GO:0021551],...,male meiotic nuclear division [GO:0007140],regulation of intracellular estrogen receptor signaling pathway [GO:0033146],circadian rhythm [GO:0007623],protein polyubiquitination [GO:0000209],integral component of plasma membrane [GO:0005887],embryonic hindlimb morphogenesis [GO:0035116],peroxisome [GO:0005777],positive regulation of binding [GO:0051099],positive regulation of protein localization to plasma membrane [GO:1903078],positive regulation of skeletal muscle tissue development [GO:0048643]
0,2,CBL,1,W802*,Abstract Background Non-small cell lung canc...,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,2,CBL,2,Q249E,Abstract Background Non-small cell lung canc...,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,3,CBL,3,N454D,Recent evidence has demonstrated that acquired...,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,4,CBL,4,L399V,Oncogenic mutations in the monomeric Casitas B...,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,4,CBL,5,V391I,Oncogenic mutations in the monomeric Casitas B...,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,5,CBL,6,V430M,Oncogenic mutations in the monomeric Casitas B...,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
6,4,CBL,8,Y371H,Abstract Juvenile myelomonocytic leukemia (JM...,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
7,4,CBL,9,C384R,Abstract Juvenile myelomonocytic leukemia (JM...,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
8,4,CBL,10,P395A,Oncogenic mutations in the monomeric Casitas B...,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
9,4,CBL,11,K382E,Noonan syndrome is an autosomal dominant conge...,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [30]:
# Save the 190 features into one csv file in case we will use it again
data_all.to_csv("ALL_GO_FUNCTIONS.csv",index=False)

In [40]:
# Do an SVD on the molecular functions to get a reduction to 25 features
svd = TruncatedSVD(n_components=25, n_iter=20, random_state=18)
feature_columns = data_all.iloc[:,4:] #starting from the 4th column we have our features
truncated_molecular = pd.DataFrame(svd.fit_transform(feature_columns.values))


In [41]:
# add truncated molecular functions to our data 
data_new = pd.concat((train, test), axis=0, ignore_index=True)
data_SVD = pd.concat((data_new, truncated_molecular), axis = 1)
data_SVD

Unnamed: 0,Class,Gene,ID,Variation,0,1,2,3,4,5,...,15,16,17,18,19,20,21,22,23,24
0,1.0,FAM58A,0,Truncating Mutations,0.007211,0.013929,-0.001500,-0.044340,0.018561,0.007570,...,-0.004674,-0.105525,-0.027981,-0.037623,-0.035569,0.018190,0.119998,-0.094944,0.029298,0.041407
1,2.0,CBL,1,W802*,0.347313,-0.122612,0.187927,0.065631,0.051512,0.393504,...,-0.772562,-0.114651,-0.175823,-0.341789,-0.242575,1.053799,0.034895,0.603474,1.267056,-0.939694
2,2.0,CBL,2,Q249E,0.347313,-0.122612,0.187927,0.065631,0.051512,0.393504,...,-0.772562,-0.114651,-0.175823,-0.341789,-0.242575,1.053799,0.034895,0.603474,1.267056,-0.939694
3,3.0,CBL,3,N454D,0.347313,-0.122612,0.187927,0.065631,0.051512,0.393504,...,-0.772562,-0.114651,-0.175823,-0.341789,-0.242575,1.053799,0.034895,0.603474,1.267056,-0.939694
4,4.0,CBL,4,L399V,0.347313,-0.122612,0.187927,0.065631,0.051512,0.393504,...,-0.772562,-0.114651,-0.175823,-0.341789,-0.242575,1.053799,0.034895,0.603474,1.267056,-0.939694
5,4.0,CBL,5,V391I,0.347313,-0.122612,0.187927,0.065631,0.051512,0.393504,...,-0.772562,-0.114651,-0.175823,-0.341789,-0.242575,1.053799,0.034895,0.603474,1.267056,-0.939694
6,5.0,CBL,6,V430M,0.347313,-0.122612,0.187927,0.065631,0.051512,0.393504,...,-0.772562,-0.114651,-0.175823,-0.341789,-0.242575,1.053799,0.034895,0.603474,1.267056,-0.939694
7,1.0,CBL,7,Deletion,0.347313,-0.122612,0.187927,0.065631,0.051512,0.393504,...,-0.772562,-0.114651,-0.175823,-0.341789,-0.242575,1.053799,0.034895,0.603474,1.267056,-0.939694
8,4.0,CBL,8,Y371H,0.347313,-0.122612,0.187927,0.065631,0.051512,0.393504,...,-0.772562,-0.114651,-0.175823,-0.341789,-0.242575,1.053799,0.034895,0.603474,1.267056,-0.939694
9,4.0,CBL,9,C384R,0.347313,-0.122612,0.187927,0.065631,0.051512,0.393504,...,-0.772562,-0.114651,-0.175823,-0.341789,-0.242575,1.053799,0.034895,0.603474,1.267056,-0.939694


In [44]:
print(svd.explained_variance_ratio_.sum())

0.816023021311


In [57]:
new_names = [] 
for i in range(25):
    new_names.append('molecular_SVD_'+str(i+1))

data_SVD.columns = data_SVD.columns[:4].tolist() + new_names

In [46]:
# Save the 25 svd's features into one file 
data_SVD.to_csv("molecular_bases/svd25_molecular_functions.csv",index=False)