# Get the uniprot bases ready for modelling


In [1]:
import numpy as np
import pandas as pd
import string
import os
from collections import Counter
from collections import defaultdict

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import re
from bioservices import *
import collections
%pylab inline --no-import-all

Populating the interactive namespace from numpy and matplotlib


In [2]:
train = pd.read_csv('..//bases/training_variants')
test = pd.read_csv('..//bases/test_variants')

In [74]:
data_all = pd.concat((train, test), axis=0, ignore_index=True)

In [4]:
all_genes = set(data_all.Gene)
print(len(all_genes))
print(all_genes)

1507


{'SMN1', 'IDS', 'F7', 'AQP7', 'CNGB1', 'PDGFRB', 'IFT122', 'BARD1', 'CLCN5', 'PPP2R1B', 'PDE8B', 'CDKL5', 'PAX3', 'PYGM', 'FGF9', 'MCFD2', 'SETD2', 'XK', 'SLC25A12', 'PLOD3', 'PEX5', 'ADA', 'BUB1B', 'ABCC6', 'PHOX2B', 'RPS6KA3', 'KCNJ5', 'NOTCH2', 'NHP2', 'PITX1', 'JAK1', 'NLRP3', 'PEX12', 'TDP1', 'C2orf71', 'ZNF41', 'NDUFAF4', 'SNRNP200', 'PDSS1', 'GK', 'FBXW7', 'PAH', 'TREX1', 'SMARCA4', 'WAS', 'KRT74', 'NPHS1', 'MTRR', 'DLAT', 'YARS2', 'BLM', 'MAN2B1', 'HSD3B7', 'LRP6', 'CDKN1B', 'WHSC1', 'KMT2A', 'SCN1B', 'SDHB', 'LIPI', 'ETV6', 'CYB5R3', 'NDUFS2', 'NAGS', 'NOD2', 'AAAS', 'APOA5', 'IHH', 'BMP15', 'HFE', 'BMPR2', 'EFHC1', 'LPIN2', 'EFEMP1', 'DCTN1', 'CRELD1', 'CDK12', 'NKX2-5', 'LYST', 'SLC1A3', 'GLI1', 'SEMA3E', 'GALNT12', 'CPN1', 'CDK8', 'IDH1', 'BCS1L', 'KRT12', 'CARM1', 'PMS2', 'UROD', 'MGAT2', 'SCN9A', 'UBIAD1', 'AMT', 'PLA2G7', 'PNKD', 'CTH', 'ZMPSTE24', 'MINPP1', 'PEPD', 'OXCT1', 'ARL13B', 'SIAE', 'DLL3', 'ATG16L1', 'USH2A', 'VIM', 'TMEM43', 'NUP62', 'PLP1', 'SLC40A1', 'NODAL

In [5]:
u = UniProt()

In [6]:
u.debugLevel = "INFO"
u.timeout = 100   # some queries are long and requires much more time; default is 1000 seconds

In [7]:
gene_entry_dict = {}
class_dict = {}
for gene in all_genes:
    keyword = 'gene:%s+AND+organism:9606' %gene #to query database, with gene and organism 9606 is Homo Sapien (human)
    entry_name_tab = u.search(keyword, frmt='tab', limit=1, columns="entry name") 
    entry_name = [s.strip() for s in entry_name_tab.splitlines()][1] # gets the entry name = in second position in list
    gene_entry_dict[gene] = entry_name

In [8]:
gene_entries = list(gene_entry_dict.values())
len(gene_entries)

1507

In [9]:
df = u.get_df(gene_entries) # searches in uniprot -> gets results back 
df

INFO:root:fetching information from uniprot for 1499 entries
INFO:root:uniprot.get_df 1/14
INFO:root:uniprot.get_df 2/14
INFO:root:uniprot.get_df 3/14
INFO:root:uniprot.get_df 4/14
INFO:root:uniprot.get_df 5/14
INFO:root:uniprot.get_df 6/14
INFO:root:uniprot.get_df 7/14
INFO:root:uniprot.get_df 8/14
INFO:root:uniprot.get_df 9/14
INFO:root:uniprot.get_df 10/14
INFO:root:uniprot.get_df 11/14
INFO:root:uniprot.get_df 12/14
INFO:root:uniprot.get_df 13/14
INFO:root:uniprot.get_df 14/14
INFO:root:uniprot.get_df 15/14


Unnamed: 0,Entry,Entry name,Gene names,Gene names (primary ),Gene names (synonym ),Gene names (ordered locus ),Gene names (ORF ),Organism,Organism ID,Protein names,...,Miscellaneous [CC],Keywords,Protein existence,Status,Sequence annotation (Features),Protein families,Version,Comments,Cross-reference (null),Pathway.1
0,Q5S007,LRRK2_HUMAN,[LRRK2 PARK8],LRRK2,PARK8,,,Homo sapiens (Human),9606,Leucine-rich repeat serine/threonine-protein k...,...,,"[3D-structure, ATP-binding, Autophagy, Cell ju...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, TKL Ser/Thr prote...",147,"[Catalytic activity (1), Domain (2), Function ...",,
1,P08034,CXB1_HUMAN,[GJB1 CX32],GJB1,CX32,,,Homo sapiens (Human),9606,Gap junction beta-1 protein (Connexin-32) (Cx3...,...,,"[3D-structure, Cell junction, Cell membrane, C...",Evidence at protein level,reviewed,,"[Connexin family, Beta-type (group I) subfamily]",188,"[Function (1), Involvement in disease (2), Seq...",,
2,Q9H444,CHM4B_HUMAN,[CHMP4B C20orf178 SHAX1],CHMP4B,C20orf178 SHAX1,,,Homo sapiens (Human),9606,Charged multivesicular body protein 4b (Chroma...,...,MISCELLANEOUS: Its overexpression strongly inh...,"[3D-structure, Acetylation, Cataract, Coiled c...",Evidence at protein level,reviewed,,[SNF7 family],144,"[Domain (1), Function (2), Involvement in dise...",,
3,Q9P2D1,CHD7_HUMAN,[CHD7 KIAA1416],CHD7,KIAA1416,,,Homo sapiens (Human),9606,Chromodomain-helicase-DNA-binding protein 7 (C...,...,,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,[SNF2/RAD54 helicase family],164,"[Alternative products (1), Catalytic activity ...",,
4,P40692,MLH1_HUMAN,[MLH1 COCA2],MLH1,COCA2,,,Homo sapiens (Human),9606,DNA mismatch repair protein Mlh1 (MutL protein...,...,,"[3D-structure, ATP-binding, Acetylation, Alter...",Evidence at protein level,reviewed,,[DNA mismatch repair MutL/HexB family],200,"[Alternative products (1), Caution (1), Functi...",,
5,Q9Y6K1,DNM3A_HUMAN,[DNMT3A],DNMT3A,,,,Homo sapiens (Human),9606,DNA (cytosine-5)-methyltransferase 3A (Dnmt3a)...,...,,"[3D-structure, Alternative promoter usage, Alt...",Evidence at protein level,reviewed,,[Class I-like SAM-binding methyltransferase su...,160,"[Alternative products (1), Catalytic activity ...",,
6,Q9UBC3,DNM3B_HUMAN,[DNMT3B],DNMT3B,,,,Homo sapiens (Human),9606,DNA (cytosine-5)-methyltransferase 3B (Dnmt3b)...,...,,"[3D-structure, Activator, Alternative splicing...",Evidence at protein level,reviewed,,[Class I-like SAM-binding methyltransferase su...,177,"[Alternative products (1), Catalytic activity ...",,
7,P51159,RB27A_HUMAN,[RAB27A RAB27],RAB27A,RAB27,,,Homo sapiens (Human),9606,Ras-related protein Rab-27A (Rab-27) (GTP-bind...,...,,"[Acetylation, Alternative splicing, Complete p...",Evidence at protein level,reviewed,,"[Small GTPase superfamily, Rab family]",180,"[Alternative products (1), Caution (1), Functi...",,
8,O15537,XLRS1_HUMAN,[RS1 XLRS1],RS1,XLRS1,,,Homo sapiens (Human),9606,Retinoschisin (X-linked juvenile retinoschisis...,...,,"[3D-structure, Cell adhesion, Cell membrane, C...",Evidence at protein level,reviewed,,[],152,"[Developmental stage (1), Function (1), Involv...",,
9,P32004,L1CAM_HUMAN,[L1CAM CAML1 MIC5],L1CAM,CAML1 MIC5,,,Homo sapiens (Human),9606,Neural cell adhesion molecule L1 (N-CAM-L1) (N...,...,,"[Alternative splicing, Cell adhesion, Cell mem...",Evidence at protein level,reviewed,,"[Immunoglobulin superfamily, L1/neurofascin/Ng...",195,"[Alternative products (1), Function (1), Invol...",,


In [10]:
df_new = df[df['Gene ontology (molecular function)'].notnull()] # don't consider genes with no molecular function

In [11]:
GO_terms_dict = dict(zip(df_new['Entry name'], df_new['Gene ontology (molecular function)']))

In [12]:
GO_terms_dict

{'LRRK2_HUMAN': 'actin binding [GO:0003779]; ATP binding [GO:0005524]; beta-catenin destruction complex binding [GO:1904713]; clathrin binding [GO:0030276]; co-receptor binding [GO:0039706]; glycoprotein binding [GO:0001948]; GTPase activator activity [GO:0005096]; GTPase activity [GO:0003924]; GTP binding [GO:0005525]; GTP-dependent protein kinase activity [GO:0034211]; identical protein binding [GO:0042802]; ion channel binding [GO:0044325]; kinase activity [GO:0016301]; MAP kinase kinase activity [GO:0004708]; microtubule binding [GO:0008017]; peroxidase inhibitor activity [GO:0036479]; protein homodimerization activity [GO:0042803]; protein kinase A binding [GO:0051018]; protein kinase activity [GO:0004672]; protein serine/threonine kinase activity [GO:0004674]; receptor signaling complex scaffold activity [GO:0030159]; Rho GTPase binding [GO:0017048]; SNARE binding [GO:0000149]; syntaxin-1 binding [GO:0017075]; tubulin binding [GO:0015631]',
 'CXB1_HUMAN': 'gap junction channel ac

In [13]:
# Find most common GO terms to use as features
def flatten(l): # taken from https://stackoverflow.com/questions/33900770/most-frequent-values-in-a-dictionary
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, str): #replaced basestring with str for Python3
            for sub in flatten(el):
                yield sub
        else:
            yield el


In [14]:
All_GO_terms = list(flatten(GO_terms_dict.values()))
len(set(All_GO_terms))

1353

In [16]:
# loading the XGboost most important 190 features
feature_scores = np.load("features_ranking.npy")

In [26]:
features = []
for feature_score in feature_scores:
    feature = feature_score[0]
    features.append(feature)

In [61]:
len(features)

190

In [76]:
# initialize data with the features 
for feature in features:
    data_all[feature] = 0

In [77]:
for i in data_all.index:
    gene = data_all.Gene[i]
    gene_entry = gene_entry_dict[gene]
    if gene_entry in GO_terms_dict:
        GO_terms = GO_terms_dict[gene_entry]
        GO_terms = GO_terms.split('; ')# we have to split those again because seperated by ;
        features_inside = list(set(GO_terms).intersection(features))# get only features in the GO_terms that we need
        data_all.loc[i, features_inside] = 1

In [78]:
data_all.shape

(8989, 194)

In [79]:
data_all

Unnamed: 0,Class,Gene,ID,Variation,magnesium ion binding [GO:0000287],protein tyrosine kinase activity [GO:0004713],enzyme binding [GO:0019899],ATP binding [GO:0005524],kinase activity [GO:0016301],transcription corepressor activity [GO:0003714],...,epidermal growth factor receptor binding [GO:0005154],phospholipase binding [GO:0043274],patched binding [GO:0005113],integrin binding [GO:0005178],hepatocyte growth factor-activated receptor activity [GO:0005008],transcription cofactor binding [GO:0001221],core promoter sequence-specific DNA binding [GO:0001046],platelet-derived growth factor binding [GO:0048407],"phosphatidylinositol-3,4-bisphosphate binding [GO:0043325]",interleukin-12 receptor binding [GO:0005143]
0,1.0,FAM58A,0,Truncating Mutations,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2.0,CBL,1,W802*,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,2.0,CBL,2,Q249E,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,3.0,CBL,3,N454D,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,4.0,CBL,4,L399V,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
5,4.0,CBL,5,V391I,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
6,5.0,CBL,6,V430M,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
7,1.0,CBL,7,Deletion,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
8,4.0,CBL,8,Y371H,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
9,4.0,CBL,9,C384R,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [80]:
genes_train = set(train.Gene)
len(genes_train)

264

In [81]:
#### Add the dummy variables for gene to the data
# first initialize with the 264 different genes
for gene in genes_train:
    data_all[gene] = 0

In [84]:
for i in data_all.index:
    gene = data_all.Gene[i]
    if gene in genes_train:
        data_all.loc[i, gene] = 1 

In [86]:
data_all

Unnamed: 0,Class,Gene,ID,Variation,magnesium ion binding [GO:0000287],protein tyrosine kinase activity [GO:0004713],enzyme binding [GO:0019899],ATP binding [GO:0005524],kinase activity [GO:0016301],transcription corepressor activity [GO:0003714],...,ACVR1,CCND3,RYBP,FLT3,CTLA4,BCOR,RAD51B,EP300,TSC2,PPP6C
0,1.0,FAM58A,0,Truncating Mutations,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2.0,CBL,1,W802*,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2.0,CBL,2,Q249E,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3.0,CBL,3,N454D,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4.0,CBL,4,L399V,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4.0,CBL,5,V391I,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,5.0,CBL,6,V430M,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1.0,CBL,7,Deletion,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,4.0,CBL,8,Y371H,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,4.0,CBL,9,C384R,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
new_train = data_all.iloc[:len(train)]
new_test = data_all.iloc[len(train):]