# Retrieve all the GO terms for gene from the Uniprot database based on the classes

In [37]:
import numpy as np
import pandas as pd
import string
import os
from collections import Counter
from collections import defaultdict
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RandomizedLasso
from sklearn.feature_selection import RFE, f_regression, SelectFromModel, RFECV, SelectKBest, chi2
from sklearn.svm import LinearSVC, SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import StratifiedKFold

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import re
from bioservices import *
import collections
%pylab inline --no-import-all

Populating the interactive namespace from numpy and matplotlib


In [38]:
new_test=pd.read_csv('..//..//..//bases/new_test_variants.csv')
new_test_texts = pd.read_csv('..//..//..//bases/new_test_text.csv', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")
new_test_final=pd.merge(new_test,new_test_texts,how="left",on="ID")

In [39]:
leaks=pd.read_csv('..//..//..//bases/s1_add_train.csv')
leaks_1=pd.DataFrame([leaks["ID"],leaks.drop("ID",axis=1).idxmax(axis=1).map(lambda x: x.lstrip('class'))])
leaks_2=leaks_1.T
leaks_2.columns=["ID","Class"]

In [40]:
train = pd.read_csv('..//..//..//bases/training_variants')
test = pd.read_csv('..//..//..//bases/test_variants')

In [41]:
train_texts = pd.read_csv('..//..//..//bases/training_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")
test_texts = pd.read_csv('..//..//..//bases/test_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")

In [42]:
train = pd.merge(train, train_texts, how='left', on='ID')
test = pd.merge(test, test_texts, how='left', on='ID')

In [43]:
leaks_3=pd.merge(leaks_2,test[test.ID.isin(leaks_2.ID)])
leaks_final=pd.merge(leaks_3,test_texts[test_texts.ID.isin(leaks_3.ID)])

In [44]:
train_all = pd.concat([train,leaks_final]) #adding first stage

In [45]:
merge_match = new_test.merge(train_all, left_on=['Gene', 'Variation'], right_on = ['Gene', 'Variation'])
Index_leak = merge_match.ID_x - 1
new_test_index = [item for item in new_test_final.index if item not in list(Index_leak)]
test_no_leaks = new_test_final.iloc[new_test_index]
test_no_leaks

Unnamed: 0,ID,Gene,Variation,Text
2,3,WNT4,E216G,Mycosis fungoides and Sézary syndrome are prim...
3,4,SUCLA2,G118R,Regulated progression through the cell cycle ...
5,6,CHEK2,E239K,The nuclei that laboratories solution p53 KIT ...
6,7,CHST3,T141M,Myeloid differentiation 88 (MyD88) is the key ...
7,8,RNF6,G244D,Human ESCCs 2 occur frequently worldwide (1) ....
8,9,SPAST,C448Y,large were of activity growth this product tol...
10,11,SCN4A,V445M,Endometrial carcinoma is the most common gynec...
14,15,ERBB2,G746S,The protein-kinase family is the most frequent...
15,16,TP53,Y234S,Among the best-studied therapeutic targets in ...
16,17,RAB27A,A87P,"Introduction In recent years, a better unders..."


In [46]:
train_all['Substitutions_var'] = train_all.Variation.apply(lambda x: bool(re.search('^[A-Z]\\d+[A-Z*]$', x))*1)
new_train = train_all[train_all['Substitutions_var']==1]

In [47]:
train = new_train

In [48]:
# only use gene from train data -> contains the classes
all_genes = set(train.Gene)
print(len(all_genes))
print(all_genes)

186
{'BAP1', 'BTK', 'CCND1', 'MDM4', 'FGFR1', 'B2M', 'PIK3R1', 'MET', 'SRC', 'KLF4', 'NTRK3', 'TCF3', 'RIT1', 'BRAF', 'PMS2', 'CDH1', 'MYOD1', 'PRDM1', 'AKT2', 'CARM1', 'CARD11', 'TSC1', 'SMO', 'ERBB2', 'NTRK2', 'NRAS', 'ATM', 'BRCA2', 'NUP93', 'NTRK1', 'MAP2K4', 'KMT2C', 'SPOP', 'KDM5C', 'CDK4', 'POLE', 'MEF2B', 'KIT', 'CDKN1A', 'MYCN', 'AR', 'STK11', 'CDKN2A', 'SMARCA4', 'BRIP1', 'CTCF', 'MYD88', 'RASA1', 'PTPRD', 'NOTCH1', 'AGO2', 'U2AF1', 'NFKBIA', 'SF3B1', 'EZH2', 'KNSTRN', 'HNF1A', 'ERCC2', 'RAD50', 'CDK6', 'PTEN', 'RET', 'AKT3', 'MYC', 'FOXL2', 'FANCA', 'VHL', 'IGF1R', 'ETV6', 'MTOR', 'FGFR4', 'FOXA1', 'SRSF2', 'APC', 'PTCH1', 'CTNNB1', 'GNAS', 'RAF1', 'ACVR1', 'TET2', 'NF1', 'SMAD4', 'MAPK1', 'RUNX1', 'EGFR', 'RRAS2', 'RXRA', 'ERBB4', 'FOXO1', 'TSC2', 'FGFR3', 'CHEK2', 'SDHB', 'MAP2K1', 'SDHC', 'RB1', 'RAD54L', 'XPO1', 'SOX9', 'SETD2', 'DNMT3A', 'PMS1', 'NFE2L2', 'ESR1', 'H3F3A', 'PIM1', 'RBM10', 'PPP6C', 'PIK3CA', 'EPAS1', 'JAK1', 'GNAQ', 'DIS3', 'EP300', 'FAT1', 'RAC1', 'TGFB

In [49]:
u = UniProt()

In [50]:
res = u.search("ZAP70_HUMAN")
print(res)

Entry	Entry name	Status	Protein names	Gene names	Organism	Length
P43403	ZAP70_HUMAN	reviewed	Tyrosine-protein kinase ZAP-70 (EC 2.7.10.2) (70 kDa zeta-chain associated protein) (Syk-related tyrosine kinase)	ZAP70 SRK	Homo sapiens (Human)	619



In [51]:
u.debugLevel = "INFO"
u.timeout = 100   # some queries are long and requires much more time; default is 1000 seconds

In [52]:
# just an example of query
a = u.search('SLC16A1+AND+organism:9606', frmt='tab', limit=1,
               columns="entry name")

In [53]:
[s.strip() for s in a.splitlines()]

['Entry name', 'MOT1_HUMAN']

In [54]:
gene_entry_dict = {} # here we will keep the gene_entries together with their classes
class_dict = {}
for gene in all_genes:
    gene_classes = list(train.Class[train.Gene==gene])
    keyword = 'gene:%s+AND+organism:9606' %gene #to query database, with gene and organism 9606 is Homo Sapien (human)
    entry_name_tab = u.search(keyword, frmt='tab', limit=1, columns="entry name") 
    entry_name = [s.strip() for s in entry_name_tab.splitlines()][1] # gets the entry name from uniprot i.e. second position in tab
    gene_entry_dict[gene] = entry_name
    class_dict[entry_name] = gene_classes

In [55]:
gene_entry_dict

{'ABL1': 'ABL1_HUMAN',
 'ACVR1': 'ACVR1_HUMAN',
 'AGO2': 'AGO2_HUMAN',
 'AKT1': 'AKT1_HUMAN',
 'AKT2': 'AKT2_HUMAN',
 'AKT3': 'AKT3_HUMAN',
 'ALK': 'TGFR1_HUMAN',
 'APC': 'APC_HUMAN',
 'AR': 'ANDR_HUMAN',
 'ARAF': 'ARAF_HUMAN',
 'ATM': 'ATM_HUMAN',
 'AURKA': 'AURKA_HUMAN',
 'B2M': 'B2MG_HUMAN',
 'BAP1': 'RING2_HUMAN',
 'BRAF': 'BRAF_HUMAN',
 'BRCA1': 'BRCA1_HUMAN',
 'BRCA2': 'BRCA2_HUMAN',
 'BRIP1': 'FANCJ_HUMAN',
 'BTK': 'BTK_HUMAN',
 'CARD11': 'CAR11_HUMAN',
 'CARM1': 'CARM1_HUMAN',
 'CASP8': 'CASP8_HUMAN',
 'CBL': 'CBL_HUMAN',
 'CCND1': 'CCND1_HUMAN',
 'CCND3': 'CCND3_HUMAN',
 'CDH1': 'CADH1_HUMAN',
 'CDK12': 'CDK12_HUMAN',
 'CDK4': 'CDK4_HUMAN',
 'CDK6': 'CDK6_HUMAN',
 'CDKN1A': 'CDN1A_HUMAN',
 'CDKN1B': 'CDN1B_HUMAN',
 'CDKN2A': 'CDN2A_HUMAN',
 'CDKN2B': 'CDN2B_HUMAN',
 'CHEK2': 'CHK2_HUMAN',
 'CIC': 'CIC_HUMAN',
 'CREBBP': 'CBP_HUMAN',
 'CTCF': 'CTCF_HUMAN',
 'CTLA4': 'CTLA4_HUMAN',
 'CTNNB1': 'CTNB1_HUMAN',
 'DDR2': 'DDR2_HUMAN',
 'DICER1': 'DICER_HUMAN',
 'DIS3': 'RRP44_HUMAN',

In [56]:
gene_entries = list(gene_entry_dict.values())
len(gene_entries)

186

In [58]:
df = u.get_df(gene_entries)
df

INFO:root:fetching information from uniprot for 185 entries
INFO:root:uniprot.get_df 1/1
INFO:root:uniprot.get_df 2/1


Unnamed: 0,Entry,Entry name,Gene names,Gene names (primary ),Gene names (synonym ),Gene names (ordered locus ),Gene names (ORF ),Organism,Organism ID,Protein names,...,Miscellaneous [CC],Keywords,Protein existence,Status,Sequence annotation (Features),Protein families,Version,Comments,Cross-reference (null),Pathway.1
0,P27986,P85A_HUMAN,[PIK3R1 GRB1],PIK3R1,GRB1,,,Homo sapiens (Human),9606,Phosphatidylinositol 3-kinase regulatory subun...,...,,"[3D-structure, Acetylation, Alternative splici...",Evidence at protein level,reviewed,,[PI3K p85 subunit family],215,"[Alternative products (1), Caution (1), Domain...",,
1,P36897,TGFR1_HUMAN,[TGFBR1 ALK5 SKR4],TGFBR1,ALK5 SKR4,,,Homo sapiens (Human),9606,TGF-beta receptor type-1 (TGFR-1) (EC 2.7.11.3...,...,,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, TKL Ser/Thr prote...",204,"[Alternative products (1), Catalytic activity ...",,
2,P46531,NOTC1_HUMAN,[NOTCH1 TAN1],NOTCH1,TAN1,,,Homo sapiens (Human),9606,Neurogenic locus notch homolog protein 1 (Notc...,...,,"[3D-structure, ANK repeat, Activator, Angiogen...",Evidence at protein level,reviewed,,[NOTCH family],212,"[Function (1), Involvement in disease (2), Pos...",,
3,P21802,FGFR2_HUMAN,[FGFR2 BEK KGFR KSAM],FGFR2,BEK KGFR KSAM,,,Homo sapiens (Human),9606,Fibroblast growth factor receptor 2 (FGFR-2) (...,...,,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, Tyr protein kinas...",229,"[Alternative products (1), Catalytic activity ...",,
4,P31749,AKT1_HUMAN,[AKT1 PKB RAC],AKT1,PKB RAC,,,Homo sapiens (Human),9606,RAC-alpha serine/threonine-protein kinase (EC ...,...,,"[3D-structure, ATP-binding, Acetylation, Alter...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, AGC Ser/Thr prote...",217,"[Alternative products (1), Catalytic activity ...",,
5,Q06124,PTN11_HUMAN,[PTPN11 PTP2C SHPTP2],PTPN11,PTP2C SHPTP2,,,Homo sapiens (Human),9606,Tyrosine-protein phosphatase non-receptor type...,...,,"[3D-structure, Acetylation, Alternative splici...",Evidence at protein level,reviewed,,"[Protein-tyrosine phosphatase family, Non-rece...",214,"[Alternative products (1), Catalytic activity ...",,
6,P11362,FGFR1_HUMAN,[FGFR1 BFGFR CEK FGFBR FLG FLT2 HBGFR],FGFR1,BFGFR CEK FGFBR FLG FLT2 HBGFR,,,Homo sapiens (Human),9606,Fibroblast growth factor receptor 1 (FGFR-1) (...,...,,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, Tyr protein kinas...",234,"[Alternative products (1), Catalytic activity ...",,
7,P42771,CDN2A_HUMAN,[CDKN2A CDKN2 MTS1],CDKN2A,CDKN2 MTS1,,,Homo sapiens (Human),9606,Cyclin-dependent kinase inhibitor 2A (Cyclin-d...,...,,"[3D-structure, ANK repeat, Acetylation, Altern...",Evidence at protein level,reviewed,,[CDKN2 cyclin-dependent kinase inhibitor family],198,"[Alternative products (1), Caution (2), Functi...",,
8,P12830,CADH1_HUMAN,[CDH1 CDHE UVO],CDH1,CDHE UVO,,,Homo sapiens (Human),9606,Cadherin-1 (CAM 120/80) (Epithelial cadherin) ...,...,,"[3D-structure, Alternative splicing, Calcium, ...",Evidence at protein level,reviewed,,[],220,"[Alternative products (1), Domain (1), Functio...",,
9,P37173,TGFR2_HUMAN,[TGFBR2],TGFBR2,,,,Homo sapiens (Human),9606,TGF-beta receptor type-2 (TGFR-2) (EC 2.7.11.3...,...,,"[3D-structure, ATP-binding, Alternative splici...",Evidence at protein level,reviewed,,"[Protein kinase superfamily, TKL Ser/Thr prote...",209,"[Alternative products (1), Catalytic activity ...",,


In [59]:
df['Gene ontology (GO)'][0]

['cell-cell junction [GO:0005911]',
 'cis-Golgi network [GO:0005801]',
 'cytoplasm [GO:0005737]',
 'cytosol [GO:0005829]',
 'membrane [GO:0016020]',
 'nucleus [GO:0005634]',
 'perinuclear endoplasmic reticulum membrane [GO:1990578]',
 'phosphatidylinositol 3-kinase complex [GO:0005942]',
 'phosphatidylinositol 3-kinase complex, class IA [GO:0005943]',
 'plasma membrane [GO:0005886]',
 'protein complex [GO:0043234]',
 '1-phosphatidylinositol-3-kinase activity [GO:0016303]',
 '1-phosphatidylinositol-3-kinase regulator activity [GO:0046935]',
 'ErbB-3 class receptor binding [GO:0043125]',
 'insulin binding [GO:0043559]',
 'insulin receptor binding [GO:0005158]',
 'insulin receptor substrate binding [GO:0043560]',
 'insulin-like growth factor receptor binding [GO:0005159]',
 'neurotrophin TRKA receptor binding [GO:0005168]',
 'phosphatidylinositol 3-kinase binding [GO:0043548]',
 'phosphatidylinositol 3-kinase regulator activity [GO:0035014]',
 'phosphatidylinositol 3-kinase regulatory sub

In [60]:
GO_terms_dict = dict(zip(df['Entry name'], df['Gene ontology (GO)']))

In [61]:
GO_terms_dict

{'2AAA_HUMAN': ['chromosome, centromeric region [GO:0000775]',
  'cytosol [GO:0005829]',
  'dendrite [GO:0030425]',
  'extracellular exosome [GO:0070062]',
  'lateral plasma membrane [GO:0016328]',
  'membrane [GO:0016020]',
  'microtubule cytoskeleton [GO:0015630]',
  'mitochondrion [GO:0005739]',
  'nucleus [GO:0005634]',
  'protein phosphatase type 2A complex [GO:0000159]',
  'antigen binding [GO:0003823]',
  'protein heterodimerization activity [GO:0046982]',
  'protein phosphatase regulator activity [GO:0019888]',
  'protein serine/threonine phosphatase activity [GO:0004722]',
  'apoptotic process [GO:0006915]',
  'ceramide metabolic process [GO:0006672]',
  'chromosome segregation [GO:0007059]',
  'ciliary basal body-plasma membrane docking [GO:0097711]',
  'female meiotic nuclear division [GO:0007143]',
  'G2/M transition of mitotic cell cycle [GO:0000086]',
  'inactivation of MAPK activity [GO:0000188]',
  'meiotic sister chromatid cohesion, centromeric [GO:0051754]',
  'meioti

In [62]:
# Find most common GO terms to use as features
def flatten(l): # taken from https://stackoverflow.com/questions/33900770/most-frequent-values-in-a-dictionary
    for el in l:
        if isinstance(el, collections.Iterable) and not isinstance(el, str): #replaced basestring with str for Python3
            for sub in flatten(el):
                yield sub
        else:
            yield el



In [63]:
All_GO_terms = list(set((flatten(GO_terms_dict.values())))) # we want list of the unique values (set) to use for modelling
len(All_GO_terms)

3327

In [None]:
# Initialization of the 3327 new features with 0's
for terms in All_GO_terms:
    train[terms] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [25]:
# looping through all classes and getting terms for each class
'''terms_per_class = defaultdict(list)
for entry, terms in GO_terms_dict.items():
    if entry in class_dict:
        gene_classes = class_dict[entry]
        for gene_class in gene_classes:
            terms_per_class[gene_class].extend(terms)
           
        
terms_per_class'''

'terms_per_class = defaultdict(list)\nfor entry, terms in GO_terms_dict.items():\n    if entry in class_dict:\n        gene_classes = class_dict[entry]\n        for gene_class in gene_classes:\n            terms_per_class[gene_class].extend(terms)\n           \n        \nterms_per_class'

In [24]:
# code if we want most commons
'''counter_dict = {}
for classes in terms_per_class:
    counter_dict[classes] = Counter(terms_per_class[classes]).most_common(50)'''


'counter_dict = {}\nfor classes in terms_per_class:\n    counter_dict[classes] = Counter(terms_per_class[classes]).most_common(50)'

In [20]:
# adds the molecular function GO terms to each gene in train data frame
for i in train.index:
    gene = train.Gene[i]
    gene_entry = gene_entry_dict[gene]
    if gene_entry in GO_terms_dict:
        GO_terms = GO_terms_dict[gene_entry]
        train.loc[i, GO_terms] = 1

train.shape

(3321, 3806)

In [21]:
train

Unnamed: 0,ID,Gene,Variation,Class,cardiac right atrium morphogenesis [GO:0003213],type 1 angiotensin receptor binding [GO:0031702],negative regulation of triglyceride biosynthetic process [GO:0010868],collecting duct development [GO:0072044],embryonic skeletal limb joint morphogenesis [GO:0036023],response to starvation [GO:0042594],...,catalytic step 2 spliceosome [GO:0071013],response to molecule of bacterial origin [GO:0002237],sulfur compound metabolic process [GO:0006790],defense response to virus [GO:0051607],evasion or tolerance of host defenses by virus [GO:0019049],cellular response to lipopolysaccharide [GO:0071222],protein C-terminus binding [GO:0008022],DNA-methyltransferase activity [GO:0009008],regulation of cell-cell adhesion [GO:0022407],VEGF-activated neuropilin signaling pathway [GO:0038190]
0,0,FAM58A,Truncating Mutations,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,CBL,W802*,2,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2,2,CBL,Q249E,2,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,3,CBL,N454D,3,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,4,CBL,L399V,4,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,5,CBL,V391I,4,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
6,6,CBL,V430M,5,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7,7,CBL,Deletion,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8,8,CBL,Y371H,4,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
9,9,CBL,C384R,4,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [90]:
train.to_csv("new_train_uniprot.csv",index=False)

In [22]:
# fit the input X and output Y for the feature selection
X = train[All_GO_terms]
y = train['Class']
names = X.columns
ranks = {}

In [24]:
def rank_to_dict(ranks, names, order=1):
    minmax = MinMaxScaler()
    ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
    ranks = map(lambda x: round(x, 2), ranks)
    return dict(zip(names, ranks ))

In [71]:
# Lasso model
lasso = Lasso(alpha=.002, random_state = 3).fit(X,y)
features_lasso = names[np.nonzero(lasso.coef_)]
features_lasso # 182 in total

Index(['negative regulation of glial cell proliferation [GO:0060253]',
       'cellular response to UV [GO:0034644]',
       'activation of protein kinase activity [GO:0032147]',
       'positive regulation of neuron apoptotic process [GO:0043525]',
       'positive regulation of endothelial cell proliferation [GO:0001938]',
       'regulation of interferon-gamma-mediated signaling pathway [GO:0060334]',
       'ATP binding [GO:0005524]', 'RISC-loading complex [GO:0070578]',
       'miRNA metabolic process [GO:0010586]',
       'bone mineralization [GO:0030282]',
       ...
       'B cell proliferation [GO:0042100]',
       'RNA polymerase II core promoter sequence-specific DNA binding [GO:0000979]',
       'methylcytosine dioxygenase activity [GO:0070579]',
       'double-strand break repair [GO:0006302]',
       'phosphotyrosine residue binding [GO:0001784]',
       'scaffold protein binding [GO:0097110]',
       'glyoxylate cycle [GO:0006097]', 'response to estradiol [GO:0032355]',


In [84]:
# L1-SVC model
lsvc = LinearSVC(C=0.02, penalty="l1", dual=False, random_state = 3).fit(X, y)
features_lsvc = names[np.nonzero(lsvc.coef_)[1]]
features_lsvc # 209 in total

Index(['embryonic organ development [GO:0048568]', 'ATP binding [GO:0005524]',
       'positive regulation of ERK1 and ERK2 cascade [GO:0070374]',
       'cytosol [GO:0005829]',
       'protein serine/threonine kinase activity [GO:0004674]',
       'plasma membrane [GO:0005886]', 'DNA binding [GO:0003677]',
       'protein complex assembly [GO:0006461]',
       'chromosome segregation [GO:0007059]',
       'positive regulation of gene expression [GO:0010628]',
       ...
       'cell cycle arrest [GO:0007050]', 'nucleus [GO:0005634]',
       'thymus development [GO:0048538]', 'plasma membrane [GO:0005886]',
       'ATP binding [GO:0005524]', 'cytosol [GO:0005829]',
       'plasma membrane [GO:0005886]',
       'positive regulation of transcription from RNA polymerase II promoter [GO:0045944]',
       'positive regulation of transcription, DNA-templated [GO:0045893]',
       'catalytic step 2 spliceosome [GO:0071013]'],
      dtype='object', length=209)

In [89]:
features_lsvc.intersection(features_lasso)

Index(['ATP binding [GO:0005524]', 'cytosol [GO:0005829]',
       'protein serine/threonine kinase activity [GO:0004674]',
       'plasma membrane [GO:0005886]', 'DNA binding [GO:0003677]',
       'chromosome segregation [GO:0007059]',
       'positive regulation of gene expression [GO:0010628]',
       'MAPK cascade [GO:0000165]',
       'regulation of cell proliferation [GO:0042127]',
       'viral process [GO:0016032]', 'beta-catenin binding [GO:0008013]',
       'covalent chromatin modification [GO:0016569]',
       'signal transduction [GO:0007165]',
       'G1/S transition of mitotic cell cycle [GO:0000082]',
       'protein complex [GO:0043234]', 'cell surface [GO:0009986]',
       'plasma membrane [GO:0005886]', 'apoptotic process [GO:0006915]',
       'nucleoplasm [GO:0005654]', 'signal transduction [GO:0007165]',
       'protein homodimerization activity [GO:0042803]',
       'protein deubiquitination [GO:0016579]', 'protein complex [GO:0043234]',
       'somatic stem cell po

In [52]:
forest = ExtraTreesClassifier(n_estimators=200,
                              random_state=6)
forest.fit(X, y)
model = SelectFromModel(forest, prefit=True)
X_new = model.transform(X)
X_new.shape # reduced to 822 features

(3321, 821)

In [113]:
feature_index

[2121,
 3550,
 1277,
 2158,
 2362,
 3741,
 1760,
 225,
 2516,
 2335,
 1346,
 2205,
 3426,
 3216,
 2695,
 3653,
 33,
 738,
 641,
 2152,
 3428,
 2847,
 1913,
 2295,
 714,
 1542,
 3068,
 1884,
 455,
 1762,
 479,
 266,
 3060,
 1846,
 805,
 283,
 477,
 2058,
 3143,
 2928,
 1128,
 644,
 1828,
 3196,
 832,
 1740,
 1188,
 3386,
 2654,
 861,
 888,
 2313,
 3571,
 873,
 2970,
 2542,
 3552,
 2740,
 3463,
 2765,
 3245,
 3532,
 3421,
 8,
 1989,
 2061,
 1413,
 2311,
 51,
 1799,
 2296,
 1305,
 509,
 6,
 808,
 2864,
 825,
 2736,
 1727,
 677,
 510,
 3044,
 2199,
 914,
 904,
 2150,
 3070,
 1794,
 3742,
 2920,
 2167,
 1670,
 2129,
 2997,
 922,
 175,
 3471,
 2370,
 2325,
 3491,
 335,
 717,
 233,
 3684,
 1143,
 1917,
 10,
 1026,
 3199,
 1547,
 3582,
 1235,
 1202,
 3617,
 630,
 3304,
 2072,
 1920,
 1001,
 2707,
 3165,
 1244,
 524,
 2553,
 900,
 3750,
 2020,
 342,
 1096,
 676,
 1414,
 907,
 2317,
 1134,
 3195,
 161,
 3052,
 1738,
 1079,
 1127,
 2835,
 2218,
 1174,
 3134,
 913,
 1054,
 3272,
 417,
 1891,
 3002

In [123]:
train[train.ix[:,1760]==1]

Unnamed: 0,ID,Gene,Variation,Class,cell-cell junction [GO:0005911],cis-Golgi network [GO:0005801],cytoplasm [GO:0005737],cytosol [GO:0005829],membrane [GO:0016020],nucleus [GO:0005634],...,nuclear export signal receptor activity [GO:0005049],nucleocytoplasmic transporter activity [GO:0005487],Ran GTPase binding [GO:0008536],transporter activity [GO:0005215],mRNA transport [GO:0051028],protein export from nucleus [GO:0006611],regulation of centrosome duplication [GO:0010824],ribosomal large subunit export from nucleus [GO:0000055],ribosomal small subunit export from nucleus [GO:0000056],ribosomal subunit export from nucleus [GO:0000054]
2960,2960,KIT,D816E,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2961,2961,KIT,K558N,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2962,2962,KIT,L576del,7,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2963,2963,KIT,D816F,7,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2964,2964,KIT,A829P,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2965,2965,KIT,D816N,7,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2966,2966,KIT,I563_L576del,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2967,2967,KIT,T670I,7,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2968,2968,KIT,E839K,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2969,2969,KIT,K642E,7,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
# saving the train set together with all features from uniprot
train.to_csv("train_uniprot.csv",index=False)

In [58]:
# loading the XGboost most important 190 features
feature_scores = np.load("features_ranking.npy")

In [59]:
features = []
for feature_score in feature_scores:
    feature = feature_score[0]
    features.append(feature)

In [66]:
features

['magnesium ion binding [GO:0000287]',
 'protein tyrosine kinase activity [GO:0004713]',
 'enzyme binding [GO:0019899]',
 'ATP binding [GO:0005524]',
 'kinase activity [GO:0016301]',
 'transcription corepressor activity [GO:0003714]',
 'protein C-terminus binding [GO:0008022]',
 'H3 histone acetyltransferase activity [GO:0010484]',
 'damaged DNA binding [GO:0003684]',
 'protein kinase activity [GO:0004672]',
 'zinc ion binding [GO:0008270]',
 'ATPase activity [GO:0016887]',
 'tubulin binding [GO:0015631]',
 'protein homodimerization activity [GO:0042803]',
 'protein heterodimerization activity [GO:0046982]',
 'protein kinase binding [GO:0019901]',
 'protein serine/threonine kinase activity [GO:0004674]',
 'ubiquitin-protein transferase activity [GO:0004842]',
 'identical protein binding [GO:0042802]',
 'protein complex binding [GO:0032403]',
 'GTPase activity [GO:0003924]',
 'RNA polymerase II core promoter sequence-specific DNA binding [GO:0000979]',
 'transmembrane receptor protein t

In [91]:
# adding only the 190 most important features from XGboost + the dummy variables of gene
train_features = train[features]
train_original = pd.read_csv('..//bases/training_variants')
train_dummy = pd.get_dummies(train_original.Gene) 
train_new = pd.concat([train_original, train_features, train_dummy], axis=1)
train_new.shape

(3321, 458)

In [94]:
train_new


Unnamed: 0,ID,Gene,Variation,Class,magnesium ion binding [GO:0000287],protein tyrosine kinase activity [GO:0004713],enzyme binding [GO:0019899],ATP binding [GO:0005524],kinase activity [GO:0016301],transcription corepressor activity [GO:0003714],...,TSC1,TSC2,U2AF1,VEGFA,VHL,WHSC1,WHSC1L1,XPO1,XRCC2,YAP1
0,0,FAM58A,Truncating Mutations,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,CBL,W802*,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,CBL,Q249E,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,CBL,N454D,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,CBL,L399V,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,5,CBL,V391I,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,6,CBL,V430M,5,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,7,CBL,Deletion,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,8,CBL,Y371H,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,9,CBL,C384R,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# save train_new somewhere
