In [69]:
import numpy as np
import pandas as pd
import string
import os
from collections import Counter

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import TruncatedSVD

import re

# Creating base

In [7]:
train = pd.read_csv('..//bases/training_variants')
test = pd.read_csv('..//bases/test_variants')

In [8]:
train_texts = pd.read_csv('..//bases/training_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")
test_texts = pd.read_csv('..//bases/test_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")

In [9]:
new_test=pd.read_csv('..//bases/new_test_variants.csv')
new_test_texts = pd.read_csv('..//bases/new_test_text.csv', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")

In [10]:
train = pd.merge(train, train_texts, how='left', on='ID')
test = pd.merge(test, test_texts, how='left', on='ID')
new_test_final=pd.merge(new_test,new_test_texts,how="left",on="ID")
leaks=pd.read_csv('..//bases/s1_add_train.csv')
leaks_1=pd.DataFrame([leaks["ID"],leaks.drop("ID",axis=1).idxmax(axis=1).map(lambda x: x.lstrip('class'))])
leaks_2=leaks_1.T
leaks_2.columns=["ID","Class"]
leaks_3=pd.merge(leaks_2,test[test.ID.isin(leaks_2.ID)])
leaks_final=pd.merge(leaks_3,test_texts[test_texts.ID.isin(leaks_3.ID)])
train_final=pd.concat([train,leaks_final]) #adding first stage


In [13]:
del train,test,leaks,leaks_1,leaks_2,leaks_3

In [19]:
#Transform Gene Letter to their abbreviation in order to find them in the text
One_to_Three_AA = {'C': 'Cys', 'D': 'Asp', 'S': 'Ser', 'Q': 'Gln', 'K': 'Lys',
         'I': 'Ile', 'P': 'Pro', 'T': 'Thr', 'F': 'Phe', 'N': 'Asn', 
         'G': 'Gly', 'H': 'His', 'L': 'Leu', 'R': 'Arg', 'W': 'Trp', 
         'A': 'Ala', 'V': 'Val', 'E': 'Glu', 'Y': 'Tyr', 'M': 'Met'}
pattern = re.compile('|'.join(One_to_Three_AA.keys()))
##### Get variation types by using regex
def variation_regex(data, pattern): # if you want to not ignore cases, add extra argument to function
    Boolean = [not bool(re.search(pattern, i, re.IGNORECASE)) for i in data.Variation]
    data_no_regex = data[Boolean]  # 182 Fusions => 495 over 
    not_Boolean = [not i for i in Boolean]  
    data_regex = data[not_Boolean]
    
    return (data_regex, data_no_regex)

In [39]:
#### process the train and test set together
data_all = pd.concat((train_final, new_test_final), axis=0, ignore_index=True)
data_all_backup = data_all[:] ##### We keep backup because we want dummy variables of Gene & Text 
# TODO maybe also use Variation function of Gene from a database, and other suggestions. Also can use Count_sub as feature


In [40]:
data_all.shape

(4675, 5)

# Functions for pre-processing

In [41]:
def find_sub(data):

    ##### The normal case is around 2080 out of the 2644
    
    
    Boolean = [data.Variation[i] in data.Text[i] or #normal case
               data.Variation[i][:-1] in data.Text[i] or #case 1.
               pattern.sub(lambda x: One_to_Three_AA[x.group()], data.Variation[i][:-1]) # case2
               in data.Text[i]  for i in data.index] ## because new indexing we use 
    
    #TODO could also match insensitive as a next step for more info.
    #Shorter Boolean below = the normal version
    
    #Boolean = [trainSub.Variation[i] in trainSub.Text[i] #normal case
    #           for i in trainSub.ID] ## because new indexing we use ID
    #           
            
    sub_in_text = data[Boolean]
    not_Boolean = [not i for i in Boolean]  

    sub_not_in_text = data[not_Boolean]
#    sub_in_text['Count'] = [sub_in_text.Text[i].count(sub_in_text.Variation[i][:-1])
#                    +sub_in_text.Text[i].count(pattern.sub(lambda x: One_to_Three_AA[x.group()], sub_in_text.Variation[i][:-1]))
#                    for i in sub_in_text.index]
    
    return sub_in_text, sub_not_in_text
##### For subs that are not find in text: use regex to account for a different number
##### TODO: things you can further try - with AA name replacement, searching for the number only etc.
def find_sub_noText(data):
    Booleans = []
    for i in data.index:
        split_variation = re.split('(\d+)', data.Variation[i]) # split based on a number
        first_Amino = re.escape(split_variation[0]) #re.escpae uses variable as regex
        last_Amino = re.escape(split_variation[-1])
        #first_number = re.escape(split_variation[1][0])
        #new_regex = r"[^a-zA-Z0-9]" + first_Amino + first_number
        new_regex  = first_Amino + r"\d+" + last_Amino
        Boolean = bool(re.search(new_regex, data.Text[i]))
        Booleans.append(Boolean)
    
    sub_number_in_text = data[Booleans]
    not_Boolean = [not i for i in Booleans]  

    sub_again_no_text = data[not_Boolean]
    return sub_again_no_text, sub_number_in_text


In [42]:
#### Converts list of sentences into one string of sentences for each document => to use for tfidf etc.
def sentences_to_string(sentences_list):
    sentence_strings = []
    for sentences in sentences_list:
        sentence_string =  ' '.join(str(sentence) for sentence in sentences)
        sentence_strings.append(sentence_string)
    
    return sentence_strings ### This doesn't take such a long time to run


# Subtitutions (subs) processing of data set 

In [43]:
######### First find those that have the format of being a substitution in data
data_all['Substitutions_var'] = data_all.Variation.apply(lambda x: bool(re.search('^[A-Z]\\d+[A-Z*]$', x))*1) #multiplying by 1 converts True to 1, False to 0 => Maybe modify this later?
data_all['Stop_codon_var'] = data_all.Variation.apply(lambda x: bool(re.search('[*]', x))*1) #multiplying by 1 converts True to 1, False to 0
data_sub = data_all[data_all['Substitutions_var']==1] ### Now we know the index of where a substitution occurs - the data_sub

In [44]:
sub_in_text, sub_not_in_text = find_sub(data_sub)
sub_in_text_backup = sub_in_text[:] ## index gets changed by text_processing if we don't make a copy
##### INVESTIGATION: Why do some subs don't appear in Text?: Try to automize this and find out
### Substitutions can appear as SAME_PREFIX - Other number - SAME_SUFFIX

sub_again_no_Text, sub_noText = find_sub_noText(sub_not_in_text) # 108 such cases out of 411 = nice improvement
sub_noText_backup = sub_noText[:]

Working on variations who have the 2 letters right but not same numbers

In [45]:
#nltk.download("popular")

In [46]:
##############################################################################################################################
############## Non-subs preprocessing of data set #######################

#def find_mutation_type(row, pattern):  ##### TODO: make clearer by using a function instead of lambda
#    return bool(re.search('^fusion', row, re.IGNORECASE)) *1. Also for subs

####### Fusions : 'Fusions' ############
data_all['Fusion_var'] = data_all.Variation.apply(lambda x: bool(re.search('^fusion', x, re.IGNORECASE))*1) #multiplying by 1 converts True to 1, False to 0
new_fusion, new_data_all = variation_regex(data_all, '^fusion') 

###### Fusions: 'Gene-Gene fusion' ########
data_all['gene_fusion_var'] = new_data_all.Variation.apply(lambda x: bool(re.search('fusion', x, re.IGNORECASE))*1) 
_ , new_data_all = variation_regex(new_data_all, 'fusion') 
###### Notice that NaN introduced for places where splicing occured => replace after NaN with 0's when complete

####### Deletions: 'Deletions' ############
data_all['Deletion_var'] = new_data_all.Variation.apply(lambda x: bool(re.search('^del', x, re.IGNORECASE))*1) 
new_del, new_data_all = variation_regex(new_data_all, '^del') 

####### Deletions & Insertions wheteher together or seperately (doesn't make a big difference IMO)
data_all['del_or_ins_var'] = new_data_all.Variation.apply(lambda x: bool(re.search('del|ins', x, re.IGNORECASE))*1) 
# we could also later divide it into del, ins if we want to

###### Amplifications #########
data_all['Amplification_var'] = data_all.Variation.apply(lambda x: bool(re.search('ampl', x, re.IGNORECASE))*1) 

###### Truncations ########### Don't forget there are 'Truncating mutations' = 95 and '_trunc' = 4
data_all['Truncation_var'] = data_all.Variation.apply(lambda x: bool(re.search('trunc', x, re.IGNORECASE))*1) 

####### Exons #########
data_all['exon_var'] = data_all.Variation.apply(lambda x: bool(re.search('exon', x, re.IGNORECASE))*1) 

####### Frameshift mutations ########
data_all['frameshift_var'] = data_all.Variation.apply(lambda x: bool(re.search('fs', x, re.IGNORECASE))*1) 

####### Duplications ##############
data_all['dup_var'] = data_all.Variation.apply(lambda x: bool(re.search('dup', x, re.IGNORECASE))*1) 

data_all.fillna(0, inplace = True)



In [47]:
#TODO : #Sentence Tokenizer for non-subs that are not in text, AND
#subs that are not in text

In [48]:
#Not used
################ The dummy variables for Gene and Text ##################
## TODO: also use dummy for Text? There are 135 shared Genes and 142 shared Text between train and Leaks!  len(set(train.Text) & set(Leaks.Text))
#data_all_dummy = data_all_backup[['Gene', 'Text']] # drop those columns we don't need as dummy.
#X_dummy = pd.get_dummies(data_all_dummy) # converts categorical variables into dummy variable. From len set => 269 genes + 2090 texts
#X_dummy_train = X_dummy[:train.shape[0]]
#X_dummy_test = X_dummy[train.shape[0]:]
#dummy_names = X_dummy.columns.values #### To remember names if you want to check again what Gene or Text used
#X_dummy = X_dummy.values

In [49]:
###### Use the variation types 
#variation_types = data_all.drop(['ID', 'Gene', 'Class', 'Text', 'Variation'], axis =1)
#X_variation_train = variation_types[:train.shape[0]]
#X_variation_test = variation_types[train.shape[0]:]
#variation_names = variation_types.columns.values 

# Cleaning 

In [32]:
stop = set(stopwords.words('english'))
exclude = set('!"#$%&\'()*+:;<=>?@[\\]^_`{|}~0123456789') 
lemma = WordNetLemmatizer()
def clean(doc,lemmatiz=False):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free_0 = [re.sub(",|\.|/"," ",ch) for ch in stop_free]
    if lemmatiz:
        punc_free_lem="".join(ch for ch in punc_free_0 if ch not in exclude)
        normalized = " ".join(lemma.lemmatize(word) for word in punc_free_lem.split())
        return normalized
    else:
        punc_free = "".join(ch for ch in punc_free_0 if ch not in exclude)
        return punc_free

In [33]:
#No lemmatization for the moment, be careful not to lemmatize then w2vec
data_all.Text = [clean(doc) for doc in data_all.Text]  

In [50]:
for i in data_all.loc[1,"Text"].lower().split():
    print(i)

abstract
background
non-small
cell
lung
cancer
(nsclc)
is
a
heterogeneous
group
of
disorders
with
a
number
of
genetic
and
proteomic
alterations.
c-cbl
is
an
e3
ubiquitin
ligase
and
adaptor
molecule
important
in
normal
homeostasis
and
cancer.
we
determined
the
genetic
variations
of
c-cbl,
relationship
to
receptor
tyrosine
kinases
(egfr
and
met),
and
functionality
in
nsclc.
methods
and
findings
using
archival
formalin-fixed
paraffin
embedded
(ffpe)
extracted
genomic
dna,
we
show
that
c-cbl
mutations
occur
in
somatic
fashion
for
lung
cancers.
c-cbl
mutations
were
not
mutually
exclusive
of
met
or
egfr
mutations;
however
they
were
independent
of
p53
and
kras
mutations.
in
normal/tumor
pairwise
analysis,
there
was
significant
loss
of
heterozygosity
(loh)
for
the
c-cbl
locus
(22%,
n
=
8/37)
and
none
of
these
samples
revealed
any
mutation
in
the
remaining
copy
of
c-cbl.
the
c-cbl
loh
also
positively
correlated
with
egfr
and
met
mutations
observed
in
the
same
samples.
using
select
c-cbl
somatic

the
qloh
(allelic
ratio
for
the
tumor
peaks
divided
by
the
allelic
ratio
of
paired
normal
sample)
was
≤0.5
or
≥2.0
for
c-cbl
and
at
least
one
other
11q
marker
in
at
least
two
separate
experiments,
the
sample
was
considered
as
having
an
allelic
imbalance
and
interpreted
as
loh.
samples
were
evaluated
in
at
least
two
separate
experiments
and
samples
showing
prospective
loh
at
c-cbl
repeated
a
third
time
which
included
a
new
control
marker
at
the
bax
locus
(data
not
shown)
on
chromosome
19
to
verify
integrity
of
sample
dna.
transfection
of
c-cbl
constructs
the
a549
cell
line
was
transfected
using
the
fugene
hd
(roche,
nutley,
nj)
reagent
according
to
the
manufacturer's
instructions.
eight
µg
of
plasmid
dna,
containing
either
no
insert
(empty
vector),
wild-type
c-cbl,
s80n/h94y
c-cbl,
q249e
c-cbl
or
w802*
cbl
was
used
for
transfection
in
a
6-well
culture
plate.
cells
were
harvested
48
h
after
transfection
and
analyzed
for
expression.
c-cbl
knockdown
c-cbl
knockdown
was
performed
using
lent

the
results
are
depicted
in
figure
5b.
as
expected,
number
of
cells
increased
in
a
time
dependent
fashion
from
100
to
190%
relative
to
scrambled
shrna
as
control
in
a
span
of
48
h
(p
=
0.0002)
(figure
5b).
the
cell
cycle
phases
in
h358
cells
that
were
knocked
down
with
c-cbl
shrna
were
looked
at
and
compared
with
the
scrambled
shrna.
there
were
no
discernable
differences
between
these
two
constructs
in
the
different
phases
of
the
cell
cycle
(data
not
shown).
figure
5
figure
5
knockdown
of
c-cbl
using
an
shrna
increases
cell
proliferation.
go
to:
discussion
our
results
demonstrate
that
c-cbl
is
somatically
mutated
(or
has
loh)
in
lung
cancers,
and
can
significantly
contribute
to
enhanced
cell
viability
and
motility.
there
was
also
a
high
prevalence
of
loh
with
respect
to
c-cbl
in
lung
tumors
that
harbored
met
or
egfr
mutation.
in
the
present
study,
we
have
demonstrated
the
occurrence
of
c-cbl
mutations
in
lung
cancer
patients,
especially
with
different
ancestral
variations.
mutations
in

# some more features engineering

In [51]:
# Feature for the length of the text
data_all["Text_words"] = data_all["Text"].map(lambda x: len(str(x).split(" ")))

In [118]:
new_train = data_all.iloc[:len(train_final)]
new_test = data_all.iloc[len(train_final):]

In [119]:
svd = TruncatedSVD(n_components=25, n_iter=12, random_state=26)

one_hot_gene = pd.get_dummies(new_train['Gene'])
one_hot_gene_test=pd.get_dummies(new_test["Gene"])
one_hot_gene_test=fix_test_columns(one_hot_gene_test,one_hot_gene.columns)
truncated_one_hot_gene = svd.fit_transform(one_hot_gene.values)
truncated_one_hot_gene_for_test=svd.transform(one_hot_gene_test.values)

extra columns: {'EIF2B5', 'BAG3', 'WISP3', 'ADGRG1', 'BBS5', 'SF3B2', 'ZFPM2', 'B4GALT7', 'EPHA5', 'APOL1', 'ABCB11', 'SCN9A', 'GNE', 'DNAH5', 'ROCK1', 'NDUFS6', 'DPM1', 'KCNE2', 'MPDU1', 'LITAF', 'RP1', 'AURKC', 'PLA2G6', 'PDE8B', 'AP3B1', 'TRPM1', 'SIX3', 'RGS9', 'SUCLA2', 'IKBKAP', 'RAB27A', 'LRP6', 'SYT6', 'PHOX2B', 'CYP7B1', 'TNFRSF11A', 'SLC17A5', 'SLC7A7', 'SLC33A1', 'MOCS1', 'DNAI1', 'ABCC6', 'KERA', 'SLC22A4', 'CRB1', 'TRPC6', 'CRLF1', 'GALK1', 'BFSP2', 'SLC25A13', 'SLC25A15', 'EPHB2', 'TTK', 'LCT', 'STK19', 'EPHA2', 'WNT4', 'STK33', 'CRNKL1', 'MOCS2', 'GPHN', 'KISS1R', 'XRCC1', 'MYOT', 'SLC27A4', 'LRP4', 'CSF1R', 'RAD54B', 'SPAST', 'TGM5', 'CHST3', 'MCC', 'DYNC2H1', 'TP63', 'SLC25A12', 'FLNB', 'GCM2', 'PNPO', 'PTCH2', 'DNM1L', 'ALOX12B', 'SCN4A', 'ITM2B', 'TSHR', 'RECQL4', 'KCNJ13', 'BMPR1B', 'RPS19', 'DCC', 'OTOF', 'SLC6A5', 'SEPT9', 'AKAP9', 'STAT5B', 'KRIT1', 'RNF6', 'SLC19A2', 'ASS1', 'CILP', 'NDUFS3', 'NEK8', 'COX15', 'SLC7A9', 'KCNMB1', 'CST3', 'GJB3', 'LARGE1', 'SCO1',

In [120]:
genes_train=pd.DataFrame(truncated_one_hot_gene,columns=["tsvd_gene"+ str(x) for x in range(0,25)])
genes_test=pd.DataFrame(truncated_one_hot_gene_for_test,columns=["tsvd_gene"+ str(x) for x in range(0,25)])

In [121]:
new_test["index"]=range(0,len(new_test))
new_train_1=pd.merge(new_train.reset_index(),genes_train.reset_index()).drop("index",axis=1)
new_test_1=pd.merge(new_test,genes_test.reset_index()).drop("index",axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [122]:
new_train_1.to_csv("checkpoints_databases/nw_working_train.csv",index=False,encoding="utf8")
new_test_1.to_csv("checkpoints_databases/nw_working_test.csv",index=False,encoding="utf8")

In [116]:
new_train_1.shape

(3689, 42)

In [117]:
new_test_1.shape

(986, 42)