In [15]:
import numpy as np
import pandas as pd
import string
import os
from collections import Counter

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from segtok.segmenter import split_single, split_multi #seems to be the better choice than nltk despite lower speed

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
pd.options.display.max_colwidth = 50
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# Creating base

In [3]:
train = pd.read_csv('..//bases/new_training_variants.csv')
test = pd.read_csv('..//bases/new_test_variants.csv')


In [4]:
train_texts = pd.read_csv('..//bases/new_training_text.csv')
test_texts = pd.read_csv('..//bases/new_test_text.csv', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")

In [5]:
train_all = pd.merge(train, train_texts, how='left', on='ID')
test_all = pd.merge(test, test_texts, how='left', on='ID')

In [6]:
#### process the train and test set together
data_all = pd.concat((train_all, test_all), axis=0, ignore_index=True)
data_all = data_all[['Class', 'Gene', 'ID', 'Variation', 'Text']] # just reordering
data_all_backup = data_all[:] ## We keep backup in case we need to use again
data_all

Unnamed: 0,Class,Gene,ID,Variation,Text
0,1.0,FAM58A,0,Truncating Mutations,Cyclin-dependent kinases (CDKs) regulate a var...
1,2.0,CBL,1,W802*,Abstract Background Non-small cell lung canc...
2,2.0,CBL,2,Q249E,Abstract Background Non-small cell lung canc...
3,3.0,CBL,3,N454D,Recent evidence has demonstrated that acquired...
4,4.0,CBL,4,L399V,Oncogenic mutations in the monomeric Casitas B...
5,4.0,CBL,5,V391I,Oncogenic mutations in the monomeric Casitas B...
6,5.0,CBL,6,V430M,Oncogenic mutations in the monomeric Casitas B...
7,1.0,CBL,7,Deletion,CBL is a negative regulator of activated recep...
8,4.0,CBL,8,Y371H,Abstract Juvenile myelomonocytic leukemia (JM...
9,4.0,CBL,9,C384R,Abstract Juvenile myelomonocytic leukemia (JM...


In [7]:
#Transform Amino Acid (AA) Letter to their three-letter abbreviation in order to find them in the text when they appear
One_to_Three_AA = {'C': 'Cys', 'D': 'Asp', 'S': 'Ser', 'Q': 'Gln', 'K': 'Lys',
         'I': 'Ile', 'P': 'Pro', 'T': 'Thr', 'F': 'Phe', 'N': 'Asn', 
         'G': 'Gly', 'H': 'His', 'L': 'Leu', 'R': 'Arg', 'W': 'Trp', 
         'A': 'Ala', 'V': 'Val', 'E': 'Glu', 'Y': 'Tyr', 'M': 'Met'}
pattern = re.compile('|'.join(One_to_Three_AA.keys()))

# Adding extra feature(s)

In [8]:
# Feature for the length of the text
data_all["Text_words"] = data_all["Text"].map(lambda x: len(str(x).split(" ")))

# Substitutions (subs)
## Functions for pre-processing of subs

In [9]:
# find_sub return the substituions that are in text and those that are not
def find_sub(data):    
    Boolean = [data.Variation[i][:-1] in data.Text[i] or #case 1.
               pattern.sub(lambda x: One_to_Three_AA[x.group()], data.Variation[i][:-1]) # case2
               in data.Text[i]  for i in data.index] ## because new indexing we use 
    
    sub_in_text = data[Boolean]
    not_Boolean = [not i for i in Boolean]  
    sub_not_in_text = data[not_Boolean]
    
    return sub_in_text, sub_not_in_text

# find_sub_numberChange searches for other number of a substitution i.e. G12V -> G_V because sometimes mistake in entry
# Is currently without One_to_three substitution or Variation[:-1] only the full variation
def find_sub_numberChange(data):
    Booleans = [] #will contain the different Booleans if found in text
    for i in data.index:
        split_variation = re.split('(\d+)', data.Variation[i]) # split based on a number
        first_Amino = re.escape(split_variation[0]) #re.escpae uses variable as regex
        last_Amino = re.escape(split_variation[-1])
        new_regex  = first_Amino + r"\d+" + last_Amino
        Boolean = bool(re.search(new_regex, data.Text[i]))
        Booleans.append(Boolean)
        
    sub_number_in_text = data[Booleans]
    not_Boolean = [not i for i in Booleans]  
    sub_number_no_text = data[not_Boolean]
    
    return sub_number_in_text, sub_number_no_text

# for substitutions that are still not found, use other keywords
def find_sub_pattern(data, pattern):    
    Boolean = [pattern in data.Text[i] for i in data.index] ## because new indexing we use 
    
    sub_in_text = data[Boolean]
    not_Boolean = [not i for i in Boolean]  
    sub_not_in_text = data[not_Boolean]
    
    return sub_in_text, sub_not_in_text


In [10]:
##### get_sentences_sub use a window to extract sentences where the subs appear. 
# If window_left & window_right = 0 => just taking the sentences with subs

def get_sentences_sub(data, splitted_sentences, window_left, window_right):
    data.index = range(len(data)) #makes sure that index is right
    sentences_with_sub = [[] for _ in range(len(data))]
    for i in range(len(splitted_sentences)):
        sentences = splitted_sentences[i]
        one_to_three_variation = pattern.sub(lambda x: One_to_Three_AA[x.group()], data.Variation[i][:-1])
        Variation = data.Variation[i][:-1]        
        for j in range(len(sentences)):                              
            if (Variation in sentences[j]) or (one_to_three_variation in sentences[j]):
                new_regex = r"[\S]*" + re.escape(Variation) + r"[\S]*" ###  r"[\S]*" because we look for Variation[:-1] not just Variation
                sentences[j] = re.sub(new_regex, 'placeholderMutation', sentences[j]) #case 1
                ### We add the space to ' placeholderMutation' because sometimes there are letters in front of it
                new_regex = re.escape(one_to_three_variation) + r"[\S]*"
                sentences[j] = re.sub(new_regex, ' placeholderMutation', sentences[j]) #case 2
                sentences_with_sub[i].extend(sentences[max(j-window_left,0) : min(j+1+window_right, len(sentences)-1)])               
    
    return sentences_with_sub

##### get_sentences_sub_number use a window to extract sentences where the subs appear that have different number i.e. G12V -> G_V

def get_sentences_sub_number(data, splitted_sentences, window_left, window_right):
    #position_sentences = [[] for _ in range(len(data))]  #### currently not used
    data.index = range(len(data))
    sentences_with_sub_number = [[] for _ in range(len(data))]
    for i in range(len(splitted_sentences)):
        sentences = splitted_sentences[i] 
        for j in range(len(sentences)):
            split_variation = re.split('(\d+)', data.Variation[i]) # split based on a number
            first_Amino = re.escape(split_variation[0]) #re.escpae uses variable as regex
            last_Amino = re.escape(split_variation[-1])
            new_regex  = first_Amino + r"\d+" + last_Amino
            Boolean = bool(re.search(new_regex, sentences[j]))            
            if Boolean:
                sentences[j] = re.sub(new_regex, ' placeholderMutation', sentences[j]) # Again replacing the sentence with placeholder
                sentences_with_sub_number[i].extend(sentences[max(j-window_left,0) : min(j+1+window_right,len(sentences)-1)])
    
    return sentences_with_sub_number

# for substitutions that are still not found, use other keywords

def get_sentences_pattern(data, splitted_sentences, pattern, window_left, window_right):
    data.index = range(len(data)) #makes sure that index is right
    sentences_with_sub = [[] for _ in range(len(data))]
    for i in range(len(splitted_sentences)):
        sentences = splitted_sentences[i] 
        for j in range(len(sentences)):                              
            if (pattern in sentences[j]):
                sentences_with_sub[i].extend(sentences[max(j-window_left,0) : min(j+1+window_right, len(sentences)-1)])               
    
    return sentences_with_sub

In [None]:
###### LENGTH of sentences <= 10 : we run split again with segtok instead of NLTK
###### LENGTH < 5: window of 2,2   5 <= LENGTH <= 10: window of 1,1 

def get_window_sub(data, splitted_sentences):
    

In [11]:
#### Converts list of sentences into one string of sentences for each document => to use for tfidf etc.
def sentences_to_string(sentences_list):
    sentence_strings = []
    for sentences in sentences_list:
        sentence_string =  ' '.join(str(sentence) for sentence in sentences)
        sentence_strings.append(sentence_string)
    
    return sentence_strings 

## Subs processing of the data set 

In [12]:
######### First find those that have the format of being a substitution in data
data_all['Substitutions_var'] = data_all.Variation.apply(lambda x: bool(re.search('^[A-Z]\\d+[A-Z*]$', x))*1) #multiplying by 1 converts True to 1, False to 0 => Maybe modify this later?
data_all['Stop_codon_var'] = data_all.Variation.apply(lambda x: bool(re.search('[*]', x))*1) #multiplying by 1 converts True to 1, False to 0
data_sub = data_all[data_all['Substitutions_var']==1] ### Now we know the index of where a substitution occurs - the data_sub
print("Length of total subs: %i" %len(data_sub)) # other ways to process it like finding the word 'mutation'

Length of total subs: 3820


#### Subs inside the text

In [13]:
## First consider the subs that appear in text
sub_text, sub_no_text = find_sub(data_sub) 

In [14]:
## use tokenizer to split into sentences of all the subs in text 
NLTK_sub = [sent_tokenize(sub_text.Text[i]) for i in sub_text.index] 

In [396]:
# extract window for the sub sentences where they appear
# !! Use [:] because it makes a copy and doesn't change anything to the original indexes
sub_sentences = get_sentences_sub(sub_text[:], NLTK_sub[:], window_left = 0, window_right = 0) # choosing for window 0 as default now
sub_sentences_string = sentences_to_string(sub_sentences)

In [345]:
# Replace text in data_all
data_all.Text.iloc[sub_text.index] = sub_sentences_string #iloc for indexing based on integers
data_all.Text.iloc[sub_text.index]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


1       Using select c-CBL somatic mutations such as S...
2       Using select c-CBL somatic mutations such as S...
3       Most of the changes were novel, although 4 cas...
4       Finally, the third group constituted mutations...
5       Finally, the third group constituted mutations...
6       The second group of mutants (M374V, placeholde...
8       We investigated the mechanism by which placeho...
9       We investigated the mechanism by which CBL-Y37...
10      Finally, the third group constituted mutations...
11      Purified PCR amplicons (CBLWT, CBLC381A, place...
12      Purified PCR amplicons (CBLWT, CBLC381A, CBLK3...
13      Purified PCR amplicons (CBLWT, placeholderMuta...
14      The second group of mutants (M374V, V430M, pla...
15      Purified PCR amplicons (CBLWT, CBLC381A, CBLK3...
17      When introduced into Lin- Sca1+ c-Kit+ (LSK) H...
18      The second group of mutants placeholderMutatio...
19      When introduced into Lin- Sca1+ c-Kit+ (LSK) H...
20      Using 

In [16]:
#### Now use a window for the smaller size sentences (with LENGTH <= 10 sentences)

NameError: name 'sub_sentences_string' is not defined

#### Subs with a different number inside the text


In [346]:
# now the subs that don't appear in text: one reason is different number in the substitution. f.e. G12V -> G13V 
sub_number_text, sub_number_no_text = find_sub_numberChange(sub_no_text) # 108 such cases out of 411 = nice improvement

In [347]:
## use tokenizer to split into sentences of subs that have different number in text 
NLTK_sub_number = [sent_tokenize(sub_number_text.Text[i]) for i in sub_number_text.index]

In [348]:
# extract window for the sub sentences where they appear
# !! Use [:] because it makes a copy and doesn't change anything to the original indexes
sub_number_sentences = get_sentences_sub_number(sub_number_text[:], NLTK_sub_number[:], window_left = 0, window_right = 0)
sub_number_string = sentences_to_string(sub_number_sentences)

In [349]:
data_all.Text.iloc[sub_number_text.index] = sub_number_string #iloc for indexing based on integers
data_all.Text.iloc[sub_number_text.index]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


44      His-tagged versions of the catalytic region of...
49      His-tagged versions of the catalytic region of...
61      His-tagged versions of the catalytic region of...
63      His-tagged versions of the catalytic region of...
66      His-tagged versions of the catalytic region of...
73      While both were missense mutations, in silico ...
208      placeholderMutation and T847I were also found...
213     Complex mutation  placeholderMutation and dele...
267     As a positive control, we introduced the well-...
275     And in the series of Pallis and colleagues (12...
306     Recurrent mutations in H3F3A, which encodes th...
307     Histone H3-K42me2 may regulate transcription  ...
308     Recurrent mutations in H3F3A, which encodes th...
521     Both combinations contained mutations of codon...
607     In the present study, we have focused on two d...
632     Such mutant amino acid specificity was also ap...
730     In contrast, ectopic expression of FGFR4  plac...
1081    Recent

#### Subs that are still not found

In [350]:
# subs that are still not found, look based on a pattern like the word 'mutat' in text
sub_pattern_text, sub_pattern_no_text = find_sub_pattern(sub_number_no_text, 'mutat') # 108 such cases out of 411 = nice improvement

In [352]:
sub_pattern_no_text # only 14 texts that still don't include sentences with the word 'mutat', of which three nulls

Unnamed: 0,Class,Gene,ID,Variation,Text,Substitutions_var,Stop_codon_var
140,5.0,EGFR,140,I491M,The accurate determination of perfluoroalkyl s...,1,0
145,2.0,EGFR,145,K467T,The accurate determination of perfluoroalkyl s...,1,0
259,2.0,EGFR,259,S464L,The accurate determination of perfluoroalkyl s...,1,0
416,4.0,TP53,416,P151S,The effects of chlorpromazine on various prope...,1,0
1109,1.0,FANCA,1109,S1088F,,1,0
1407,6.0,FGFR3,1407,K508M,,1,0
1613,4.0,VHL,1613,L158Q,The case of a 40-year-old woman with severe ed...,1,0
1936,7.0,CARD11,1936,G116S,Left ventricular (LV) remodeling is a signific...,1,0
1942,7.0,CARD11,1942,E127G,Left ventricular (LV) remodeling is a signific...,1,0
2302,7.0,JAK1,2302,R724H,Regulatory T (T reg) cells have a major role i...,1,0


In [353]:
## use tokenizer to split into sentences of subs that have keyword 'mutat'
NLTK_sub_pattern = [sent_tokenize(sub_pattern_text.Text[i]) for i in sub_pattern_text.index]

In [354]:
# !! Use [:] because it makes a copy and doesn't change anything to the original indexes
sub_pattern_sentences = get_sentences_pattern(sub_pattern_text[:], NLTK_sub_pattern[:], 'mutat', window_left = 0, window_right = 0)
sub_pattern_string = sentences_to_string(sub_pattern_sentences)

In [355]:
data_all.Text.iloc[sub_pattern_text.index] = sub_pattern_string #iloc for indexing based on integers
data_all.Text.iloc[sub_pattern_text.index]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


46      Mutations in CBL, encoding an E3 ubiquitin lig...
80      This D1 domain contains only two serine residu...
102     Identification of a high-risk disease-causing ...
104     In this study, we profiled 70 colorectal cance...
106     Identification of a high-risk disease-causing ...
108     Most Lynch Syndrome-associated CRCs are report...
111     Identification of a high-risk disease-causing ...
112     Germ-line MSH6 mutations, which are rare in HN...
136     In 2011, whole-exome sequencing studies showed...
151     We investigated the association between mutati...
155     Clinical data and epidermal growth factor rece...
161     Clinical data and epidermal growth factor rece...
168     Currently available methods of EGFR mutation d...
178     Those were analyzed for EGFR and k-ras (Kirste...
182     Purpose: Clinical reports about responsiveness...
185     ‘Classical' mutations in the EGFR tyrosine kin...
215     Purpose: Clinical features of epidermal growth...
222     Findin

# Non-substitutions (no subs)
## Functions for pre-processing of no_subs like fusions, deletions, truncations, etc.

In [358]:
## Gets different non-subs variation types by using regex
def variation_regex(data, pattern): 
    Boolean = [bool(re.search(pattern, i, re.IGNORECASE)) for i in data.Variation]
    data_regex = data[Boolean] 
    not_Boolean = [not i for i in Boolean]  
    data_no_regex = data[not_Boolean]
    
    return (data_regex, data_no_regex)

In [359]:
def custom_find(data):
    l=["mutat","variat","fusion","deleti","amplific","framesh",
    "truncat","fs","exon","dupl"]
    Boolean={}
    for word in l:
        Boolean[word] = [re.search(word,data.Text[i]) for i in data.index] ## because new indexing we use 
    Bool=[any(tup) for tup in zip(Boolean["mutat"],Boolean["variat"],Boolean["fusion"],
        Boolean["deleti"],Boolean["amplific"],Boolean["truncat"],
        Boolean["fs"],Boolean["framesh"],Boolean["exon"],Boolean["dupl"])]
    no_sub_in_text=data[Bool]
    no_Bool=[not i for i in Bool]
    no_sub_not_in_text=data[no_Bool]
    return no_sub_in_text,no_sub_not_in_text


In [360]:
def get_sentences_pattern(data, splitted_sentences, pattern, window_left, window_right):
    data.index = range(len(data)) #makes sure that index is right
    sentences_with_sub = [[] for _ in range(len(data))]
    for i in range(len(splitted_sentences)):
        sentences = splitted_sentences[i] 
        for j in range(len(sentences)):                              
            if (pattern in sentences[j]):
                sentences_with_sub[i].extend(sentences[max(j-window_left,0) : min(j+1+window_right, len(sentences)-1)])               
    
    return sentences_with_sub

In [361]:
def get_sentences_nosub(data, splitted_sentences, window_left, window_right):
    #position_sentences = [[] for _ in range(len(data))]  #### currently not used
    data.index = range(len(data))
    sentences_with_sub = [[] for _ in range(len(data))]

    for i in range(len(splitted_sentences)):
        sentences = splitted_sentences[i]
        l=["mutat","variat","fusion","deleti","amplific","framesh",
        "truncat","fs","exon","dupl"]     
        for j in range(len(sentences)):    
            for keyword in l:
                if keyword in sentences[j]:
                    sentences_with_sub[i].extend(sentences[max(j-window_left,0) : min(j+1+window_right, len(sentences)-1)])
    return sentences_with_sub

## No subs processing of the data set 

In [362]:
#### First find those that have the format of being a non-substitutions in the data

# Initialize some of the variables already (because we splice them, could be filled with NA's otherwise)
# Represents 783 from train and test set
data_all['gene_fusion_var'] = 0
data_all['Deletion_var'] = 0
data_all['del_or_ins_var'] = 0

####### Fusions : 'Fusions' ############
data_all['Fusion_var'] = data_all.Variation.apply(lambda x: bool(re.search('^fusion', x, re.IGNORECASE))*1) #multiplying by 1 converts True to 1, False to 0
_ , new_data_all = variation_regex(data_all, '^fusion')  #37 cases

###### Fusions: 'Gene-Gene fusion' ########
data_all['gene_fusion_var'].loc[new_data_all.index] = new_data_all.Variation.apply(lambda x: bool(re.search('fusion', x, re.IGNORECASE))*1)
_ , new_data_all = variation_regex(new_data_all, 'fusion') #160 cases

####### Deletions: 'Deletions' ############
data_all['Deletion_var'].loc[new_data_all.index] = new_data_all.Variation.apply(lambda x: bool(re.search('^del', x, re.IGNORECASE))*1)
_, new_data_all = variation_regex(new_data_all, '^del') # 88 cases

####### Deletions & Insertions wheteher together or seperately (doesn't make a big difference IMO)
data_all['del_or_ins_var'].loc[new_data_all.index] = new_data_all.Variation.apply(lambda x: bool(re.search('del|ins', x, re.IGNORECASE))*1)
# 196 cases

###### Amplifications #########
data_all['Amplification_var'] = data_all.Variation.apply(lambda x: bool(re.search('ampl', x, re.IGNORECASE))*1) # 79 cases

###### Truncations ########### Don't forget there are 'Truncating mutations'  '_trunc' 
data_all['Truncation_var'] = data_all.Variation.apply(lambda x: bool(re.search('trunc', x, re.IGNORECASE))*1) # 118 cases

####### Exons #########
data_all['exon_var'] = data_all.Variation.apply(lambda x: bool(re.search('exon', x, re.IGNORECASE))*1) 

####### Frameshift mutations ########
data_all['frameshift_var'] = data_all.Variation.apply(lambda x: bool(re.search('fs', x, re.IGNORECASE))*1) # 22 cases

####### Duplications ##############
data_all['dup_var'] = data_all.Variation.apply(lambda x: bool(re.search('dup', x, re.IGNORECASE))*1)

####### Hypermethylation ##########
data_all['overexpression_var'] = data_all.Variation.apply(lambda x: bool(re.search('Overexpression', x, re.IGNORECASE))*1) 

####### Splice ##########
data_all['splice_var'] = data_all.Variation.apply(lambda x: bool(re.search('splice', x, re.IGNORECASE))*1) 

####### Hypermethylation ##########
data_all['hypermethylation_var'] = data_all.Variation.apply(lambda x: bool(re.search('hypermethylation', x, re.IGNORECASE))*1)

####### DNA binding ##########
data_all['DNA_binding_var'] = data_all.Variation.apply(lambda x: bool(re.search('DNA', x, re.IGNORECASE))*1)

####### Null ##########
data_all['null_var'] = data_all.Variation.apply(lambda x: bool(re.search('null', x, re.IGNORECASE))*1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [363]:
# Remember there are still unassigned variations that we can assign to for the next run (21 in total)
all_variations = data_all.loc[:, "Substitutions_var":]
data_all[(all_variations.T == 0).all()]

Unnamed: 0,Class,Gene,ID,Variation,Text,Substitutions_var,Stop_codon_var,gene_fusion_var,Deletion_var,del_or_ins_var,...,Amplification_var,Truncation_var,exon_var,frameshift_var,dup_var,overexpression_var,splice_var,hypermethylation_var,DNA_binding_var,null_var
31,7.0,TERT,31,Promoter Mutations,Sequencing studies have identified many recurr...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
150,7.0,EGFR,150,EGFRvIII,Alterations ofthe EGFR gene occur frequently i...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
152,7.0,EGFR,152,EGFRvII,Abstract Glioblastomas with EGFR amplificatio...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
170,7.0,EGFR,170,EGFRvV,Abstract The epidermal growth factor recepto...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
211,7.0,EGFR,211,EGFR-KDD,Oncogenic EGFR mutations are found in 10-35% o...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
238,7.0,EGFR,238,EGFRvIV,Tumor cells often subvert normal regulatory me...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
350,1.0,CDH1,350,Epigenetic Silencing,"In diffuse gastric carcinoma, despite common E...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
354,4.0,EP300,354,R1627,The transcriptional coactivator p300/CBP (CREB...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
355,4.0,EP300,355,C1385,The transcriptional coactivator p300/CBP (CREB...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
631,4.0,FBXW7,631,Copy Number Loss,This study focused on a cell cycle regulatory ...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [366]:
# Preprocess all the nosubs together
nosub=data_all[data_all['Substitutions_var']==0]

In [367]:
nosub_text, nosub_no_text = custom_find(nosub)
nosub_no_text # only 4 not found, of which three nulls

Unnamed: 0,Class,Gene,ID,Variation,Text,Substitutions_var,Stop_codon_var,gene_fusion_var,Deletion_var,del_or_ins_var,...,Amplification_var,Truncation_var,exon_var,frameshift_var,dup_var,overexpression_var,splice_var,hypermethylation_var,DNA_binding_var,null_var
1277,1.0,ARID5B,1277,Truncating Mutations,,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1639,6.0,FLT1,1639,Amplification,,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1911,1.0,FOXP1,1911,Truncating Mutations,The transcriptional network of the androgen re...,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3426,2.0,AURKB,3426,Amplification,,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [368]:
# find the sentences containing the nosubs
NLTK_nosub = [sent_tokenize(nosub_text.Text[i]) for i in nosub_text.index]

In [188]:
nosub_sentences = get_sentences_nosub(nosub_text[:], NLTK_nosub[:], window_left = 0, window_right = 0) # Retrieves sentences where keyword is included
nosub_sentences_string = sentences_to_string(nosub_sentences)

In [369]:
### Replacement inside the Text of data_all
data_all.Text.iloc[nosub_text.index] = nosub_sentences_string #iloc for indexing based on integers
data_all.Text.iloc[nosub_text.index]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


0       Cyclin M, an orphan cyclin, is the product of ...
7       In this study, we determined the frequency of ...
16      Germline and somatic mutations that deregulate...
31      Sequencing studies have identified many recurr...
33      Studies with comparative genomic hybridization...
41      Here, we report that deletion of the miRNA pro...
43      Here, we report that deletion of the miRNA pro...
48      Experimental Design  Genome-wide DNA methylati...
70      Both, human SHQ1 and NAP57 (as an N-terminal f...
71      Polyclonal anti-cyclin D2 and anti-cyclin D3 a...
72      Here we report the association of three Mre11 ...
74      The Mre11 complex specifies ssDNA endonuclease...
75      Polyclonal anti-cyclin D2 and anti-cyclin D3 a...
86      A Sac I (-1 195)-BamHI (+ 79) fragmnent and a ...
87      A Sac I (-1 195)-BamHI (+ 79) fragmnent and a ...
88      To determine whether RYBP binding to MDM2 is n...
96      Mutations in the known high-risk genes, BRCA1,...
100     These 

In [189]:
#Not used
################ The dummy variables for Gene and Text ##################
## TODO: also use dummy for Text? There are 135 shared Genes and 142 shared Text between train and Leaks!  len(set(train.Text) & set(Leaks.Text))
#data_all_dummy = data_all_backup[['Gene', 'Text']] # drop those columns we don't need as dummy.
#X_dummy = pd.get_dummies(data_all_dummy) # converts categorical variables into dummy variable. From len set => 269 genes + 2090 texts
#X_dummy_train = X_dummy[:train.shape[0]]
#X_dummy_test = X_dummy[train.shape[0]:]
#dummy_names = X_dummy.columns.values #### To remember names if you want to check again what Gene or Text used
#X_dummy = X_dummy.values

# Cleaning 

In [190]:
stop = set(stopwords.words('english'))
stop.update(('et al', 'fig', 'figure', 'acknowledgement' )) #maybe add some more

exclude = set('!"#$%&\'()*+:;<=>?@[\\]^_`{|}~') 
lemma = WordNetLemmatizer()
def clean(doc,lemmatiz=False):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free_0 = [re.sub(",|\.|/"," ",ch) for ch in stop_free]
    if lemmatiz:
        punc_free_lem="".join(ch for ch in punc_free_0 if ch not in exclude)
        normalized = " ".join(lemma.lemmatize(word) for word in punc_free_lem.split())
        return normalized
    else:
        punc_free = "".join(ch for ch in punc_free_0 if ch not in exclude)
        return punc_free

In [191]:
#No lemmatization for the moment, be careful not to lemmatize then w2vec
data_all.Text = [clean(doc) for doc in data_all["Text"]]  
#data_all.Background = [clean(doc) for doc in data_all.Background]

# Adding extra feature(s)

In [247]:
# Feature for the length of the text
data_all["Text_words"] = data_all["Text"].map(lambda x: len(str(x).split(" ")))

In [269]:
new_train = data_all.iloc[:len(train_final)]
new_test = data_all.iloc[len(train_final):]

In [270]:
svd = TruncatedSVD(n_components=25, n_iter=12, random_state=26)

one_hot_gene = pd.get_dummies(new_train['Gene'])
one_hot_gene_test=pd.get_dummies(new_test["Gene"])
one_hot_gene_test=fix_test_columns(one_hot_gene_test,one_hot_gene.columns)
truncated_one_hot_gene = svd.fit_transform(one_hot_gene.values)
truncated_one_hot_gene_for_test=svd.transform(one_hot_gene_test.values)

In [271]:
genes_train=pd.DataFrame(truncated_one_hot_gene,columns=["tsvd_gene"+ str(x) for x in range(0,25)])
genes_test=pd.DataFrame(truncated_one_hot_gene_for_test,columns=["tsvd_gene"+ str(x) for x in range(0,25)])

In [272]:
new_test["index"]=range(0,len(new_test))
new_train_1=pd.merge(new_train.reset_index(),genes_train.reset_index()).drop("index",axis=1)
new_test_1=pd.merge(new_test,genes_test.reset_index()).drop("index",axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [273]:
new_train_1.to_csv("checkpoints_databases/w_working_train.csv",index=False,encoding="utf8")
new_test_1.to_csv("checkpoints_databases/w_working_test.csv",index=False,encoding="utf8")

In [265]:
new_test_1.shape

(986, 47)