In [13]:
import numpy as np
import pandas as pd
import string
import os
from collections import Counter

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from segtok.segmenter import split_single, split_multi #seems to be the better choice than nltk despite lower speed

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
pd.options.display.max_colwidth = 50
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# Creating base

In [14]:
train = pd.read_csv('..//bases/training_variants')
test = pd.read_csv('..//bases/test_variants')

In [15]:
train_texts = pd.read_csv('..//bases/training_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")
test_texts = pd.read_csv('..//bases/test_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")

In [16]:
train = pd.merge(train, train_texts, how='left', on='ID')
test = pd.merge(test, test_texts, how='left', on='ID')

In [17]:
#Transform Amino Acid (AA) Letter to their three-letter abbreviation in order to find them in the text when they appear
One_to_Three_AA = {'C': 'Cys', 'D': 'Asp', 'S': 'Ser', 'Q': 'Gln', 'K': 'Lys',
         'I': 'Ile', 'P': 'Pro', 'T': 'Thr', 'F': 'Phe', 'N': 'Asn', 
         'G': 'Gly', 'H': 'His', 'L': 'Leu', 'R': 'Arg', 'W': 'Trp', 
         'A': 'Ala', 'V': 'Val', 'E': 'Glu', 'Y': 'Tyr', 'M': 'Met'}
pattern = re.compile('|'.join(One_to_Three_AA.keys()))

In [351]:
#### process the train and test set together
data_all = pd.concat((train, test), axis=0, ignore_index=True)
data_all_backup = data_all[:] ## We keep backup in case we need to use again


# Substitutions (subs)
## Functions for pre-processing of subs

In [33]:
# find_sub return the substituions that are in text and those that are not
def find_sub(data):    
    Boolean = [data.Variation[i][:-1] in data.Text[i] or #case 1.
               pattern.sub(lambda x: One_to_Three_AA[x.group()], data.Variation[i][:-1]) # case2
               in data.Text[i]  for i in data.index] ## because new indexing we use 
    
    sub_in_text = data[Boolean]
    not_Boolean = [not i for i in Boolean]  
    sub_not_in_text = data[not_Boolean]
    
    return sub_in_text, sub_not_in_text

# find_sub_numberChange searches for other number of a substitution i.e. G12V -> G_V because sometimes mistake in entry
# Is currently without One_to_three substitution or Variation[:-1] only the full variation
def find_sub_numberChange(data):
    Booleans = [] #will contain the different Booleans if found in text
    for i in data.index:
        split_variation = re.split('(\d+)', data.Variation[i]) # split based on a number
        first_Amino = re.escape(split_variation[0]) #re.escpae uses variable as regex
        last_Amino = re.escape(split_variation[-1])
        new_regex  = first_Amino + r"\d+" + last_Amino
        Boolean = bool(re.search(new_regex, data.Text[i]))
        Booleans.append(Boolean)
        
    sub_number_in_text = data[Booleans]
    not_Boolean = [not i for i in Booleans]  
    sub_number_no_text = data[not_Boolean]
    
    return sub_number_in_text, sub_number_no_text


In [30]:
##### get_sentences_sub use a window to extract sentences where the subs appear. 
# If window_left & window_right = 0 => just taking the sentences with subs

def get_sentences_sub(data, splitted_sentences, window_left, window_right):
    data.index = range(len(data)) #makes sure that index is right
    sentences_with_sub = [[] for _ in range(len(data))]
    for i in range(len(splitted_sentences)):
        sentences = splitted_sentences[i]
        one_to_three_variation = pattern.sub(lambda x: One_to_Three_AA[x.group()], data.Variation[i][:-1])
        Variation = data.Variation[i][:-1]        
        for j in range(len(sentences)):                              
            if (Variation in sentences[j]) or (one_to_three_variation in sentences[j]):
                new_regex = re.escape(Variation) + r"[\S]*" ###  r"[\S]*" because we look for Variation[:-1] not just Variation
                sentences[j] = re.sub(new_regex, ' placeholderMutation', sentences[j]) #case 1
                ### We add the space to ' placeholderMutation' because sometimes there are letters in front of it
                new_regex = re.escape(one_to_three_variation) + r"[\S]*"
                sentences[j] = re.sub(new_regex, ' placeholderMutation', sentences[j]) #case 2
                sentences_with_sub[i].extend(sentences[max(j-window_left,0) : min(j+1+window_right, len(sentences)-1)])               
    
    return sentences_with_sub

##### get_sentences_sub_number use a window to extract sentences where the subs appear that have different number i.e. G12V -> G_V

def get_sentences_sub_number(data, splitted_sentences, window_left, window_right):
    #position_sentences = [[] for _ in range(len(data))]  #### currently not used
    data.index = range(len(data))
    sentences_with_sub_number = [[] for _ in range(len(data))]
    for i in range(len(splitted_sentences)):
        sentences = splitted_sentences[i] 
        for j in range(len(sentences)):
            split_variation = re.split('(\d+)', data.Variation[i]) # split based on a number
            first_Amino = re.escape(split_variation[0]) #re.escpae uses variable as regex
            last_Amino = re.escape(split_variation[-1])
            new_regex  = first_Amino + r"\d+" + last_Amino
            Boolean = bool(re.search(new_regex, sentences[j]))            
            if Boolean:
                sentences[j] = re.sub(new_regex, ' placeholderMutation', sentences[j]) # Again replacing the sentence with placeholder
                sentences_with_sub_number[i].extend(sentences[max(j-window_left,0) : min(j+1+window_right,len(sentences)-1)])
    
    return sentences_with_sub_number


In [31]:
#### Converts list of sentences into one string of sentences for each document => to use for tfidf etc.
def sentences_to_string(sentences_list):
    sentence_strings = []
    for sentences in sentences_list:
        sentence_string =  ' '.join(str(sentence) for sentence in sentences)
        sentence_strings.append(sentence_string)
    
    return sentence_strings 


## Subs processing of the data set 

In [352]:
#### First find those that have the format of being a substitution in data
data_all['Substitutions_var'] = data_all.Variation.apply(lambda x: bool(re.search('^[A-Z]\\d+[A-Z*]$', x))*1) #multiplying by 1 converts True to 1, False to 0 => Maybe modify this later?
data_all['Stop_codon_var'] = data_all.Variation.apply(lambda x: bool(re.search('[*]', x))*1) #multiplying by 1 converts True to 1, False to 0
data_sub = data_all[data_all['Substitutions_var']==1] ### Now we know the index of where a substitution occurs - the data_sub

In [59]:
sub_text, sub_no_text = find_sub(data_sub) 
sub_text_backup = sub_text[:] ## index gets changed by text_processing if we don't make a copy

sub_number_text, sub_number_no_text = find_sub_numberChange(sub_no_text) # 108 such cases out of 411 = nice improvement
sub_number_backup = sub_number_text[:]


In [39]:
## use tokenizer to split into sentences of all the subs in text + extract sentences in window
NLTK_sub = [sent_tokenize(sub_text.Text[i]) for i in sub_text.index] 
sub_sentences = get_sentences_sub(sub_text, NLTK_sub, window_left = 0, window_right = 0) # choosing for window 0 as default now

In [47]:
## use tokenizer to split into sentences of subs that have different number in text + extract sentences in window
NLTK_sub_number = [split_single(sub_number_text.Text[i]) for i in sub_number_text.index]
sub_number_sentences = get_sentences_sub_number(sub_number_text, NLTK_sub_number, window_left = 0, window_right = 0)

In [60]:
print("Length of subs with text: %i" %len(sub_sentences))
print("Length of subs with other number in text: %i" %len(sub_number_sentences))
print("Length of subs still not found: %i" %len(sub_number_no_text)) # other ways to process it like finding the word 'mutation'

Length of subs with text: 7794
Length of subs with other number in text: 108
Length of subs still not found: 304


In [296]:
# sub_number_no_text.Text[sub_number_no_text.index[0]] #prints out one of the texts

# Non-substitutions (no subs)
## Functions for pre-processing of no_subs like fusions, deletions, truncations, etc.

In [101]:
## Gets different non-subs variation types by using regex
def variation_regex(data, pattern): 
    Boolean = [bool(re.search(pattern, i, re.IGNORECASE)) for i in data.Variation]
    data_regex = data[Boolean] 
    not_Boolean = [not i for i in Boolean]  
    data_no_regex = data[not_Boolean]
    
    return (data_regex, data_no_regex)

In [431]:
# find the nosub needed based on pattern
def find_nosub(data, pattern):    
    Boolean = [pattern in data.Text[i] for i in data.index] ## because new indexing we use 
    
    nosub_in_text = data[Boolean]
    not_Boolean = [not i for i in Boolean]  
    nosub_not_in_text = data[not_Boolean]
    
    return nosub_in_text, nosub_not_in_text


In [434]:
# find the fusion needed based on Gene and pattern
def find_nosub_gene(data, pattern):    
    Boolean = [(pattern in data.Text[i]) & (data.Gene[i] in data.Text[i]) for i in data.index] 
    
    nosub_gene_in_text = data[Boolean]
    not_Boolean = [not i for i in Boolean]  
    nosub_gene_not_in_text = data[not_Boolean]
    
    return nosub_gene_in_text, nosub_gene_not_in_text

In [492]:
# find the fusion needed based on Gene and pattern
def find_gene_fusion(data, pattern):
    Booleans = []
    for i in data.index:
        gene_fusion = data.Variation[i].split(' ')[0] # this forget about the word 'Fusion' inside the variation
        # split on '-' to search for the two genes seperately now
        gene_split = gene_fusion.split('-')
        first_gene = gene_split[0]
        if len(gene_split)>1:
            second_gene = gene_fusion.split('-')[1]
        else:
            second_gene = '' # will always find if there is no second gene
        
        Boolean = (pattern in data.Text[i]) & ((first_gene in data.Text[i]) & (second_gene in data.Text[i]))
        Booleans.append(Boolean)
    
    gene_fusion_in_text = data[Booleans]
    not_Boolean = [not i for i in Booleans]  
    gene_fusion_not_in_text = data[not_Boolean]
        
    return gene_fusion_in_text, gene_fusion_not_in_text

In [175]:
def get_sentences_nosub(data, splitted_sentences, pattern, window_left, window_right):
    data.index = range(len(data))
    sentences_nosub = [[] for _ in range(len(data))]
    
    for i in range(len(splitted_sentences)):
        sentences = splitted_sentences[i]   
        for j in range(len(sentences)):    
                if pattern in sentences[j]:
                    sentences_nosub[i].extend(sentences[max(j-window_left,0) : min(j+1+window_right, len(sentences)-1)])
                    
    return sentences_nosub  ### This might take a while because it's looping through all sentences


In [333]:
def get_sentences_fusion(data, splitted_sentences, pattern, window_left, window_right):
    data.index = range(len(data))
    sentences_fusion = [[] for _ in range(len(data))]
    
    for i in range(len(splitted_sentences)):
        sentences = splitted_sentences[i]   
        for j in range(len(sentences)):    
                if (pattern in sentences[j]) & (data.Gene[i] in sentences[j]):
                    sentences_fusion[i].extend(sentences[max(j-window_left,0) : min(j+1+window_right, len(sentences)-1)])
                    
    return sentences_fusion   ### This might take a while because it's looping through all sentences


In [399]:
def get_sentences_gene_fusion(data, splitted_sentences, pattern, window_left, window_right):
    data.index = range(len(data))
    sentences_gene_fusion = [[] for _ in range(len(data))]
    
    for i in range(len(splitted_sentences)):
        sentences = splitted_sentences[i]
        gene_fusion = data.Variation[i].split(' ')[0] # this forget about the word 'Fusion' inside the variation
        # split on '-' to have both genes
        gene_split = gene_fusion.split('-')
        first_gene = gene_split[0]
        if len(gene_split)>1:
            second_gene = gene_fusion.split('-')[1]
        else:
            second_gene = '' # will always find if there is no second gene
        for j in range(len(sentences)):    
                if (pattern in sentences[j]) & ((first_gene in sentences[j]) & (second_gene in sentences[j])):
                    sentences_gene_fusion[i].extend(sentences[max(j-window_left,0) : min(j+1+window_right, len(sentences)-1)])
                    
    return sentences_gene_fusion   ### This might take a while because it's looping through all sentences


## No subs processing of the data set 

In [371]:
#### First find those that have the format of being a non-substitutions in the data

# Initialize some of the variables already (because we splice them, could be filled with NA's otherwise)
data_all['gene_fusion_var'] = 0
data_all['Deletion_var'] = 0
data_all['del_or_ins_var'] = 0

####### Fusions : 'Fusions' ############
data_all['Fusion_var'] = data_all.Variation.apply(lambda x: bool(re.search('^fusion', x, re.IGNORECASE))*1) #multiplying by 1 converts True to 1, False to 0
_ , new_data_all = variation_regex(data_all, '^fusion') 

###### Fusions: 'Gene-Gene fusion' ########
data_all['gene_fusion_var'].loc[new_data_all.index] = new_data_all.Variation.apply(lambda x: bool(re.search('fusion', x, re.IGNORECASE))*1)
_ , new_data_all = variation_regex(new_data_all, 'fusion') 

####### Deletions: 'Deletions' ############
data_all['Deletion_var'].loc[new_data_all.index] = new_data_all.Variation.apply(lambda x: bool(re.search('^del', x, re.IGNORECASE))*1)
_, new_data_all = variation_regex(new_data_all, '^del') 

####### Deletions & Insertions wheteher together or seperately (doesn't make a big difference IMO)
data_all['del_or_ins_var'].loc[new_data_all.index] = new_data_all.Variation.apply(lambda x: bool(re.search('del|ins', x, re.IGNORECASE))*1)

###### Amplifications #########
data_all['Amplification_var'] = data_all.Variation.apply(lambda x: bool(re.search('ampl', x, re.IGNORECASE))*1) 

###### Truncations ########### Don't forget there are 'Truncating mutations' = 95 and '_trunc' = 4
data_all['Truncation_var'] = data_all.Variation.apply(lambda x: bool(re.search('trunc', x, re.IGNORECASE))*1) 

####### Exons #########
data_all['exon_var'] = data_all.Variation.apply(lambda x: bool(re.search('exon', x, re.IGNORECASE))*1) 

####### Frameshift mutations ########
data_all['frameshift_var'] = data_all.Variation.apply(lambda x: bool(re.search('fs', x, re.IGNORECASE))*1) 

####### Duplications ##############
data_all['dup_var'] = data_all.Variation.apply(lambda x: bool(re.search('dup', x, re.IGNORECASE))*1) 


all_variations = data_all.loc[:, "Substitutions_var":"dup_var"]
# all_variations[all_variations.T == 0] # deal with those 61 cases that are not assigned any label 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


#### Fusions: the normal variations

In [435]:
# Reduce text for all the fusions using the word 'fusion' in text
Fusions = data_all[(data_all['Fusion_var']==1)]
fusion_in_text, fusion_not_in_text = find_nosub_gene(Fusions, 'fusion') 

In [436]:
fusion_not_in_text # for 'Fusion' variation we find that gene is always in text

Unnamed: 0,Class,Gene,ID,Text,Variation,Substitutions_var,Stop_codon_var,gene_fusion_var,Deletion_var,del_or_ins_var,Fusion_var,Amplification_var,Truncation_var,exon_var,frameshift_var,dup_var


In [437]:
NLTK_fusion = [sent_tokenize(fusion_in_text.Text[i]) for i in fusion_in_text.index] 
sentences_fusion = get_sentences_fusion(fusion_in_text, NLTK_fusion, 'fusion', window_left=0, window_right=0)
# sentences_fusion did a good job reducing our text. only 3 cases left where sentences are 0, 1 and 2

#### Fusions: the gene-gene variations

In [489]:
gene_fusions = data_all[data_all['gene_fusion_var']==1]
gene_fusion_in_text, gene_fusion_not_in_text = find_gene_fusion(gene_fusions, 'fusion') 

In [490]:
len(gene_fusion_not_in_text) # 6 gene_fusions don't include 'fusion' and one of the gene names => deal later with them if we want

6

In [491]:
gene_fusion_not_in_text

Unnamed: 0,Class,Gene,ID,Text,Variation,Substitutions_var,Stop_codon_var,gene_fusion_var,Deletion_var,del_or_ins_var,Fusion_var,Amplification_var,Truncation_var,exon_var,frameshift_var,dup_var,Text_words
941,2.0,PDGFRB,941,Chronic myelomonocytic leukemia (CMML) is a my...,ATF7IP-PDGFRB Fusion,0,0,1,0,0,0,0,0,0,0,0,116
1057,2.0,EWSR1,1057,"As a result of chromosome translocations, the ...",EWSR1-FEV Fusion,0,0,1,0,0,0,0,0,0,0,0,207
1060,7.0,EWSR1,1060,Fibroblast growth factors (FGFs) expressed in...,EWSR1-ETV1 Fusion,0,0,1,0,0,0,0,0,0,0,0,6237
1065,2.0,EWSR1,1065,"Ewing’s sarcoma/PNET, a tumor of the bone and ...",EWSR1-ETV4 Fusion,0,0,1,0,0,0,0,0,0,0,0,1531
3223,7.0,NTRK1,3223,"In this study, we utilized retroviral transfer...",Delta-NTRK1 Fusion,0,0,1,0,0,0,0,0,0,0,0,6524
6183,,PDGFRB,2862,Chromosomal translocations involving the plate...,RABEP1-PDGFRB Fusion,0,0,1,0,0,0,0,0,0,0,0,5245


In [428]:
NLTK_gene_fusion = [sent_tokenize(gene_fusion_in_text.Text[i]) for i in gene_fusion_in_text.index] 
sentences_gene_fusion = get_sentences_gene_fusion(gene_fusion_in_text, NLTK_gene_fusion, 'fusion', window_left=0, window_right=0)
# sentences_gene_fusion from 154 sentences still has some low ones (14 smaller than 5)

#### Deletions: the normal variations

In [448]:
# Reduce text for all the Deletion Variation using the word 'deletion' in text
deletions = data_all[(data_all['Deletion_var']==1)]
deletion_in_text, deletion_not_in_text = find_nosub_gene(deletions, 'del') # only 5 left in deletion_not_in_text

In [449]:
deletion_not_in_text

Unnamed: 0,Class,Gene,ID,Text,Variation,Substitutions_var,Stop_codon_var,gene_fusion_var,Deletion_var,del_or_ins_var,Fusion_var,Amplification_var,Truncation_var,exon_var,frameshift_var,dup_var
539,4.0,SMAD2,539,Transforming growth factor- (TGF-) plays a com...,Deletion,0,0,0,1,0,0,0,0,0,0,0
1157,1.0,KMT2C,1157,Abstract Monomethylation of histone H3 on Lys...,Deletion,0,0,0,1,0,0,0,0,0,0,0
3201,1.0,RASA1,3201,Phospholipase C gamma 1 (PLC gamma 1) and p21r...,Deletion,0,0,0,1,0,0,0,0,0,0,0
3253,1.0,CASP8,3253,Evidence exists that alterations of the genes ...,Deletion,0,0,0,1,0,0,0,0,0,0,0
7153,,ASXL2,3832,Acute myeloid leukemia (AML) with t(8;21) (q22...,Deletion,0,0,0,1,0,0,0,0,0,0,0


In [438]:
NLTK_deletion = [sent_tokenize(deletion_in_text.Text[i]) for i in deletion_in_text.index] 
sentences_deletion = get_sentences_fusion(deletion_in_text, NLTK_deletion, 'del', window_left=0, window_right=0)
# still work to do

#### Deletions: the del_or_ins variations

In [462]:
del_or_ins = data_all[(data_all['del_or_ins_var']==1) & (data_all['exon_var']==0)]
for Variation in del_or_ins.Variation:
    print(Variation)

L747_T751delinsP
S752_I759del
E746_T751insIP
D770_N771insD
K745_A750del
D770_N771insNPG
E746_A750del
A859_L883delinsV
A750_E758del
V769_D770insGVV
A750_E758delinsP
L747_P753delinsS
H773insLGNP
H773_V774insH
L747_A750delinsP
L747_E749del
L747_T751del
L747_A750del
E746_T751delinsA
E709_T710delinsD
A763_Y764insFQEA
I744_K745delinsKIPVAI
E746_A750delinsQ
L747_P753del
S768_V769insVAS
E746_T751delinsVA
A767_V769del
T34_A289del
D770_N771insVDSVDNP
E746_S752delinsA
E746_S752delinsI
E161del
3' Deletion
534_536del
533_534del
DNA binding domain deletions
DNA binding domain insertions
G776delinsVC
M774_A775insAYVM
G776delinsLC
G776_V777insYVMA
C450_K451insMIEWMI
C456_N468del
I843_D846del
V561_I562insER
Y375_K455del
D842_M844del
D842_H845del
S566_E571delinsR
E311_K312del
I843del
C456_R481del
H845_N848delinsP
560_561insER
V544_L545insAVLVLLVIVIISLI
M1_E165DEL
V422del
385_418del
N198_F199delinsI
V128del
E1552del
P449_L455del
G106_R108del
T576del
I559_D560insDKRMNS
W237_Y242del
Q579_L581del
K459_S460d

In [460]:
del_or_ins

Unnamed: 0,Class,Gene,ID,Text,Variation,Substitutions_var,Stop_codon_var,gene_fusion_var,Deletion_var,del_or_ins_var,Fusion_var,Amplification_var,Truncation_var,exon_var,frameshift_var,dup_var
138,7.0,EGFR,138,Non–small-cell lung cancer is the leading caus...,L747_T751delinsP,0,0,0,0,1,0,0,0,0,0,0
139,2.0,EGFR,139,In contrast to other primary epidermal growth...,S752_I759del,0,0,0,0,1,0,0,0,0,0,0
146,7.0,EGFR,146,Somatic mutations introduced into the epiderma...,E746_T751insIP,0,0,0,0,1,0,0,0,0,0,0
147,7.0,EGFR,147,In contrast to other primary epidermal growth...,D770_N771insD,0,0,0,0,1,0,0,0,0,0,0
149,7.0,EGFR,149,Non–small-cell lung cancer is the leading caus...,K745_A750del,0,0,0,0,1,0,0,0,0,0,0
165,7.0,EGFR,165,In contrast to other primary epidermal growth...,D770_N771insNPG,0,0,0,0,1,0,0,0,0,0,0
166,7.0,EGFR,166,Abstract Malignant pleural effusions (MPEs) ...,E746_A750del,0,0,0,0,1,0,0,0,0,0,0
169,7.0,EGFR,169,Somatic mutations in the tyrosine kinase (TK) ...,Exon 19 deletion/insertion,0,0,0,0,1,0,0,0,1,0,0
171,2.0,EGFR,171,Somatic mutations introduced into the epiderma...,A859_L883delinsV,0,0,0,0,1,0,0,0,0,0,0
174,7.0,EGFR,174,Somatic mutations introduced into the epiderma...,A750_E758del,0,0,0,0,1,0,0,0,0,0,0


#### Truncations

In [455]:
truncations = data_all[(data_all['Truncation_var']==1)]
truncation_in_text, truncation_not_in_text = find_nosub_gene(truncations, 'trunc') # only 5 left in deletion_not_in_text

In [456]:
truncation_not_in_text

Unnamed: 0,Class,Gene,ID,Text,Variation,Substitutions_var,Stop_codon_var,gene_fusion_var,Deletion_var,del_or_ins_var,Fusion_var,Amplification_var,Truncation_var,exon_var,frameshift_var,dup_var
88,1.0,RYBP,88,The mouse double minute 2 (MDM2)–p53 interacti...,Truncating Mutations,0,0,0,0,0,0,0,1,0,0,0
313,1.0,ELF3,313,Previous studies have shown that the promoter ...,Truncating Mutations,0,0,0,0,0,0,0,1,0,0,0
532,1.0,TP53BP1,532,Introduction Inhibition of PARP1 induces synt...,Truncating Mutations,0,0,0,0,0,0,0,1,0,0,0
1015,1.0,TSC2,1015,SUMMARY mTORC1 promotes cell growth in respon...,Truncating Mutations,0,0,0,0,0,0,0,1,0,0,0
1080,1.0,ARID1A,1080,ARID1A (BAF250A) promotes the formation of SWI...,Truncating Mutations,0,0,0,0,0,0,0,1,0,0,0
1277,1.0,ARID5B,1277,,Truncating Mutations,0,0,0,0,0,0,0,1,0,0,0
1826,1.0,NCOR1,1826,Estrogen receptors (ERs) are normally expresse...,Truncating Mutations,0,0,0,0,0,0,0,1,0,0,0
1902,1.0,TET1,1902,"TET2 is a close relative of TET1, an enzyme th...",Truncating Mutations,0,0,0,0,0,0,0,1,0,0,0
1911,1.0,FOXP1,1911,The transcriptional network of the androgen re...,Truncating Mutations,0,0,0,0,0,0,0,1,0,0,0
2146,1.0,PTCH1,2146,Basal cell carcinoma (BCC) is the most common ...,Truncating Mutations,0,0,0,0,0,0,0,1,0,0,0


In [256]:
del_or_ins = data_all[(data_all['del_or_ins_var']==1)]
del_or_ins.Text[del_or_ins.index[2]]
deletions.Text[deletions.index[3]]


'Mre11, Rad50, and Nbs1 function in a protein complex that is central to the metabolism of chromosome breaks. Null mutants of each are inviable. We demonstrate here that hypomorphic Rad50 mutant mice (Rad50S/S mice) exhibited growth defects and cancer predisposition. Rad50S/S mice died with complete bone marrow depletion as a result of progressive hematopoietic stem cell failure. Similar attrition occurred in spermatogenic cells. In both contexts, attrition was substantially mitigated by p53 deficiency, whereas the tumor latency of p53−/− and p53+/− animals was reduced by Rad50S/S. Indices of genotoxic stress and chromosomal rearrangements were evident in Rad50S/S cultured cells, as well as in Rad50S/S and p53−/−Rad50S/S lymphomas, suggesting that the Rad50S/S phenotype was attributable to chromosomal instability. These outcomes were not associated with overt defects in the Mre11 complex\'s previously established double strand break repair and cell cycle checkpoint regulation functions

In [257]:
deletions

Unnamed: 0,Class,Gene,ID,Text,Variation,gene_fusion_var,Deletion_var,del_or_ins_var,Fusion_var,Amplification_var,Truncation_var,exon_var,frameshift_var,dup_var
7,1.0,CBL,7,CBL is a negative regulator of activated recep...,Deletion,0,1,0,0,0,0,0,0,0
43,1.0,DICER1,43,Mesenchymal cell populations contribute to mic...,Deletion,0,1,0,0,0,0,0,0,0
70,1.0,SHQ1,70,Assembly of H/ACA RNPs in yeast is aided by at...,Deletion,0,1,0,0,0,0,0,0,0
74,1.0,RAD50,74,"Mre11, Rad50, and Nbs1 function in a protein c...",Deletion,0,1,0,0,0,0,0,0,0
101,1.0,TGFBR2,101,Melanoma patients treated with oncogenic BRAF ...,Deletion,0,1,0,0,0,0,0,0,0
107,1.0,MSH6,107,A hypermutated subtype of advanced prostate ca...,Deletion,0,1,0,0,0,0,0,0,0
115,1.0,KMT2D,115,Kabuki syndrome (KS) is a multiple congenital ...,Deletion,0,1,0,0,0,0,0,0,0
119,1.0,PBRM1,119,Screening for tumor suppressor genes in breast...,Deletion,0,1,0,0,0,0,0,0,0
137,4.0,LATS2,137,Malignant mesothelioma (MM) is an aggressive n...,Deletion,0,1,0,0,0,0,0,0,0
286,1.0,ARID2,286,"In eukaryotic cells, DNA is packaged into chro...",Deletion,0,1,0,0,0,0,0,0,0


In [96]:
data_all.Text[data_all['gene_fusion_var']==1]

164     Oncogenic mutations in the epidermal growth fa...
268     Oncogenic mutations in the epidermal growth fa...
279     For 40% of pediatric T-ALL cases, underlying o...
280     For 40% of pediatric T-ALL cases, underlying o...
283     For 40% of pediatric T-ALL cases, underlying o...
289     NUT midline carcinoma (NMC) is a genetically d...
300      Abstract The discovery of recurrent gene fusi...
301     Recurrent gene fusions involving oncogenic ETS...
302     Although common in hematologic and mesenchymal...
319     Echinoderm microtubule associated protein like...
321     Oncogenic gene fusions involving the 3’ region...
322     Chromosomal rearrangements of the gene encodin...
326     We report the results of whole genome and tran...
327     Oncogenic gene fusions involving the 3’ region...
329     Chromosomal rearrangements of the gene encodin...
331     Echinoderm microtubule associated protein like...
332     Echinoderm microtubule associated protein like...
333     Inflam

In [34]:
NLTK_nosub = [sent_tokenize(no_sub_yes.Text[i]) for i in no_sub_yes.index]

In [29]:
no_sub_yes,no_sub_no=custom_find(no_sub)

In [92]:
data_all

Unnamed: 0,Class,Gene,ID,Text,Variation,Fusion_var,gene_fusion_var,Deletion_var,del_or_ins_var,Amplification_var,Truncation_var,exon_var,frameshift_var,dup_var
0,1.0,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations,0,0.0,0.0,0.0,0,1,0,0,0
1,2.0,CBL,1,Abstract Background Non-small cell lung canc...,W802*,0,0.0,0.0,0.0,0,0,0,0,0
2,2.0,CBL,2,Abstract Background Non-small cell lung canc...,Q249E,0,0.0,0.0,0.0,0,0,0,0,0
3,3.0,CBL,3,Recent evidence has demonstrated that acquired...,N454D,0,0.0,0.0,0.0,0,0,0,0,0
4,4.0,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V,0,0.0,0.0,0.0,0,0,0,0,0
5,4.0,CBL,5,Oncogenic mutations in the monomeric Casitas B...,V391I,0,0.0,0.0,0.0,0,0,0,0,0
6,5.0,CBL,6,Oncogenic mutations in the monomeric Casitas B...,V430M,0,0.0,0.0,0.0,0,0,0,0,0
7,1.0,CBL,7,CBL is a negative regulator of activated recep...,Deletion,0,0.0,1.0,,0,0,0,0,0
8,4.0,CBL,8,Abstract Juvenile myelomonocytic leukemia (JM...,Y371H,0,0.0,0.0,0.0,0,0,0,0,0
9,4.0,CBL,9,Abstract Juvenile myelomonocytic leukemia (JM...,C384R,0,0.0,0.0,0.0,0,0,0,0,0


In [37]:
NLTK_sub_again_notext = [sent_tokenize(sub_again_yes.Text[i]) for i in sub_again_yes.index]


In [35]:
nosub_sentences = get_sentences_nosub(no_sub_yes, NLTK_nosub, window_left = 0, window_right = 0) # Retrieves sentences where keyword is included
nosub_sentences = [sorted(set(sentences), key = sentences.index) for sentences in nosub_sentences] # only use unique 
nosub_background = [sorted(set(sentences), key = sentences.index) for sentences in nosub_background] # only use unique 



In [38]:
sub_again_sentences,sub_again_background = get_sentences_sub_again_notext(sub_again_yes, NLTK_sub_again_notext, window_left = 0, window_right = 0) # Retrieves sentences where keyword is included
sub_again_sentences = [sorted(set(sentences), key = sentences.index) for sentences in sub_again_sentences] # only use unique 
sub_again_background = [sorted(set(sentences), key = sentences.index) for sentences in sub_again_background] # only use unique 


In [39]:
NLTK_sub_noText = [sent_tokenize(sub_noText.Text[i]) for i in sub_noText.index]
sub_noText_sentences,sub_noText_background = get_sentences_sub_noText(sub_noText, NLTK_sub_noText, window_left = 0, window_right = 0) # Retrieves sentences where subsitution mutation is included
sub_noText_sentences = [sorted(set(sentences), key = sentences.index) for sentences in sub_noText_sentences] # only use unique sentences
sub_noText_background = [sorted(set(sentences), key = sentences.index) for sentences in sub_noText_background] # only use unique 


In [40]:
NLTK_sub = [sent_tokenize(sub_in_text.Text[i]) for i in sub_in_text.index] # takes a long time to run tokenizer => use pickle to save
sub_sentences,sub_background = get_sentences_sub(sub_in_text, NLTK_sub, window_left = 0, window_right = 0) 
# Retrieves sentences where subsitution mutation is included.
# window_left and window_right specify which sentences to keep at the left side or right side of the sub sentences.
# IMPORTANT: I used also placeholderMutation to replace the original sub mutations here
sub_sentences = [sorted(set(sentences), key = sentences.index) for sentences in sub_sentences]
sub_background = [sorted(set(sentences), key = sentences.index) for sentences in sub_background]


In [41]:
sub_sentences_string = sentences_to_string(sub_sentences)
sub_noText_string = sentences_to_string(sub_noText_sentences)
nosub_sentences_string=sentences_to_string(nosub_sentences)
sub_again_string=sentences_to_string(sub_again_sentences)

In [42]:
#Creating the background text (Text is now the frontline text)

In [110]:
data_all.loc[sub_in_text.index,"Background"]=sentences_to_string(sub_background)
data_all.loc[sub_noText.index,"Background"]=sentences_to_string(sub_noText_background)
data_all.loc[no_sub_yes.index,"Background"]=sentences_to_string(nosub_background)
data_all.loc[sub_again_yes.index,"Background"]=sentences_to_string(sub_again_background)
data_all.loc[sub_again_no.index,"Background"]="nbckgr"
data_all.loc[no_sub_no.index,"Background"]="nbckgr"

In [111]:
data_all.Background[data_all.Background==""]="nbckgr"
data_all.Background[data_all.Background.isnull()]="nbckgr"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [112]:
data_all.Text[sub_in_text.index] = sub_sentences_string
data_all.Text[sub_noText.index] = sub_noText_string
data_all.Text[no_sub_yes.index] = nosub_sentences_string
data_all.Text[sub_again_yes.index] = sub_again_string
data_all.Text[sub_again_no.index]="null"
data_all.Text[no_sub_no.index]="null"
data_all.Text[data_all.Text==""]="null"
data_all.Text[data_all.Text.isnull()]="null"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

Se

In [114]:
#TODO : #Sentence Tokenizer for non-subs that are not in text, AND
#subs that are not in text

In [115]:
#Not used
################ The dummy variables for Gene and Text ##################
## TODO: also use dummy for Text? There are 135 shared Genes and 142 shared Text between train and Leaks!  len(set(train.Text) & set(Leaks.Text))
#data_all_dummy = data_all_backup[['Gene', 'Text']] # drop those columns we don't need as dummy.
#X_dummy = pd.get_dummies(data_all_dummy) # converts categorical variables into dummy variable. From len set => 269 genes + 2090 texts
#X_dummy_train = X_dummy[:train.shape[0]]
#X_dummy_test = X_dummy[train.shape[0]:]
#dummy_names = X_dummy.columns.values #### To remember names if you want to check again what Gene or Text used
#X_dummy = X_dummy.values

In [116]:
###### Use the variation types 
#variation_types = data_all.drop(['ID', 'Gene', 'Class', 'Text', 'Variation'], axis =1)
#X_variation_train = variation_types[:train.shape[0]]
#X_variation_test = variation_types[train.shape[0]:]
#variation_names = variation_types.columns.values 

# Cleaning 

In [117]:
stop = set(stopwords.words('english')) + 
exclude = set('!"#$%&\'()*+:;<=>?@[\\]^_`{|}~') 
lemma = WordNetLemmatizer()
def clean(doc,lemmatiz=False):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free_0 = [re.sub(",|\.|/"," ",ch) for ch in stop_free]
    if lemmatiz:
        punc_free_lem="".join(ch for ch in punc_free_0 if ch not in exclude)
        normalized = " ".join(lemma.lemmatize(word) for word in punc_free_lem.split())
        return normalized
    else:
        punc_free = "".join(ch for ch in punc_free_0 if ch not in exclude)
        return punc_free

In [118]:
#No lemmatization for the moment, be careful not to lemmatize then w2vec
data_all.Text = [clean(doc) for doc in data_all.Text]  

In [120]:
train = data_all.iloc[:len(train)]
test = data_all.iloc[len(train):]

In [125]:
data_all=pd.concat((train, test), axis=0, ignore_index=True)

# Some more feature engineering

In [463]:
# Feature based on length of the text
data_all["Text_words"] = data_all["Text"].map(lambda x: len(str(x).split(" ")))

In [195]:
list(data_all.Text[data_all.Text_words<100])

['null',
 'null',
 'null',
 'The case of a 40-year-old woman with severe edema of the face and neck after the injection of a local dental anesthetic is presented. The reaction is attributed to the presence of sodium metabisulfite, and antioxidant, in the local anesthetic. Both the anesthetic and the sodium metabisulfite gave a delayed positive patch-test response.',
 'null',
 'null',
 'null']

In [465]:
train_new = data_all.iloc[:len(train)]
train_new

Unnamed: 0,Class,Gene,ID,Text,Variation,Substitutions_var,Stop_codon_var,gene_fusion_var,Deletion_var,del_or_ins_var,Fusion_var,Amplification_var,Truncation_var,exon_var,frameshift_var,dup_var,Text_words
0,1.0,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations,0,0,0,0,0,0,0,1,0,0,0,6105
1,2.0,CBL,1,Abstract Background Non-small cell lung canc...,W802*,1,1,0,0,0,0,0,0,0,0,0,5783
2,2.0,CBL,2,Abstract Background Non-small cell lung canc...,Q249E,1,0,0,0,0,0,0,0,0,0,0,5783
3,3.0,CBL,3,Recent evidence has demonstrated that acquired...,N454D,1,0,0,0,0,0,0,0,0,0,0,5625
4,4.0,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V,1,0,0,0,0,0,0,0,0,0,0,6248
5,4.0,CBL,5,Oncogenic mutations in the monomeric Casitas B...,V391I,1,0,0,0,0,0,0,0,0,0,0,6248
6,5.0,CBL,6,Oncogenic mutations in the monomeric Casitas B...,V430M,1,0,0,0,0,0,0,0,0,0,0,6248
7,1.0,CBL,7,CBL is a negative regulator of activated recep...,Deletion,0,0,0,1,0,0,0,0,0,0,0,14709
8,4.0,CBL,8,Abstract Juvenile myelomonocytic leukemia (JM...,Y371H,1,0,0,0,0,0,0,0,0,0,0,12009
9,4.0,CBL,9,Abstract Juvenile myelomonocytic leukemia (JM...,C384R,1,0,0,0,0,0,0,0,0,0,0,5761


In [143]:
train_2=pd.get_dummies(train_1,columns=["Gene"])

In [145]:
test_2=pd.get_dummies(test_1,columns=["Gene"])

In [146]:
#add and remove gene dummies on test set to match train

In [155]:
train_2.to_csv("checkpoints_databases/w_working_train.csv",index=False,encoding="utf8")
test_3.to_csv("checkpoints_databases/w_working_test.csv",index=False,encoding="utf8")

In [488]:
Gene_INPUT_DIM=25
svd = TruncatedSVD(n_components=125, n_iter=Gene_INPUT_DIM, random_state=12)



one_hot_gene = pd.get_dummies(train['Gene'])
truncated_one_hot_gene = svd.fit_transform(one_hot_gene.values)
print(svd.explained_variance_ratio_.sum())

0.921350579565


In [474]:
pd.DataFrame(truncated_one_hot_gene)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,1.931671e-24,-1.600352e-19,1.825182e-20,-7.212122e-22,1.511184e-21,3.909289e-24,-2.613487e-25,-1.948228e-28,4.273498e-28,-1.097003e-29,...,2.388333e-26,1.908868e-26,-1.092443e-25,-1.228134e-25,1.805306e-25,2.782531e-26,8.946231e-24,-2.819339e-23,-1.824502e-23,-1.427539e-23
1,6.598803e-21,-9.421152e-16,1.260674e-16,2.947655e-16,-1.344321e-15,-8.430348e-16,-4.814295e-16,1.744864e-13,2.077607e-13,-1.292588e-11,...,-1.976270e-08,6.557776e-09,-7.656762e-08,-6.647731e-07,3.739187e-07,4.182728e-07,1.039008e-05,-3.644007e-07,-7.565462e-06,-9.850639e-05
2,6.509848e-21,-9.544203e-16,-1.625777e-16,1.878908e-16,-1.394798e-15,-8.424694e-16,-4.808551e-16,1.744864e-13,2.077607e-13,-1.292588e-11,...,-1.976270e-08,6.557776e-09,-7.656762e-08,-6.647731e-07,3.739187e-07,4.182728e-07,1.039008e-05,-3.644007e-07,-7.565462e-06,-9.850639e-05
3,6.534243e-21,-9.626240e-16,-3.870358e-16,6.629769e-15,1.754405e-15,-8.755560e-16,-4.850829e-16,1.744864e-13,2.077607e-13,-1.292588e-11,...,-1.976270e-08,6.557776e-09,-7.656762e-08,-6.647731e-07,3.739187e-07,4.182728e-07,1.039008e-05,-3.644007e-07,-7.565462e-06,-9.850639e-05
4,6.510358e-21,-9.545605e-16,-1.666234e-16,3.084059e-16,-1.334077e-15,-8.455937e-16,-4.685449e-16,1.744864e-13,2.077607e-13,-1.292588e-11,...,-1.976270e-08,6.557776e-09,-7.656762e-08,-6.647731e-07,3.739187e-07,4.182728e-07,1.039008e-05,-3.644007e-07,-7.565462e-06,-9.850639e-05
5,6.510372e-21,-9.545717e-16,-1.668053e-16,3.113815e-16,-1.335866e-15,2.642914e-16,-5.166400e-16,1.744857e-13,2.077607e-13,-1.292588e-11,...,-1.976270e-08,6.557776e-09,-7.656762e-08,-6.647731e-07,3.739187e-07,4.182728e-07,1.039008e-05,-3.644007e-07,-7.565462e-06,-9.850639e-05
6,6.510358e-21,-9.545604e-16,-1.666299e-16,3.085683e-16,-1.333880e-15,-8.722547e-16,-2.650174e-16,1.744836e-13,2.077649e-13,-1.292588e-11,...,-1.976270e-08,6.557776e-09,-7.656762e-08,-6.647731e-07,3.739187e-07,4.182728e-07,1.039008e-05,-3.644007e-07,-7.565462e-06,-9.850639e-05
7,6.510357e-21,-9.545605e-16,-1.666224e-16,3.083764e-16,-1.334101e-15,-8.428125e-16,-4.812189e-16,1.762190e-13,2.077205e-13,-1.292604e-11,...,-1.976270e-08,6.557776e-09,-7.656762e-08,-6.647731e-07,3.739187e-07,4.182728e-07,1.039008e-05,-3.644007e-07,-7.565462e-06,-9.850639e-05
8,6.510357e-21,-9.545605e-16,-1.666224e-16,3.083766e-16,-1.334101e-15,-8.429561e-16,-4.812583e-16,1.738943e-13,2.078936e-13,-1.292626e-11,...,-1.976270e-08,6.557776e-09,-7.656762e-08,-6.647731e-07,3.739187e-07,4.182728e-07,1.039008e-05,-3.644007e-07,-7.565462e-06,-9.850639e-05
9,6.510357e-21,-9.545605e-16,-1.666224e-16,3.083765e-16,-1.334101e-15,-8.428841e-16,-4.813957e-16,1.745164e-13,2.077656e-13,-1.292648e-11,...,-1.976270e-08,6.557776e-09,-7.656762e-08,-6.647731e-07,3.739187e-07,4.182728e-07,1.039008e-05,-3.644007e-07,-7.565462e-06,-9.850639e-05
