In [7]:
import numpy as np
import pandas as pd
import string
import os
from collections import Counter

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import re

# Creating base

In [8]:
train = pd.read_csv('bases/training_variants')
test = pd.read_csv('bases/test_variants')

In [31]:
train_texts = pd.read_csv('bases/training_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")
test_texts = pd.read_csv('bases/test_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")

In [13]:
train = pd.merge(train, train_texts, how='left', on='ID')
test = pd.merge(test, test_texts, how='left', on='ID')

In [14]:
#Transform Gene Letter to their abbreviation in order to find them in the text
One_to_Three_AA = {'C': 'Cys', 'D': 'Asp', 'S': 'Ser', 'Q': 'Gln', 'K': 'Lys',
         'I': 'Ile', 'P': 'Pro', 'T': 'Thr', 'F': 'Phe', 'N': 'Asn', 
         'G': 'Gly', 'H': 'His', 'L': 'Leu', 'R': 'Arg', 'W': 'Trp', 
         'A': 'Ala', 'V': 'Val', 'E': 'Glu', 'Y': 'Tyr', 'M': 'Met'}
pattern = re.compile('|'.join(One_to_Three_AA.keys()))
##### Get variation types by using regex
def variation_regex(data, pattern): # if you want to not ignore cases, add extra argument to function
    Boolean = [not bool(re.search(pattern, i, re.IGNORECASE)) for i in data.Variation]
    data_no_regex = data[Boolean]  # 182 Fusions => 495 over 
    not_Boolean = [not i for i in Boolean]  
    data_regex = data[not_Boolean]
    
    return (data_regex, data_no_regex)

In [15]:
#### process the train and test set together
data_all = pd.concat((train, test), axis=0, ignore_index=True)
data_all_backup = data_all[:] ##### We keep backup because we want dummy variables of Gene & Text 
# TODO maybe also use Variation function of Gene from a database, and other suggestions. Also can use Count_sub as feature


# Functions for pre-processing

In [16]:
def find_sub(data):

    ##### The normal case is around 2080 out of the 2644
    
    
    Boolean = [data.Variation[i] in data.Text[i] or #normal case
               data.Variation[i][:-1] in data.Text[i] or #case 1.
               pattern.sub(lambda x: One_to_Three_AA[x.group()], data.Variation[i][:-1]) # case2
               in data.Text[i]  for i in data.index] ## because new indexing we use 
    
    #TODO could also match insensitive as a next step for more info.
    #Shorter Boolean below = the normal version
    
    #Boolean = [trainSub.Variation[i] in trainSub.Text[i] #normal case
    #           for i in trainSub.ID] ## because new indexing we use ID
    #           
            
    sub_in_text = data[Boolean]
    not_Boolean = [not i for i in Boolean]  

    sub_not_in_text = data[not_Boolean]
#    sub_in_text['Count'] = [sub_in_text.Text[i].count(sub_in_text.Variation[i][:-1])
#                    +sub_in_text.Text[i].count(pattern.sub(lambda x: One_to_Three_AA[x.group()], sub_in_text.Variation[i][:-1]))
#                    for i in sub_in_text.index]
    
    return sub_in_text, sub_not_in_text
##### For subs that are not find in text: use regex to account for a different number
##### TODO: things you can further try - with AA name replacement, searching for the number only etc.
def find_sub_noText(data):
    Booleans = []
    for i in data.index:
        split_variation = re.split('(\d+)', data.Variation[i]) # split based on a number
        first_Amino = re.escape(split_variation[0]) #re.escpae uses variable as regex
        last_Amino = re.escape(split_variation[-1])
        #first_number = re.escape(split_variation[1][0])
        #new_regex = r"[^a-zA-Z0-9]" + first_Amino + first_number
        new_regex  = first_Amino + r"\d+" + last_Amino
        Boolean = bool(re.search(new_regex, data.Text[i]))
        Booleans.append(Boolean)
    
    sub_number_in_text = data[Booleans]
    not_Boolean = [not i for i in Booleans]  

    sub_again_no_text = data[not_Boolean]
    return sub_again_no_text, sub_number_in_text


In [17]:
##### Next we use a window to extract sentences
def get_sentences_sub(data, splitted_sentences, window_left, window_right):
    #position_sentences = [[] for _ in range(len(data))]  #### currently not used
    data.index = range(len(data))
    sentences_with_sub = [[] for _ in range(len(data))]
    
    for i in range(len(splitted_sentences)):
        sentences = splitted_sentences[i]
        one_to_three_variation = pattern.sub(lambda x: One_to_Three_AA[x.group()], data.Variation[i][:-1])
        Variation = data.Variation[i][:-1]        
        for j in range(len(sentences)):                              
            if (Variation in sentences[j]) or (one_to_three_variation in sentences[j]):
                new_regex = re.escape(Variation) + r"[\S]*" ### Means no white space 0 or more
                sentences[j] = re.sub(new_regex, ' placeholderMutation', sentences[j]) #case 1
                new_regex = re.escape(one_to_three_variation) + r"[\S]*"
                sentences[j] = re.sub(new_regex, ' placeholderMutation', sentences[j]) #case 2
                sentences_with_sub[i].extend(sentences[max(j-window_left,0) : min(j+1+window_right, len(sentences)-1)])
                
                ### We add space to ' placeholderMutation' because sometimes there are letters in front of it
                # position_sentences[i].append(j) # not used for the moment

    return sentences_with_sub   ### This might take a while because it's looping through all sentences
def get_sentences_sub_noText(data, splitted_sentences, window_left, window_right):
    #position_sentences = [[] for _ in range(len(data))]  #### currently not used
    data.index = range(len(data))
    sentences_with_sub = [[] for _ in range(len(data))]
    
    for i in range(len(splitted_sentences)):
        sentences = splitted_sentences[i] 
        for j in range(len(sentences)):
            split_variation = re.split('(\d+)', data.Variation[i]) # split based on a number
            first_Amino = re.escape(split_variation[0]) #re.escpae uses variable as regex
            last_Amino = re.escape(split_variation[-1])
            new_regex  = first_Amino + r"\d+" + last_Amino
            
            #counter=len(re.findall(new_regex, sentences[j]))
            
            Boolean = bool(re.search(new_regex, sentences[j]))
            if Boolean:
                sentences[j] = re.sub(new_regex, ' placeholderMutation', sentences[j]) # Might help catch sy
                sentences_with_sub[i].extend(sentences[j-window_left : j+1+window_right])
                # position_sentences[i].append(j) # not used for the moment

    return sentences_with_sub   ### This might take a while because it's looping through all sentences


In [18]:
#### Converts list of sentences into one string of sentences for each document => to use for tfidf etc.
def sentences_to_string(sentences_list):
    sentence_strings = []
    for sentences in sentences_list:
        sentence_string =  ' '.join(str(sentence) for sentence in sentences)
        sentence_strings.append(sentence_string)
    
    return sentence_strings ### This doesn't take such a long time to run


# Subtitutions (subs) processing of data set 

In [19]:
######### First find those that have the format of being a substitution in data
data_all['Substitutions_var'] = data_all.Variation.apply(lambda x: bool(re.search('^[A-Z]\\d+[A-Z*]$', x))*1) #multiplying by 1 converts True to 1, False to 0 => Maybe modify this later?
data_all['Stop_codon_var'] = data_all.Variation.apply(lambda x: bool(re.search('[*]', x))*1) #multiplying by 1 converts True to 1, False to 0
data_sub = data_all[data_all['Substitutions_var']==1] ### Now we know the index of where a substitution occurs - the data_sub

In [20]:
sub_in_text, sub_not_in_text = find_sub(data_sub)
sub_in_text_backup = sub_in_text[:] ## index gets changed by text_processing if we don't make a copy
##### INVESTIGATION: Why do some subs don't appear in Text?: Try to automize this and find out
### Substitutions can appear as SAME_PREFIX - Other number - SAME_SUFFIX

sub_again_no_Text, sub_noText = find_sub_noText(sub_not_in_text) # 108 such cases out of 411 = nice improvement
sub_noText_backup = sub_noText[:]

Working on variations who have the 2 letters right but not same numbers

In [24]:
#nltk.download("popular")

In [21]:
NLTK_sub = [sent_tokenize(sub_in_text.Text[i]) for i in sub_in_text.index] # takes a long time to run tokenizer => use pickle to save
sub_sentences = get_sentences_sub(sub_in_text, NLTK_sub, window_left = 2, window_right = 2) 
# Retrieves sentences where subsitution mutation is included.
# window_left and window_right specify which sentences to keep at the left side or right side of the sub sentences.
# IMPORTANT: I used also placeholderMutation to replace the original sub mutations here
sub_sentences = [sorted(set(sentences), key = sentences.index) for sentences in sub_sentences] # only uses unique sentences


In [24]:
NLTK_sub_noText = [sent_tokenize(sub_noText.Text[i]) for i in sub_noText.index]
sub_noText_sentences = get_sentences_sub_noText(sub_noText, NLTK_sub_noText, window_left = 2, window_right = 2) # Retrieves sentences where subsitution mutation is included
sub_noText_sentences = [sorted(set(sentences), key = sentences.index) for sentences in sub_noText_sentences] # only use unique sentences


In [25]:
sub_sentences_string = sentences_to_string(sub_sentences)
sub_noText_string = sentences_to_string(sub_noText_sentences)

In [26]:
data_all.Text[sub_in_text_backup.index] = sub_sentences_string
data_all.Text[sub_noText_backup.index] = sub_noText_string

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [27]:
##############################################################################################################################
############## Non-subs preprocessing of data set #######################

#def find_mutation_type(row, pattern):  ##### TODO: make clearer by using a function instead of lambda
#    return bool(re.search('^fusion', row, re.IGNORECASE)) *1. Also for subs

####### Fusions : 'Fusions' ############
data_all['Fusion_var'] = data_all.Variation.apply(lambda x: bool(re.search('^fusion', x, re.IGNORECASE))*1) #multiplying by 1 converts True to 1, False to 0
new_fusion, new_data_all = variation_regex(data_all, '^fusion') 

###### Fusions: 'Gene-Gene fusion' ########
data_all['gene_fusion_var'] = new_data_all.Variation.apply(lambda x: bool(re.search('fusion', x, re.IGNORECASE))*1) 
_ , new_data_all = variation_regex(new_data_all, 'fusion') 
###### Notice that NaN introduced for places where splicing occured => replace after NaN with 0's when complete

####### Deletions: 'Deletions' ############
data_all['Deletion_var'] = new_data_all.Variation.apply(lambda x: bool(re.search('^del', x, re.IGNORECASE))*1) 
new_del, new_data_all = variation_regex(new_data_all, '^del') 

####### Deletions & Insertions wheteher together or seperately (doesn't make a big difference IMO)
data_all['del_or_ins_var'] = new_data_all.Variation.apply(lambda x: bool(re.search('del|ins', x, re.IGNORECASE))*1) 
# we could also later divide it into del, ins if we want to

###### Amplifications #########
data_all['Amplification_var'] = data_all.Variation.apply(lambda x: bool(re.search('ampl', x, re.IGNORECASE))*1) 

###### Truncations ########### Don't forget there are 'Truncating mutations' = 95 and '_trunc' = 4
data_all['Truncation_var'] = data_all.Variation.apply(lambda x: bool(re.search('trunc', x, re.IGNORECASE))*1) 

####### Exons #########
data_all['exon_var'] = data_all.Variation.apply(lambda x: bool(re.search('exon', x, re.IGNORECASE))*1) 

####### Frameshift mutations ########
data_all['frameshift_var'] = data_all.Variation.apply(lambda x: bool(re.search('fs', x, re.IGNORECASE))*1) 

####### Duplications ##############
data_all['dup_var'] = data_all.Variation.apply(lambda x: bool(re.search('dup', x, re.IGNORECASE))*1) 

data_all.fillna(0, inplace = True)



In [30]:
#TODO : #Sentence Tokenizer for non-subs that are not in text, AND
#subs that are not in text

In [31]:
#Not used
################ The dummy variables for Gene and Text ##################
## TODO: also use dummy for Text? There are 135 shared Genes and 142 shared Text between train and Leaks!  len(set(train.Text) & set(Leaks.Text))
#data_all_dummy = data_all_backup[['Gene', 'Text']] # drop those columns we don't need as dummy.
#X_dummy = pd.get_dummies(data_all_dummy) # converts categorical variables into dummy variable. From len set => 269 genes + 2090 texts
#X_dummy_train = X_dummy[:train.shape[0]]
#X_dummy_test = X_dummy[train.shape[0]:]
#dummy_names = X_dummy.columns.values #### To remember names if you want to check again what Gene or Text used
#X_dummy = X_dummy.values

In [32]:
###### Use the variation types 
#variation_types = data_all.drop(['ID', 'Gene', 'Class', 'Text', 'Variation'], axis =1)
#X_variation_train = variation_types[:train.shape[0]]
#X_variation_test = variation_types[train.shape[0]:]
#variation_names = variation_types.columns.values 

# Cleaning 

In [32]:
stop = set(stopwords.words('english'))
exclude = set('!"#$%&\'()*+:;<=>?@[\\]^_`{|}~0123456789') 
print(exclude)
lemma = WordNetLemmatizer()
def clean(doc,lemmatiz=False):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free_0 = [re.sub(",|\.|/"," ",ch) for ch in stop_free]
    if lemmatiz:
        punc_free_lem="".join(ch for ch in punc_free_0 if ch not in exclude)
        normalized = " ".join(lemma.lemmatize(word) for word in punc_free_lem.split())
        return normalized
    else:
        punc_free = "".join(ch for ch in punc_free_0 if ch not in exclude)
        return punc_free

{'~', ']', '_', '2', ')', '[', '4', '8', ';', '3', '<', '^', '(', '{', '|', '>', '7', "'", '}', '@', '`', '5', '"', ':', '1', '=', '$', '&', '9', '\\', '6', '!', '0', '*', '%', '#', '?', '+'}


In [33]:
#No lemmatization for the moment, be careful not to lemmatize then w2vec
data_all.Text = [clean(doc) for doc in data_all.Text]  

# some more features engineering

In [37]:
#Some counters on the gene and variation, here not using the 3letters abreviations
#that means if the text contains only the gene or variation but with the 
#3letters format, the variable will capture it
print(data_all)
data_all["Gene_Share"] = data_all_backup.apply(lambda r: sum([1 for w in r["Gene"].split(" ") if w in r["Text"].split(" ")]), axis=1)
data_all["Variation_Share"] = data_all_backup.apply(lambda r: sum([1 for w in r["Variation"].split(" ") if w in r["Text"].split(" ")]), axis=1)

data_all["Text_words"] = data_all["Text"].map(lambda x: len(str(x).split(" ")))

      Class     Gene    ID                                               Text  \
0       1.0   FAM58A     0  cyclin-dependent kinases cdks regulate variety...   
1       2.0      CBL     1  normal tumor pairwise analysis  significant lo...   
2       2.0      CBL     2  normal tumor pairwise analysis  significant lo...   
3       3.0      CBL     3  hrm analysis cbl exons   blast crisis cases n ...   
4       4.0      CBL     4  vm mutant borderline densitometry ratio    mut...   
5       4.0      CBL     5  vm mutant borderline densitometry ratio    mut...   
6       5.0      CBL     6  figure  figure  structures wild type green ke ...   
7       1.0      CBL     7  cbl negative regulator activated receptor tyro...   
8       4.0      CBL     8  subset jmml patients harbor cbl mutations asso...   
9       4.0      CBL     9  subset jmml patients harbor cbl mutations asso...   
10      4.0      CBL    10  vm mutant borderline densitometry ratio    mut...   
11      4.0      CBL    11  

In [36]:
figure_counter=["fig","figure"]
for fig in figure_counter: 
        data_all["Figure_counter"]=(data_all_backup["Text"].map(lambda x : str(x).count(fig)))

In [None]:
data_all["Text"]=[re.sub("fig|figure"," ",doc) for doc in data_all["Text"]]

In [38]:
train = data_all.iloc[:len(train)]
test = data_all.iloc[len(train):]

In [39]:
train.to_csv("bases/working_train.csv",index=False,encoding="utf8")
test.to_csv("bases/working_test.csv",index=False,encoding="utf8")