In [75]:
import numpy as np
import pandas as pd
import string
import os
from collections import Counter

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.decomposition import TruncatedSVD

import re

# Creating base

In [76]:
train = pd.read_csv('..//bases/training_variants')
test = pd.read_csv('..//bases/test_variants')

In [77]:
train_texts = pd.read_csv('..//bases/training_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")
test_texts = pd.read_csv('..//bases/test_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")

In [78]:
new_test=pd.read_csv('..//bases/new_test_variants.csv')
new_test_texts = pd.read_csv('..//bases/new_test_text.csv', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")

In [79]:
train = pd.merge(train, train_texts, how='left', on='ID')
test = pd.merge(test, test_texts, how='left', on='ID')
new_test_final=pd.merge(new_test,new_test_texts,how="left",on="ID")
leaks=pd.read_csv('..//bases/s1_add_train.csv')
leaks_1=pd.DataFrame([leaks["ID"],leaks.drop("ID",axis=1).idxmax(axis=1).map(lambda x: x.lstrip('class'))])
leaks_2=leaks_1.T
leaks_2.columns=["ID","Class"]
leaks_3=pd.merge(leaks_2,test[test.ID.isin(leaks_2.ID)])
leaks_final=pd.merge(leaks_3,test_texts[test_texts.ID.isin(leaks_3.ID)])
train_final=pd.concat([train,leaks_final]) #adding first stage


In [80]:
del train,test,leaks,leaks_1,leaks_2,leaks_3

In [81]:
#Transform Gene Letter to their abbreviation in order to find them in the text
One_to_Three_AA = {'C': 'Cys', 'D': 'Asp', 'S': 'Ser', 'Q': 'Gln', 'K': 'Lys',
         'I': 'Ile', 'P': 'Pro', 'T': 'Thr', 'F': 'Phe', 'N': 'Asn', 
         'G': 'Gly', 'H': 'His', 'L': 'Leu', 'R': 'Arg', 'W': 'Trp', 
         'A': 'Ala', 'V': 'Val', 'E': 'Glu', 'Y': 'Tyr', 'M': 'Met'}
pattern = re.compile('|'.join(One_to_Three_AA.keys()))
##### Get variation types by using regex
def variation_regex(data, pattern): # if you want to not ignore cases, add extra argument to function
    Boolean = [not bool(re.search(pattern, i, re.IGNORECASE)) for i in data.Variation]
    data_no_regex = data[Boolean]  # 182 Fusions => 495 over 
    not_Boolean = [not i for i in Boolean]  
    data_regex = data[not_Boolean]
    
    return (data_regex, data_no_regex)

In [82]:
#### process the train and test set together
data_all = pd.concat((train_final, new_test_final), axis=0, ignore_index=True)
data_all_backup = data_all[:] ##### We keep backup because we want dummy variables of Gene & Text 
# TODO maybe also use Variation function of Gene from a database, and other suggestions. Also can use Count_sub as feature


In [83]:
data_all.shape
data_all

Unnamed: 0,Class,Gene,ID,Text,Variation
0,1,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations
1,2,CBL,1,Abstract Background Non-small cell lung canc...,W802*
2,2,CBL,2,Abstract Background Non-small cell lung canc...,Q249E
3,3,CBL,3,Recent evidence has demonstrated that acquired...,N454D
4,4,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V
5,4,CBL,5,Oncogenic mutations in the monomeric Casitas B...,V391I
6,5,CBL,6,Oncogenic mutations in the monomeric Casitas B...,V430M
7,1,CBL,7,CBL is a negative regulator of activated recep...,Deletion
8,4,CBL,8,Abstract Juvenile myelomonocytic leukemia (JM...,Y371H
9,4,CBL,9,Abstract Juvenile myelomonocytic leukemia (JM...,C384R


# Functions for pre-processing

In [84]:
def find_sub(data):

    ##### The normal case is around 2080 out of the 2644
    
    
    Boolean = [data.Variation[i] in data.Text[i] or #normal case
               data.Variation[i][:-1] in data.Text[i] or #case 1.
               pattern.sub(lambda x: One_to_Three_AA[x.group()], data.Variation[i][:-1]) # case2
               in data.Text[i]  for i in data.index] ## because new indexing we use 
    
    #TODO could also match insensitive as a next step for more info.
    #Shorter Boolean below = the normal version
    
    #Boolean = [trainSub.Variation[i] in trainSub.Text[i] #normal case
    #           for i in trainSub.ID] ## because new indexing we use ID
    #           
            
    sub_in_text = data[Boolean]
    not_Boolean = [not i for i in Boolean]  

    sub_not_in_text = data[not_Boolean]
#    sub_in_text['Count'] = [sub_in_text.Text[i].count(sub_in_text.Variation[i][:-1])
#                    +sub_in_text.Text[i].count(pattern.sub(lambda x: One_to_Three_AA[x.group()], sub_in_text.Variation[i][:-1]))
#                    for i in sub_in_text.index]
    
    return sub_in_text, sub_not_in_text
##### For subs that are not find in text: use regex to account for a different number
##### TODO: things you can further try - with AA name replacement, searching for the number only etc.
def find_sub_noText(data):
    Booleans = []
    for i in data.index:
        split_variation = re.split('(\d+)', data.Variation[i]) # split based on a number
        first_Amino = re.escape(split_variation[0]) #re.escpae uses variable as regex
        last_Amino = re.escape(split_variation[-1])
        #first_number = re.escape(split_variation[1][0])
        #new_regex = r"[^a-zA-Z0-9]" + first_Amino + first_number
        new_regex  = first_Amino + r"\d+" + last_Amino
        Boolean = bool(re.search(new_regex, data.Text[i]))
        Booleans.append(Boolean)
    
    sub_number_in_text = data[Booleans]
    not_Boolean = [not i for i in Booleans]  

    sub_again_no_text = data[not_Boolean]
    return sub_again_no_text, sub_number_in_text


In [85]:
#### Converts list of sentences into one string of sentences for each document => to use for tfidf etc.
def sentences_to_string(sentences_list):
    sentence_strings = []
    for sentences in sentences_list:
        sentence_string =  ' '.join(str(sentence) for sentence in sentences)
        sentence_strings.append(sentence_string)
    
    return sentence_strings ### This doesn't take such a long time to run


# Subtitutions (subs) processing of data set 

In [86]:
######### First find those that have the format of being a substitution in data
data_all['Substitutions_var'] = data_all.Variation.apply(lambda x: bool(re.search('^[A-Z]\\d+[A-Z*]$', x))*1) #multiplying by 1 converts True to 1, False to 0 => Maybe modify this later?
data_all['Stop_codon_var'] = data_all.Variation.apply(lambda x: bool(re.search('[*]', x))*1) #multiplying by 1 converts True to 1, False to 0
data_sub = data_all[data_all['Substitutions_var']==1] ### Now we know the index of where a substitution occurs - the data_sub

In [87]:
sub_in_text, sub_not_in_text = find_sub(data_sub)
sub_in_text_backup = sub_in_text[:] ## index gets changed by text_processing if we don't make a copy
##### INVESTIGATION: Why do some subs don't appear in Text?: Try to automize this and find out
### Substitutions can appear as SAME_PREFIX - Other number - SAME_SUFFIX

sub_again_no_Text, sub_noText = find_sub_noText(sub_not_in_text) # 108 such cases out of 411 = nice improvement
sub_noText_backup = sub_noText[:]

Working on variations who have the 2 letters right but not same numbers

In [88]:
#nltk.download("popular")

In [89]:
#### First find those that have the format of being a non-substitutions in the data

# Initialize some of the variables already (because we splice them, could be filled with NA's otherwise)
# Represents 783 from train and test set
data_all['gene_fusion_var'] = 0
data_all['Deletion_var'] = 0
data_all['del_or_ins_var'] = 0

####### Fusions : 'Fusions' ############
data_all['Fusion_var'] = data_all.Variation.apply(lambda x: bool(re.search('^fusion', x, re.IGNORECASE))*1) #multiplying by 1 converts True to 1, False to 0
_ , new_data_all = variation_regex(data_all, '^fusion')  #37 cases

###### Fusions: 'Gene-Gene fusion' ########
data_all['gene_fusion_var'].loc[new_data_all.index] = new_data_all.Variation.apply(lambda x: bool(re.search('fusion', x, re.IGNORECASE))*1)
_ , new_data_all = variation_regex(new_data_all, 'fusion') #160 cases

####### Deletions: 'Deletions' ############
data_all['Deletion_var'].loc[new_data_all.index] = new_data_all.Variation.apply(lambda x: bool(re.search('^del', x, re.IGNORECASE))*1)
_, new_data_all = variation_regex(new_data_all, '^del') # 88 cases

####### Deletions & Insertions wheteher together or seperately (doesn't make a big difference IMO)
data_all['del_or_ins_var'].loc[new_data_all.index] = new_data_all.Variation.apply(lambda x: bool(re.search('del|ins', x, re.IGNORECASE))*1)
# 196 cases

###### Amplifications #########
data_all['Amplification_var'] = data_all.Variation.apply(lambda x: bool(re.search('ampl', x, re.IGNORECASE))*1) # 79 cases

###### Truncations ########### Don't forget there are 'Truncating mutations'  '_trunc' 
data_all['Truncation_var'] = data_all.Variation.apply(lambda x: bool(re.search('trunc', x, re.IGNORECASE))*1) # 118 cases

####### Exons #########
data_all['exon_var'] = data_all.Variation.apply(lambda x: bool(re.search('exon', x, re.IGNORECASE))*1) 

####### Frameshift mutations ########
data_all['frameshift_var'] = data_all.Variation.apply(lambda x: bool(re.search('fs', x, re.IGNORECASE))*1) # 22 cases

####### Duplications ##############
data_all['dup_var'] = data_all.Variation.apply(lambda x: bool(re.search('dup', x, re.IGNORECASE))*1)

####### Hypermethylation ##########
data_all['overexpression_var'] = data_all.Variation.apply(lambda x: bool(re.search('Overexpression', x, re.IGNORECASE))*1) 

####### Splice ##########
data_all['splice_var'] = data_all.Variation.apply(lambda x: bool(re.search('splice', x, re.IGNORECASE))*1) 

####### Hypermethylation ##########
data_all['hypermethylation_var'] = data_all.Variation.apply(lambda x: bool(re.search('hypermethylation', x, re.IGNORECASE))*1)

####### DNA binding ##########
data_all['DNA_binding_var'] = data_all.Variation.apply(lambda x: bool(re.search('DNA', x, re.IGNORECASE))*1)

####### Null ##########
data_all['null_var'] = data_all.Variation.apply(lambda x: bool(re.search('null', x, re.IGNORECASE))*1)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [90]:
#TODO : #Sentence Tokenizer for non-subs that are not in text, AND
#subs that are not in text

In [91]:
#Not used
################ The dummy variables for Gene and Text ##################
## TODO: also use dummy for Text? There are 135 shared Genes and 142 shared Text between train and Leaks!  len(set(train.Text) & set(Leaks.Text))
#data_all_dummy = data_all_backup[['Gene', 'Text']] # drop those columns we don't need as dummy.
#X_dummy = pd.get_dummies(data_all_dummy) # converts categorical variables into dummy variable. From len set => 269 genes + 2090 texts
#X_dummy_train = X_dummy[:train.shape[0]]
#X_dummy_test = X_dummy[train.shape[0]:]
#dummy_names = X_dummy.columns.values #### To remember names if you want to check again what Gene or Text used
#X_dummy = X_dummy.values

In [92]:
###### Use the variation types 
#variation_types = data_all.drop(['ID', 'Gene', 'Class', 'Text', 'Variation'], axis =1)
#X_variation_train = variation_types[:train.shape[0]]
#X_variation_test = variation_types[train.shape[0]:]
#variation_names = variation_types.columns.values 

# some more features engineering

In [93]:
# Feature for the length of the text
data_all["Text_words"] = data_all["Text"].map(lambda x: len(str(x).split(" ")))

In [94]:
new_train = data_all.iloc[:len(train_final)]
new_test = data_all.iloc[len(train_final):]

In [95]:
def add_missing_dummy_columns( d, columns ):
    missing_cols = set( columns ) - set( d.columns )
    for c in missing_cols:
        d[c] = 0
def fix_test_columns( d, columns ):  

    add_missing_dummy_columns( d, columns )

    # make sure we have all the columns we need
    assert( set( columns ) - set( d.columns ) == set())

    extra_cols = set( d.columns ) - set( columns )
    if extra_cols:
        print ("extra columns:", extra_cols)

    d = d[ columns ]
    return d

In [96]:
svd = TruncatedSVD(n_components=25, n_iter=12, random_state=26)

one_hot_gene = pd.get_dummies(new_train['Gene'])
one_hot_gene_test=pd.get_dummies(new_test["Gene"])
one_hot_gene_test=fix_test_columns(one_hot_gene_test,one_hot_gene.columns)
truncated_one_hot_gene = svd.fit_transform(one_hot_gene.values)
truncated_one_hot_gene_for_test=svd.transform(one_hot_gene_test.values)

extra columns: {'KISS1R', 'ABCC6', 'TRPC6', 'STAT5B', 'CLDN16', 'BFSP2', 'FLNB', 'DYNC2H1', 'CRNKL1', 'SEPT9', 'BBS5', 'BMPR1B', 'NDUFS3', 'DCC', 'MCC', 'SCN9A', 'GLE1', 'WNT4', 'IKBKAP', 'YARS', 'CHST3', 'EPHA2', 'CRLF1', 'EIF2B5', 'EPHB2', 'TTK', 'HABP2', 'NDUFS6', 'SLC4A4', 'SLC6A5', 'PLA2G6', 'RP1', 'EDAR', 'GCM2', 'SUCLA2', 'DNM1L', 'PDE8B', 'SLC7A7', 'MYOT', 'LCT', 'TNFRSF11A', 'TGM5', 'ABCB11', 'ASS1', 'TINF2', 'DNAH5', 'WISP3', 'KCNMB1', 'TSHR', 'RNF6', 'AGXT', 'SLC19A2', 'RECQL4', 'BCS1L', 'ZFPM2', 'LRP4', 'PNPO', 'RAB27A', 'XRCC1', 'SLC27A4', 'SF3B2', 'CST3', 'DNAI1', 'KLF11', 'GPHN', 'ATP2C1', 'SLC25A15', 'ALG10', 'AURKC', 'PTCH2', 'NEK8', 'BAG3', 'RPS26', 'RAD54B', 'SLC22A4', 'ROCK1', 'LRP5', 'MPDU1', 'SLC25A13', 'PRKRA', 'SCO1', 'ITM2B', 'SCN4A', 'SLC22A5', 'KCNE2', 'SYT6', 'SLC33A1', 'SPAST', 'MOCS2', 'KCNJ13', 'RGS9', 'CYP7B1', 'TP63', 'SLC25A12', 'CRB1', 'POLH', 'APOL1', 'GNE', 'LRP6', 'SLC17A5', 'RPS19', 'MOCS1', 'KRIT1', 'OTOF', 'ALOX12B', 'STK19', 'KCNQ4', 'STK33', '

In [97]:
genes_train=pd.DataFrame(truncated_one_hot_gene,columns=["tsvd_gene"+ str(x) for x in range(0,25)])
genes_test=pd.DataFrame(truncated_one_hot_gene_for_test,columns=["tsvd_gene"+ str(x) for x in range(0,25)])

In [98]:
new_test["index"]=range(0,len(new_test))
new_train_1=pd.merge(new_train.reset_index(),genes_train.reset_index()).drop("index",axis=1)
new_test_1=pd.merge(new_test,genes_test.reset_index()).drop("index",axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [99]:
new_train_1.to_csv("checkpoints_databases/nw_working_train_0.csv",index=False,encoding="utf8")
new_test_1.to_csv("checkpoints_databases/nw_working_test_0.csv",index=False,encoding="utf8")

In [100]:
new_train_1.shape

(3689, 47)

In [101]:
new_test_1.shape

(986, 47)

In [47]:
new_train_1

Unnamed: 0,Class,Gene,ID,Text,Variation,Substitutions_var,Stop_codon_var,Fusion_var,gene_fusion_var,Deletion_var,...,tsvd_gene15,tsvd_gene16,tsvd_gene17,tsvd_gene18,tsvd_gene19,tsvd_gene20,tsvd_gene21,tsvd_gene22,tsvd_gene23,tsvd_gene24
0,1,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations,0,0,0,0.0,0.0,...,-5.095974e-21,-1.369400e-21,-3.807105e-21,-1.190448e-20,-8.716408e-21,-1.818640e-20,-2.010513e-20,-4.441140e-20,-7.781386e-20,5.540191e-20
1,2,CBL,1,Abstract Background Non-small cell lung canc...,W802*,1,1,0,0.0,0.0,...,1.551605e-04,2.240308e-04,-1.994789e-04,-2.619992e-05,5.231767e-06,-8.621741e-04,1.631879e-04,3.211893e-03,4.296584e-03,3.149577e-03
2,2,CBL,2,Abstract Background Non-small cell lung canc...,Q249E,1,0,0,0.0,0.0,...,1.551605e-04,2.240308e-04,-1.994789e-04,-2.619992e-05,5.231767e-06,-8.621741e-04,1.631879e-04,3.211893e-03,4.296584e-03,3.149577e-03
3,3,CBL,3,Recent evidence has demonstrated that acquired...,N454D,1,0,0,0.0,0.0,...,1.551605e-04,2.240308e-04,-1.994789e-04,-2.619992e-05,5.231767e-06,-8.621741e-04,1.631879e-04,3.211893e-03,4.296584e-03,3.149577e-03
4,4,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V,1,0,0,0.0,0.0,...,1.551605e-04,2.240308e-04,-1.994789e-04,-2.619992e-05,5.231767e-06,-8.621741e-04,1.631879e-04,3.211893e-03,4.296584e-03,3.149577e-03
5,4,CBL,5,Oncogenic mutations in the monomeric Casitas B...,V391I,1,0,0,0.0,0.0,...,1.551605e-04,2.240308e-04,-1.994789e-04,-2.619992e-05,5.231767e-06,-8.621741e-04,1.631879e-04,3.211893e-03,4.296584e-03,3.149577e-03
6,5,CBL,6,Oncogenic mutations in the monomeric Casitas B...,V430M,1,0,0,0.0,0.0,...,1.551605e-04,2.240308e-04,-1.994789e-04,-2.619992e-05,5.231767e-06,-8.621741e-04,1.631879e-04,3.211893e-03,4.296584e-03,3.149577e-03
7,1,CBL,7,CBL is a negative regulator of activated recep...,Deletion,0,0,0,0.0,1.0,...,1.551605e-04,2.240308e-04,-1.994789e-04,-2.619992e-05,5.231767e-06,-8.621741e-04,1.631879e-04,3.211893e-03,4.296584e-03,3.149577e-03
8,4,CBL,8,Abstract Juvenile myelomonocytic leukemia (JM...,Y371H,1,0,0,0.0,0.0,...,1.551605e-04,2.240308e-04,-1.994789e-04,-2.619992e-05,5.231767e-06,-8.621741e-04,1.631879e-04,3.211893e-03,4.296584e-03,3.149577e-03
9,4,CBL,9,Abstract Juvenile myelomonocytic leukemia (JM...,C384R,1,0,0,0.0,0.0,...,1.551605e-04,2.240308e-04,-1.994789e-04,-2.619992e-05,5.231767e-06,-8.621741e-04,1.631879e-04,3.211893e-03,4.296584e-03,3.149577e-03
