In [127]:
import numpy as np
import pandas as pd
import string
import os
from collections import Counter
import copy

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
pd.options.display.max_colwidth = 50
pd.set_option('display.max_rows', 500)
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# Creating base

In [128]:
new_test=pd.read_csv('..//bases/new_test_variants.csv')
new_test_texts = pd.read_csv('..//bases/new_test_text.csv', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")
new_test_final=pd.merge(new_test,new_test_texts,how="left",on="ID")

In [129]:
leaks=pd.read_csv('..//bases/s1_add_train.csv')
leaks_1=pd.DataFrame([leaks["ID"],leaks.drop("ID",axis=1).idxmax(axis=1).map(lambda x: x.lstrip('class'))])
leaks_2=leaks_1.T
leaks_2.columns=["ID","Class"]

In [130]:
train = pd.read_csv('..//bases/training_variants')
test = pd.read_csv('..//bases/test_variants')

In [131]:
train_texts = pd.read_csv('..//bases/training_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")
test_texts = pd.read_csv('..//bases/test_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"], encoding = "utf-8")

In [132]:
train = pd.merge(train, train_texts, how='left', on='ID')
test = pd.merge(test, test_texts, how='left', on='ID')

In [133]:
leaks_3=pd.merge(leaks_2,test[test.ID.isin(leaks_2.ID)])
leaks_final=pd.merge(leaks_3,test_texts[test_texts.ID.isin(leaks_3.ID)])

In [134]:
train_all = pd.concat([train,leaks_final]) #adding first stage

In [135]:
merge_match = new_test.merge(train_all, left_on=['Gene', 'Variation'], right_on = ['Gene', 'Variation'])
Index_leak = merge_match.ID_x - 1
new_test_index = [item for item in new_test_final.index if item not in list(Index_leak)]
test_no_leaks = new_test_final.iloc[new_test_index]
test_no_leaks

Unnamed: 0,ID,Gene,Variation,Text
2,3,WNT4,E216G,Mycosis fungoides and Sézary syndrome are prim...
3,4,SUCLA2,G118R,Regulated progression through the cell cycle ...
5,6,CHEK2,E239K,The nuclei that laboratories solution p53 KIT ...
6,7,CHST3,T141M,Myeloid differentiation 88 (MyD88) is the key ...
7,8,RNF6,G244D,Human ESCCs 2 occur frequently worldwide (1) ....
8,9,SPAST,C448Y,large were of activity growth this product tol...
10,11,SCN4A,V445M,Endometrial carcinoma is the most common gynec...
14,15,ERBB2,G746S,The protein-kinase family is the most frequent...
15,16,TP53,Y234S,Among the best-studied therapeutic targets in ...
16,17,RAB27A,A87P,"Introduction In recent years, a better unders..."


In [136]:
train_all['Substitutions_var'] = train_all.Variation.apply(lambda x: bool(re.search('^[A-Z]\\d+[A-Z*]$', x))*1)
new_train = train_all[train_all['Substitutions_var']==1]

In [137]:
#### process the train and test set together
data_all = pd.concat((new_train, test_no_leaks), axis=0, ignore_index=True)
data_all = data_all[['Class', 'Gene', 'ID', 'Variation', 'Text']] # just reordering
data_all_backup = data_all[:] ## We keep backup in case we need to use again
data_all

Unnamed: 0,Class,Gene,ID,Variation,Text
0,2,CBL,1,W802*,Abstract Background Non-small cell lung canc...
1,2,CBL,2,Q249E,Abstract Background Non-small cell lung canc...
2,3,CBL,3,N454D,Recent evidence has demonstrated that acquired...
3,4,CBL,4,L399V,Oncogenic mutations in the monomeric Casitas B...
4,4,CBL,5,V391I,Oncogenic mutations in the monomeric Casitas B...
5,5,CBL,6,V430M,Oncogenic mutations in the monomeric Casitas B...
6,4,CBL,8,Y371H,Abstract Juvenile myelomonocytic leukemia (JM...
7,4,CBL,9,C384R,Abstract Juvenile myelomonocytic leukemia (JM...
8,4,CBL,10,P395A,Oncogenic mutations in the monomeric Casitas B...
9,4,CBL,11,K382E,Noonan syndrome is an autosomal dominant conge...


In [138]:
data_all.iloc[3107,:]

Class                                                      NaN
Gene                                                    PIK3CA
ID                                                         281
Variation                                               T1025A
Text         Molecular genetic testing informs diagnosis, p...
Name: 3107, dtype: object

In [139]:
#Transform Amino Acid (AA) Letter to their three-letter abbreviation in order to find them in the text when they appear
One_to_Three_AA = {'C': 'Cys', 'D': 'Asp', 'S': 'Ser', 'Q': 'Gln', 'K': 'Lys',
         'I': 'Ile', 'P': 'Pro', 'T': 'Thr', 'F': 'Phe', 'N': 'Asn', 
         'G': 'Gly', 'H': 'His', 'L': 'Leu', 'R': 'Arg', 'W': 'Trp', 
         'A': 'Ala', 'V': 'Val', 'E': 'Glu', 'Y': 'Tyr', 'M': 'Met'}
pattern = re.compile('|'.join(One_to_Three_AA.keys()))

# Substitutions (subs)
## Functions for pre-processing of subs

In [140]:
# find_sub return the substituions that are in text and those that are not
def find_sub(data):    
    Boolean = [data.Variation[i][:-1] in data.Text[i] or #case 1.
               pattern.sub(lambda x: One_to_Three_AA[x.group()], data.Variation[i][:-1]) # case2
               in data.Text[i]  for i in data.index] ## because new indexing we use 
    
    sub_in_text = data[Boolean]
    not_Boolean = [not i for i in Boolean]  
    sub_not_in_text = data[not_Boolean]
    
    return sub_in_text, sub_not_in_text

# find_sub_numberChange searches for other number of a substitution i.e. G12V -> G_V because sometimes mistake in entry
# Is currently without One_to_three substitution or Variation[:-1] only the full variation
def find_sub_numberChange(data):
    Booleans = [] #will contain the different Booleans if found in text
    for i in data.index:
        split_variation = re.split('(\d+)', data.Variation[i]) # split based on a number
        first_Amino = re.escape(split_variation[0]) #re.escpae uses variable as regex
        last_Amino = re.escape(split_variation[-1])
        new_regex  = first_Amino + r"\d+" + last_Amino
        Boolean = bool(re.search(new_regex, data.Text[i]))
        Booleans.append(Boolean)
        
    sub_number_in_text = data[Booleans]
    not_Boolean = [not i for i in Booleans]  
    sub_number_no_text = data[not_Boolean]
    
    return sub_number_in_text, sub_number_no_text

# for substitutions that are still not found, use other keywords
def find_sub_pattern(data):    
    Boolean = [('mutat' in data.Text[i]) or ('variant' in data.Text[i]) or (data.Gene[i] in data.Text[i]) for i in data.index] ## because new indexing we use 
    
    sub_in_text = data[Boolean]
    not_Boolean = [not i for i in Boolean]  
    sub_not_in_text = data[not_Boolean]
    
    return sub_in_text, sub_not_in_text


In [141]:
##### get_sentences_sub use a window to extract sentences where the subs appear. 
# If window_left & window_right = 0 => just taking the sentences with subs

def get_sentences_sub(data, splitted_sentences):
    data.index = range(len(data)) #makes sure that index is right
    sentences_with_sub = [[] for _ in range(len(data))]
    for i in range(len(splitted_sentences)):
        sentences = splitted_sentences[i]
        one_to_three_variation = pattern.sub(lambda x: One_to_Three_AA[x.group()], data.Variation[i][:-1])
        Variation = data.Variation[i][:-1]    
        for j in range(len(sentences)):                              
            if (Variation in sentences[j]) or (one_to_three_variation in sentences[j]):
                new_regex = re.escape(Variation) + r"[\S]*" ###  r"[\S]*" because we look for Variation[:-1] not just Variation
                sentences[j] = re.sub(new_regex, ' placeholderMutation', sentences[j]) #case 1
                ### We add the space to ' placeholderMutation' because sometimes there are letters in front of it
                new_regex = re.escape(one_to_three_variation) + r"[\S]*"
                sentences[j] = re.sub(new_regex, ' placeholderMutation', sentences[j]) #case 2
                sentences_with_sub[i].extend(sentences[j:j+1])
                
    return sentences_with_sub

##### get_sentences_sub_number use a window to extract sentences where the subs appear that have different number i.e. G12V -> G_V

def get_sentences_sub_number(data, splitted_sentences):
    #position_sentences = [[] for _ in range(len(data))]  #### currently not used
    data.index = range(len(data))
    sentences_with_sub_number = [[] for _ in range(len(data))]
    for i in range(len(splitted_sentences)):
        sentences = splitted_sentences[i]
        split_variation = re.split('(\d+)', data.Variation[i]) # split based on a number
        first_Amino = re.escape(split_variation[0]) #re.escpae uses variable as regex
        last_Amino = re.escape(split_variation[-1])
        new_regex  = first_Amino + r"\d+" + last_Amino
        
        for j in range(len(sentences)):
            Boolean = bool(re.search(new_regex, sentences[j]))            
            if Boolean:
                sentences[j] = re.sub(new_regex, ' placeholderMutation', sentences[j]) # Again replacing the sentence with placeholder
                sentences_with_sub_number[i].extend(sentences[j:j+1])
    
    return sentences_with_sub_number

# for substitutions that are still not found, use other keywords
def get_sentences_pattern(data, splitted_sentences):
    data.index = range(len(data)) #makes sure that index is right
    sentences_with_sub = [[] for _ in range(len(data))]
    for i in range(len(splitted_sentences)):
        sentences = splitted_sentences[i]
        gene_name = data.Gene[i]
        for j in range(len(sentences)):                              
            if ('mutat' in sentences[j]) or ('variant' in sentences[j]) or (gene_name in sentences[j]):
                sentences[j] = re.sub(gene_name, ' placeholderGene', sentences[j]) # This time we replace for the gene because specific mutation not found
                sentences_with_sub[i].extend(sentences[j:j+1])               
    
    return sentences_with_sub

In [142]:
###### We use a window when sentences are too low. For 5 <= LENGTH <= 10: window of 1, for <= 5: window of 2 =, for 1 : window of 3
###### 

def get_window_sub(data, splitted_sentences, lengths):
    data.index = range(len(data)) #makes sure that index is right
    sentences_with_sub = [[] for _ in range(len(data))]
    for i in range(len(splitted_sentences)):
        sentences = splitted_sentences[i]
        length = lengths[i]
        if length == 1:
            window = 6
        elif length == 2:
            window = 3
        elif length >= 5:
            window=1
        else:
            window=2
        
        one_to_three_variation = pattern.sub(lambda x: One_to_Three_AA[x.group()], data.Variation[i][:-1])
        Variation = data.Variation[i][:-1] 
        all_sentences = []
        for j in range(len(sentences)):
            if (Variation in sentences[j]) or (one_to_three_variation in sentences[j]):
                new_regex = re.escape(Variation) + r"[\S]*" ###  r"[\S]*" because we look for Variation[:-1] not just Variation
                other_regex = re.escape(one_to_three_variation) + r"[\S]*"
                for sentence in sentences[max(j-window,0) : min(j+1+window, len(sentences)-1)]: # to account if start or end of text
                    sentence = re.sub(new_regex, ' placeholderMutation', sentence) #case 1
                    sentence = re.sub(other_regex, ' placeholderMutation', sentence) #case 2 after case 1
                    all_sentences.append(sentence)
        sentences_with_sub[i] = all_sentences
                    
    return sentences_with_sub

def get_window_sub_number(data, splitted_sentences, lengths):
    #position_sentences = [[] for _ in range(len(data))]  #### currently not used
    data.index = range(len(data))
    sentences_with_sub_number = [[] for _ in range(len(data))]
    for i in range(len(splitted_sentences)):
        sentences = splitted_sentences[i] 
        length = lengths[i]
        if length == 1:
            window = 6
        elif length == 2:
            window = 3
        elif length >= 5:
            window=1
        else:
            window=2
            
        split_variation = re.split('(\d+)', data.Variation[i]) # split based on a number
        first_Amino = re.escape(split_variation[0]) #re.escpae uses variable as regex
        last_Amino = re.escape(split_variation[-1])
        new_regex  = first_Amino + r"\d+" + last_Amino    
        
        all_sentences = []
        for j in range(len(sentences)):
            Boolean = bool(re.search(new_regex, sentences[j]))            
            if Boolean:
                for sentence in sentences[max(j-window,0) : min(j+1+window, len(sentences)-1)]:
                    sentence = re.sub(new_regex, ' placeholderMutation', sentence)
                    all_sentences.append(sentence)
        sentences_with_sub_number[i] = all_sentences
    
    return sentences_with_sub_number

def get_window_pattern(data, splitted_sentences, lengths):
    data.index = range(len(data)) #makes sure that index is right
    sentences_with_sub = [[] for _ in range(len(data))]
    for i in range(len(splitted_sentences)):
        sentences = splitted_sentences[i]
        length = lengths[i]
        if length == 1:
            window = 6
        elif length == 2:
            window = 3
        elif length >= 5:
            window=1
        else:
            window=2
        
        gene_name = data.Gene[i]
        all_sentences = []
        for j in range(len(sentences)):
            if ('mutat' in sentences[j]) or ('variant' in sentences[j]) or (gene_name in sentences[j]):
                for sentence in sentences[max(j-window,0) : min(j+1+window, len(sentences)-1)]: # to account if start or end of text
                    sentence = re.sub(gene_name, ' placeholderGene', sentence) #case 1
                    all_sentences.append(sentence)
        sentences_with_sub[i] = all_sentences
                    
    return sentences_with_sub




In [143]:
#### Converts list of sentences into one string of sentences for each document => to use for tfidf etc.
def sentences_to_string(sentences_list):
    sentence_strings = []
    for sentences in sentences_list:
        sentence_string =  ' '.join(str(sentence) for sentence in sentences)
        sentence_strings.append(sentence_string)
    
    return sentence_strings 

## Subs processing of the data set 

In [146]:
######### First find those that have the format of being a substitution in data
data_all['Substitutions_var'] = data_all.Variation.apply(lambda x: bool(re.search('^[A-Z]\\d+[A-Z*]$', x))*1) #multiplying by 1 converts True to 1, False to 0 => Maybe modify this later?
data_all['null'] = data_all.Variation.apply(lambda x: bool(re.search('null', x))*1)
data_all = data_all[(data_all['Substitutions_var']==1) | (data_all['null']==1) ] ### Now we know the index of where a substitution occurs - the data_sub
data_all = data_all.loc[:, 'Class':'Text']
data_sub = data_all
print("Length of total subs: %i" %len(data_sub)) # other ways to process it like finding the word 'mutation' 
data_all

Length of total subs: 3544


Unnamed: 0,Class,Gene,ID,Variation,Text
0,2,CBL,1,W802*,Abstract Background Non-small cell lung canc...
1,2,CBL,2,Q249E,Abstract Background Non-small cell lung canc...
2,3,CBL,3,N454D,Recent evidence has demonstrated that acquired...
3,4,CBL,4,L399V,Oncogenic mutations in the monomeric Casitas B...
4,4,CBL,5,V391I,Oncogenic mutations in the monomeric Casitas B...
5,5,CBL,6,V430M,Oncogenic mutations in the monomeric Casitas B...
6,4,CBL,8,Y371H,Abstract Juvenile myelomonocytic leukemia (JM...
7,4,CBL,9,C384R,Abstract Juvenile myelomonocytic leukemia (JM...
8,4,CBL,10,P395A,Oncogenic mutations in the monomeric Casitas B...
9,4,CBL,11,K382E,Noonan syndrome is an autosomal dominant conge...


### Subs inside the text

#### Subs inside the text: all sentences

In [147]:
## First consider the subs that appear in text
sub_text, sub_no_text = find_sub(data_sub) 

In [148]:
## use tokenizer to split into sentences of all the subs in text 
NLTK_sub = [sent_tokenize(sub_text.Text[i]) for i in sub_text.index] 

In [149]:
NLTK_window = copy.deepcopy(NLTK_sub) # a deep copy is necessary besauce something magical and strange happening without it
NLTK_copy = copy.deepcopy(NLTK_sub) # again for backup purposes

In [150]:
# extract window for the sub sentences where they appear
# !! Use [:] because it makes a copy and doesn't change anything to the original indexes
sub_sentences = get_sentences_sub(sub_text[:], NLTK_copy[:]) # choosing for window 0 as default now
sub_sentences_string = sentences_to_string(sub_sentences)

In [151]:
# Replace text in data_all
data_all.Text.loc[sub_text.index] = sub_sentences_string
data_all.Text.loc[sub_text.index]

0       Using select c-CBL somatic mutations such as S...
1       Using select c-CBL somatic mutations such as S...
2       Most of the changes were novel, although 4 cas...
3       Finally, the third group constituted mutations...
4       Finally, the third group constituted mutations...
5       The second group of mutants (M374V,  placehold...
6       We investigated the mechanism by which CBL- pl...
7       We investigated the mechanism by which CBL-Y37...
8       Finally, the third group constituted mutations...
9       Purified PCR amplicons (CBLWT, CBLC381A, CBL p...
10      Purified PCR amplicons (CBLWT, CBLC381A, CBLK3...
11      Purified PCR amplicons (CBLWT, CBL placeholder...
12      The second group of mutants (M374V, V430M,  pl...
13      Purified PCR amplicons (CBLWT, CBLC381A, CBLK3...
14      When introduced into Lin- Sca1+ c-Kit+ (LSK) H...
15      The second group of mutants ( placeholderMutat...
16      When introduced into Lin- Sca1+ c-Kit+ (LSK) H...
17      Using 

#### Subs inside the text: sentences with length <=10

In [152]:
######### We rerun for window. Length<2 : w = 3, Length >=6: w = 1, Length in between: w = 2
indexes = [index for index, sentences in enumerate(sub_sentences) if len(sentences) <= 10]
sentence_lengths = [len(sentences) for index, sentences in enumerate(sub_sentences) if len(sentences) <= 10]
NLTK_sub_window = [NLTK_window[i] for i in indexes]

In [153]:
new_index = sub_text.index[indexes]
sub_window = sub_text.loc[new_index] # gets the subs with the low length
sub_window #1524 cases 

Unnamed: 0,Class,Gene,ID,Variation,Text
2,3,CBL,3,N454D,Recent evidence has demonstrated that acquired...
3,4,CBL,4,L399V,Oncogenic mutations in the monomeric Casitas B...
4,4,CBL,5,V391I,Oncogenic mutations in the monomeric Casitas B...
5,5,CBL,6,V430M,Oncogenic mutations in the monomeric Casitas B...
8,4,CBL,10,P395A,Oncogenic mutations in the monomeric Casitas B...
12,5,CBL,14,P428L,Oncogenic mutations in the monomeric Casitas B...
14,4,CBL,17,Q367P,Acquired uniparental disomy (aUPD) is a common...
15,5,CBL,18,M374V,Oncogenic mutations in the monomeric Casitas B...
16,4,CBL,19,Y371S,Acquired uniparental disomy (aUPD) is a common...
18,4,CBL,21,C396R,Oncogenic mutations in the monomeric Casitas B...


In [154]:
sub_window_sentences = get_window_sub(sub_window[:], NLTK_sub_window[:], sentence_lengths) 
sub_window_sentences = [sorted(set(sentences), key = sentences.index) for sentences in sub_window_sentences] # removes duplicates
sub_window_string = sentences_to_string(sub_window_sentences)

In [155]:
sub_window_sentences[3] # only thing we can do about sentences like this one is look at the full text words now

['Figure 4 Figure 4 Structures of wild type (green) and K382E mutant (tan) are shown for nCBL and pCBL-E2-S states.',
 'Salt bridges are shown as dashed lines and in both cases mutation affects the stability by disrupting the salt bridge.',
 'The second group of mutants (M374V,  placeholderMutation P428L, Q249E and double mutant S80N/H94Y) maintained the CBL activity equivalent to or greater than wild-type CBL (relative densitometry of 80% or higher) (Fig.',
 '3B).',
 'Consistent with this, the levels of the activated EGFR were also decreased in these samples (Fig.',
 'Two of these CBL mutants resulted in more ubiquitination of EGFR compared to wild type CBL.',
 'For example, the Q249E mutant showed an increased ubiquitination of EGFR in 293T and HeLa cells while the M374V mutant resulted in higher ubiquitination levels in A549 and HeLa cells (Figure 3 and S6).',
 'Only  placeholderMutation mutant was on the borderline with the densitometry ratio of 0.8.',
 'All mutations from the seco

In [156]:
# Finally: replacing those texts in data_all
data_all.Text.loc[new_index] = sub_window_string
data_all.Text.loc[new_index]

2       View inlineView popup Table 2 Cases analyzed f...
3       For example, the Q249E mutant showed an increa...
4       For example, the Q249E mutant showed an increa...
5       Figure 4 Figure 4 Structures of wild type (gre...
8       For example, the Q249E mutant showed an increa...
12      Moreover, K382E mutation had a significant imp...
14      Transformed NIH3T3 cells showed PI3 kinase-dep...
15      Figure 4 Figure 4 Structures of wild type (gre...
16      Transformed NIH3T3 cells showed PI3 kinase-dep...
18      Figure 3 Figure 3 (A) EGFR ubiquitination. HEK...
19      For example, the Q249E mutant showed an increa...
20      Figure 6 Download figureOpen in new tabDownloa...
21      We fully sequenced the coding exons of these g...
22      Overall survival (A) and progression-free surv...
24      Both families contain compound heterozygotes. ...
26      Here we describe 2 families in which 2 TERT mu...
27      Both families contain compound heterozygotes. ...
28      Both f

### Subs with a different number inside the text

#### Subs with a different number inside the text: all sentences


In [157]:
# now the subs that don't appear in text: one reason is different number in the substitution. f.e. G12V -> G13V 
sub_number_text, sub_number_no_text = find_sub_numberChange(sub_no_text) # 131 cases with number change

In [158]:
## use tokenizer to split into sentences of subs that have different number in text 
NLTK_sub_number = [sent_tokenize(sub_number_text.Text[i]) for i in sub_number_text.index]
NLTK_sub_number_window = copy.deepcopy(NLTK_sub_number) # a deep copy is necessary besauce something magical and strange happening without it
NLTK_sub_number_copy = copy.deepcopy(NLTK_sub_number) # again for backup purposes

In [159]:
# extract window for the sub sentences where they appear
# !! Use [:] because it makes a copy and doesn't change anything to the original indexes
sub_number_sentences = get_sentences_sub_number(sub_number_text[:], NLTK_sub_number_copy[:])
sub_number_string = sentences_to_string(sub_number_sentences)

In [160]:
data_all.Text.loc[sub_number_text.index] = sub_number_string #iloc for indexing based on integers
data_all.Text.loc[sub_number_text.index]

37      His-tagged versions of the catalytic region of...
41      His-tagged versions of the catalytic region of...
53      His-tagged versions of the catalytic region of...
55      His-tagged versions of the catalytic region of...
58      His-tagged versions of the catalytic region of...
62      While both were missense mutations, in silico ...
150      placeholderMutation and T847I were also found...
153     Complex mutation  placeholderMutation and dele...
191     As a positive control, we introduced the well-...
198     And in the series of Pallis and colleagues (12...
211     Recurrent mutations in H3F3A, which encodes th...
212     Histone H3-K42me2 may regulate transcription  ...
213     Recurrent mutations in H3F3A, which encodes th...
394     Both combinations contained mutations of codon...
474     In the present study, we have focused on two d...
494     Such mutant amino acid specificity was also ap...
578     In contrast, ectopic expression of FGFR4  plac...
823     Recent

#### Subs with a different number inside the text: sentences with length <= 10

In [161]:
######### ######### We rerun for window. Length<2 : w = 3, Length >=6: w = 1, Length in between: w = 2
indexes = [index for index, sentences in enumerate(sub_number_sentences) if len(sentences) <= 10]
sentence_lengths = [len(sentences) for index, sentences in enumerate(sub_number_sentences) if len(sentences) <= 10]
NLTK_window = [NLTK_sub_number[i] for i in indexes]

In [162]:
new_index = sub_number_text.index[indexes]
sub_number_window = sub_number_text.loc[new_index] # gets the subs with the low length
sub_number_window #87 cases 

Unnamed: 0,Class,Gene,ID,Variation,Text
37,4.0,PTPRT,44,T1365M,"Tyrosine phosphorylation, regulated by protein..."
41,4.0,PTPRT,49,R1343L,"Tyrosine phosphorylation, regulated by protein..."
53,4.0,PTPRT,61,R1209W,"Tyrosine phosphorylation, regulated by protein..."
150,2.0,EGFR,208,V774A,Purpose: Clinical features of epidermal growth...
153,2.0,EGFR,213,V774M,Purpose: Clinical features of epidermal growth...
191,7.0,EGFR,267,G810S,Purpose: Epidermal growth factor receptor (EGF...
198,2.0,EGFR,275,L838P,Purpose: Clinical features of epidermal growth...
213,8.0,H3F3A,308,G35V,The DNA entry and exit points on the nucleosom...
394,4.0,TP53,521,E286K,The transcription factor and tumor suppressor ...
494,1.0,FBXW7,632,G423R,Background Melanoma is a heterogeneous tumor ...


In [163]:
sub_window_sentences = get_window_sub_number(sub_number_window[:], NLTK_window[:], sentence_lengths) 
sub_window_sentences = [sorted(set(sentences), key = sentences.index) for sentences in sub_window_sentences] # removes duplicates
sub_window_string = sentences_to_string(sub_window_sentences)
sub_window_sentences[24]

['Modeling of the pathogenic mutations was performed and analyzed using O [39].',
 'Atomic superimpositions were performed using program lsqkab [40] in CCP4 Suite [41] and structural representations were prepared using PyMol [42].',
 'Protein Expression and Purification The cDNA fragment encoding residues P459 to E769 of human FGFR2c (Accession code: NP_075259) was amplified by polymerase chain reaction and subcloned into pET bacterial expression vector with an NH2 terminal 6× His tag to aid in protein purification.',
 'Point mutations (M536I, M538I, I548V, N550H, N550K,  placeholderMutation, V565I, E566G, L618M, and K660E) were introduced using QuikChange Site-Directed Mutagenesis Kit (Stratagene, La Jolla, CA).',
 'The bacterial strain BL21 (DE3) cells were transformed with the expression constructs, and kinase expression was induced with 1 mM isopropyl-L-thio-BDgalactopyranoside overnight at the appropriate temperature.',
 'The cells were lysed, and the soluble kinase proteins were 

In [164]:
data_all.Text.loc[new_index] = sub_window_string
data_all.Text.loc[new_index]

37      Second, the prevalence of mutations in the cod...
41      Second, the prevalence of mutations in the cod...
53      Second, the prevalence of mutations in the cod...
150     Some other uncommon EGFR mutations, that is, t...
153     By searching the database (38), we found that ...
191     To test more directly whether the mutations EG...
198     However, case numbers were small in previous s...
213     Somatic mutations in the H3.3-ATRX-DAXX chroma...
394     1. The effect of suppressor amino acid changes...
494     Examining the spectrum of KRAS mutations, whic...
578     None of the somatic alleles of EPHA3, ERBB4, F...
825     Trimethylation of H3K27 is a mechanism for sup...
827     EZH2 is the catalytic subunit of the PRC2 comp...
830     1b have not previously been reported as statis...
831     1b have not previously been reported as statis...
862     Summary During the past decade, the treatment ...
865     The H1112 mutations were immediately adjacent ...
869     Mutati

In [165]:
data_all.loc[len(new_train):].Text[data_all.loc[len(new_train):].index[4]]

'We amplified all of the coding exons from 24 primary ESCC tumors and 16 tumor cell lines. We found three mutations in three ESCC primary tumors (Fig. 2 ⇓ ; Table 2 ⇓ ) and one mutation in a tumor cell line (Table 2) ⇓ . The amino acid changes in the mutations were R102K, A242T,  placeholderMutation and S623N. We also analyzed the blood DNAs from the three patients with mutations in the RNF6 gene, and all contained the wild-type allele (Fig. 2) ⇓ , thus indicating that the mutations in the ESCC primary tumors were acquired during tumorigenesis in somatic cells. RNF6 contains three domains: an arginine domain (amino acids 292–424), a poly-aspartic acid domain (amino acids 598–601), and a Zinc finger domain (amino acids 632–673). The mutation S623N, which is located near the Zinc finger domain, may affect its DNA-binding activity. Both A242T and  placeholderMutation may perturb the structure and function of the RNF6 protein. In the first case, the side chain of amino acid Thr is larger t

#### Subs that are still not found: all sentences

In [166]:
sub_number_no_text.shape # 365 left

(365, 5)

In [167]:
sub_number_no_text.loc[len(new_train):].shape # 62 in test

(62, 5)

In [168]:
texts_to_analyze = list(sub_number_no_text.loc[len(new_train):].Text)

In [169]:
len(set(texts_to_analyze)) # basically 31 texts the same!

31

In [170]:
texts_to_analyze[8]

"Ongoing cancer genome characterization studies continue to elucidate the spectrum of genomic abnormalities that drive many cancers, and in the clinical arena assessment of the driver genetic alterations in patients is playing an increasingly important diagnostic and/or prognostic role for many cancer types. However, the landscape of genomic abnormalities is still unknown for less common cancers, and the influence of specific genotypes on clinical behavior is often still unclear. To address some of these deficiencies, we developed Profile, a prospective cohort study to obtain genomic information on all patients at a large tertiary care medical center for cancer-related care. We enrolled patients with any cancer diagnosis, and, for each patient (unselected for cancer site or type) we applied mass spectrometric genotyping (OncoMap) of 471 common recurrent mutations in 41 cancer-related genes. We report the results of the first 5000 patients, of which 26% exhibited potentially actionable 

In [171]:
sub_number_no_text.loc[len(new_train):] # A lot of sentences starting with 'Among...'

Unnamed: 0,Class,Gene,ID,Variation,Text
2933,,TP53,16,Y234S,Among the best-studied therapeutic targets in ...
2937,,CSF1R,20,Y969H,The FMS gene encodes the functional cell surfa...
2944,,SYT6,31,A406T,The phosphatidylinositol-3-kinase (PI3K)/serin...
2949,,CSF1R,38,Y969F,Clinical characteristics.Adult-onset leukoence...
2962,,STK11,58,G163C,Germline mutation in serine/threonine kinase 1...
2979,,DCC,87,P1375H,DCC is a candidate tumor-suppressor gene encod...
2990,,ACVR1,104,R258M,Among the best-studied therapeutic targets in ...
3018,,STK11,144,R86G,Germline mutation in serine/threonine kinase 1...
3026,,AKT2,156,S302G,Ongoing cancer genome characterization studies...
3030,,MYC,161,P58L,Among the best-studied therapeutic targets in ...


In [172]:
# subs that are still not found, look based on a pattern like the word 'mutat', 'variat' or the gene in text
sub_pattern_text, sub_pattern_no_text = find_sub_pattern(sub_number_no_text) # 

In [173]:
sub_pattern_no_text # only 13 texts that still don't include the sentences of which three nulls and low text_words length, replace with nulls

Unnamed: 0,Class,Gene,ID,Variation,Text
108,5,EGFR,140,I491M,The accurate determination of perfluoroalkyl s...
111,2,EGFR,145,K467T,The accurate determination of perfluoroalkyl s...
185,2,EGFR,259,S464L,The accurate determination of perfluoroalkyl s...
292,4,TP53,416,P151S,The effects of chlorpromazine on various prope...
841,1,FANCA,1109,S1088F,
1078,6,FGFR3,1407,K508M,
1255,4,VHL,1613,L158Q,The case of a 40-year-old woman with severe ed...
1524,7,CARD11,1936,G116S,Left ventricular (LV) remodeling is a signific...
1528,7,CARD11,1942,E127G,Left ventricular (LV) remodeling is a signific...
1820,7,JAK1,2302,R724H,Regulatory T (T reg) cells have a major role i...


In [174]:
data_all.Text.loc[sub_pattern_no_text.index] = ''
data_all.Text.loc[sub_pattern_no_text.index]

108     
111     
185     
292     
841     
1078    
1255    
1524    
1528    
1820    
1827    
2220    
2427    
Name: Text, dtype: object

In [175]:
## use tokenizer to split into sentences of subs that have keywords or gene
NLTK_sub_pattern = [sent_tokenize(sub_pattern_text.Text[i]) for i in sub_pattern_text.index]

In [176]:
# !! Use [:] because it makes a copy and doesn't change anything to the original indexes
sub_pattern_sentences = get_sentences_pattern(sub_pattern_text[:], NLTK_sub_pattern[:])
sub_pattern_string = sentences_to_string(sub_pattern_sentences)

In [177]:
data_all.Text.iloc[sub_pattern_text.index] = sub_pattern_string #iloc for indexing based on integers
data_all.Text.iloc[sub_pattern_text.index]

39      Mutations in CBL, encoding an E3 ubiquitin lig...
67      This D1 domain contains only two serine residu...
83      Identification of a high-risk disease-causing ...
85      In this study, we profiled 70 colorectal cance...
87      Identification of a high-risk disease-causing ...
88      Most Lynch Syndrome-associated CRCs are report...
90      Identification of a high-risk disease-causing ...
91      Germ-line  placeholderGene mutations, which ar...
107     The  placeholderGene gene encodes subunit 1 of...
113     Purpose: Mutations in epidermal growth factor ...
116     Clinical data and epidermal growth factor rece...
122     Clinical data and epidermal growth factor rece...
126     Mutations in the epidermal growth factor recep...
131     In selected patients with advanced non-small c...
135     Purpose: Clinical reports about responsiveness...
137     ‘Classical' mutations in the  placeholderGene ...
154     Purpose: Clinical features of epidermal growth...
159     Introd

#### Subs that are still not found inside the text: sentences with length <= 10

In [178]:
######### ######### We rerun for window. Length<2 : w = 3, Length >=6: w = 1, Length in between: w = 2
indexes = [index for index, sentences in enumerate(sub_pattern_sentences) if len(sentences) <= 10]
sentence_lengths = [len(sentences) for index, sentences in enumerate(sub_pattern_sentences) if len(sentences) <=10]
NLTK_window = [NLTK_sub_pattern[i] for i in indexes]

In [179]:
sentence_lengths # 19 cases where we still look again, some of them from same text

[4, 4, 4, 4, 2, 2, 3, 5, 5, 5, 5, 3, 4, 5, 5, 5, 5, 5, 9]

In [180]:
new_index = sub_pattern_text.index[indexes]
sub_pattern_window = sub_pattern_text.loc[new_index] # gets the subs with the low length
sub_pattern_window #87 cases 

Unnamed: 0,Class,Gene,ID,Variation,Text
67,2.0,CCND3,80,I290A,The activities of cyclin D-dependent kinases s...
231,4.0,CDH1,344,A617T,E-cadherin is involved in the formation of cel...
233,4.0,CDH1,346,A634V,E-cadherin is involved in the formation of cel...
235,4.0,CDH1,348,T340A,E-cadherin is involved in the formation of cel...
450,1.0,SMAD4,581,R378A,The formation of protein complexes between pho...
461,1.0,SMAD4,593,D493A,The formation of protein complexes between pho...
505,4.0,CDKN2A,648,Q50*,"The p16 gene is located in chromosome 9p21, a ..."
543,4.0,CDKN2A,688,R79P,Cell division is controlled by a series of pos...
547,4.0,CDKN2A,692,G93W,Cell division is controlled by a series of pos...
548,4.0,CDKN2A,693,V118D,Cell division is controlled by a series of pos...


In [181]:
sub_window_sentences = get_window_pattern(sub_pattern_window[:], NLTK_window[:], sentence_lengths) 
sub_window_sentences = [sorted(set(sentences), key = sentences.index) for sentences in sub_window_sentences] # removes duplicates
sub_window_string = sentences_to_string(sub_window_sentences)

In [182]:
data_all.Text.loc[new_index] = sub_window_string
data_all.Text.loc[new_index]

67      GSK-3β is a major cyclin D1 kinase in lysates ...
231     E-cadherin is involved in the formation of cel...
233     E-cadherin is involved in the formation of cel...
235     E-cadherin is involved in the formation of cel...
450     The formation of protein complexes between pho...
461     The formation of protein complexes between pho...
505     The p16 gene is located in chromosome 9p21, a ...
543     4). Deletion or mutation of these CDK-inhibito...
547     4). Deletion or mutation of these CDK-inhibito...
548     4). Deletion or mutation of these CDK-inhibito...
664     In addition to its well-documented effects on ...
1226    HEREDITARY nonpolyposis colorectal cancer (HNP...
1997    However, cellular fluorescence evolves slowly ...
2363    Moreover, inactivation of the  placeholderGene...
2368    Moreover, inactivation of the  placeholderGene...
2370    Moreover, inactivation of the  placeholderGene...
2685    Hereditary nonpolyposis colon cancer (HN-PCC) ...
2814    4). De

# Extra investigation of test text

In [183]:
test_all = data_all.loc[len(new_train):]
all_text = list(test_all.Text)
test_all

Unnamed: 0,Class,Gene,ID,Variation,Text
2925,,WNT4,3,E216G,This unchanged corresponded consistent express...
2926,,SUCLA2,4,G118R,On placeholderMutation difference first expre...
2927,,CHEK2,6,E239K,"In 1996, two independent groups confirmed that..."
2928,,CHST3,7,T141M,We point S12 the the overexpression OSCST pla...
2929,,RNF6,8,G244D,We amplified all of the coding exons from 24 p...
2930,,SPAST,9,C448Y,Interestingly also indeed of novel and placeh...
2931,,SCN4A,11,V445M,Based of MATERIALS new concentrations placeho...
2932,,ERBB2,15,G746S,A total of 114 samples (41 wild-type and 73 mu...
2933,,TP53,16,Y234S,Among the best-studied therapeutic targets in ...
2934,,RAB27A,17,A87P,If cases used of 0.8 proof-of-principle with a...


In [184]:
positions_machine = []
all_together = []
for i in range(len(all_text)):
    text = all_text[i]
    if text.count('placeholderMutation') < 11:
        all_together.append((i, text.count('placeholderMutation')))
        positions_machine.append(i)

In [185]:
important_texts = [all_text[i] for i in positions_machine]
test_all.iloc[positions_machine]

Unnamed: 0,Class,Gene,ID,Variation,Text
2929,,RNF6,8,G244D,We amplified all of the coding exons from 24 p...
2932,,ERBB2,15,G746S,A total of 114 samples (41 wild-type and 73 mu...
2933,,TP53,16,Y234S,Among the best-studied therapeutic targets in ...
2937,,CSF1R,20,Y969H,This study reports on the frequency of point m...
2944,,SYT6,31,A406T,Each tumor encoded 18 shared and nine private ...
2949,,CSF1R,38,Y969F,Diagnosis/testing.The diagnosis is suspected i...
2952,,SMAD4,43,G386C,"3a). Examining the spectrum of KRAS mutations,..."
2954,,ERBB2,47,V762M,The ERBB2 mutations were detected in advanced ...
2956,,MET,50,M1268T,"Before these results were available, the patie..."
2959,,KRAS,54,Q61K,Although >90% of BRAF mutations in melanoma in...


In [186]:
all_together

[(4, 2),
 (7, 2),
 (8, 0),
 (12, 0),
 (19, 0),
 (24, 0),
 (27, 2),
 (29, 1),
 (31, 7),
 (34, 1),
 (37, 0),
 (46, 3),
 (48, 2),
 (54, 0),
 (65, 0),
 (67, 3),
 (83, 1),
 (85, 5),
 (93, 0),
 (101, 0),
 (105, 0),
 (108, 0),
 (120, 7),
 (123, 5),
 (125, 3),
 (128, 0),
 (143, 0),
 (147, 3),
 (148, 0),
 (160, 0),
 (164, 0),
 (166, 0),
 (179, 3),
 (182, 0),
 (188, 2),
 (210, 0),
 (225, 0),
 (234, 2),
 (252, 0),
 (255, 0),
 (264, 8),
 (266, 0),
 (267, 0),
 (270, 0),
 (272, 1),
 (273, 0),
 (282, 0),
 (285, 0),
 (286, 3),
 (291, 2),
 (294, 4),
 (295, 0),
 (296, 1),
 (297, 1),
 (300, 1),
 (302, 0),
 (304, 2),
 (306, 0),
 (309, 4),
 (324, 3),
 (326, 1),
 (329, 0),
 (332, 0),
 (333, 7),
 (336, 5),
 (344, 0),
 (346, 1),
 (350, 0),
 (352, 2),
 (360, 0),
 (373, 0),
 (375, 0),
 (378, 0),
 (379, 1),
 (393, 5),
 (395, 0),
 (396, 1),
 (402, 0),
 (409, 0),
 (415, 0),
 (420, 1),
 (432, 2),
 (444, 1),
 (446, 3),
 (449, 0),
 (450, 0),
 (454, 3),
 (455, 5),
 (459, 2),
 (465, 5),
 (479, 2),
 (480, 0),
 (483, 0),

In [187]:
test_important = test_all.iloc[positions_machine]
test_important

Unnamed: 0,Class,Gene,ID,Variation,Text
2929,,RNF6,8,G244D,We amplified all of the coding exons from 24 p...
2932,,ERBB2,15,G746S,A total of 114 samples (41 wild-type and 73 mu...
2933,,TP53,16,Y234S,Among the best-studied therapeutic targets in ...
2937,,CSF1R,20,Y969H,This study reports on the frequency of point m...
2944,,SYT6,31,A406T,Each tumor encoded 18 shared and nine private ...
2949,,CSF1R,38,Y969F,Diagnosis/testing.The diagnosis is suspected i...
2952,,SMAD4,43,G386C,"3a). Examining the spectrum of KRAS mutations,..."
2954,,ERBB2,47,V762M,The ERBB2 mutations were detected in advanced ...
2956,,MET,50,M1268T,"Before these results were available, the patie..."
2959,,KRAS,54,Q61K,Although >90% of BRAF mutations in melanoma in...


In [188]:
test_important.Text[test_important.index[20]]

'Among the best-studied therapeutic targets in human cancers are proteins encoded by genes with tumor-specific mutational hotspots, such as KRAS, NRAS, BRAF, KIT, and EGFR. The acquisition of somatic mutations is one of the major mechanisms responsible for the dysregulation of proliferation, invasion, and apoptosis that is required for oncogenesis. These computational approaches develop either gene-level statistical models that exploit different mutational patterns3–6 to identify significantly mutated genes or use weight-of-evidence-based methods1,7 that are heuristic and ratiometric in approach. However, emerging data indicate that different hotspot mutations in the same cancer gene can be functionally distinct in vitro and in vivo and display different clinical phenotypes and drug sensitivity8–11. To date, studies of hotspot mutations in cancer have been limited to within individual tumor types12–14 or have focused on individual cancer genes across tumor types15. A systematic populat

# Adding gene dummy and Full_text

In [189]:
shared_genes = list(set(train_all.Gene).intersection(set(test_all.Gene))) # the shared genes are those that appear in both
count_important = 0
for gene in test_important.Gene:
    if gene in shared_genes:
        count_important += 1

count_important # so 104 EXAMPLES in test where same gene appears as in the impo

104

In [190]:
len(shared_genes)

35

In [191]:
shared_genes

['PTEN',
 'XPO1',
 'IDH1',
 'BCL10',
 'TP53',
 'VHL',
 'MED12',
 'DNMT3A',
 'RAD50',
 'EGFR',
 'MET',
 'CHEK2',
 'CREBBP',
 'AKT2',
 'RAD54L',
 'FGFR2',
 'ERBB2',
 'PIK3CA',
 'CDKN2A',
 'BRAF',
 'MAP2K1',
 'RAC1',
 'ACVR1',
 'NRAS',
 'FGFR3',
 'STK11',
 'KRAS',
 'IDH2',
 'MYC',
 'NOTCH1',
 'CDK4',
 'PTPN11',
 'EPAS1',
 'CTNNB1',
 'SMAD4']

In [192]:
for gene in shared_genes:
    data_all[gene] = 0
data_all

Unnamed: 0,Class,Gene,ID,Variation,Text,PTEN,XPO1,IDH1,BCL10,TP53,...,STK11,KRAS,IDH2,MYC,NOTCH1,CDK4,PTPN11,EPAS1,CTNNB1,SMAD4
0,2,CBL,1,W802*,Using select c-CBL somatic mutations such as S...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,CBL,2,Q249E,Using select c-CBL somatic mutations such as S...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,CBL,3,N454D,View inlineView popup Table 2 Cases analyzed f...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,CBL,4,L399V,"For example, the Q249E mutant showed an increa...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,CBL,5,V391I,"For example, the Q249E mutant showed an increa...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,5,CBL,6,V430M,Figure 4 Figure 4 Structures of wild type (gre...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,4,CBL,8,Y371H,We investigated the mechanism by which CBL- pl...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,4,CBL,9,C384R,We investigated the mechanism by which CBL-Y37...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,4,CBL,10,P395A,"For example, the Q249E mutant showed an increa...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,4,CBL,11,K382E,"Purified PCR amplicons (CBLWT, CBLC381A, CBL p...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [193]:
for i in data_all.index:
    gene = data_all.Gene[i]
    if gene in shared_genes:
        data_all.loc[i, gene] = 1

data_all # it's not all zeros, it looks like it but genes like BRAF and EFGR have more 1's because they occur more

Unnamed: 0,Class,Gene,ID,Variation,Text,PTEN,XPO1,IDH1,BCL10,TP53,...,STK11,KRAS,IDH2,MYC,NOTCH1,CDK4,PTPN11,EPAS1,CTNNB1,SMAD4
0,2,CBL,1,W802*,Using select c-CBL somatic mutations such as S...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,CBL,2,Q249E,Using select c-CBL somatic mutations such as S...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,CBL,3,N454D,View inlineView popup Table 2 Cases analyzed f...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,CBL,4,L399V,"For example, the Q249E mutant showed an increa...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,CBL,5,V391I,"For example, the Q249E mutant showed an increa...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,5,CBL,6,V430M,Figure 4 Figure 4 Structures of wild type (gre...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,4,CBL,8,Y371H,We investigated the mechanism by which CBL- pl...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,4,CBL,9,C384R,We investigated the mechanism by which CBL-Y37...,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,4,CBL,10,P395A,"For example, the Q249E mutant showed an increa...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,4,CBL,11,K382E,"Purified PCR amplicons (CBLWT, CBLC381A, CBL p...",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [194]:
# We decided to go for 15 dummies for genes:
N_components = 15
svd = TruncatedSVD(n_components= N_components, n_iter=18, random_state=18)
data_text = data_all[['Class', 'Gene', 'ID', 'Variation','Text']] #5th is Text, just before the genes 
one_hot_gene = data_all[shared_genes]

truncated_one_hot_gene = svd.fit_transform(one_hot_gene.values)
print(svd.explained_variance_ratio_.sum())

0.846437054329


In [195]:
genes_df =pd.DataFrame(truncated_one_hot_gene,columns=["tsvd_gene"+ str(x) for x in range(N_components)])
genes_df

Unnamed: 0,tsvd_gene0,tsvd_gene1,tsvd_gene2,tsvd_gene3,tsvd_gene4,tsvd_gene5,tsvd_gene6,tsvd_gene7,tsvd_gene8,tsvd_gene9,tsvd_gene10,tsvd_gene11,tsvd_gene12,tsvd_gene13,tsvd_gene14
0,-4.687577e-24,-6.120573e-20,-2.015694e-20,-2.202339e-22,1.757299e-22,-5.107769e-24,4.393204e-25,1.168586e-25,1.300702e-26,-9.409370e-27,2.064121e-27,-2.000105e-27,-1.345011e-27,1.946342e-28,1.131490e-30
1,6.531017e-22,-1.105960e-18,7.176205e-17,6.392266e-19,-4.231921e-19,-2.200993e-20,-3.413029e-21,-6.911586e-23,7.166714e-24,-3.414389e-24,-8.294976e-25,4.138654e-24,-1.139305e-24,4.867798e-25,-3.041686e-27
2,-1.936228e-24,1.235412e-21,-9.429628e-20,1.236044e-17,-2.559256e-17,-1.224705e-18,4.534869e-21,-7.927475e-21,1.548052e-21,7.879409e-22,-1.680712e-22,3.254222e-22,-1.174251e-22,1.223404e-23,-2.468670e-26
3,-1.936252e-25,5.014914e-20,1.336022e-18,-3.148571e-15,-4.521623e-15,-3.777485e-16,-1.312519e-17,-2.967254e-18,-2.406352e-19,3.017615e-19,1.119697e-19,1.249843e-19,-1.130464e-20,1.574058e-20,-1.161984e-22
4,-5.494071e-26,9.687730e-23,-8.270054e-22,-2.321849e-19,1.723147e-18,-4.018228e-17,3.180975e-18,1.377191e-18,6.220148e-19,-1.946108e-19,-3.515165e-19,-2.582858e-19,5.891622e-21,-9.859626e-21,1.456015e-22
5,7.952901e-26,-1.557783e-22,4.261542e-21,1.230154e-18,9.799194e-19,-5.008013e-17,7.225287e-16,-2.504022e-17,-1.337896e-17,3.424846e-18,1.111619e-18,-5.907006e-20,7.160735e-19,-2.981643e-20,-2.359028e-22
6,2.322664e-27,1.012991e-24,4.924734e-23,1.256177e-20,5.618518e-20,1.144737e-18,-9.163914e-18,-1.171408e-16,-1.079938e-17,1.026808e-16,3.895431e-18,1.291907e-17,1.378850e-17,5.497033e-18,-4.194782e-20
7,5.928725e-27,-8.832844e-25,6.759233e-23,4.656283e-20,2.369380e-20,1.896838e-18,7.057712e-18,-1.788094e-16,9.163135e-16,-2.859316e-16,-4.262833e-16,-3.311566e-16,-7.823928e-17,7.478273e-17,-3.314364e-19
8,4.148274e-27,-3.627361e-24,3.518046e-23,7.118104e-22,-2.898922e-20,1.726778e-18,-4.692803e-18,1.710749e-17,6.879925e-16,6.765901e-16,3.948970e-16,5.810336e-16,-8.419593e-16,-3.675528e-16,2.853503e-18
9,-8.596720e-27,1.582616e-24,-1.525188e-22,-1.716718e-21,-1.965893e-19,-4.125585e-19,1.484085e-17,5.460050e-18,-1.341416e-15,-3.554197e-15,2.955287e-15,2.767873e-15,-4.564471e-16,-2.300869e-17,-2.570776e-20


In [208]:
data_final = pd.merge(data_text.reset_index(), genes_df.reset_index()).drop("index",axis=1)

In [209]:
data_final

Unnamed: 0,Class,Gene,ID,Variation,Text,tsvd_gene0,tsvd_gene1,tsvd_gene2,tsvd_gene3,tsvd_gene4,tsvd_gene5,tsvd_gene6,tsvd_gene7,tsvd_gene8,tsvd_gene9,tsvd_gene10,tsvd_gene11,tsvd_gene12,tsvd_gene13,tsvd_gene14
0,2,CBL,1,W802*,Using select c-CBL somatic mutations such as S...,-4.687577e-24,-6.120573e-20,-2.015694e-20,-2.202339e-22,1.757299e-22,-5.107769e-24,4.393204e-25,1.168586e-25,1.300702e-26,-9.409370e-27,2.064121e-27,-2.000105e-27,-1.345011e-27,1.946342e-28,1.131490e-30
1,2,CBL,2,Q249E,Using select c-CBL somatic mutations such as S...,6.531017e-22,-1.105960e-18,7.176205e-17,6.392266e-19,-4.231921e-19,-2.200993e-20,-3.413029e-21,-6.911586e-23,7.166714e-24,-3.414389e-24,-8.294976e-25,4.138654e-24,-1.139305e-24,4.867798e-25,-3.041686e-27
2,3,CBL,3,N454D,View inlineView popup Table 2 Cases analyzed f...,-1.936228e-24,1.235412e-21,-9.429628e-20,1.236044e-17,-2.559256e-17,-1.224705e-18,4.534869e-21,-7.927475e-21,1.548052e-21,7.879409e-22,-1.680712e-22,3.254222e-22,-1.174251e-22,1.223404e-23,-2.468670e-26
3,4,CBL,4,L399V,"For example, the Q249E mutant showed an increa...",-1.936252e-25,5.014914e-20,1.336022e-18,-3.148571e-15,-4.521623e-15,-3.777485e-16,-1.312519e-17,-2.967254e-18,-2.406352e-19,3.017615e-19,1.119697e-19,1.249843e-19,-1.130464e-20,1.574058e-20,-1.161984e-22
4,4,CBL,5,V391I,"For example, the Q249E mutant showed an increa...",-5.494071e-26,9.687730e-23,-8.270054e-22,-2.321849e-19,1.723147e-18,-4.018228e-17,3.180975e-18,1.377191e-18,6.220148e-19,-1.946108e-19,-3.515165e-19,-2.582858e-19,5.891622e-21,-9.859626e-21,1.456015e-22
5,5,CBL,6,V430M,Figure 4 Figure 4 Structures of wild type (gre...,7.952901e-26,-1.557783e-22,4.261542e-21,1.230154e-18,9.799194e-19,-5.008013e-17,7.225287e-16,-2.504022e-17,-1.337896e-17,3.424846e-18,1.111619e-18,-5.907006e-20,7.160735e-19,-2.981643e-20,-2.359028e-22
6,4,CBL,8,Y371H,We investigated the mechanism by which CBL- pl...,2.322664e-27,1.012991e-24,4.924734e-23,1.256177e-20,5.618518e-20,1.144737e-18,-9.163914e-18,-1.171408e-16,-1.079938e-17,1.026808e-16,3.895431e-18,1.291907e-17,1.378850e-17,5.497033e-18,-4.194782e-20
7,4,CBL,9,C384R,We investigated the mechanism by which CBL-Y37...,5.928725e-27,-8.832844e-25,6.759233e-23,4.656283e-20,2.369380e-20,1.896838e-18,7.057712e-18,-1.788094e-16,9.163135e-16,-2.859316e-16,-4.262833e-16,-3.311566e-16,-7.823928e-17,7.478273e-17,-3.314364e-19
8,4,CBL,10,P395A,"For example, the Q249E mutant showed an increa...",4.148274e-27,-3.627361e-24,3.518046e-23,7.118104e-22,-2.898922e-20,1.726778e-18,-4.692803e-18,1.710749e-17,6.879925e-16,6.765901e-16,3.948970e-16,5.810336e-16,-8.419593e-16,-3.675528e-16,2.853503e-18
9,4,CBL,11,K382E,"Purified PCR amplicons (CBLWT, CBLC381A, CBL p...",-8.596720e-27,1.582616e-24,-1.525188e-22,-1.716718e-21,-1.965893e-19,-4.125585e-19,1.484085e-17,5.460050e-18,-1.341416e-15,-3.554197e-15,2.955287e-15,2.767873e-15,-4.564471e-16,-2.300869e-17,-2.570776e-20


In [210]:
# Replace column name for text of window, and add new column of full text
data_final.insert(loc=4, column='Full_Text', value=data_all_backup.Text)
data_final.columns.values[5] = 'Window_Text'
data_final

Unnamed: 0,Class,Gene,ID,Variation,Full_Text,Window_Text,tsvd_gene0,tsvd_gene1,tsvd_gene2,tsvd_gene3,...,tsvd_gene5,tsvd_gene6,tsvd_gene7,tsvd_gene8,tsvd_gene9,tsvd_gene10,tsvd_gene11,tsvd_gene12,tsvd_gene13,tsvd_gene14
0,2,CBL,1,W802*,Abstract Background Non-small cell lung canc...,Using select c-CBL somatic mutations such as S...,-4.687577e-24,-6.120573e-20,-2.015694e-20,-2.202339e-22,...,-5.107769e-24,4.393204e-25,1.168586e-25,1.300702e-26,-9.409370e-27,2.064121e-27,-2.000105e-27,-1.345011e-27,1.946342e-28,1.131490e-30
1,2,CBL,2,Q249E,Abstract Background Non-small cell lung canc...,Using select c-CBL somatic mutations such as S...,6.531017e-22,-1.105960e-18,7.176205e-17,6.392266e-19,...,-2.200993e-20,-3.413029e-21,-6.911586e-23,7.166714e-24,-3.414389e-24,-8.294976e-25,4.138654e-24,-1.139305e-24,4.867798e-25,-3.041686e-27
2,3,CBL,3,N454D,Recent evidence has demonstrated that acquired...,View inlineView popup Table 2 Cases analyzed f...,-1.936228e-24,1.235412e-21,-9.429628e-20,1.236044e-17,...,-1.224705e-18,4.534869e-21,-7.927475e-21,1.548052e-21,7.879409e-22,-1.680712e-22,3.254222e-22,-1.174251e-22,1.223404e-23,-2.468670e-26
3,4,CBL,4,L399V,Oncogenic mutations in the monomeric Casitas B...,"For example, the Q249E mutant showed an increa...",-1.936252e-25,5.014914e-20,1.336022e-18,-3.148571e-15,...,-3.777485e-16,-1.312519e-17,-2.967254e-18,-2.406352e-19,3.017615e-19,1.119697e-19,1.249843e-19,-1.130464e-20,1.574058e-20,-1.161984e-22
4,4,CBL,5,V391I,Oncogenic mutations in the monomeric Casitas B...,"For example, the Q249E mutant showed an increa...",-5.494071e-26,9.687730e-23,-8.270054e-22,-2.321849e-19,...,-4.018228e-17,3.180975e-18,1.377191e-18,6.220148e-19,-1.946108e-19,-3.515165e-19,-2.582858e-19,5.891622e-21,-9.859626e-21,1.456015e-22
5,5,CBL,6,V430M,Oncogenic mutations in the monomeric Casitas B...,Figure 4 Figure 4 Structures of wild type (gre...,7.952901e-26,-1.557783e-22,4.261542e-21,1.230154e-18,...,-5.008013e-17,7.225287e-16,-2.504022e-17,-1.337896e-17,3.424846e-18,1.111619e-18,-5.907006e-20,7.160735e-19,-2.981643e-20,-2.359028e-22
6,4,CBL,8,Y371H,Abstract Juvenile myelomonocytic leukemia (JM...,We investigated the mechanism by which CBL- pl...,2.322664e-27,1.012991e-24,4.924734e-23,1.256177e-20,...,1.144737e-18,-9.163914e-18,-1.171408e-16,-1.079938e-17,1.026808e-16,3.895431e-18,1.291907e-17,1.378850e-17,5.497033e-18,-4.194782e-20
7,4,CBL,9,C384R,Abstract Juvenile myelomonocytic leukemia (JM...,We investigated the mechanism by which CBL-Y37...,5.928725e-27,-8.832844e-25,6.759233e-23,4.656283e-20,...,1.896838e-18,7.057712e-18,-1.788094e-16,9.163135e-16,-2.859316e-16,-4.262833e-16,-3.311566e-16,-7.823928e-17,7.478273e-17,-3.314364e-19
8,4,CBL,10,P395A,Oncogenic mutations in the monomeric Casitas B...,"For example, the Q249E mutant showed an increa...",4.148274e-27,-3.627361e-24,3.518046e-23,7.118104e-22,...,1.726778e-18,-4.692803e-18,1.710749e-17,6.879925e-16,6.765901e-16,3.948970e-16,5.810336e-16,-8.419593e-16,-3.675528e-16,2.853503e-18
9,4,CBL,11,K382E,Noonan syndrome is an autosomal dominant conge...,"Purified PCR amplicons (CBLWT, CBLC381A, CBL p...",-8.596720e-27,1.582616e-24,-1.525188e-22,-1.716718e-21,...,-4.125585e-19,1.484085e-17,5.460050e-18,-1.341416e-15,-3.554197e-15,2.955287e-15,2.767873e-15,-4.564471e-16,-2.300869e-17,-2.570776e-20


In [551]:
new_train_1 = data_final.iloc[:len(new_train)]
new_test_1 = data_final.iloc[len(new_train):]

In [553]:
new_test_1

Unnamed: 0,Class,Gene,ID,Variation,Window_Text,tsvd_gene0,tsvd_gene1,tsvd_gene2,tsvd_gene3,tsvd_gene4,tsvd_gene5,tsvd_gene6,tsvd_gene7,tsvd_gene8,tsvd_gene9,tsvd_gene10,tsvd_gene11,tsvd_gene12,tsvd_gene13,tsvd_gene14
2925,,WNT4,3,E216G,This unchanged corresponded consistent express...,-0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-0.000000e+00,-0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-0.000000e+00,0.000000e+00
2926,,SUCLA2,4,G118R,On placeholderMutation difference first expre...,-0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-0.000000e+00,-0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-0.000000e+00,0.000000e+00
2927,,CHEK2,6,E239K,"In 1996, two independent groups confirmed that...",-1.711680e-21,-1.038383e-18,-9.038207e-18,1.504303e-18,-7.811160e-19,-1.061855e-19,1.425506e-17,-8.953001e-19,-2.444479e-17,-4.234301e-17,6.104390e-18,6.016692e-16,5.515131e-18,3.307617e-17,-2.938149e-16
2928,,CHST3,7,T141M,We point S12 the the overexpression OSCST pla...,-0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-0.000000e+00,-0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-0.000000e+00,0.000000e+00
2929,,RNF6,8,G244D,The fourth exon required five overlapping PCR ...,-0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-0.000000e+00,-0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-0.000000e+00,0.000000e+00
2930,,SPAST,9,C448Y,Interestingly also indeed of novel and placeh...,-0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-0.000000e+00,-0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-0.000000e+00,0.000000e+00
2931,,SCN4A,11,V445M,Based of MATERIALS new concentrations placeho...,-0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-0.000000e+00,-0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-0.000000e+00,0.000000e+00
2932,,ERBB2,15,G746S,OncoMap was performed in triplicate across thr...,-3.230124e-22,4.651652e-19,6.359142e-18,-3.677714e-16,1.000000e+00,1.024989e-15,-2.572598e-16,-2.704591e-16,2.461913e-17,9.925581e-18,1.448375e-17,3.974184e-18,8.202620e-18,-4.961602e-18,-2.067585e-17
2933,,TP53,16,Y234S,Among the best-studied therapeutic targets in ...,1.000000e+00,-4.429340e-17,4.186066e-18,-1.445982e-21,1.431285e-22,3.018405e-22,-8.902819e-22,8.612156e-21,7.521920e-20,-5.393804e-20,-1.079923e-20,-3.159632e-19,-6.834865e-19,4.860392e-19,5.353019e-18
2934,,RAB27A,17,A87P,If cases used of 0.8 proof-of-principle with a...,-0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-0.000000e+00,-0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,-0.000000e+00,0.000000e+00


In [554]:
new_train_1.shape

(2925, 20)

In [555]:
new_test_1.shape

(619, 20)

In [148]:
new_train_1.to_csv("checkpoints_databases/new_working_train.csv",index=False,encoding="utf8")
new_test_1.to_csv("checkpoints_databases/new_working_test.csv",index=False,encoding="utf8")