This notebook prepares the data files so that they can be used for the training model. The training and test data frames are saved as pickles

# Imports

In [2]:
from sklearn.utils import shuffle
import pandas as pd
import pickle
import sys
sys.path.append('/mnt/4_TB_HD/ramona/utils')

# Modeling the text

## Transform the Training Set into Right Format 


In [3]:
training_variants_df = pd.read_csv("../utils/data/training_variants")
training_text_df = pd.read_csv("../utils/data/training_text", sep="\|\|", engine="python", skiprows=1, names=["ID", "Text"])
print("Training Variants".ljust(15), training_variants_df.shape)
print("Train Text".ljust(15), training_text_df.shape)

print(training_variants_df.head())
print(training_text_df.shape)
training_text_df.head()

Training Variants (3321, 4)
Train Text      (3321, 2)
   ID    Gene             Variation  Class
0   0  FAM58A  Truncating Mutations      1
1   1     CBL                 W802*      2
2   2     CBL                 Q249E      2
3   3     CBL                 N454D      3
4   4     CBL                 L399V      4
(3321, 2)


Unnamed: 0,ID,Text
0,0,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,Abstract Background Non-small cell lung canc...
2,2,Abstract Background Non-small cell lung canc...
3,3,Recent evidence has demonstrated that acquired...
4,4,Oncogenic mutations in the monomeric Casitas B...


In [4]:
def translate_classes(classe):
    if int(classe) == 1:
        meaning = 'Likely Loss-of-function'
    elif int(classe) == 2:
        meaning = 'Likely Gain-of-function'
    elif int(classe) == 3:
        meaning = 'Neutral'
    elif int(classe) == 5:
        meaning = 'Likely Neutral'
    elif int(classe) == 6:
        meaning = 'Inconclusive'
    elif int(classe) == 7:
        meaning = 'Gain-of-function'
    elif int(classe) == 8:
        meaning = 'Likely Switch-of-function'
    elif int(classe) == 9:
        meaning = 'Switch-of-function'
    elif int(classe) == 4:
        meaning = 'Loss-of-function'
    return meaning




merge training text and variants into one data frame and drop the texts which have no content

In [5]:
training_merge_df = training_variants_df.merge(training_text_df,left_on="ID",right_on="ID")
print(training_merge_df.shape)

training_merge_df.loc[:, 'Text_count'] = training_merge_df["Text"].apply(lambda x: len(str(x).split()) )
print(training_merge_df.loc[training_merge_df['Text_count'] == 1])
training_merge_df = training_merge_df[training_merge_df['Text_count'] != 1]

#training_merge_df['Class'] = training_merge_df["Class"].apply(lambda x: translate_classes(x) )
print(training_merge_df.shape)
#training_merge_df = shuffle(training_merge_df)
training_merge_df.head() 



(3321, 5)
        ID    Gene             Variation  Class Text  Text_count
1109  1109   FANCA                S1088F      1  NaN           1
1277  1277  ARID5B  Truncating Mutations      1  NaN           1
1407  1407   FGFR3                 K508M      6  NaN           1
1639  1639    FLT1         Amplification      6  NaN           1
2755  2755    BRAF                 G596C      7  NaN           1
(3316, 6)


Unnamed: 0,ID,Gene,Variation,Class,Text,Text_count
0,0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...,6089
1,1,CBL,W802*,2,Abstract Background Non-small cell lung canc...,5756
2,2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...,5756
3,3,CBL,N454D,3,Recent evidence has demonstrated that acquired...,5572
4,4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...,6202


In [35]:
#training_merge_df.to_csv('training_data_presentation.csv')

In [20]:
training_merge_df[['Gene', 'Variation']].to_pickle('test_variants_filtered')

There were 5 ID which had no text

In [21]:
training_merge_df['Text'] = training_merge_df['Text'].apply(lambda x: x.replace('\t',' '))


Saving the text of the training_merge_df combined with the class as a txt file for evaluation

In [12]:
#training_merge_df[['Class','Text']].to_csv(r'training_text_with_label.txt', header=None, index=None, sep='\t')


In [23]:
def extract_text_sections(Text, Gene, Variation, ID): 
    section=''
    index = Text.find(Variation)
    if index == -1:
      #print("Could not find: %s in iD: %d"  % (Variation, ID))
      Variation = 'mutation'
      index = Text.find(Variation)
    end_index= 0    
    t= 0
    #print(Variation,index,t)
    index_list = []
    while index != -1:
      index_list.append(index)
      old_index = index
      index = Text.find(Variation, old_index+1)
      t+=1
    
    
    # for the case the variation appears less the 5 times
    if t < 5 and t>0: 
  
      for index in index_list: 
        # if two sections are overlapping 
        if index <= end_index: 
          # determine end point depending on if it's out of range or not 
          if index+int(2000/t) <= (len(Text)-1): 
            end = index+int(2000/t)
          else: 
            end = (len(Text)-1)
          section += Text[end_index:end]
       
        else: 
          # determine end point depending on if it's out of range or not
          if index+int(2000/t) <=(len(Text)-1): 
            end = index+int(2000/t)
          else: 
            end = (len(Text)-1)
          # determine start point depending on if it's out of range or not
          if index-int(2000/t) >= 0: 
            start = index-int(2000/t)
          else: 
            start = 0
          section += Text[start:end] 
         
  
        end_index = end
    if t==0:
      section = Text
    else: 
       for index in index_list :
        if index <= end_index: 
          # determine end point 
          if index+1000 <= (len(Text)-1): 
            end = index+1000
          else: 
            end = (len(Text)-1)
          section += Text[end_index:end]
       
        else: 
          if index+1000 <= (len(Text)-1): 
            end = index+1000
          else: 
            end = (len(Text)-1)
          if index-1000 >= 0: 
            start = index-1000
          else: 
            start = 0
          section += Text[start:end] 
        
        end_index = end
    section = section.replace('\t', ' ')
    return section
   



In [26]:
training_merge_df['modified_text'] = training_merge_df[['Text', 'Gene','Variation', 'ID']].apply(lambda x: extract_text_sections(x[0].lower(),x[1].lower(),x[2].lower(),x[3]), axis=1 )

training_merge_df.head()

Unnamed: 0,ID,Gene,Variation,Class,Text,Text_count,modified_text
0,0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...,6089,cyclin-dependent kinases (cdks) regulate a var...
1,1,CBL,W802*,2,Abstract Background Non-small cell lung canc...,5756,ncer (nsclc) is a heterogeneous group of disor...
2,2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...,5756,ll lung cancer (nsclc) is a heterogeneous grou...
3,3,CBL,N454D,3,Recent evidence has demonstrated that acquired...,5572,alysis but failed to detect any further sequen...
4,4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...,6202,d) compared to either a549 or hek293t cells (m...


In [27]:
training_merge_df.to_pickle('train_1000.sav')

In [8]:
Y_train = training_merge_df['Class']
Y_train.head()

0    1
1    2
2    2
3    3
4    4
Name: Class, dtype: int64

In [9]:
X_train.to_pickle('X_train_full.sav')
Y_train.to_pickle('Y_train_full.sav')

## Transform the Test Set into the Right Format 

In [3]:
#load test files

test_variants_df = pd.read_csv("../utils/data/test_variants.csv")
test_text_df = pd.read_csv("../utils/data/test_text.csv", sep="\|\|", engine="python", skiprows=1, names=["ID", "Text"])
solution = pd.read_csv("../utils/data/solution_filtered.csv")
print("Test Variant".ljust(15), test_variants_df.shape)
print("Test Text".ljust(15), test_text_df.shape)
print("Solution".ljust(15), solution.shape)
print(solution.head(5))

test_merge_df = test_variants_df.merge(test_text_df,left_on="ID",right_on="ID")
print('Test Merge'.ljust(15),test_merge_df.shape)
test_merge_df.head(5)




Test Variant    (5668, 3)
Test Text       (5668, 2)
Solution        (368, 10)
   ID  class1  class2  class3  class4  class5  class6  class7  class8  class9
0  12       1       0       0       0       0       0       0       0       0
1  19       0       1       0       0       0       0       0       0       0
2  21       0       1       0       0       0       0       0       0       0
3  55       0       0       0       1       0       0       0       0       0
4  64       0       0       0       1       0       0       0       0       0
Test Merge      (5668, 4)


Unnamed: 0,ID,Gene,Variation,Text
0,0,ACSL4,R570S,2. This mutation resulted in a myeloproliferat...
1,1,NAGLU,P521L,Abstract The Large Tumor Suppressor 1 (LATS1)...
2,2,PAH,L333F,Vascular endothelial growth factor receptor (V...
3,3,ING1,A148D,Inflammatory myofibroblastic tumor (IMT) is a ...
4,4,TMEM216,G77A,Abstract Retinoblastoma is a pediatric retina...


In [4]:
#build Y_test with the solution data frame 

solution.columns = ['ID', 1, 2, 3, 4, 5, 6,
       7, 8, 9]
Class_df = pd.DataFrame()
Class_df['ID'] = solution['ID']
Class_df['Class'] =  solution[[1, 2, 3, 4, 5, 6,
       7, 8, 9]].idxmax(axis=1)

Class_df.head(5) #Takes the info from solution



Unnamed: 0,ID,Class
0,12,1
1,19,2
2,21,2
3,55,4
4,64,4


In [13]:
test_classified_df = test_merge_df.merge(Class_df,left_on="ID",right_on="ID")
print('Test classified'.ljust(15),test_classified_df.shape)

test_classified_df.loc[:, 'Text_count'] = test_classified_df["Text"].apply(lambda x: len(str(x).split()) )
print(test_classified_df.loc[test_classified_df['Text_count'] == 1])
test_classified_df = test_classified_df[test_classified_df['Text_count'] != 1]

print('Test classified'.ljust(15),test_classified_df.shape)
test_classified_df.head(5)

Test classified (368, 5)
       ID   Gene      Variation Text  Class  Text_count
105  1623  AURKB  Amplification  NaN      2           1
Test classified (367, 6)


Unnamed: 0,ID,Gene,Variation,Text,Class,Text_count
0,12,TET2,Y1902A,TET proteins oxidize 5-methylcytosine (5mC) on...,1,7123
1,19,MTOR,D2512H,Genes encoding components of the PI3K-Akt-mTOR...,2,4283
2,21,KIT,D52N,Myeloproliferative disorders (MPD) constitute ...,2,2477
3,55,SPOP,F125V,"In the largest E3 ligase subfamily, Cul3 binds...",4,22727
4,64,KEAP1,C23Y,Keap1 is the substrate recognition module of a...,4,2854


In [14]:
test_classified_df['modified_text'] = test_classified_df[['Text', 'Gene','Variation', 'ID']].apply(lambda x: extract_text_sections(x[0].lower(),x[1].lower(),x[2].lower(),x[3]), axis=1 )


In [15]:
test_classified_df

Unnamed: 0,ID,Gene,Variation,Text,Class,Text_count,modified_text
0,12,TET2,Y1902A,TET proteins oxidize 5-methylcytosine (5mC) on...,1,7123,the binding of purified tet2 proteins to a fa...
1,19,MTOR,D2512H,Genes encoding components of the PI3K-Akt-mTOR...,2,4283,red somatic point mutations (supplementary tab...
2,21,KIT,D52N,Myeloproliferative disorders (MPD) constitute ...,2,2477,myeloproliferative disorders (mpd) constitute ...
3,55,SPOP,F125V,"In the largest E3 ligase subfamily, Cul3 binds...",4,22727,"re molecular signatures database (msigdb, cont..."
4,64,KEAP1,C23Y,Keap1 is the substrate recognition module of a...,4,2854,keap1 is the substrate recognition module of a...
5,70,CDKN2A,G35V,Inherited mutations affecting the INK4a/ARF lo...,4,12660,inherited mutations affecting the ink4a/arf lo...
6,89,IDH1,R132S,To assess the value of anti-isocitrate dehydro...,9,27549,09 was generally crisper with a better signal-...
7,93,KIT,N822K,c-KIT is a member of the type 3 subclass of tr...,7,34300,"us 10% fetal calf serum (fcs), and 4 mg cell l..."
8,111,ETV1,Amplification,E26 transformation-specific (ETS) transcriptio...,7,7082,to be fully explored. we have completed a com...
9,113,JAK1,F958C,Activating mutations in JAK1 and JAK2 have bee...,7,4878,pocket formed between the jh2 αc helix and an ...


In [9]:
test_classified_df[['Gene', 'Variation']].to_pickle('test_variants_filtered')

In [37]:
X_test=test_classified_df[['Text', 'Gene','Variation', 'ID']].apply(lambda x: extract_text_sections(x[0].lower(),x[1].lower(),x[2].lower(), x[3]), axis=1 )
X_test.shape


(367,)

In [11]:
#X_test=test_classified_df['Text']
Y_test = test_classified_df['Class']

In [16]:
test_classified_df.to_pickle('test_1000.sav')
#Y_test.to_pickle('Y_test_1000.sav')

# Saving the training and test set

In [None]:
# Saving it temporary on the computer
X_train.to_pickle('/tmp/X_train_1000.sav')
Y_train.to_pickle('/tmp/Y_train_1000.sav')
X_test.to_pickle('/tmp/X_test_1000.sav')
Y_test.to_pickle('/tmp/Y_test_1000.sav')

In [None]:
pickled_df = {'/tmp/X_train_1000.sav':'X_train_1000', '/tmp/Y_train_1000.sav':'Y_train_1000', '/tmp/X_test_1000.sav': 'X_test_1000', '/tmp/Y_test_1000.sav':'Y_test_1000' }