# Imports

In [2]:
import datetime
import lightgbm as lgb
from sklearn import *
import sklearn
import pandas as pd
import numpy as np
import pickle
import sys
sys.path.append('../utils/')
from Read_data import read_data
from evaluation import evaluation_class
import ourPreprocessor
import csv


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Load Training data (included stage1 test data set) 

In [5]:
train = pd.read_csv("../utils/data/training_variants")
trainx = pd.read_csv('../utils/data/training_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])
train_merge_df = pd.merge(train, trainx, how='left', on='ID')
train_merge_df = train_merge_df.loc[~train_merge_df.Text.isnull()]
print(train_merge_df.shape)
train_merge_df.head()

(3316, 5)


Unnamed: 0,ID,Gene,Variation,Class,Text
0,0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,2,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,3,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...


In [6]:
#load stage1 test files
test = pd.read_csv("../utils/data/test_variants.csv")
testx = pd.read_csv("../utils/data/test_text.csv", sep="\|\|", engine="python", skiprows=1, names=["ID", "Text"])
solution = pd.read_csv("../utils/data/solution_filtered.csv")
test_merge_df = test.merge(testx,left_on="ID",right_on="ID")

#add the classification to the test_merge
solution.columns = ['ID', 1, 2, 3, 4, 5, 6,
       7, 8, 9]
Class_df = pd.DataFrame()
Class_df['ID'] = solution['ID']
Class_df['Class'] =  solution[[1, 2, 3, 4, 5, 6,
       7, 8, 9]].idxmax(axis=1)
test_classified_df = test_merge_df.merge(Class_df,left_on="ID",right_on="ID")
test_classified_df = test_classified_df.loc[~test_classified_df.Text.isnull()] #delete samples with NaN as text
train = train_merge_df.append(test_classified_df)
print(train.shape)
train.head()


(3683, 5)


Unnamed: 0,Class,Gene,ID,Text,Variation
0,1,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations
1,2,CBL,1,Abstract Background Non-small cell lung canc...,W802*
2,2,CBL,2,Abstract Background Non-small cell lung canc...,Q249E
3,3,CBL,3,Recent evidence has demonstrated that acquired...,N454D
4,4,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V


In [7]:
y_train = train['Class'].values

## Load test data (stage2 test) 


In [8]:
stage2_testx = pd.read_csv('../utils/stage2_data/stage2_test_text.csv', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])
stage2_test = pd.read_csv('/mnt/4_TB_HD/ramona/utils/stage2_data/stage2_test_variants.csv')
stage2_solution = pd.read_csv('../utils/stage2_data/stage_2_private_solution.csv')
stage2_merge_df= pd.merge(stage2_test, stage2_testx, how='left', on='ID').fillna('')

#transform classification and take only those stage 2 samples with solution (125 out of 986)
stage2_solution.columns = ['ID', 1, 2, 3, 4, 5, 6,
       7, 8, 9]
stage2_Class_df = pd.DataFrame()
stage2_Class_df['ID'] = stage2_solution['ID']
stage2_Class_df['Class'] = stage2_solution[[1, 2, 3, 4, 5, 6,
       7, 8, 9]].idxmax(axis=1)

stage2_classified_df = stage2_merge_df.merge(stage2_Class_df,left_on="ID",right_on="ID")
y_test = stage2_classified_df['Class'].values
stage2 = stage2_classified_df.drop(['Class'], axis=1)
print(stage2.shape)
stage2.head()


(125, 4)


Unnamed: 0,ID,Gene,Variation,Text
0,8,RNF6,G244D,Human ESCCs 2 occur frequently worldwide (1) ....
1,15,ERBB2,G746S,The protein-kinase family is the most frequent...
2,16,TP53,Y234S,Among the best-studied therapeutic targets in ...
3,18,EGFR,P546S,Head and neck squamous cell carcinoma (HNSCC) ...
4,19,ERBB2,G279E,Functional characterization of cancer-associat...


In [9]:
#Classification for test and train
y = np.append(y_train, y_test)

### Preprocessing the data set 

In [5]:
df_all = pd.concat((train, stage2), axis=0, ignore_index=True)
df_all['Gene_Share'] = df_all.apply(lambda r: sum([1 for w in r['Gene'].split(' ') if w in r['Text'].split(' ')]), axis=1)
df_all['Variation_Share'] = df_all.apply(lambda r: sum([1 for w in r['Variation'].split(' ') if w in r['Text'].split(' ')]), axis=1)

gen_var_lst = sorted(list(train.Gene.unique()) + list(train.Variation.unique()))
print(len(gen_var_lst))
gen_var_lst = [x for x in gen_var_lst if len(x.split(' '))==1]
print(len(gen_var_lst))
i_ = 0

for c in df_all.columns:
    if df_all[c].dtype == 'object':
        if c in ['Gene','Variation']:
            lbl = preprocessing.LabelEncoder()
            df_all[c+'_lbl_enc'] = lbl.fit_transform(df_all[c].values)  
            df_all[c+'_len'] = df_all[c].map(lambda x: len(str(x)))
            df_all[c+'_words'] = df_all[c].map(lambda x: len(str(x).split(' ')))
        elif c != 'Text':
            lbl = preprocessing.LabelEncoder()
            df_all[c] = lbl.fit_transform(df_all[c].values)
        if c=='Text': 
            df_all[c+'_len'] = df_all[c].map(lambda x: len(str(x)))
            df_all[c+'_words'] = df_all[c].map(lambda x: len(str(x).split(' '))) 

class cust_regression_vals(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
    def fit(self, x, y=None):
        return self
    def transform(self, x):
        x = x.drop(['Gene', 'Variation','ID','Text'],axis=1).values
        return x

class cust_txt_col(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, x, y=None):
        return self
    def transform(self, x):
        return x[self.key].apply(str)

3573
3392


## Create new train and text set 

In [6]:
# determine length of train set
length_of_train = len(train)

# shuffle data set
df_all['Class'] = y
df_all = df_all.sample(frac=1)
y_train_shuffled = df_all['Class'].iloc[:length_of_train]
y_test_shuffled = df_all['Class'].iloc[length_of_train:]
df_all = df_all.drop(['Class'], axis=1)
    
train = df_all.iloc[:length_of_train]; print(train.shape)  
test = df_all.iloc[slength_of_train:]; print(test.shape)

size = 3808-length_of_train

(3683, 14)
(125, 14)


## Preprocessing and Vectorizing

In [7]:
print('Pipeline...')
fp = pipeline.Pipeline([
    ('union', pipeline.FeatureUnion(
        n_jobs = 1,
        transformer_list = [
            ('standard', cust_regression_vals()),
            ('pi1', pipeline.Pipeline([('Gene', cust_txt_col('Gene')), ('count_Gene', feature_extraction.text.CountVectorizer(analyzer=u'char', ngram_range=(1, 8))), ('tsvd1', decomposition.TruncatedSVD(n_components=25, n_iter=25, random_state=12))])),
            ('pi2', pipeline.Pipeline([('Variation', cust_txt_col('Variation')), ('count_Variation', feature_extraction.text.CountVectorizer(analyzer=u'char', ngram_range=(1, 8))), ('tsvd2', decomposition.TruncatedSVD(n_components=25, n_iter=25, random_state=12))])),

            ('pi3', pipeline.Pipeline([('Text', cust_txt_col('Text')), ('tfidf_Text', feature_extraction.text.TfidfVectorizer(preprocessor=ourPreprocessor.myPreprocessor, tokenizer=ourPreprocessor.myTokenizer, ngram_range=(1,3), stop_words=ourPreprocessor.stopWords)), ('tsvd3', decomposition.TruncatedSVD(n_components=300, n_iter=25, random_state=12))]))
        ])
    )])

Pipeline...


In [8]:
train_transformed = fp.fit_transform(train); print(train.shape)
test_transformed = fp.transform(test); print(test.shape)

(3683, 14)
(125, 14)


# Saving X and Y

In [9]:
time = datetime.datetime.now().strftime("%y-%m-%d-%H-%M")

In [10]:
time

'18-05-30-13-08'

In [11]:
def save_pipeline( model=None, time=None):
    pickle.dump(fp, open('/mnt/4_TB_HD/ramona/utils/textPipelines/featurepipeline_'+time+'.pkl', 'wb'))
    
def save_pipelineinfo( model = '' , trainpickle = '', testpickle= '', Y_train='', Y_test='', balance = '', genecolumns='', shuffled = '', svd_name = '', pipeline_date =''):
    filename = '/mnt/4_TB_HD/ramona/deliver/pipelineinfo2.csv'
    with open(filename,'a') as f:
        f = csv.writer(f)
        f.writerow([ model, trainpickle, testpickle, Y_train, Y_test, 'full' ,balance,shuffled, genecolumns, svd_name, pipeline_date, 'testset size is'+ size  ])   


In [12]:
path = '/mnt/4_TB_HD/ramona/utils/features/'
filename1 = 'Xtrain_' + time
filename2 = 'Xtest_' + time
filename3 =  'Ytrain_' + time 
filename4 =  'Ytest_' + time 
with open(path + filename1, 'wb') as f1:
    pickle.dump(train_transformed, f1)
with open(path + filename2, 'wb') as f2:
    pickle.dump(test_transformed ,f2)
with open(path + filename3, 'wb') as f2:
    pickle.dump(y_train_shuffled.values ,f2)
with open(path + filename4, 'wb') as f2:
    pickle.dump(y_test_shuffled.values ,f2)


filename5 = 'train_before_pipeline' + time
filename6 = 'test_before_pipeline' + time
with open(path + filename5, 'wb') as f1:
    pickle.dump(train, f1)
with open(path + filename6, 'wb') as f2:
    pickle.dump(test ,f2)


    


In [13]:

save_pipelineinfo([fp.named_steps['union'].transformer_list[0], fp.named_steps['union'].transformer_list[1][1].named_steps, fp.named_steps['union'].transformer_list[2][1].named_steps, fp.named_steps['union'].transformer_list[3][1].named_steps], filename1, filename2, filename3, filename4, shuffled = True,  balance = '', pipeline_date=time)




TypeError: must be str, not int

In [None]:
save_pipeline(fp, time)

In [None]:
clf = pickle.load(open('/mnt/4_TB_HD/ramona/utils/textPipelines/featurepipeline_'+time+'.pkl', 'rb') )

In [None]:
clf.named_steps['union'].transformer_list[3][1].named_steps