# Imports

In [2]:
import datetime
import lightgbm as lgb
from sklearn import *
import sklearn
import pandas as pd
import numpy as np
import pickle
import sys
sys.path.append('../utils/')
from Read_data import read_data
from evaluation import evaluation_class
import ourPreprocessor
import csv


  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Load Training data (included stage1 test data set) 

In [5]:
train = pd.read_csv("../utils/data/training_variants")
trainx = pd.read_csv('../utils/data/training_text', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])
train_merge_df = pd.merge(train, trainx, how='left', on='ID')
train_merge_df = train_merge_df.loc[~train_merge_df.Text.isnull()]
print(train_merge_df.shape)
train_merge_df.head()

(3316, 5)


Unnamed: 0,ID,Gene,Variation,Class,Text
0,0,FAM58A,Truncating Mutations,1,Cyclin-dependent kinases (CDKs) regulate a var...
1,1,CBL,W802*,2,Abstract Background Non-small cell lung canc...
2,2,CBL,Q249E,2,Abstract Background Non-small cell lung canc...
3,3,CBL,N454D,3,Recent evidence has demonstrated that acquired...
4,4,CBL,L399V,4,Oncogenic mutations in the monomeric Casitas B...


In [6]:
#load stage1 test files
test = pd.read_csv("../utils/data/test_variants.csv")
testx = pd.read_csv("../utils/data/test_text.csv", sep="\|\|", engine="python", skiprows=1, names=["ID", "Text"])
solution = pd.read_csv("../utils/data/solution_filtered.csv")
test_merge_df = test.merge(testx,left_on="ID",right_on="ID")

#add the classification to the test_merge
solution.columns = ['ID', 1, 2, 3, 4, 5, 6,
       7, 8, 9]
Class_df = pd.DataFrame()
Class_df['ID'] = solution['ID']
Class_df['Class'] =  solution[[1, 2, 3, 4, 5, 6,
       7, 8, 9]].idxmax(axis=1)
test_classified_df = test_merge_df.merge(Class_df,left_on="ID",right_on="ID")
test_classified_df = test_classified_df.loc[~test_classified_df.Text.isnull()] #delete samples with NaN as text
train = train_merge_df.append(test_classified_df)
print(train.shape)
train.head()


(3683, 5)


Unnamed: 0,Class,Gene,ID,Text,Variation
0,1,FAM58A,0,Cyclin-dependent kinases (CDKs) regulate a var...,Truncating Mutations
1,2,CBL,1,Abstract Background Non-small cell lung canc...,W802*
2,2,CBL,2,Abstract Background Non-small cell lung canc...,Q249E
3,3,CBL,3,Recent evidence has demonstrated that acquired...,N454D
4,4,CBL,4,Oncogenic mutations in the monomeric Casitas B...,L399V


In [7]:
y_train = train['Class'].values
train = train.drop(['Class'], axis=1)

## Load test data (stage2 test) 


In [34]:
stage2_testx = pd.read_csv('../utils/stage2_data/stage2_test_text.csv', sep="\|\|", engine='python', header=None, skiprows=1, names=["ID","Text"])
stage2_test = pd.read_csv('/mnt/4_TB_HD/ramona/utils/stage2_data/stage2_test_variants.csv')
stage2_solution = pd.read_csv('../utils/stage2_data/stage_2_private_solution.csv')
stage2_merge_df= pd.merge(stage2_test, stage2_testx, how='left', on='ID').fillna('')

stage2= pd.merge(stage2_test, stage2_testx, how='left', on='ID').fillna('')
stage2.head()

Unnamed: 0,ID,Gene,Variation,Text
0,1,CHEK2,H371Y,The incidence of breast cancer is increasing i...
1,2,AXIN2,Truncating Mutations,An unselected series of 310 colorectal carcino...
2,3,WNT4,E216G,Mycosis fungoides and Sézary syndrome are prim...
3,4,SUCLA2,G118R,Regulated progression through the cell cycle ...
4,5,BRAF,T599insTT,Pilocytic astrocytoma (PA) is emerging as a tu...


### Preprocessing the data set 

In [6]:
def TransDict_from_list(groups):
        '''
        Given a list of letter groups, returns a dict mapping each group to a
        single letter from the group - for use in translation.
        >>> alex6=["C", "G", "P", "FYW", "AVILM", "STNQRHKDE"]
        >>> trans_a6 = TransDict_from_list(alex6)
        >>> print(trans_a6)
        {'V': 'A', 'W': 'F', 'T': 'D', 'R': 'D', 'S': 'D', 'P': 'P',
         'Q': 'D', 'Y': 'F', 'F': 'F',
         'G': 'G', 'D': 'D', 'E': 'D', 'C': 'C', 'A': 'A',
          'N': 'D', 'L': 'A', 'M': 'A', 'K': 'D', 'H': 'D', 'I': 'A'}
        '''
        transDict = dict()

        result = {}
        for group in groups:
            g_members = sorted(group) #Alphabetically sorted list
            for c in g_members:
                result[c] = str(g_members[0]) #K:V map, use group's first letter as represent.
        return result

In [7]:
def finddistance(AA1 = None, AA2=None):
        if AA1 in pc5 and AA2 in pc5 and AA1 != 'W':
            AAlist = ph_distances.loc[AA1] #Finds row for AA1
        else:
            return float('nan')
        if AA2 == 'S' or AA1=='W':
            dist=float('nan')
        else:
            dist=AAlist.get(AA2) #Search for AA2
        if math.isnan(dist): #If not found, switch order and search again
            dist = finddistance(AA1=AA2, AA2=AA1)
        return dist

In [22]:
df_all = pd.concat((train, stage2), axis=0, ignore_index=True)
df_all['Gene_Share'] = df_all.apply(lambda r: sum([1 for w in r['Gene'].split(' ') if w in r['Text'].split(' ')]), axis=1)
df_all['Variation_Share'] = df_all.apply(lambda r: sum([1 for w in r['Variation'].split(' ') if w in r['Text'].split(' ')]), axis=1)


gen_var_lst = sorted(list(train.Gene.unique()) + list(train.Variation.unique()))
print(len(gen_var_lst))
gen_var_lst = [x for x in gen_var_lst if len(x.split(' '))==1]
print(len(gen_var_lst))
i_ = 0


df_all["simple_variation_pattern"] = df_all.Variation.str.contains(r'^[A-Z]\d{1,7}[A-Z]$',case=False)
df_all['location_number'] = df_all.Variation.str.extract('(\d+)', expand=True)
AA_VALID = 'ACDEFGHIKLMNPQRSTVWY'
df_all['variant_letter_first'] = df_all.apply(lambda row: row.Variation[0] if row.Variation[0] in (AA_VALID) else np.NaN,axis=1)
df_all['variant_letter_last'] = df_all.apply(lambda row: row.Variation.split()[0][-1] if (row.Variation.split()[0][-1] in (AA_VALID)) else np.NaN,axis=1)
df_all.loc[df_all.simple_variation_pattern==False,['variant_letter_last',"variant_letter_first"]] = np.NaN
ofer8=TransDict_from_list(["C", "G", "P", "FYW", "AVILM", "RKH", "DE", "STNQ"])
sdm12 =TransDict_from_list(["A", "D", "KER", "N",  "TSQ", "YF", "LIVM", "C", "W", "H", "G", "P"] )
pc5 = {"I": "A", # Aliphatic
             "V": "A",         "L": "A",
             "F": "R", # Aromatic
             "Y": "R",         "W": "R",         "H": "R",
             "K": "C", # Charged
             "R": "C",         "D": "C",         "E": "C",
             "G": "T", # Tiny
             "A": "T",         "C": "T",         "S": "T",
             "T": "D", # Diverse
             "M": "D",         "Q": "D",         "N": "D",
             "P": "D"}
#df_all['AAGroup_ofer8_letter_first'] = df_all["variant_letter_first"].map(ofer8)
#df_all['AAGroup_ofer8_letter_last'] = df_all["variant_letter_last"].map(ofer8)
#df_all['AAGroup_ofer8_equiv'] = df_all['AAGroup_ofer8_letter_first'] == df_all['AAGroup_ofer8_letter_last']
#df_all['AAGroup_m12_equiv'] = df_all['variant_letter_last'].map(sdm12) == df_all['variant_letter_first'].map(sdm12)
#df_all['AAGroup_p5_equiv'] = df_all['variant_letter_last'].map(pc5) == df_all['variant_letter_first'].map(pc5)



ph_distances = pd.read_csv("../utils/physiochem.csv", sep=';')
ph_distances = ph_distances.set_index('Unnamed: 0')
PC_distance= df_all[['variant_letter_first', 'variant_letter_last']].apply(lambda x: finddistance(x[0],x[1]), axis=1)
meanv=np.mean(PC_distance)
PC_distance[np.isnan(PC_distance)]=meanv
df_all['PC_distance'] = normalize(PC_distance.values.reshape(-1, 1), axis=0)
df_all = df_all.drop(["simple_variation_pattern",'location_number', 'variant_letter_first','variant_letter_last' ], axis=1,)  
df_all = df_all.astype(str)


for c in df_all.columns:
    if df_all[c].dtype == 'object':
        if c in ['Gene','Variation']:
            lbl = preprocessing.LabelEncoder()
            df_all[c+'_lbl_enc'] = lbl.fit_transform(df_all[c].values)  
            df_all[c+'_len'] = df_all[c].map(lambda x: len(str(x)))
            df_all[c+'_words'] = df_all[c].map(lambda x: len(str(x).split(' ')))
        elif c != 'Text' and c != 'PC_distance':
            lbl = preprocessing.LabelEncoder()
            df_all[c] = lbl.fit_transform(df_all[c].values)
        if c=='Text': 
            df_all[c+'_len'] = df_all[c].map(lambda x: len(str(x)))
            df_all[c+'_words'] = df_all[c].map(lambda x: len(str(x).split(' '))) 





class cust_regression_vals(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
    def fit(self, x, y=None):
        return self
    def transform(self, x):
        x = x.drop(['Gene', 'Variation','ID','Text'],axis=1).values
        return x

class cust_txt_col(sklearn.base.BaseEstimator, sklearn.base.TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, x, y=None):
        return self
    def transform(self, x):
        return x[self.key].apply(str)

3573
3392


In [23]:
train = df_all.iloc[:len(train)]; print(train.shape)  
test = df_all.iloc[len(train):]; print(test.shape)

(3683, 15)
(986, 15)


In [24]:
print('Pipeline...')
fp = pipeline.Pipeline([
    ('union', pipeline.FeatureUnion(
        n_jobs = 1,
        transformer_list = [
            ('standard', cust_regression_vals()),
            ('pi1', pipeline.Pipeline([('Gene', cust_txt_col('Gene')), ('count_Gene', feature_extraction.text.CountVectorizer(analyzer=u'char', ngram_range=(1, 8))), ('tsvd1', decomposition.TruncatedSVD(n_components=25, n_iter=25, random_state=12))])),
            ('pi2', pipeline.Pipeline([('Variation', cust_txt_col('Variation')), ('count_Variation', feature_extraction.text.CountVectorizer(analyzer=u'char', ngram_range=(1, 8))), ('tsvd2', decomposition.TruncatedSVD(n_components=25, n_iter=25, random_state=12))])),

            ('pi3', pipeline.Pipeline([('Text', cust_txt_col('Text')), ('tfidf_Text', feature_extraction.text.TfidfVectorizer(preprocessor=ourPreprocessor.myPreprocessor, tokenizer=ourPreprocessor.tokenizeratleastthree, ngram_range=(1,3), stop_words=ourPreprocessor.stopWords, min_df=1)), ('tsvd3', decomposition.TruncatedSVD(n_components=50, n_iter=50, random_state=12))]))
        ])
    )])

Pipeline...


In [25]:
%%time
train_transformed = fp.fit_transform(train)
print(train_transformed.shape)
test_transformed = fp.transform(test)
print(test_transformed.shape)

(3683, 111)
(986, 111)
CPU times: user 1h 13min 46s, sys: 16min 37s, total: 1h 30min 23s
Wall time: 47min 21s


# Saving the X and Y

In [26]:
time = datetime.datetime.now().strftime("%y-%m-%d-%H-%M")

In [27]:
time

'18-05-31-12-21'

In [28]:
def save_pipeline( model=None, time=None):
    pickle.dump(fp, open('/mnt/4_TB_HD/ramona/utils/textPipelines/featurepipeline_'+time+'.pkl', 'wb'))
    
def save_pipelineinfo( model = '' , trainpickle = '', testpickle= '', Y_train='', Y_test='', balance = '', genecolumns='', shuffled = '', svd_name = '', pipeline_date =''):
    filename = '/mnt/4_TB_HD/ramona/deliver/pipelineinfo2.csv'
    with open(filename,'a') as f:
        f = csv.writer(f)
        f.writerow([ model, trainpickle, testpickle, Y_train, Y_test, 'full' ,balance,shuffled, genecolumns, svd_name, pipeline_date ])   


In [29]:
path = '/mnt/4_TB_HD/ramona/utils/features/'
filename1 = 'Xtrain_' + time
filename2 = 'Xtest_' + time
filename3 =  'Ytrain_' + time 
filename4 =  'Ytest_' + time 
with open(path + filename1, 'wb') as f1:
    pickle.dump(train_transformed, f1)
with open(path + filename2, 'wb') as f2:
    pickle.dump(test_transformed ,f2)

Y_test_stage2_class = pd.read_pickle('../utils/stage2_data/stage2test_classes.sav')['Class'].values
with open(path + filename3, 'wb') as f2:
    pickle.dump(y ,f2)
with open(path + filename4, 'wb') as f2:
    pickle.dump(Y_test_stage2_class ,f2)

filename5 = 'train_before_pipeline' + time
filename6 = 'test_before_pipeline' + time
with open(path + filename5, 'wb') as f1:
    pickle.dump(train, f1)
with open(path + filename6, 'wb') as f2:
    pickle.dump(test ,f2)



In [30]:

save_pipelineinfo([fp.named_steps['union'].transformer_list[0], fp.named_steps['union'].transformer_list[1][1].named_steps, fp.named_steps['union'].transformer_list[2][1].named_steps, fp.named_steps['union'].transformer_list[3][1].named_steps], filename1, filename2, filename3, filename4, shuffled = False,  balance = '', pipeline_date=time)






In [31]:
save_pipeline(fp, time)

In [32]:
clf = pickle.load(open('/mnt/4_TB_HD/ramona/utils/textPipelines/featurepipeline_'+time+'.pkl', 'rb') )

In [33]:
fp.named_steps['union'].transformer_list[1][1].named_steps

{'Gene': cust_txt_col(key='Gene'),
 'count_Gene': CountVectorizer(analyzer='char', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1, 8), preprocessor=None, stop_words=None,
         strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
         tokenizer=None, vocabulary=None),
 'tsvd1': TruncatedSVD(algorithm='randomized', n_components=25, n_iter=25,
        random_state=12, tol=0.0)}