In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from gensim import utils
import os
import nltk
import scipy.sparse as ssp

Using Theano backend.


In [2]:
train=pd.read_csv("bases/working_train.csv",encoding="utf8")
test=pd.read_csv("bases/working_test.csv",encoding="utf8")

# Here we will add features from word2vec retrained then get the mean for the sentences

In [45]:
class MySentences(object):
    """MySentences is a generator to produce a list of tokenized sentences 
    
    Takes a list of numpy arrays containing documents.
    
    Args:
        arrays: List of arrays, where each element in the array contains a document.
    """
    def __init__(self, *arrays):
        self.arrays = arrays
 
    def __iter__(self):
        for array in self.arrays:
            for document in array:
                for sent in nltk.sent_tokenize(document):
                    yield nltk.word_tokenize(sent)

def get_word2vec(sentences, location):
    """Returns trained word2vec
    
    Args:
        sentences: iterator for sentences
        
        location (str): Path to save/load word2vec
    """
    if os.path.exists(location):
        print('Found {}'.format(location))
        model = gensim.models.Word2Vec.load(location)
        return model
    
    print('{} not found. training model'.format(location))
    model = gensim.models.Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
    print('Model done training. Saving to disk')
    model.save(location)
    return model

In [46]:
#It's important to remove duplicated spaces for word2vec learning !
train["Text"]=[" ".join(doc.split()) for doc in train["Text"].values]
test["Text"]=[" ".join(doc.split()) for doc in test["Text"].values]

In [None]:
w2vec = get_word2vec(
    MySentences(
        train["Text"].values,test["Text"].values),"localisation"
)

localisation not found. training model


In [16]:
class MyTokenizer:
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        transformed_X = []
        for document in X:
            tokenized_doc = []
            for sent in nltk.sent_tokenize(document):
                tokenized_doc += nltk.word_tokenize(sent)
            transformed_X.append(np.array(tokenized_doc))
        return np.array(transformed_X)
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.wv.syn0[0])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = MyTokenizer().fit_transform(X)
        
        return np.array([
            np.mean([self.word2vec.wv[w] for w in words if w in self.word2vec.wv]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

In [17]:
mean_embedding_vectorizer = MeanEmbeddingVectorizer(w2vec)
mean_embedded_train = mean_embedding_vectorizer.fit_transform(train['Text'])
mean_embedded_test = mean_embedding_vectorizer.fit_transform(test['Text'])

In [18]:
df_embed_tr=pd.DataFrame(mean_embedded_train)
df_embed_te=pd.DataFrame(mean_embedded_test)

In [19]:
df_embedding_tr=df_embed_tr.reset_index()
df_embedding_te=df_embed_te.reset_index()
df_embedding_tr=df_embedding_tr.rename(columns={"index":"ID"})
df_embedding_te=df_embedding_te.rename(columns={"index":"ID"})

In [5]:
train_cl=train.drop(["Variation","Text","Gene","Class"],axis=1)
test_cl=test.drop(["Text","Class","Variation","Gene"],axis=1)

In [39]:
train_w2v=pd.merge(train,df_embedding_tr,on="ID")
test_w2v=pd.merge(test,df_embedding_te,on="ID")

In [56]:
train_w2v.to_csv("bases/working_train_w2v.csv",index=False,encoding="utf8")
test_w2v.to_csv("bases/working_test_w2v.csv",index=False,encoding="utf8")

# Now TFIDF

In [55]:

tfidf = TfidfVectorizer(
        min_df=10, max_features=10000, strip_accents=None, lowercase = False,
        analyzer='word', token_pattern=r'\w+', ngram_range=(1,3), use_idf=True,
        smooth_idf=True, sublinear_tf=True
        ).fit(train["Text"])

X_train_text = tfidf.transform(train["Text"])
X_test_text = tfidf.transform(test["Text"])

In [59]:
train

Unnamed: 0,Class,Gene,ID,Text,Variation,Substitutions_var,Stop_codon_var,Fusion_var,gene_fusion_var,Deletion_var,del_or_ins_var,Amplification_var,Truncation_var,exon_var,frameshift_var,dup_var,Gene_Share,Variation_Share,Text_words,Figure_counter
0,1.0,FAM58A,0,cyclin-dependent kinases cdks regulate variety...,Truncating Mutations,0,0,0,0.0,0.0,0.0,0,1,0,0,0,1,1,4871,0
1,2.0,CBL,1,normal tumor pairwise analysis significant lo...,W802*,1,1,0,0.0,0.0,0.0,0,0,0,0,0,1,1,1155,0
2,2.0,CBL,2,normal tumor pairwise analysis significant lo...,Q249E,1,0,0,0.0,0.0,0.0,0,0,0,0,0,1,1,1153,0
3,3.0,CBL,3,hrm analysis cbl exons blast crisis cases n ...,N454D,1,0,0,0.0,0.0,0.0,0,0,0,0,0,1,1,645,8
4,4.0,CBL,4,vm mutant borderline densitometry ratio mut...,L399V,1,0,0,0.0,0.0,0.0,0,0,0,0,0,1,1,113,0
5,4.0,CBL,5,vm mutant borderline densitometry ratio mut...,V391I,1,0,0,0.0,0.0,0.0,0,0,0,0,0,1,1,113,0
6,5.0,CBL,6,ure ure structures wild type green ke muta...,V430M,1,0,0,0.0,0.0,0.0,0,0,0,0,0,1,1,180,0
7,1.0,CBL,7,cbl negative regulator activated receptor tyro...,Deletion,0,0,0,0.0,1.0,0.0,0,0,0,0,0,1,0,11966,0
8,4.0,CBL,8,subset jmml patients harbor cbl mutations asso...,Y371H,1,0,0,0.0,0.0,0.0,0,0,0,0,0,1,1,3572,0
9,4.0,CBL,9,subset jmml patients harbor cbl mutations asso...,C384R,1,0,0,0.0,0.0,0.0,0,0,0,0,0,1,1,2889,0


In [69]:
X_train=ssp.hstack((train_cl.drop("ID",axis=1), X_train_text),format="csc")
X_test=ssp.hstack((test_cl.drop("ID",axis=1),X_test_text),format="csc")
print(X_train.shape,X_test.shape)

(3321, 10015) (5668, 10015)


In [77]:
ssp.save_npz("bases/working_train_tfidf.npz",X_train)
ssp.save_npz("bases/working_test_tfidf.npz",X_test)

# Now Doc2Vec

In [3]:
def constructLabeledSentences(data):
    sentences=[]
    for index, row in data.iteritems():
        sentences.append(LabeledSentence(utils.to_unicode(row).split(), ['Text' + '_%s' % str(index)]))
    return sentences

train_sentences = constructLabeledSentences(train['Text'])
test_sentences = constructLabeledSentences(test['Text'])

Text_INPUT_DIM=350

d2v_train = Doc2Vec(min_count=1, window=10, size=Text_INPUT_DIM, sample=1e-4, negative=5, workers=-1, iter=5,seed=26)
d2v_train.build_vocab(train_sentences)
d2v_train.train(train_sentences, total_examples=d2v_train.corpus_count, epochs=d2v_train.iter)

d2v_test = Doc2Vec(min_count=1, window=10, size=Text_INPUT_DIM, sample=1e-4, negative=5, workers=-1, iter=5,seed=26)
d2v_test.build_vocab(test_sentences)
d2v_test.train(test_sentences, total_examples=d2v_test.corpus_count, epochs=d2v_test.iter)

0

In [8]:
d2v_train_arrays = np.zeros((len(train), Text_INPUT_DIM))
d2v_test_arrays = np.zeros((len(test), Text_INPUT_DIM))

for i in range(len(train)):
    d2v_train_arrays[i] = d2v_train.docvecs['Text_'+str(i)]

for i in range(len(test)):
    d2v_test_arrays[i] = d2v_test.docvecs['Text_'+str(i)]

X_train=ssp.hstack((train_cl.drop("ID",axis=1), d2v_train_arrays),format="csc")
X_test=ssp.hstack((test_cl.drop("ID",axis=1), d2v_test_arrays),format="csc")

print(X_train.shape,X_test.shape)

ssp.save_npz("bases/working_train_d2v.npz",X_train)
ssp.save_npz("bases/working_test_d2v.npz",X_test)

(3321, 365) (5668, 365)
