In [4]:
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from gensim import utils
import os
import nltk
import scipy.sparse as ssp

In [5]:
train=pd.read_csv("checkpoints_databases/w_working_train.csv",encoding="utf8")
test=pd.read_csv("checkpoints_databases/w_working_test.csv",encoding="utf8")

# Here we will add features from word2vec retrained then get the mean for the sentences

In [6]:
class MySentences(object):
    """MySentences is a generator to produce a list of tokenized sentences 
    
    Takes a list of numpy arrays containing documents.
    
    Args:
        arrays: List of arrays, where each element in the array contains a document.
    """
    def __init__(self, *arrays):
        self.arrays = arrays
 
    def __iter__(self):
        for array in self.arrays:
            for document in array:
                for sent in nltk.sent_tokenize(document):
                    yield nltk.word_tokenize(sent)

def get_word2vec(sentences, location):
    """Returns trained word2vec
    
    Args:
        sentences: iterator for sentences
        
        location (str): Path to save/load word2vec
    """
    if os.path.exists(location):
        print('Found {}'.format(location))
        model = gensim.models.Word2Vec.load(location)
        return model
    
    print('{} not found. training model'.format(location))
    model = gensim.models.Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
    print('Model done training. Saving to disk')
    model.save(location)
    return model

In [7]:
#It's important to remove duplicated spaces for word2vec learning !
train["Text"]=[" ".join(doc.split()) for doc in train["Text"].values]
test["Text"]=[" ".join(doc.split()) for doc in test["Text"].values]

AttributeError: 'float' object has no attribute 'split'

In [8]:
test[test["Text"].isnull()==True]

Unnamed: 0,Class,ID,Text,Variation,Substitutions_var,Stop_codon_var,Background,Fusion_var,gene_fusion_var,Deletion_var,...,Gene_ZFP57,Gene_ZFPM2,Gene_ZFYVE27,Gene_ZIC3,Gene_ZMPSTE24,Gene_ZNF365,Gene_ZNF41,Gene_ZNF513,Gene_ZNF592,Gene_ZNF81
3889,,3889,,M1628T,1,0,genetic abnormality frequently identified glio...,0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
w2vec = get_word2vec(
    MySentences(
        train["Text"].values,test["Text"].values),"w2v_features"
)

In [8]:
class MyTokenizer:
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        transformed_X = []
        for document in X:
            tokenized_doc = []
            for sent in nltk.sent_tokenize(document):
                tokenized_doc += nltk.word_tokenize(sent)
            transformed_X.append(np.array(tokenized_doc))
        return np.array(transformed_X)
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.wv.syn0[0])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = MyTokenizer().fit_transform(X)
        
        return np.array([
            np.mean([self.word2vec.wv[w] for w in words if w in self.word2vec.wv]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

In [9]:
mean_embedding_vectorizer = MeanEmbeddingVectorizer(w2vec)
mean_embedded_train = mean_embedding_vectorizer.fit_transform(train['Text'])
mean_embedded_test = mean_embedding_vectorizer.fit_transform(test['Text'])

In [10]:
df_embed_tr=pd.DataFrame(mean_embedded_train)
df_embed_te=pd.DataFrame(mean_embedded_test)

In [11]:
df_embedding_tr=df_embed_tr.reset_index()
df_embedding_te=df_embed_te.reset_index()
df_embedding_tr=df_embedding_tr.rename(columns={"index":"ID"})
df_embedding_te=df_embedding_te.rename(columns={"index":"ID"})

In [14]:
train_cl=train.drop(["Variation","Text","Class"],axis=1)
test_cl=test.drop(["Text","Class","Variation"],axis=1)

In [15]:
feat_stack_train.to_csv("w_meta_features/meta_train_l1l2.csv")
feat_stack_test.to_csv("w_meta_features/meta_test_l1l2.csv")

In [13]:
train_w2v=pd.merge(train_cl,df_embedding_tr,on="ID")
test_w2v=pd.merge(test_cl,df_embedding_te,on="ID")

In [14]:
np_w2v_train=np.array(train_w2v.drop("ID",axis=1))
np_w2v_test=np.array(test_w2v.drop("ID",axis=1))
ssp_w2v_train=ssp.csc_matrix(np_w2v_train)
ssp_w2v_test=ssp.csc_matrix(np_w2v_test)

In [15]:
ssp.save_npz("checkpoints_databases/w_working_train_w2v.npz",ssp_w2v_train)
ssp.save_npz("checkpoints_databases/w_working_test_w2v.npz",ssp_w2v_test)

# Now TFIDF

In [16]:

tfidf = TfidfVectorizer(
        min_df=10, max_features=10000, strip_accents=None, lowercase = False,
        analyzer='word', token_pattern=r'\w+', ngram_range=(1,3), use_idf=True,
        smooth_idf=True, sublinear_tf=True
        ).fit(train["Text"])

X_train_text = tfidf.transform(train["Text"])
X_test_text = tfidf.transform(test["Text"])

In [17]:
X_train=ssp.hstack((train_cl.drop("ID",axis=1), X_train_text),format="csc")
X_test=ssp.hstack((test_cl.drop("ID",axis=1),X_test_text),format="csc")
print(X_train.shape,X_test.shape)

(3321, 10015) (5668, 10015)


In [18]:
ssp.save_npz("checkpoints_databases/w_working_train_tfidf.npz",X_train)
ssp.save_npz("checkpoints_databases/w_working_test_tfidf.npz",X_test)

# Now Doc2Vec

In [19]:
def constructLabeledSentences(data):
    sentences=[]
    for index, row in data.iteritems():
        sentences.append(LabeledSentence(utils.to_unicode(row).split(), ['Text' + '_%s' % str(index)]))
    return sentences

train_sentences = constructLabeledSentences(train['Text'])
test_sentences = constructLabeledSentences(test['Text'])

Text_INPUT_DIM=350

d2v_train = Doc2Vec(min_count=1, window=10, size=Text_INPUT_DIM, sample=1e-4, negative=5, workers=-1, iter=5,seed=26)
d2v_train.build_vocab(train_sentences)
d2v_train.train(train_sentences, total_examples=d2v_train.corpus_count, epochs=d2v_train.iter)

d2v_test = Doc2Vec(min_count=1, window=10, size=Text_INPUT_DIM, sample=1e-4, negative=5, workers=-1, iter=5,seed=26)
d2v_test.build_vocab(test_sentences)
d2v_test.train(test_sentences, total_examples=d2v_test.corpus_count, epochs=d2v_test.iter)

0

In [20]:
d2v_train_arrays = np.zeros((len(train), Text_INPUT_DIM))
d2v_test_arrays = np.zeros((len(test), Text_INPUT_DIM))

for i in range(len(train)):
    d2v_train_arrays[i] = d2v_train.docvecs['Text_'+str(i)]

for i in range(len(test)):
    d2v_test_arrays[i] = d2v_test.docvecs['Text_'+str(i)]

X_train=ssp.hstack((train_cl.drop("ID",axis=1), d2v_train_arrays),format="csc")
X_test=ssp.hstack((test_cl.drop("ID",axis=1), d2v_test_arrays),format="csc")

print(X_train.shape,X_test.shape)

ssp.save_npz("checkpoints_databases/w_working_train_d2v.npz",X_train)
ssp.save_npz("checkpoints_databases/w_working_test_d2v.npz",X_test)

(3321, 365) (5668, 365)
