In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import gensim
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from gensim import utils
import os
import nltk
import scipy.sparse as ssp

Using TensorFlow backend.


In [2]:
train=pd.read_csv("checkpoints_databases/w_working_train.csv",encoding="utf8")
test=pd.read_csv("checkpoints_databases/w_working_test.csv",encoding="utf8")

In [16]:
train_cl=train.drop(["Variation","Text","Class","Gene"],axis=1)
test_cl=test.drop(["Text","Class","Variation","Gene"],axis=1)
train_cl.to_csv("w_meta_features/meta_train_l1l2.csv")
test_cl.to_csv("w_meta_features/meta_test_l1l2.csv")

# Here we will add features from word2vec retrained then get the mean for the sentences

In [34]:
class MySentences(object):
    """MySentences is a generator to produce a list of tokenized sentences 
    
    Takes a list of numpy arrays containing documents.
    
    Args:
        arrays: List of arrays, where each element in the array contains a document.
    """
    def __init__(self, *arrays):
        self.arrays = arrays
 
    def __iter__(self):
        for array in self.arrays:
            for document in array:
                for sent in nltk.sent_tokenize(document):
                    yield nltk.word_tokenize(sent)

def get_word2vec(sentences, location,size):
    """Returns trained word2vec
    
    Args:
        sentences: iterator for sentences
        
        location (str): Path to save/load word2vec
    """
    if os.path.exists(location):
        print('Found {}'.format(location))
        model = gensim.models.Word2Vec.load(location)
        return model
    
    print('{} not found. training model'.format(location))
    model = gensim.models.Word2Vec(sentences, size=size, window=5, min_count=5, workers=4)
    print('Model done training. Saving to disk')
    model.save(location)
    return model

In [7]:
#It's important to remove duplicated spaces for word2vec learning !
train["Text"]=[" ".join(doc.split()) for doc in train["Text"].values]
test["Text"]=[" ".join(doc.split()) for doc in test["Text"].values]

In [9]:
number_w2v=[100,200,300]
w2v={}
for size in number_w2v:
    w2v["w2v_"+str(size)] = get_word2vec(
        MySentences(
            train["Text"].values,test["Text"].values),"w2v_features"+str(size),size
    )

w2v_features100 not found. training model
Model done training. Saving to disk
w2v_features200 not found. training model
Model done training. Saving to disk
w2v_features300 not found. training model
Model done training. Saving to disk


In [10]:
class MyTokenizer:
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        transformed_X = []
        for document in X:
            tokenized_doc = []
            for sent in nltk.sent_tokenize(document):
                tokenized_doc += nltk.word_tokenize(sent)
            transformed_X.append(np.array(tokenized_doc))
        return np.array(transformed_X)
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.wv.syn0[0])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = MyTokenizer().fit_transform(X)
        
        return np.array([
            np.mean([self.word2vec.wv[w] for w in words if w in self.word2vec.wv]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

In [19]:
mean_embedding_vectorizer={}
mean_embedded_train={}
mean_embedded_test={}
for name in w2v:
    mean_embedding_vectorizer[name] = MeanEmbeddingVectorizer(w2v[name])
    mean_embedded_train[name] = mean_embedding_vectorizer[name].fit_transform(train['Text'])
    mean_embedded_test[name] = mean_embedding_vectorizer[name].fit_transform(test['Text'])
df_embed_tr={}
df_embed_te={}
for name in w2v:
    df_embed_tr[name]=pd.DataFrame(mean_embedded_train[name])
    df_embed_te[name]=pd.DataFrame(mean_embedded_test[name])
df_embedding_tr={}
df_embedding_te={}
for name in w2v:
    df_embedding_tr[name]=df_embed_tr[name].reset_index()
    df_embedding_te[name]=df_embed_te[name].reset_index()
    df_embedding_tr[name]=df_embedding_tr[name].rename(columns={"index":"ID"})
    df_embedding_te[name]=df_embedding_te[name].rename(columns={"index":"ID"})
train_w2v={}
test_w2v={}
for name in w2v:
    train_w2v[name]=pd.merge(train_cl,df_embedding_tr[name],on="ID")
    test_w2v[name]=pd.merge(test_cl,df_embedding_te[name],on="ID")
np_w2v_train={}
np_w2v_test={}
ssp_w2v_train={}
ssp_w2v_test={}
for name in w2v:
    np_w2v_train[name]=np.array(train_w2v[name].drop("ID",axis=1))
    np_w2v_test[name]=np.array(test_w2v[name].drop("ID",axis=1))
    ssp_w2v_train[name]=ssp.csc_matrix(np_w2v_train[name])
    ssp_w2v_test[name]=ssp.csc_matrix(np_w2v_test[name])
for name in w2v:
    ssp.save_npz("checkpoints_databases/w_working_train_w2v_"+name+".npz",ssp_w2v_train[name])
    ssp.save_npz("checkpoints_databases/w_working_test_w2v_"+name+".npz",ssp_w2v_test[name])

In [33]:
df_embed_tr["w2v_200"]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,-0.057405,0.197566,0.527322,0.130546,0.293730,0.164782,0.048901,-0.338855,0.234756,0.312049,...,-0.398163,0.317923,0.039845,0.201423,0.119841,0.257390,-0.462988,0.176000,-0.027966,0.653740
1,-0.126027,0.266798,0.449233,0.390345,0.313293,0.051888,0.167819,-0.052928,0.000801,0.043154,...,-0.169994,0.652684,0.081891,-0.218189,0.112043,0.695898,-0.164660,0.022291,-0.836819,0.787693
2,-0.117264,0.264506,0.452685,0.388457,0.314592,0.043760,0.160930,-0.062269,-0.003007,0.047831,...,-0.172663,0.654870,0.081686,-0.208102,0.107386,0.703997,-0.167303,0.023559,-0.828714,0.783131
3,-0.236825,0.270087,0.835019,0.604167,0.183494,-0.004426,0.378956,0.670836,0.672299,0.104886,...,-0.523348,0.380097,-0.415965,-0.596954,0.267313,1.296499,-0.167477,0.030287,-0.277649,1.102420
4,0.012049,0.051558,0.317926,0.376385,0.357085,-0.328691,0.067617,0.504810,0.673324,0.298689,...,-0.613144,0.372948,-0.542382,-0.156690,0.648330,1.191289,-0.304821,0.154658,-0.483035,0.584619
5,0.012119,0.049389,0.314572,0.373265,0.355185,-0.335367,0.065497,0.508105,0.673907,0.306013,...,-0.610415,0.365746,-0.549868,-0.154889,0.645622,1.195627,-0.312431,0.161748,-0.489876,0.581025
6,-0.023987,-0.117596,0.141155,0.237709,0.475287,-0.199513,-0.117329,0.089067,0.545177,0.161861,...,-0.416091,0.508044,-0.482833,-0.202231,0.213532,1.136387,-0.203656,0.374505,-1.109121,0.870974
7,-0.269782,0.033860,0.338151,0.398014,0.487732,0.077372,0.167809,0.123897,0.263821,-0.045762,...,-0.286819,0.255130,0.013121,-0.074305,0.160226,0.397054,-0.307087,-0.059110,-0.299057,0.884539
8,0.034730,0.491386,0.248449,0.014238,0.370917,0.309910,-0.025118,0.045255,0.707751,0.805179,...,-0.590055,0.350958,-0.230090,0.127338,0.318473,0.947984,-0.696691,0.225549,-0.363246,0.771461
9,0.049545,0.540370,0.253687,0.016476,0.414097,0.302729,-0.128461,-0.003493,0.682857,0.798955,...,-0.529122,0.319444,-0.195476,0.163914,0.289975,0.913162,-0.759063,0.252478,-0.390039,0.750178


# Now TFIDF +300tsvd

In [20]:

tfidf = TfidfVectorizer(
        min_df=10, max_features=10000, strip_accents=None, lowercase = False,
        analyzer='word', token_pattern=r'\w+', ngram_range=(1,3), use_idf=True,
        smooth_idf=True, sublinear_tf=True
        ).fit(train["Text"])

X_train_text = tfidf.transform(train["Text"])
X_test_text = tfidf.transform(test["Text"])

In [21]:
tsvd_train= {}
tsvd_test={}
list_comp=[100,200,300]
dic_svd={}
for comp in list_comp:
    dic_svd[str(comp)]=TruncatedSVD(n_components=comp,n_iter=25,random_state=26)
for svd in dic_svd:
    tsvd_train[svd]=dic_svd[svd].fit_transform(X_train_text)
    tsvd_test[svd]=dic_svd[svd].transform(X_test_text)
    X_train=pd.DataFrame()
X_test=pd.DataFrame()
for n in dic_svd:
    for i in range(int(n)):
        X_train['tsvd_' +str(n)+"_"+str(i)] = tsvd_train[n][:, i]
        X_test['tsvd_' +str(n)+"_"+str(i)] = tsvd_test[n][:, i]

In [22]:
X_train_100=X_train.iloc[:,:100]
X_train_200=X_train.iloc[:,100:300]
X_train_300=X_train.iloc[:,300:600]
X_test_100=X_test.iloc[:,:100]
X_test_200=X_test.iloc[:,100:300]
X_test_300=X_test.iloc[:,300:600]
dic_train={}
dic_test={}
dic_train["tsvd_100"]=ssp.hstack((train_cl.drop("ID",axis=1), X_train_100),format="csc")
dic_test["tsvd_100"]=ssp.hstack((test_cl.drop("ID",axis=1),X_test_100),format="csc")
dic_train["tsvd_200"]=ssp.hstack((train_cl.drop("ID",axis=1), X_train_200),format="csc")
dic_test["tsvd_200"]=ssp.hstack((test_cl.drop("ID",axis=1),X_test_200),format="csc")
dic_train["tsvd_300"]=ssp.hstack((train_cl.drop("ID",axis=1), X_train_300),format="csc")
dic_test["tsvd_300"]=ssp.hstack((test_cl.drop("ID",axis=1),X_test_300),format="csc")
for name in dic_train:
    ssp.save_npz("checkpoints_databases/w_working_train_tfidf_"+name+".npz",dic_train[name])
    ssp.save_npz("checkpoints_databases/w_working_test_tfidf_"+name+".npz",dic_test[name])

# Now Doc2Vec

In [23]:
def constructLabeledSentences(data):
    sentences=[]
    for index, row in data.iteritems():
        sentences.append(LabeledSentence(utils.to_unicode(row).split(), ['Text' + '_%s' % str(index)]))
    return sentences

train_sentences = constructLabeledSentences(train['Text'])
test_sentences = constructLabeledSentences(test['Text'])

Text_dim=[100,200,300]
d2v_train={}
d2v_test={}
for size in Text_dim:
    d2v_train["d2v_"+str(size)] = Doc2Vec(min_count=1, window=10, size=size, sample=1e-4, negative=5, workers=-1, iter=5,seed=26)
    d2v_train["d2v_"+str(size)].build_vocab(train_sentences)
    d2v_train["d2v_"+str(size)].train(train_sentences, total_examples=d2v_train["d2v_"+str(size)].corpus_count,
                                      epochs=d2v_train["d2v_"+str(size)].iter)
for size in Text_dim:
    d2v_test["d2v_"+str(size)] = Doc2Vec(min_count=1, window=10, size=size, sample=1e-4, negative=5, workers=-1, iter=5,seed=26)
    d2v_test["d2v_"+str(size)].build_vocab(test_sentences)
    d2v_test["d2v_"+str(size)].train(test_sentences, total_examples=d2v_test["d2v_"+str(size)].corpus_count,
                                 epochs=d2v_test["d2v_"+str(size)].iter)

In [24]:
d2v_train_arrays={}
d2v_test_arrays={}
d2v_train_arrays["d2v_100"] = np.zeros((len(train), 100))
d2v_test_arrays["d2v_100"] = np.zeros((len(test), 100))
d2v_train_arrays["d2v_200"] = np.zeros((len(train), 200))
d2v_test_arrays["d2v_200"] = np.zeros((len(test), 200))
d2v_train_arrays["d2v_300"] = np.zeros((len(train), 300))
d2v_test_arrays["d2v_300"] = np.zeros((len(test), 300))

for n_train,n_test in zip(range(len(train)),range(len(test))):
    d2v_train_arrays["d2v_100"][n_train] = d2v_train["d2v_100"].docvecs['Text_'+str(n_train)]
    d2v_test_arrays["d2v_100"][n_test] = d2v_test["d2v_100"].docvecs['Text_'+str(n_test)]
    d2v_train_arrays["d2v_200"][n_train] = d2v_train["d2v_200"].docvecs['Text_'+str(n_train)]
    d2v_test_arrays["d2v_200"][n_test] = d2v_test["d2v_200"].docvecs['Text_'+str(n_test)]
    d2v_train_arrays["d2v_300"][n_train] = d2v_train["d2v_300"].docvecs['Text_'+str(n_train)]
    d2v_test_arrays["d2v_300"][n_test] = d2v_test["d2v_300"].docvecs['Text_'+str(n_test)]
    

In [25]:
X_train={}
X_test={}
for name in d2v_train:
    X_train[name]=ssp.hstack((train_cl.drop("ID",axis=1), d2v_train_arrays[name]),format="csc")
    X_test[name]=ssp.hstack((test_cl.drop("ID",axis=1), d2v_test_arrays[name]),format="csc")
    print(X_train[name].shape,X_test[name].shape)
for name in d2v_train:
    ssp.save_npz("checkpoints_databases/w_working_train_d2v_"+".npz",X_train[name])
    ssp.save_npz("checkpoints_databases/w_working_test_d2v_"+".npz",X_test[name])


(3689, 142) (986, 142)
(3689, 242) (986, 242)
(3689, 342) (986, 342)
