In [118]:
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import gensim
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from gensim import utils
import os
import nltk
import scipy.sparse as ssp

In [119]:
train=pd.read_csv("checkpoints_databases/w_working_train.csv",encoding="utf8")
test=pd.read_csv("checkpoints_databases/w_working_test.csv",encoding="utf8")

In [120]:
train_cl=train.drop(["Variation","Text","Class","Gene"],axis=1)
test_cl=test.drop(["Text","Class","Variation","Gene"],axis=1)

In [121]:
train_cl=train_cl.reset_index()
train_cl=train_cl.drop("ID",axis=1)
train_cl=train_cl.rename(columns={"index":"ID"})

In [122]:
train_cl.to_csv("w_meta_features/meta_train_l1l2.csv")
test_cl.to_csv("w_meta_features/meta_test_l1l2.csv")

# Here we will add features from word2vec retrained then get the mean for the sentences

In [123]:
class MySentences(object):
    """MySentences is a generator to produce a list of tokenized sentences 
    
    Takes a list of numpy arrays containing documents.
    
    Args:
        arrays: List of arrays, where each element in the array contains a document.
    """
    def __init__(self, *arrays):
        self.arrays = arrays
 
    def __iter__(self):
        for array in self.arrays:
            for document in array:
                for sent in nltk.sent_tokenize(document):
                    yield nltk.word_tokenize(sent)

def get_word2vec(sentences, location,size):
    """Returns trained word2vec
    
    Args:
        sentences: iterator for sentences
        
        location (str): Path to save/load word2vec
    """
    if os.path.exists(location):
        print('Found {}'.format(location))
        model = gensim.models.Word2Vec.load(location)
        return model
    
    print('{} not found. training model'.format(location))
    model = gensim.models.Word2Vec(sentences, size=size, window=5, min_count=5, workers=4)
    print('Model done training. Saving to disk')
    model.save(location)
    return model

In [124]:
#It's important to remove duplicated spaces for word2vec learning !
train["Text"]=[" ".join(doc.split()) for doc in train["Text"].values]
test["Text"]=[" ".join(doc.split()) for doc in test["Text"].values]

In [None]:
number_w2v=[100,200,300]
w2v={}
for size in number_w2v:
    w2v["w2v_"+str(size)] = get_word2vec(
        MySentences(
            train["Text"].values),"w2v_features"+str(size),size
    )

w2v_features100 not found. training model


In [38]:
class MyTokenizer:
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        transformed_X = []
        for document in X:
            tokenized_doc = []
            for sent in nltk.sent_tokenize(document):
                tokenized_doc += nltk.word_tokenize(sent)
            transformed_X.append(np.array(tokenized_doc))
        return np.array(transformed_X)
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.wv.syn0[0])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = MyTokenizer().fit_transform(X)
        
        return np.array([
            np.mean([self.word2vec.wv[w] for w in words if w in self.word2vec.wv]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

In [93]:
mean_embedding_vectorizer={}
mean_embedded_train={}
mean_embedded_test={}
for name in w2v:
    mean_embedding_vectorizer[name] = MeanEmbeddingVectorizer(w2v[name])
    mean_embedded_train[name] = mean_embedding_vectorizer[name].fit_transform(train['Text'])
    mean_embedded_test[name] = mean_embedding_vectorizer[name].fit_transform(test['Text'])
df_embed_tr={}
df_embed_te={}
for name in w2v:
    df_embed_tr[name]=pd.DataFrame(mean_embedded_train[name])
    df_embed_te[name]=pd.DataFrame(mean_embedded_test[name])
df_embedding_tr={}
df_embedding_te={}
for name in w2v:
    df_embedding_tr[name]=df_embed_tr[name].reset_index()
    df_embedding_te[name]=df_embed_te[name].reset_index()
    df_embedding_tr[name]=df_embedding_tr[name].rename(columns={"index":"ID"})
    df_embedding_te[name]=df_embedding_te[name].rename(columns={"index":"ID"})
train_w2v={}
test_w2v={}
for name in w2v:
    train_w2v[name]=pd.concat((train_cl,df_embedding_tr[name]),axis=1)
    test_w2v[name]=pd.concat((test_cl,df_embedding_te[name]),axis=1)


In [117]:
for name in w2v:
    train_w2v[name].to_csv("checkpoints_databases/w_working_train_"+name+".csv",index=False)
    test_w2v[name].to_csv("checkpoints_databases/w_working_test_"+name+".csv",index=False)

# Now TFIDF +300tsvd

In [20]:

tfidf = TfidfVectorizer(
        min_df=10, max_features=10000, strip_accents=None, lowercase = False,
        analyzer='word', token_pattern=r'\w+', ngram_range=(1,3), use_idf=True,
        smooth_idf=True, sublinear_tf=True
        ).fit(train["Text"])

X_train_text = tfidf.transform(train["Text"])
X_test_text = tfidf.transform(test["Text"])

In [102]:
tsvd_train= {}
tsvd_test={}
list_comp=[100,200,300]
dic_svd={}
for comp in list_comp:
    dic_svd[str(comp)]=TruncatedSVD(n_components=comp,n_iter=25,random_state=26)
for svd in dic_svd:
    tsvd_train[svd]=dic_svd[svd].fit_transform(X_train_text)
    tsvd_test[svd]=dic_svd[svd].transform(X_test_text)
X_train=pd.DataFrame()
X_test=pd.DataFrame()
for n in dic_svd:
    for i in range(int(n)):
        X_train['tsvd_' +str(n)+"_"+str(i)] = tsvd_train[n][:, i]
        X_test['tsvd_' +str(n)+"_"+str(i)] = tsvd_test[n][:, i]

In [104]:
X_train_100=X_train.iloc[:,:100]
X_train_200=X_train.iloc[:,100:300]
X_train_300=X_train.iloc[:,300:600]
X_test_100=X_test.iloc[:,:100]
X_test_200=X_test.iloc[:,100:300]
X_test_300=X_test.iloc[:,300:600]
dic_train={}
dic_test={}
dic_train["tsvd_100"]=pd.concat((train_cl, X_train_100),axis=1)
dic_test["tsvd_100"]=pd.concat((test_cl,X_test_100),axis=1)
dic_train["tsvd_200"]=pd.concat((train_cl, X_train_200),axis=1)
dic_test["tsvd_200"]=pd.concat((test_cl,X_test_200),axis=1)
dic_train["tsvd_300"]=pd.concat((train_cl, X_train_300),axis=1)
dic_test["tsvd_300"]=pd.concat((test_cl,X_test_300),axis=1)

In [108]:
for name in dic_train:
    dic_train[name].to_csv("checkpoints_databases/w_working_train_tfidf_"+name+".csv",index=False)
    dic_test[name].to_csv("checkpoints_databases/w_working_test_tfidf_"+name+".csv",index=False)

# Now Doc2Vec

In [23]:
def constructLabeledSentences(data):
    sentences=[]
    for index, row in data.iteritems():
        sentences.append(LabeledSentence(utils.to_unicode(row).split(), ['Text' + '_%s' % str(index)]))
    return sentences

train_sentences = constructLabeledSentences(train['Text'])
test_sentences = constructLabeledSentences(test['Text'])

Text_dim=[100,200,300]
d2v_train={}
d2v_test={}
for size in Text_dim:
    d2v_train["d2v_"+str(size)] = Doc2Vec(min_count=1, window=10, size=size, sample=1e-4, negative=5, workers=-1, iter=5,seed=26)
    d2v_train["d2v_"+str(size)].build_vocab(train_sentences)
    d2v_train["d2v_"+str(size)].train(train_sentences, total_examples=d2v_train["d2v_"+str(size)].corpus_count,
                                      epochs=d2v_train["d2v_"+str(size)].iter)
for size in Text_dim:
    d2v_test["d2v_"+str(size)] = Doc2Vec(min_count=1, window=10, size=size, sample=1e-4, negative=5, workers=-1, iter=5,seed=26)
    d2v_test["d2v_"+str(size)].build_vocab(test_sentences)
    d2v_test["d2v_"+str(size)].train(test_sentences, total_examples=d2v_test["d2v_"+str(size)].corpus_count,
                                 epochs=d2v_test["d2v_"+str(size)].iter)

In [24]:
d2v_train_arrays={}
d2v_test_arrays={}
d2v_train_arrays["d2v_100"] = np.zeros((len(train), 100))
d2v_test_arrays["d2v_100"] = np.zeros((len(test), 100))
d2v_train_arrays["d2v_200"] = np.zeros((len(train), 200))
d2v_test_arrays["d2v_200"] = np.zeros((len(test), 200))
d2v_train_arrays["d2v_300"] = np.zeros((len(train), 300))
d2v_test_arrays["d2v_300"] = np.zeros((len(test), 300))

for n_train,n_test in zip(range(len(train)),range(len(test))):
    d2v_train_arrays["d2v_100"][n_train] = d2v_train["d2v_100"].docvecs['Text_'+str(n_train)]
    d2v_test_arrays["d2v_100"][n_test] = d2v_test["d2v_100"].docvecs['Text_'+str(n_test)]
    d2v_train_arrays["d2v_200"][n_train] = d2v_train["d2v_200"].docvecs['Text_'+str(n_train)]
    d2v_test_arrays["d2v_200"][n_test] = d2v_test["d2v_200"].docvecs['Text_'+str(n_test)]
    d2v_train_arrays["d2v_300"][n_train] = d2v_train["d2v_300"].docvecs['Text_'+str(n_train)]
    d2v_test_arrays["d2v_300"][n_test] = d2v_test["d2v_300"].docvecs['Text_'+str(n_test)]
    

In [109]:
X_train_d2v={}
X_test_d2v={}
for name in d2v_train:
    X_train_d2v[name]=pd.concat((train_cl, pd.DataFrame(d2v_train_arrays[name])),axis=1)
    X_test_d2v[name]=pd.concat((test_cl, pd.DataFrame(d2v_test_arrays[name])),axis=1)
    print(X_train_d2v[name].shape,X_test_d2v[name].shape)
for name in X_train_d2v:
    X_train_d2v[name].to_csv("checkpoints_databases/w_working_train_"+name+".csv",index=False)
    X_test_d2v[name].to_csv("checkpoints_databases/w_working_test_"+name+".csv",index=False)


(3689, 143) (986, 143)
(3689, 243) (986, 243)
(3689, 343) (986, 343)


In [115]:
test_w2v["w2v_100"]

Unnamed: 0,ID,Substitutions_var,Stop_codon_var,gene_fusion_var,Deletion_var,del_or_ins_var,Fusion_var,Amplification_var,Truncation_var,exon_var,...,90,91,92,93,94,95,96,97,98,99
0,1,1,0,0,0,0,0,0,0,0,...,-0.449496,0.587597,-0.196433,-0.118292,-0.275072,0.373901,-0.386638,0.200778,0.222391,1.027323
1,2,0,0,0,0,0,0,0,1,0,...,-0.273759,0.488260,-0.171580,-0.154771,-0.155976,0.564686,-0.454898,0.164387,-0.488425,0.812096
2,3,1,0,0,0,0,0,0,0,0,...,-0.440092,0.319991,-0.197175,0.003191,-0.187051,0.438474,-0.500932,0.103333,-0.006287,0.921059
3,4,1,0,0,0,0,0,0,0,0,...,-0.566799,0.332304,-0.443873,-0.170687,0.019462,0.481697,-0.380977,0.114232,0.000228,0.917906
4,5,0,0,0,0,1,0,0,0,0,...,-0.427703,0.324021,-0.114971,0.277837,-0.165568,0.323747,-0.655890,0.203359,0.229903,1.033422
5,6,1,0,0,0,0,0,0,0,0,...,-0.383829,0.433802,-0.129014,-0.121596,0.069659,0.521747,-0.394767,0.169891,-0.260955,0.785261
6,7,1,0,0,0,0,0,0,0,0,...,-0.788942,1.109889,-0.760017,-0.628782,0.316051,1.391335,0.033703,0.335547,0.616032,1.079194
7,8,1,0,0,0,0,0,0,0,0,...,-0.369160,0.446673,-0.181225,0.115796,0.058878,0.552782,-0.462485,0.258370,-0.155506,0.748431
8,9,1,0,0,0,0,0,0,0,0,...,-0.220771,0.743434,-0.516956,-0.465282,0.539460,0.915681,-0.112725,-0.095157,0.323692,0.318403
9,10,1,0,0,0,0,0,0,0,0,...,-0.372665,0.483980,-0.248830,-0.038577,-0.062850,0.699067,-0.537655,0.193569,-0.142377,0.757672
