In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import gensim
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from gensim import utils
import os
import nltk
import scipy.sparse as ssp
from nltk.tokenize import sent_tokenize, word_tokenize

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
from nltk import pos_tag

Using TensorFlow backend.


In [29]:
train=pd.read_csv("checkpoints_databases/new_working_train.csv",encoding="utf8")
test=pd.read_csv("checkpoints_databases/new_working_test.csv",encoding="utf8")

In [31]:
#Stock as input features for meta model
train_cl=train.drop(["Variation","Full_Text","Window_Text","Class","Gene","Text_words"],axis=1)
test_cl=test.drop(["Text","Class","Full_Text","Window_Text","Variation","Gene","Text_words"],axis=1)


In [33]:
train_cl.to_csv("w_meta_features/meta_train_l2.csv",index=False)
test_cl.to_csv("w_meta_features/meta_test_l2.csv",index=False)

In [34]:
data_all=pd.concat((train,test)).reset_index(drop=True)

In [36]:
stop = set(stopwords.words('english'))
exclude = set('.,!"#$%&\'()*+:;<=>?@[\\]^_`{|}')
lemma = WordNetLemmatizer()
def clean(doc,lemmatiz=False):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free_0 =[re.sub(",|\.|/"," ",ch) for ch in stop_free]
    punc_free_1= re.sub(r'\s\d+\s',' ',"".join(punc_free_0))
    if lemmatiz==True:
        lem=[]
        punc_free_lem="".join(ch for ch in punc_free_0 if ch not in exclude)
        for word,tag in pos_tag(word_tokenize(punc_free_lem)):
            wntag=tag[0].lower()
            wntag=wntag if wntag in ["a","r","n","v"] else None
            if not wntag:
                lem.append(word)
            else:
                lem.append(lemma.lemmatize(word,wntag))
        normalized=" ".join(word for word in lem)
        return normalized


In [37]:
data_all.Text = [clean(doc, lemmatiz=True) for doc in data_all.Text]  

In [9]:
train = data_all.iloc[:len(train)]
test = data_all.iloc[len(train):]

In [13]:
for i in range(len(data_all)):
    if (len(data_all.Text[i])<300):
        print (i)

231
233
235
450
461
841
1078
1255
2166
2220
2484
2979
3107


# Here we will add features from word2vec retrained then get the mean for the sentences

In [32]:
class MySentences(object):
    """MySentences is a generator to produce a list of tokenized sentences 
    
    Takes a list of numpy arrays containing documents.
    
    Args:
        arrays: List of arrays, where each element in the array contains a document.
    """
    def __init__(self, *arrays):
        self.arrays = arrays
 
    def __iter__(self):
        for array in self.arrays:
            for document in array:
                for sent in nltk.sent_tokenize(document):
                    yield nltk.word_tokenize(sent)

def get_word2vec(sentences, location,size):
    """Returns trained word2vec
    
    Args:
        sentences: iterator for sentences
        
        location (str): Path to save/load word2vec
    """
    if os.path.exists(location):
        print('Found {}'.format(location))
        model = gensim.models.Word2Vec.load(location)
        return model
    
    print('{} not found. training model'.format(location))
    model = gensim.models.Word2Vec(sentences, size=size, window=5, min_count=5, workers=4)
    print('Model done training. Saving to disk')
    model.save(location)
    return model

In [33]:
#It's important to remove duplicated spaces for word2vec learning !
train["Text"]=[" ".join(doc.split()) for doc in train["Text"].values]
test["Text"]=[" ".join(doc.split()) for doc in test["Text"].values]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [34]:
number_w2v=[100,200,300]
w2v={}
for size in number_w2v:
    w2v["w2v_"+str(size)] = get_word2vec(
        MySentences(
            train["Text"].values),"new_w2v_features"+str(size),size
    )

new_w2v_features100 not found. training model
Model done training. Saving to disk
new_w2v_features200 not found. training model
Model done training. Saving to disk
new_w2v_features300 not found. training model
Model done training. Saving to disk


In [39]:
class MyTokenizer:
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        transformed_X = []
        for document in X:
            tokenized_doc = []
            for sent in nltk.sent_tokenize(document):
                tokenized_doc += nltk.word_tokenize(sent)
            transformed_X.append(np.array(tokenized_doc))
        return np.array(transformed_X)
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.wv.syn0[0])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = MyTokenizer().fit_transform(X)
        
        return np.array([
            np.mean([self.word2vec.wv[w] for w in words if w in self.word2vec.wv]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

In [40]:
mean_embedding_vectorizer={}
mean_embedded_train={}
mean_embedded_test={}
for name in w2v:
    mean_embedding_vectorizer[name] = MeanEmbeddingVectorizer(w2v[name])
    mean_embedded_train[name] = mean_embedding_vectorizer[name].fit_transform(train['Text'])
    mean_embedded_test[name] = mean_embedding_vectorizer[name].fit_transform(test['Text'])
df_embed_tr={}
df_embed_te={}
for name in w2v:
    df_embed_tr[name]=pd.DataFrame(mean_embedded_train[name])
    df_embed_te[name]=pd.DataFrame(mean_embedded_test[name])
train_w2v={}
test_w2v={}
for name in w2v:
    train_w2v[name]=df_embed_tr[name]
    test_w2v[name]=df_embed_te[name]


In [41]:
#not sure if ID is on the column, if not just suppr the drop
for name in w2v:
    train_w2v[name].drop("ID",axis=1).to_csv("checkpoints_databases/new_working_train_"+name+".csv",index=False)
    test_w2v[name].drop("ID",axis=1).to_csv("checkpoints_databases/new_working_test_"+name+".csv",index=False)

# Now TFIDF +300tsvd

In [42]:
tfidf = TfidfVectorizer(
        min_df=3, max_features=8000, strip_accents=None, lowercase = False,
        analyzer='word', token_pattern=r'\w+', ngram_range=(1,3), use_idf=True,
        smooth_idf=True, sublinear_tf=True
        ).fit(train["Text"])

X_train_text = tfidf.transform(train["Text"])
X_test_text = tfidf.transform(test["Text"])

In [51]:
tfidf_names =tfidf.get_feature_names()
tfidf_names

['a',
 'acid',
 'activate',
 'activation',
 'activity',
 'al',
 'alk',
 'also',
 'amplification',
 'analysis',
 'associate',
 'b',
 'bind',
 'braf',
 'brca',
 'c',
 'cancer',
 'case',
 'cell',
 'cell line',
 'clinical',
 'compare',
 'd',
 'data',
 'deletion',
 'detect',
 'different',
 'dna',
 'domain',
 'e',
 'effect',
 'egfr',
 'egfr mutation',
 'et',
 'et al',
 'exon',
 'express',
 'expression',
 'f',
 'fgfr',
 'fig',
 'figure',
 'find',
 'flt',
 'function',
 'fusion',
 'g',
 'gefitinib',
 'gene',
 'human',
 'identify',
 'include',
 'increase',
 'inhibitor',
 'insertion',
 'kinase',
 'kit',
 'level',
 'line',
 'lung',
 'may',
 'missense',
 'mutant',
 'mutation',
 'n',
 'number',
 'observe',
 'one',
 'p',
 'patient',
 'placeholdermutation',
 'previously',
 'protein',
 'receptor',
 'region',
 'report',
 'residue',
 'resistance',
 'response',
 'result',
 's',
 'sample',
 'sequence',
 'show',
 'site',
 'study',
 'substitution',
 'suggest',
 't',
 'table',
 'three',
 'tumor',
 'two',
 'ty

In [43]:
tsvd_train= {}
tsvd_test={}
list_comp=[100,200,300]
dic_svd={}
for comp in list_comp:
    dic_svd[str(comp)]=TruncatedSVD(n_components=comp,n_iter=25,random_state=26)
for svd in dic_svd:
    tsvd_train[svd]=dic_svd[svd].fit_transform(X_train_text)
    tsvd_test[svd]=dic_svd[svd].transform(X_test_text)
X_train=pd.DataFrame()
X_test=pd.DataFrame()
for n in dic_svd:
    for i in range(int(n)):
        X_train['tsvd_' +str(n)+"_"+str(i)] = tsvd_train[n][:, i]
        X_test['tsvd_' +str(n)+"_"+str(i)] = tsvd_test[n][:, i]

In [44]:
X_train_100=X_train.iloc[:,:100]
X_train_200=X_train.iloc[:,100:300]
X_train_300=X_train.iloc[:,300:600]
X_test_100=X_test.iloc[:,:100]
X_test_200=X_test.iloc[:,100:300]
X_test_300=X_test.iloc[:,300:600]
dic_train={}
dic_test={}
dic_train["tsvd_100"]=X_train_100
dic_test["tsvd_100"]=X_test_100
dic_train["tsvd_200"]=X_train_200
dic_test["tsvd_200"]=X_test_200
dic_train["tsvd_300"]=X_train_300
dic_test["tsvd_300"]=X_test_300

In [45]:
for name in dic_train:
    dic_train[name].to_csv("checkpoints_databases/new_working_train_tfidf_"+name+".csv",index=False)
    dic_test[name].to_csv("checkpoints_databases/new_working_test_tfidf_"+name+".csv",index=False)

# Now w2v bio

In [46]:
from gensim.models import KeyedVectors

In [47]:
w2v_bio = KeyedVectors.load_word2vec_format("../bases/PMC-w2v.bin",binary=True)

In [48]:
me_vec={}
me_train={}
me_test={}
me_vec = MeanEmbeddingVectorizer(w2v_bio)
me_train = me_vec.fit_transform(train['Text'])
me_test = me_vec.fit_transform(test['Text'])
df_bio_tr={}
df_bio_te={}
df_bio_tr=pd.DataFrame(me_train)
df_bio_te=pd.DataFrame(me_test)

df_tr=df_bio_tr.reset_index()
df_te=df_bio_te.reset_index()
df_tr=df_tr.rename(columns={"index":"ID"})
df_te=df_te.rename(columns={"index":"ID"})

train_w2v_bio=df_tr
test_w2v_bio=df_te

In [49]:
train_w2v_bio.drop("ID",axis=1).to_csv("checkpoints_databases/new_working_train_bio.csv",index=False)
test_w2v_bio.drop("ID",axis=1).to_csv("checkpoints_databases/new_working_test_bio.csv",index=False)