In [35]:
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
import os
import nltk
import scipy.sparse as ssp

In [53]:
train=pd.read_csv("bases/working_train.csv",encoding="utf8")
test=pd.read_csv("bases/working_test.csv",encoding="utf8")

# Here we will add features from word2vec retrained then get the mean for the sentences

In [13]:
class MySentences(object):
    """MySentences is a generator to produce a list of tokenized sentences 
    
    Takes a list of numpy arrays containing documents.
    
    Args:
        arrays: List of arrays, where each element in the array contains a document.
    """
    def __init__(self, *arrays):
        self.arrays = arrays
 
    def __iter__(self):
        for array in self.arrays:
            for document in array:
                for sent in nltk.sent_tokenize(document):
                    yield nltk.word_tokenize(sent)

def get_word2vec(sentences, location):
    """Returns trained word2vec
    
    Args:
        sentences: iterator for sentences
        
        location (str): Path to save/load word2vec
    """
    if os.path.exists(location):
        print('Found {}'.format(location))
        model = gensim.models.Word2Vec.load(location)
        return model
    
    print('{} not found. training model'.format(location))
    model = gensim.models.Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
    print('Model done training. Saving to disk')
    model.save(location)
    return model

In [14]:
#It's important to remove duplicated spaces for word2vec learning !
train["Text"]=[" ".join(doc.split()) for doc in train["Text"].values]
test["Text"]=[" ".join(doc.split()) for doc in test["Text"].values]

In [15]:
w2vec = get_word2vec(
    MySentences(
        train["Text"].values,test["Text"].values),"localisation"
)

Found localisation


In [16]:
class MyTokenizer:
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        transformed_X = []
        for document in X:
            tokenized_doc = []
            for sent in nltk.sent_tokenize(document):
                tokenized_doc += nltk.word_tokenize(sent)
            transformed_X.append(np.array(tokenized_doc))
        return np.array(transformed_X)
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.wv.syn0[0])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = MyTokenizer().fit_transform(X)
        
        return np.array([
            np.mean([self.word2vec.wv[w] for w in words if w in self.word2vec.wv]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

In [17]:
mean_embedding_vectorizer = MeanEmbeddingVectorizer(w2vec)
mean_embedded_train = mean_embedding_vectorizer.fit_transform(train['Text'])
mean_embedded_test = mean_embedding_vectorizer.fit_transform(test['Text'])

In [18]:
df_embed_tr=pd.DataFrame(mean_embedded_train)
df_embed_te=pd.DataFrame(mean_embedded_test)

In [19]:
df_embedding_tr=df_embed_tr.reset_index()
df_embedding_te=df_embed_te.reset_index()
df_embedding_tr=df_embedding_tr.rename(columns={"index":"ID"})
df_embedding_te=df_embedding_te.rename(columns={"index":"ID"})

In [54]:
train_cl=train.drop(["Variation","Text","Gene","Class"],axis=1)
test_cl=test.drop(["Text","Class","Variation","Gene"],axis=1)

In [39]:
train_w2v=pd.merge(train,df_embedding_tr,on="ID")
test_w2v=pd.merge(test,df_embedding_te,on="ID")

In [56]:
train_w2v.to_csv("bases/working_train_w2v.csv",index=False,encoding="utf8")
test_w2v.to_csv("bases/working_test_w2v.csv",index=False,encoding="utf8")

# Now TFIDF

In [None]:

tfidf = TfidfVectorizer(
        min_df=10, max_features=10000, strip_accents=None, lowercase = False,
        analyzer='word', token_pattern=r'\w+', ngram_range=(1,3), use_idf=True,
        smooth_idf=True, sublinear_tf=True
        ).fit(train["Text"])

X_train_text = tfidf.transform(train["Text"])
X_test_text = tfidf.transform(test["Text"])

In [None]:
X_train=ssp.hstack((train, X_train_text),format="csc")
X_test=ssp.hstack((test,X_test_text),format="csc")
print(X_train.shape,X_test.shape)

# THEN FASTTEXT