In [7]:
import numpy as np
import pandas as pd
import pickle
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from gensim import utils
import os
import nltk
import scipy.sparse as ssp
from sklearn.decomposition import TruncatedSVD

In [8]:
train=pd.read_csv("checkpoints_databases/nw_working_train.csv",encoding="utf8")
test=pd.read_csv("checkpoints_databases/nw_working_test.csv",encoding="utf8")

In [9]:
train_cl=train.drop(["Variation","Text","Class"],axis=1)
test_cl=test.drop(["Text","Class","Variation"],axis=1)
train_cl.to_csv("nw_meta_features/meta_train_l1l2.csv")
test_cl.to_csv("nw_meta_features/meta_test_l1l2.csv")

# Here we will add features from word2vec retrained then get the mean for the sentences

In [33]:
class MySentences(object):
    """MySentences is a generator to produce a list of tokenized sentences 
    
    Takes a list of numpy arrays containing documents.
    
    Args:
        arrays: List of arrays, where each element in the array contains a document.
    """
    def __init__(self, *arrays):
        self.arrays = arrays
 
    def __iter__(self):
        for array in self.arrays:
            for document in array:
                for sent in nltk.sent_tokenize(document):
                    yield nltk.word_tokenize(sent)

def get_word2vec(sentences, location,size):
    """Returns trained word2vec
    
    Args:
        sentences: iterator for sentences
        
        location (str): Path to save/load word2vec
    """
    if os.path.exists(location):
        print('Found {}'.format(location))
        model = gensim.models.Word2Vec.load(location)
        return model
    
    print('{} not found. training model'.format(location))
    model = gensim.models.Word2Vec(sentences, size=size, window=5, min_count=5, workers=4)
    print('Model done training. Saving to disk')
    model.save(location)
    return model

In [11]:
#It's important to remove duplicated spaces for word2vec learning !
train["Text"]=[" ".join(doc.split()) for doc in train["Text"].values]
test["Text"]=[" ".join(doc.split()) for doc in test["Text"].values]

In [34]:
number_w2v=[100,200,300]
w2v={}
for size in number_w2v:
    w2v["w2v_"+str(size)] = get_word2vec(
        MySentences(
            train["Text"].values),"w2v_features"+str(size),size
    )

Found w2v_features100
Found w2v_features200
Found w2v_features300


In [13]:
class MyTokenizer:
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        transformed_X = []
        for document in X:
            tokenized_doc = []
            for sent in nltk.sent_tokenize(document):
                tokenized_doc += nltk.word_tokenize(sent)
            transformed_X.append(np.array(tokenized_doc))
        return np.array(transformed_X)
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
        self.dim = len(word2vec.wv.syn0[0])

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = MyTokenizer().fit_transform(X)
        
        return np.array([
            np.mean([self.word2vec.wv[w] for w in words if w in self.word2vec.wv]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])
    
    def fit_transform(self, X, y=None):
        return self.transform(X)

In [37]:
mean_embedding_vectorizer={}
mean_embedded_train={}
mean_embedded_test={}
for name in w2v:
    mean_embedding_vectorizer[name] = MeanEmbeddingVectorizer(w2v[name])
    mean_embedded_train[name] = mean_embedding_vectorizer[name].fit_transform(train['Text'])
    mean_embedded_test[name] = mean_embedding_vectorizer[name].fit_transform(test['Text'])
df_embed_tr={}
df_embed_tr={}
for name in w2v:
    df_embed_tr[name]=pd.DataFrame(mean_embedded_train[name])
    df_embed_te[name]=pd.DataFrame(mean_embedded_test[name])
for name in w2v:
    df_embedding_tr[name]=df_embed_tr.reset_index()
    df_embedding_te[name]=df_embed_te.reset_index()
    df_embedding_tr[name]=df_embedding_tr.rename(columns={"index":"ID"})
    df_embedding_te[name]=df_embedding_te.rename(columns={"index":"ID"})
train_w2v={}
test_w2v={}
for name in w2v:
    train_w2v[name]=pd.merge(train_cl,df_embedding_tr,on="ID")
    test_w2v[name]=pd.merge(test_cl,df_embedding_te,on="ID")
np_w2v_train={}
nw_w2v_test={}
ssp_w2v_train={}
ssp_w2v_test={}
for name in w2v:
    np_w2v_train[name]=np.array(train_w2v[name].drop("ID",axis=1))
    np_w2v_test[name]=np.array(test_w2v[name].drop("ID",axis=1))
    ssp_w2v_train[name]=ssp.csc_matrix(np_w2v_train[name])
    ssp_w2v_test[name]=ssp.csc_matrix(np_w2v_test[name])
for name in w2v:
    ssp.save_npz("checkpoints_databases/nw_working_train_w2v_"+name+".npz",ssp_w2v_train[name])
    ssp.save_npz("checkpoints_databases/nw_working_test_w2v_"+name+".npz",ssp_w2v_test[name])

# Now TFIDF+300tsvd

In [3]:
tfidf = TfidfVectorizer(
        min_df=10, max_features=10000, strip_accents=None, lowercase = False,
        analyzer='word', token_pattern=r'\w+', ngram_range=(1,3), use_idf=True,
        smooth_idf=True, sublinear_tf=True
        ).fit(train["Text"])

X_train_text = tfidf.transform(train["Text"])
X_test_text = tfidf.transform(test["Text"])

In [6]:
tsvd_train= {}
tsvd_test={}
list_comp=[100,200,300]
dic_svd={}
for comp in list_comp:
    dic_svd[str(comp)]=TruncatedSVD(n_components=comp,n_iter=25,random_state=26)
for svd in dic_svd:
    tsvd_train[svd]=dic_svd[svd].fit_transform(X_train_text)
    tsvd_test[svd]=dic_svd[svd].transform(X_test_text)
    X_train=pd.DataFrame()
X_test=pd.DataFrame()
for n in dic_svd:
    for i in range(int(n)):
        X_train['tsvd_' +str(n)+"_"+str(i)] = tsvd_train[n][:, i]
        X_test['tsvd_' +str(n)+"_"+str(i)] = tsvd_test[n][:, i]

In [None]:
X_train_100=X_train.iloc[:,:100]
X_train_200=X_train.iloc[:,100:300]
X_train_300=X_train.iloc[:,300:600]
X_test_100=X_test.iloc[:,:100]
X_test_200=X_test.iloc[:,100:300]
X_test_300=X_test.iloc[:,300:600]
dic_train={}
dic_test={}
dic_train["tsvd_100"]=ssp.hstack((train_cl.drop("ID",axis=1), X_train_100),format="csc")
dic_test["tsvd_100"]=ssp.hstack((test_cl.drop("ID",axis=1),X_test_100),format="csc")
dic_train["tsvd_200"]=ssp.hstack((train_cl.drop("ID",axis=1), X_train_200),format="csc")
dic_test["tsvd_200"]=ssp.hstack((test_cl.drop("ID",axis=1),X_test_200),format="csc")
dic_train["tsvd_300"]=ssp.hstack((train_cl.drop("ID",axis=1), X_train_300),format="csc")
dic_test["tsvd_300"]=ssp.hstack((test_cl.drop("ID",axis=1),X_test_300),format="csc")
for name in dic_train:
    ssp.save_npz("checkpoints_databases/nw_working_train_tfidf_"+name+".npz",dic_train[name])
    ssp.save_npz("checkpoints_databases/nw_working_test_tfidf_"+name+".npz",dic_test[name])

# Now Doc2Vec

In [None]:
def constructLabeledSentences(data):
    sentences=[]
    for index, row in data.iteritems():
        sentences.append(LabeledSentence(utils.to_unicode(row).split(), ['Text' + '_%s' % str(index)]))
    return sentences

train_sentences = constructLabeledSentences(train['Text'])
test_sentences = constructLabeledSentences(test['Text'])

Text_dim=[100,200,300]
d2v_train={}
d2v_test={}
for size in Text_dim:
    d2v_train["d2v_"+str(size)] = Doc2Vec(min_count=1, window=10, size=size, sample=1e-4, negative=5, workers=-1, iter=5,seed=26)
    d2v_train["d2v_"+str(size)].build_vocab(train_sentences)
    d2v_train["d2v_"+str(size)].train(train_sentences, total_examples=d2v_train["d2v_"+str(size)].corpus_count,
                                      epochs=d2v_train["d2v_"+str(size)].iter)
for size in Text_dim:
    d2v_test["d2v_"+str(size)] = Doc2Vec(min_count=1, window=10, size=size, sample=1e-4, negative=5, workers=-1, iter=5,seed=26)
    d2v_test["d2v_"+str(size)].build_vocab(test_sentences)
    d2v_test["d2v_"+str(size)].train(test_sentences, total_examples=d2v_test["d2v_"+str(size)].corpus_count,
                                 epochs=d2v_test["d2v_"+str(size)].iter)

In [None]:
d2v_train_arrays={}
d2v_test_arrays={}
d2v_train_arrays["d2v_100"] = np.zeros((len(train), 100))
d2v_test_arrays["d2v_100"] = np.zeros((len(test), 100))
d2v_train_arrays["d2v_200"] = np.zeros((len(train), 200))
d2v_test_arrays["d2v_200"] = np.zeros((len(test), 200))
d2v_train_arrays["d2v_300"] = np.zeros((len(train), 300))
d2v_test_arrays["d2v_300"] = np.zeros((len(test), 300))

for n_train,n_test in zip(range(len(train)),range(len(test))):
    d2v_train_arrays["d2v_100"][i] = d2v_train["d2v_100"].docvecs['Text_'+str(n_train)]
    d2v_test_arrays["d2v_100"][i] = d2v_test["d2v_100"].docvecs['Text_'+str(n_test)]
    d2v_train_arrays["d2v_200"][i] = d2v_train["d2v_200"].docvecs['Text_'+str(n_train)]
    d2v_test_arrays["d2v_200"][i] = d2v_test["d2v_200"].docvecs['Text_'+str(n_test)]
    d2v_train_arrays["d2v_300"][i] = d2v_train["d2v_300"].docvecs['Text_'+str(n_train)]
    d2v_test_arrays["d2v_300"][i] = d2v_test["d2v_300"].docvecs['Text_'+str(n_test)]
    


In [None]:
X_train=ssp.hstack((train_cl.drop("ID",axis=1), d2v_train_arrays),format="csc")
X_test=ssp.hstack((test_cl.drop("ID",axis=1), d2v_test_arrays),format="csc")

print(X_train.shape,X_test.shape)

ssp.save_npz("checkpoints_databases/nw_working_train_d2v.npz",X_train)
ssp.save_npz("checkpoints_databases/nw_working_test_d2v.npz",X_test)
