
# Advanced ML: Working with an embedding vectorizer


#### 1. install embedding vectorizer:
pip install embeddingvectorizer

https://github.com/ccs-amsterdam/embeddingvectorizer

#### 2. download the dataset:
https://surfdrive.surf.nl/files/index.php/s/HKR33cTie8NT6Zh
    
#### 3. Download a pre-trained embedding model:
https://surfdrive.surf.nl/files/index.php/s/5DVO9b2XdNTxfZQ


In [91]:
import embeddingvectorizer
import gensim
from sklearn.linear_model import LogisticRegressionCV
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_pickle('dataset_vermeer.pkl')
X_train, X_test, y_train, y_test = train_test_split (df['text'], df['topic'], test_size = 0.2 , random_state=42)


In [71]:
#embedding_mdl
model = gensim.models.Word2Vec.load('../tmpanne/fullsample/w2v_model_nr_7_window_10_size_300_negsample_15')


In [66]:
# get embedding model in right format (vector array for each dictionary word)
embedding_mdl = dict(zip(model.wv.index2word, model.wv.syn0))

# count vectorizer
embedding_vect_count = embeddingvectorizer.EmbeddingCountVectorizer(embedding_mdl, 'mean')

# tfidf
embedding_vect_tfidf = embeddingvectorizer.EmbeddingTfidfVectorizer(embedding_mdl, 'mean')


  """Entry point for launching an IPython kernel.


In [73]:
# fit and predict
clf = LogisticRegressionCV()
pipe = make_pipeline(embedding_vect_count, clf)
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)


# Example with Keras

In [2]:
import tensorflow as tf


In [24]:
import numpy as np
from tqdm import tqdm
import pandas as pd
import pickle
import numpy as np

from gensim.models.keyedvectors import KeyedVectors
from gensim.models import Word2Vec

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Embedding, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder


In [None]:
model = Word2Vec.load("AEM_small_300")
model.wv.save_word2vec_format('AEM300.txt', binary=False)
AEM = 'AEM300.txt'


In [33]:
def encodeY(Y):
    '''create one-hot (dummies) for output, encode class values as integers
    '''
    encoder = LabelEncoder()
    encoder.fit(Y)
    encoded_Y = encoder.transform(Y)
    dummy_y = tf.keras.utils.to_categorical(encoded_Y)
    return dummy_y

In [90]:
X_train, X_test, y_train, y_test = train_test_split([t.translate(str.maketrans('', '', string.punctuation)) for t in df['text']], encodeY(df['topic'].map(int)), test_size = 0.2)

In [1]:
# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(X_train)


NameError: name 'Tokenizer' is not defined

In [36]:
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(X_train)
# pad sequences
max_length = max([len(s.split()) for s in X_train])
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
# define training labels
#ytrain = array([0 for _ in range(900)] + [1 for _ in range(900)])

In [37]:
encoded_docs = tokenizer.texts_to_sequences(X_test)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [21]:
#Xtrain[0]

In [40]:
embeddings_index = {}
with open(AEM) as f:
    numberofwordvectors, dimensions = [int(e) for e in next(f).split()]
    for line in tqdm(f):
        values = line.split()
        embeddings_index[values[0]] = np.asarray(values[1:], dtype='float32')
       # word = values[0]
       # coefs = np.asarray(values[1:], dtype='float32')
      #  embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))
print('Should be {} vectors with {} dimensions'.format(numberofwordvectors, dimensions))

1055100it [02:00, 8720.43it/s]

Found 1055100 word vectors.
Should be 1055100 vectors with 300 dimensions





In [82]:
# create a weight matrix for the Embedding layer from a loaded embedding
def get_weight_matrix(embedding, vocab):
    words_not_found = 0 
    total_words = 0
    debug = []
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = np.zeros((vocab_size, 300))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in tqdm(vocab.items()):
        e = embedding.get(word, None)
        if e is not None:   # if we do not find the word, we do not want to replace anything but leave the zero's
            weight_matrix[i] = e
            total_words+=1
        else:
            words_not_found+=1
            debug.append(word)
    print('Weight matrix created. For {} out of {} words, we did not have any embedding.'.format(words_not_found, total_words))
    return debug, weight_matrix

In [83]:
missingwords, embedding_vectors = get_weight_matrix(embeddings_index, tokenizer.word_index)

100%|██████████| 98032/98032 [00:01<00:00, 83007.29it/s]

Weight matrix created. For 24853 out of 73179 words, we did not have any embedding.





In [84]:
embedding_vectors

array([[ 0.     ,  0.     , ...,  0.     ,  0.     ],
       [ 4.85867, -2.41003, ...,  1.81039,  2.77538],
       ...,
       [ 0.     ,  0.     , ...,  0.     ,  0.     ],
       [ 0.     ,  0.     , ...,  0.     ,  0.     ]])

In [85]:
len(embedding_vectors), len(Xtrain)

(98033, 2788)

In [86]:
embedding_layer = Embedding(len(tokenizer.word_index)+1, 300, weights=[embedding_vectors], input_length=max_length, trainable=False)

W0302 13:22:27.362712 140085305067264 deprecation.py:506] From /usr/local/lib/python3.5/dist-packages/tensorflow/python/keras/initializers.py:119: calling RandomUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [87]:
# define model
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(4, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

W0302 13:22:31.918565 140085305067264 deprecation.py:506] From /usr/local/lib/python3.5/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 51138, 300)        29409900  
_________________________________________________________________
conv1d (Conv1D)              (None, 51134, 128)        192128    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 25567, 128)        0         
_________________________________________________________________
flatten (Flatten)            (None, 3272576)           0         
_________________________________________________________________
dense (Dense)                (None, 4)                 13090308  
Total params: 42,692,336
Trainable params: 13,282,436
Non-trainable params: 29,409,900
_________________________________________________________________
None


In [88]:
# alternatief model

numberoflabels = 4
model = Sequential()
model.add(embedding_layer)
model.add(Conv1D(128, 4, activation='relu'))
model.add(MaxPooling1D(4))
model.add(MaxPooling1D(4))
model.add(Flatten())
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=numberoflabels, activation='softmax'))   # voor twee categorien sigmoid, voor 1 tanh
 
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 51138, 300)        29409900  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 51135, 128)        153728    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 12783, 128)        0         
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 3195, 128)         0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 408960)            0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                26173504  
_________________________________________________________________
dense_2 (Dense)              (None, 4)                

In [None]:
VALIDATION_SIZE=200

model.fit(Xtrain[:-VALIDATION_SIZE], y_train[:-VALIDATION_SIZE], 
          epochs=3, verbose=True,
          validation_data=(Xtrain[-VALIDATION_SIZE:], y_train[-VALIDATION_SIZE:]))

Train on 2588 samples, validate on 200 samples
Epoch 1/3
 224/2588 [=>............................] - ETA: 12:29 - loss: 27.8686 - acc: 0.4732

In [23]:
loss, acc = model.evaluate(Xtest, y_test, verbose=True)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 77.793694


In [None]:
# compile network
# fit network
model.fit(Xtrain, y_train, epochs=3, verbose=True)
# evaluate
loss, acc = model.evaluate(Xtest, y_test, verbose=0)
print('Test Accuracy: %f' % (acc*100))


In [None]:
# embeddings_index['man'] - embeddings_index['vrouw']