In [1]:
import pandas as pd
import joblib

from nltk import word_tokenize
import gensim
from keras.preprocessing.text import one_hot
from keras.utils import pad_sequences
from keras.layers import Embedding
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten

**Loading the tokens**

In [2]:
tweets_train_tokenized = pd.read_csv('csvs/tweets_train_tokens.csv', index_col=False)
tweets_train_tokenized_message = pd.Series(tweets_train_tokenized.message)
# Converting Panda series into Unicode datatype as required by vectorizers
tweets = tweets_train_tokenized_message.astype('U').values
tweets

array(['arirang simply kpop kim hyung jun cross ha yeong playback',
       'read politico article donald trump running mate tom brady list likely choice',
       'type bazura project google image image photo dad glenn moustache whatthe',
       ..., 'bring dunkin iced coffee tomorrow hero',
       'currently holiday portugal come home tomorrow poland tuesday holocaust memorial trip',
       'ladykiller saturday aternoon'], dtype=object)

#### **C. Word Embeddings**

Word embeddings are a type of word representation that allows words with similar meaning to have a similar representation.

It is an improvement over more the traditional bag-of-word model encoding schemes where large sparse vectors were used to represent each word or to score each word within a vector to represent an entire vocabulary. These representations were sparse because the vocabularies were vast and a given word or document would be represented by a large vector comprised mostly of zero values.

Key to the approach is the idea of using a **dense distributed representation** for each word.

In [3]:
# import image module
from IPython.display import Image

# get the image
Image(url="pictures/word-embedding-1.png", width=700, height=450)

In [4]:
# import image module
from IPython.display import Image

# get the image
Image(url="pictures/word-embedding-2.png", width=700, height=600)

**c.1. Creating Word embeddings using Keras**

In [5]:
# We tokenize each sentence to get every word
from nltk import word_tokenize
words=[word_tokenize(sentence) for sentence in tweets]
words

[['arirang',
  'simply',
  'kpop',
  'kim',
  'hyung',
  'jun',
  'cross',
  'ha',
  'yeong',
  'playback'],
 ['read',
  'politico',
  'article',
  'donald',
  'trump',
  'running',
  'mate',
  'tom',
  'brady',
  'list',
  'likely',
  'choice'],
 ['type',
  'bazura',
  'project',
  'google',
  'image',
  'image',
  'photo',
  'dad',
  'glenn',
  'moustache',
  'whatthe'],
 ['fast',
  'lerner',
  'subpoena',
  'tech',
  'guy',
  'work',
  'hillary',
  'private',
  'server',
  'plead',
  'sound',
  'familiar'],
 ['sony',
  'reward',
  'app',
  'like',
  'lot',
  'female',
  'singer',
  'non',
  'retro',
  'sale',
  'no',
  'info'],
 ['watch',
  'brooklyn',
  'nets',
  'new',
  'york',
  'knick',
  'tonight',
  'postpone',
  'knick',
  'butt',
  'fuck',
  'miami',
  'tomorrow'],
 ['guy', 'open', 'gate', 'naruto', 'save', 'ass', 'goat'],
 ['triple',
  'h',
  'never',
  'ric',
  'flair',
  'bitch',
  'sunday',
  'no',
  'pressure',
  'rollin',
  'look',
  'look',
  'hhh',
  'raw'],
 ['join

In [6]:
# First, we need to determing the vocabulary size of our corpus
# We can estimate it to be much larger than actual to reduce the probability of collisions from the hash function.
# We need to have a clue of how many are they using Word2Vec
w2v_model = gensim.models.Word2Vec(words,window=5,vector_size=100, min_count=1)

# model.wv.index_to_key provides us a list of vocabularies
wv_words = w2v_model.wv.index_to_key
len(wv_words)

35776

In [7]:
# We can estimate it to be around 36000
vocab_size = 36000

In [8]:
# Second, we need to create a onehot-representation for each vocabulary word
# The vocab_size is used to assign a specific index to each word, so the same word will have the same index
onehot_repr = [one_hot(word, vocab_size) for word in tweets]
onehot_repr

[[34022, 13415, 25650, 2777, 17095, 15230, 30631, 17220, 6963, 3518],
 [19291,
  13706,
  10502,
  15899,
  15657,
  30308,
  6074,
  23259,
  14253,
  21087,
  33330,
  24450],
 [13788, 32266, 35165, 27934, 11083, 11083, 22017, 15412, 867, 1354, 32762],
 [32660,
  16450,
  17246,
  35347,
  32929,
  35082,
  12602,
  35839,
  16918,
  35744,
  27949,
  35400],
 [21251,
  23999,
  6367,
  10664,
  31771,
  1248,
  28074,
  32929,
  27969,
  5814,
  5764,
  2660],
 [15613,
  23050,
  9429,
  3346,
  6469,
  12524,
  8077,
  9905,
  12524,
  24759,
  17878,
  11892,
  6669],
 [32929, 9548, 14516, 5882, 21760, 9495, 2754],
 [18358,
  14662,
  12997,
  12567,
  33773,
  15787,
  19480,
  5764,
  2556,
  10671,
  12486,
  12486,
  6093,
  13410],
 [34286,
  23282,
  17392,
  15822,
  7291,
  16685,
  9548,
  6709,
  6669,
  5263,
  31326,
  26173,
  21798,
  25530],
 [867, 33220, 4304, 11154, 9464, 20242, 19480],
 [3174, 7516, 5385, 6669, 10157, 14014, 7516, 14282, 10265],
 [10464,
  21282,

In [9]:
# Example
tweets[0]

'arirang simply kpop kim hyung jun cross ha yeong playback'

In [10]:
# Onehot representation of tweets[0]
onehot_repr[0]

[34022, 13415, 25650, 2777, 17095, 15230, 30631, 17220, 6963, 3518]

In [11]:
# Next, we need to make our sentences to be of the same length
# To do this, we need to know the sentence with the most words in it
from nltk import word_tokenize
max_len = 0
for i in range(len(tweets)):
    word_tokens = word_tokenize(tweets[i])
    length = len(word_tokens)
    if length > max_len:
        max_len = length
max_len

27

In [12]:
import numpy as np
# It's good to know the average number of tokens per tweet
# So we can have an idea what a good window size is for our tweets
from nltk import word_tokenize
tweets_length = []
for i in range(len(tweets)):
    word_tokens = word_tokenize(tweets[i])
    length = len(word_tokens)
    tweets_length.append(length)
tweets_ave = np.mean(np.array(tweets_length))
tweets_ave

9.503210870659286

In [13]:
# We can use pad_sequences, and it will add zeros to each sentence with less than 27 words
# We can choose if it will be added at the beginning or at the end
# using the `padding` parameter which takes `pre` or `post`
padded_tweets = pad_sequences(onehot_repr, padding='pre',maxlen=max_len)
padded_tweets

array([[    0,     0,     0, ..., 17220,  6963,  3518],
       [    0,     0,     0, ..., 21087, 33330, 24450],
       [    0,     0,     0, ...,   867,  1354, 32762],
       ...,
       [    0,     0,     0, ...,  3339,  6669, 22932],
       [    0,     0,     0, ..., 34137, 25977, 30586],
       [    0,     0,     0, ...,  3883, 27273, 11962]])

Keras offers an **Embedding** layer that can be used for neural networks on text data.

It requires that the input data be **integer encoded**, so that each word is represented by a unique integer.

The Embedding layer is initialized with random weights and will learn an embedding for all of the words in the training dataset.

It is a flexible layer that can be used in a variety of ways, such as:

* It can be used alone to learn a word embedding that can be saved and used in another model later.
* It can be used as part of a deep learning model where the embedding is learned along with the model itself.
* It can be used to load a pre-trained word embedding model, a type of transfer learning.

The Embedding layer is defined as the first hidden layer of a network. It must specify 3 arguments:

1. **input_dim**: This is the size of the vocabulary in the text data. For example, if your data is integer encoded to values between 0-10, then the size of the vocabulary would be 11 words.
2. **output_dim**: This is the size of the vector space in which words will be embedded. It defines the size of the output vectors from this layer for each word. For example, it could be 32 or 100 or even larger. Test different values for your problem.
3. **input_length**: This is the length of input sequences, as you would define for any input layer of a Keras model. For example, if all of your input documents are comprised of 1000 words, this would be 1000.

In [14]:
# How many features do we want to describe each word?
# We can set this using `dim`, we can experiment with this

# We just need the output of the embedding layer, because it contains the weights that we want
# We need to flatten it, because each word will has its own vector which will create a 2d-matrix for each tweet
# We just need a 1-dimensional representation for each tweet
dim = 10
keras_model = Sequential()
keras_model.add(Embedding(vocab_size, dim, input_length=max_len))
keras_model.add(Flatten())
keras_model.compile('rmsprop',loss="categorical_crossentropy", metrics=['accuracy'])
keras_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 27, 10)            360000    
                                                                 
 flatten (Flatten)           (None, 270)               0         
                                                                 
Total params: 360,000
Trainable params: 360,000
Non-trainable params: 0
_________________________________________________________________


In [15]:
keras_model_tweets = keras_model.predict(padded_tweets)
keras_model_tweets



array([[ 0.02011312, -0.00384263, -0.03569106, ...,  0.02549319,
         0.03361343, -0.00767218],
       [ 0.02011312, -0.00384263, -0.03569106, ...,  0.0429675 ,
         0.03223759, -0.04315708],
       [ 0.02011312, -0.00384263, -0.03569106, ..., -0.0283402 ,
        -0.02560207, -0.009534  ],
       ...,
       [ 0.02011312, -0.00384263, -0.03569106, ..., -0.01660419,
        -0.03300374,  0.00409142],
       [ 0.02011312, -0.00384263, -0.03569106, ...,  0.03942401,
        -0.04004568, -0.00603427],
       [ 0.02011312, -0.00384263, -0.03569106, ...,  0.0034272 ,
        -0.02500931,  0.04393745]], dtype=float32)

In [16]:
keras_model_tweets.shape

(49675, 270)

In [17]:
import joblib
# Save the keras model to disk
keras_model_file = 'vectors/keras_model.sav'
joblib.dump(keras_model, keras_model_file)

Keras weights file (<HDF5 file "variables.h5" (mode r+)>) saving:
...layers\embedding
......vars
.........0
...layers\flatten
......vars
...optimizer
......vars
.........0
...vars
Keras model archive saving:
File Name                                             Modified             Size
config.json                                    2023-01-20 18:35:09         1083
metadata.json                                  2023-01-20 18:35:09           64
variables.h5                                   2023-01-20 18:35:09      1450288


['vectors/keras_model.sav']

In [18]:
import joblib
# Save the tfidvectorizer to disk
keras_model_tweets_file = 'vectors/keras_model_tweets.sav'
joblib.dump(keras_model_tweets, keras_model_tweets_file)

['vectors/keras_model_tweets.sav']

**c.2. Word2Vec**

Word2Vec is the combination of two techniques:
a. Continuous Bag of Words
b. Skipgram

Both of these are shallow neural networks (1 hidden layer) which learn the weights, which then act as word vector representations when multiplied to the onehot-encoded versions of the target vocabulary.

Predicting a target word (CBOW), and predicting the context/surrounding words given a word (Skipgram) are just **"fake problems"** we are trying to solve. What we are interested in is the **side effect**, which is the **word embedding vector**, or the weights generated by the neural network in solving that fake problems.

Word2Vec is not a true *unsupervised learning* technique (since there is some sort of error backpropagation taking place through correct and incorrect predictions), they are a **self-supervised technique**, a specific instance of *supervised learning* where the targets are generated from the input data. In order to get self-supervised models to learn interesting features, you have to come up with an interesting synthetic target and loss function.

*Self-supervised* methods rely heavily on the **The Distributional Hypothesis** derived from the *semantic theory*, and the basis for **statistical semantics**. The hypothesis states that:

    - Words that are used and occur in the same contexts tend to convey similar meanings.

The crux of a self-supervised model is that word representations learned while learning to predict the context of a word from the word itself (or vice versa) represent a vector space capturing **deep semantic and syntactic concepts and phenomena.** Meaning, learning from the context of a word can teach us about both its meaning and its syntactic role.




In [19]:
# import image module
from IPython.display import Image

# get the image
Image(url="pictures/word2vec-1.png", width=700, height=700)

In [20]:
# import image module
from IPython.display import Image

# get the image
Image(url="pictures/word2vec-2.png", width=700, height=500)

In [21]:
# import image module
from IPython.display import Image

# get the image
Image(url="pictures/word2vec-3.png", width=700, height=550)

With CBOW, the word embedding is **between the hidden layer and the output layer**, while with Skipgram, the word embedding is **between the input layer and the hidden layer**.

In [22]:
import gensim
### Lets train Word2vec from scratch
# min_count ignores all words with frequency less than min_count
# window size is determined by `window`
# the number of dimensions for each word vector is given by `vector_size`
vector_size = 10
window = 9
w2v_model=gensim.models.Word2Vec(words,window=window,vector_size=vector_size, min_count=1)
word2vec_model_file = './vectors/' +'word2vec_' + str(window) + str(vector_size) + '.model'
w2v_model.save(word2vec_model_file)

In [23]:
# All word vectors need to be averaged with respect to each sentences
# We should be getting the final vector for each sentence
import numpy as np
def avg_word2vec(doc):
    # remove out-of-vocabulary words
    #sent = [word for word in doc if word in model.wv.index_to_key]
    #print(sent)
    
    return np.mean([w2v_model.wv[word] for word in doc if word in w2v_model.wv.index_to_key],axis=0)
        

In [24]:
from tqdm import tqdm

#apply for the entire sentences
X=[]
for i in tqdm(range(len(words))):
    X.append(avg_word2vec(words[i]))

100%|██████████| 49675/49675 [00:36<00:00, 1344.51it/s]


In [25]:
word2vec_tweets=np.array(X)
word2vec_tweets

array([[ 0.43820262, -0.4754556 ,  0.11711419, ...,  0.8534837 ,
        -1.470024  , -0.72018313],
       [ 0.8019137 , -1.0742214 ,  0.6273    , ...,  0.96964335,
        -2.5701778 , -1.698925  ],
       [ 0.7749731 , -0.4721202 ,  0.4301789 , ...,  1.0060902 ,
        -1.4998134 , -0.62398565],
       ...,
       [ 1.2551143 , -0.6130551 ,  0.83693   , ...,  1.9660778 ,
        -1.5066313 , -1.1546358 ],
       [ 1.0965337 , -0.71529126,  0.89206046, ...,  1.8730817 ,
        -1.32547   , -0.6296154 ],
       [ 0.16459951, -0.8970738 ,  0.19940762, ...,  2.136376  ,
        -0.63419354,  0.01595367]], dtype=float32)

In [26]:
word2vec_tweets.shape

(49675, 10)

In [27]:
import joblib
# Save the tfidvectorizer to disk
word2vec_tweets_file = 'vectors/word2vec_tweets.sav'
joblib.dump(word2vec_tweets, word2vec_tweets_file)

['vectors/word2vec_tweets.sav']

**c.3. fastText**

FastText splits out words using n-gram characters. Contrary to other popular models that learn word representations by assigning a distinct vector to each word, FastText is based on the skipgram model, where each word is represented as a bag of character n-grams. A vector representation is associated to each character n-gram; words being represented as the sum of these representations.

This approach is a significant improvement over word2vec and GloVe for two reasons:

* The ability to infer **out-of-vocabulary** words. Example, ‘England’ is related to ‘Netherlands’ because of land present in both as ‘lan’ and ‘and’.

* The robustness to spelling mistakes and typos.

Word2Vec is trained using **words**. while;

fastText is trained using **character n-gram**. Example: *capable n_gram=3* ==> *cap*, *apa*, *pab*, *abl*, *ble*

fastText is also often the **first choice** when you want to train **custom word embeddings** for your *domain*.

fastText is a **technique** similar to word2vec as well as a **library**. It is also lightweight and faster than others, but with performance metrics comparable to fancy models.


In [28]:
from gensim.models import FastText
window = 9
vector_size = 10
ft_model = FastText(window=window, min_count=2, sentences=words, vector_size=vector_size)
ft_model_file = './vectors/' +'fasttext_' + str(window) + str(vector_size) + '.model'
ft_model.save(ft_model_file)

In [29]:
# All word vectors need to be averaged with respect to each sentences
# We should be getting the final vector for each sentence
import numpy as np
def avg_fasttext(doc):
    # remove out-of-vocabulary words
    #sent = [word for word in doc if word in model.wv.index_to_key]
    #print(sent)
    
    return np.mean([ft_model.wv[word] for word in doc],axis=0)
        

In [30]:
from tqdm import tqdm

#apply for the entire sentences
X_fast=[]
for i in tqdm(range(len(words))):
    X_fast.append(avg_fasttext(words[i]))

100%|██████████| 49675/49675 [00:02<00:00, 21143.10it/s]


In [31]:
ft_tweets = np.array(X_fast)
ft_tweets

array([[ 1.7083393 ,  0.16257861, -0.3256186 , ...,  1.6048752 ,
        -0.6958239 , -0.861176  ],
       [ 1.4671383 ,  0.27155647, -2.1007617 , ...,  2.8497617 ,
        -0.8527399 , -0.83536667],
       [ 1.7470719 ,  0.10171478, -0.4181965 , ...,  1.5183479 ,
        -0.21019751, -0.65540004],
       ...,
       [ 1.5684685 , -0.18233915, -0.61222434, ...,  1.6030623 ,
        -1.0378721 , -1.6156934 ],
       [ 1.7817134 ,  0.7445152 , -0.77702844, ...,  2.0351388 ,
        -0.6231385 , -1.0140816 ],
       [ 1.2703185 ,  1.3933463 , -0.4189272 , ...,  2.0525815 ,
        -0.8890051 , -1.0568657 ]], dtype=float32)

In [32]:
ft_tweets.shape

(49675, 10)

In [33]:
import joblib
# Save the tfidvectorizer to disk
ft_tweets_file = 'vectors/fasttext_tweets.sav'
joblib.dump(ft_tweets,ft_tweets_file)

['vectors/fasttext_tweets.sav']

#### **End. Thank you!**