# Word2Vec - Tensorflow, Skip-Gram, NCS Loss

In [0]:
##imports
import numpy as np
import pandas as pd
import gensim
from bs4 import BeautifulSoup
import re
import nltk
##
import tensorflow as tf
from tensorflow.keras.layers import Dense, Embedding, Input, Dot, Reshape
from tensorflow.keras.models import Model
import random as rn

In [0]:
##getting data from https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews/kernels?sortBy=voteCount&group=everyone&pageSize=20&datasetId=134715
data_imdb = pd.read_csv(r'/content/drive/My Drive/IMDB Dataset.csv')

In [4]:
#info
data_imdb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [0]:
def clean_text(text):
    '''Cleans the text data'''
    ##remove html tags
    text = BeautifulSoup(text, "html.parser").get_text()
    return text.lower()

In [0]:
data_imdb['review'] = data_imdb['review'].apply(clean_text)

In [7]:
data_imdb.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [8]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
##getting sentence wise data
list_sents = [nltk.word_tokenize(sent) for sent_tok in data_imdb.review for sent in nltk.sent_tokenize(sent_tok)]

In [0]:
##to use tf.keras.preprocessing.sequence.skipgrams, we have to encode our sentence to numbers. so used Tokenizer class
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(list_sents)
seq_texts = tokenizer.texts_to_sequences(list_sents) ##list of list+

In [0]:
class word2vecNCS(Model):
    
    def __init__(self, vocab_size, embed_size, num_sampled, **kwargs):
        '''NCS Word2Vec
        vocab_size: Size of vocabulary you have
        embed_size: Embedding size needed
        num_sampled: No of negative sampled to generate'''
        super(word2vecNCS, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.num_sampled = num_sampled
        ##embedding layer
        self.embed_layer = Embedding(input_dim=vocab_size, output_dim=embed_size,embeddings_initializer=tf.keras.initializers.RandomUniform(seed=32))
        ##reshing layer
        self.reshape_layer = Reshape((self.embed_size,))
    def build(self, input_shape):
        ##weights needed for nce loss
        self.nce_weight = self.add_weight(shape=(self.vocab_size, self.embed_size),
                             initializer=tf.keras.initializers.TruncatedNormal(mean=0, stddev= (1/self.embed_size**0.5)),
                             trainable=True, name="nce_weight")
        #biases needed nce loss
        self.nce_bias = self.add_weight(shape=(self.vocab_size), initializer="zeros", trainable=True, name="nce_bias")

    def call(self, input_center_word, input_context_word):
        '''
        input_center_word: center word
        input_context_word: context word''' 
        ##giving center word and getting the embedding
        embedd_out = self.embed_layer(input_center_word)
        ##rehaping 
        embedd_out = self.reshape_layer(embedd_out)
        ##calculating nce loss
        nce_loss = tf.reduce_sum(tf.nn.nce_loss(weights=self.nce_weight, 
                                  biases=self.nce_bias, 
                                  labels=input_context_word, 
                                  inputs=embedd_out, 
                                  num_sampled=self.num_sampled, 
                                  num_classes=self.vocab_size))
        return nce_loss

In [0]:
def generate_sgns():
    for seq in seq_texts:
        generated_samples, labels = tf.keras.preprocessing.sequence.skipgrams(sequence=seq, 
                                                                      vocabulary_size=len(tokenizer.word_index)+1, 
                                                                      window_size=2, negative_samples=0)
        length_samples = len(generated_samples)
        for i in range(length_samples):
            yield [generated_samples[i][0]], [generated_samples[i][1]]

In [0]:
##creating the tf dataset
tfdataset_gen = tf.data.Dataset.from_generator(generate_sgns, output_types=(tf.int64, tf.int64))
tfdataset_gen = tfdataset_gen.repeat().batch(1024).prefetch(tf.data.experimental.AUTOTUNE)

In [22]:
##training

##optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)

sgncs_w2v = word2vecNCS(len(tokenizer.word_index)+1, 100, 32)

##train step function to train
@tf.function
def train_step(input_center, input_context):
    with tf.GradientTape() as tape:
        #forward propagation
        loss = sgncs_w2v(input_center, input_context)

    #getting gradients
    gradients = tape.gradient(loss, sgncs_w2v.trainable_variables)
    #applying gradients
    optimizer.apply_gradients(zip(gradients, sgncs_w2v.trainable_variables))

    return loss, gradients

##number of epochs
no_iterations=10000

##metrics # Even if you use .fit method, it alsocalculates batchwise loss/metric and aggregates those.  
train_loss = tf.keras.metrics.Mean(name='train_loss')

#tensorboard file writers
wtrain = tf.summary.create_file_writer(logdir='/content/drive/My Drive/word2vec_logs/logs/w2vncs/train')

##check point to save
checkpoint_path = "/content/drive/My Drive/word2vec_logs/checkpoints/w2vNCS/train"
ckpt = tf.train.Checkpoint(optimizer=optimizer, model=sgncs_w2v)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=3)


counter = 0
#training anf validating
for in_center, in_context in tfdataset_gen:
    #train step
    loss_, gradients = train_step(in_center, in_context)
    #adding loss to train loss
    train_loss(loss_)

    counter = counter + 1
         
    ##tensorboard 
    with tf.name_scope('per_step_training'):
        with wtrain.as_default():
            tf.summary.scalar("batch_loss", loss_, step=counter)
    with tf.name_scope("per_batch_gradients"):
        with wtrain.as_default():
            for i in range(len(sgncs_w2v.trainable_variables)):
                name_temp = sgncs_w2v.trainable_variables[i].name
                tf.summary.histogram(name_temp, gradients[i], step=counter)
    
    if counter%100 == 0:
        #printing
        template = '''Done {} iterations, Loss: {:0.6f}'''
    
        print(template.format(counter, train_loss.result()))

        if counter%200 == 0:
            ckpt_save_path  = ckpt_manager.save()
            print ('Saving checkpoint for iteration {} at {}'.format(counter+1, ckpt_save_path))
        
        train_loss.reset_states()
    if counter > no_iterations :
        break

Done 100 iterations, Loss: 178544.187500
Done 200 iterations, Loss: 175835.203125
Saving checkpoint for iteration 201 at /content/drive/My Drive/word2vec/checkpoints/w2vNCS/train/ckpt-1
Done 300 iterations, Loss: 169735.578125
Done 400 iterations, Loss: 164666.984375
Saving checkpoint for iteration 401 at /content/drive/My Drive/word2vec/checkpoints/w2vNCS/train/ckpt-2
Done 500 iterations, Loss: 162460.453125
Done 600 iterations, Loss: 159760.906250
Saving checkpoint for iteration 601 at /content/drive/My Drive/word2vec/checkpoints/w2vNCS/train/ckpt-3
Done 700 iterations, Loss: 151500.765625
Done 800 iterations, Loss: 150536.187500
Saving checkpoint for iteration 801 at /content/drive/My Drive/word2vec/checkpoints/w2vNCS/train/ckpt-4
Done 900 iterations, Loss: 149034.468750
Done 1000 iterations, Loss: 150517.093750
Saving checkpoint for iteration 1001 at /content/drive/My Drive/word2vec/checkpoints/w2vNCS/train/ckpt-5
Done 1100 iterations, Loss: 142637.484375
Done 1200 iterations, Loss

KeyboardInterrupt: ignored

In [23]:
##you can load from below checkpoints
ckpt_manager.checkpoints

['/content/drive/My Drive/word2vec/checkpoints/w2vNCS/train/ckpt-8',
 '/content/drive/My Drive/word2vec/checkpoints/w2vNCS/train/ckpt-9',
 '/content/drive/My Drive/word2vec/checkpoints/w2vNCS/train/ckpt-10']

In [33]:
sgncs_w2v.get_weights()[0]

array([[ 4.3952752e-02, -3.9126575e-02,  4.7340643e-02, ...,
        -1.5561890e-02, -8.4457621e-03, -3.8311258e-03],
       [ 1.2673739e-02,  2.6604220e-01, -1.2200137e-01, ...,
        -2.6649845e-01,  4.3133581e-01,  1.6652634e+00],
       [ 5.0035398e-02,  2.2489849e-01,  1.3173567e-02, ...,
         5.7263959e-02,  2.8345990e-01,  1.7488687e+00],
       ...,
       [ 8.9006796e-03, -1.7985929e-02,  2.8799288e-03, ...,
         7.4115507e-03, -1.2984991e-02,  1.3159607e-02],
       [ 1.2338273e-03,  1.1289597e-02, -1.5265692e-02, ...,
        -1.8576480e-02,  1.1550784e-02, -1.6645409e-02],
       [-3.9734341e-02,  2.7531121e-02, -3.8868021e-02, ...,
        -3.4919988e-02,  2.7181890e-02,  2.0708870e-02]], dtype=float32)

In [0]:
#3getting the word vectors
word_vectors = sgncs_w2v.get_weights()[0]
##convering into dict
word_vectors_dict = {}
for word, number in tokenizer.word_index.items():
    word_vectors_dict[word] = word_vectors[number]

In [35]:
##word vector for movies
word_vectors_dict['movies']

array([-0.02759881,  0.375678  ,  0.00394442,  0.03537954, -0.3508165 ,
       -0.3843107 , -1.902659  , -0.03924064,  0.03567823,  0.22394422,
        1.3254056 , -0.2738524 , -0.10814386, -0.16480844,  1.5734187 ,
        0.24689345,  0.1799947 , -0.31367883, -0.03789733, -0.2153853 ,
        0.23517315, -0.43801263, -1.3624109 , -1.4466581 ,  1.2364206 ,
        0.16642871, -0.05696939, -1.5534589 ,  0.08557777,  0.19669291,
       -0.37249157, -0.21300569, -0.03838867,  0.08273836, -0.10653205,
        0.06428144, -0.15504088, -0.17317666,  0.17255655,  0.2918513 ,
        1.3766997 ,  0.40330094,  0.46494156,  0.80065376, -0.8771551 ,
       -0.04986344, -0.8916342 ,  0.03179355,  1.5216345 ,  0.18060522,
        0.06709374, -0.0697518 , -0.11124301,  0.18427111, -1.3699241 ,
        0.10965209, -1.0532825 ,  0.37451154, -0.08506859, -0.0325211 ,
        0.298475  ,  0.15674552, -0.06431884, -0.260983  , -0.02124155,
        0.07537135, -0.42176956, -0.22111684,  1.740518  , -0.26

In [0]:
import warnings
warnings.filterwarnings("ignore")

In [0]:
##https://stackoverflow.com/a/54809609/10899915
def save_word2vec_format_dict(fname, vocab, vectors, binary=True, total_vec=None):
  """Store the input-hidden weight matrix in the same format used by the original
    C word2vec-tool, for compatibility.

    Parameters
    ----------
    fname : str
        The file path used to save the vectors in.
    vocab : dict
        The vocabulary of words.
    vectors : numpy.array
        The vectors to be stored.
    binary : bool, optional
        If True, the data wil be saved in binary word2vec format, else it will be saved in plain text.
    total_vec : int, optional
        Explicitly specify total number of vectors
        (in case word vectors are appended with document vectors afterwards).
    """
  if not (vocab or vectors):
      raise RuntimeError("no input")
  if total_vec is None:
      total_vec = len(vocab)
  vector_size = vectors.shape[1]
  assert (len(vocab), vector_size) == vectors.shape
  with utils.smart_open(fname, 'wb') as fout:
      print(total_vec, vector_size)
      fout.write(utils.to_utf8("%s %s\n" % (total_vec, vector_size)))
      # store in sorted order: most frequent words at the top
      for word, row in vocab.items():
          if binary:
              row = row.astype(REAL)
              fout.write(utils.to_utf8(word) + b" " + row.tostring())
          else:
              fout.write(utils.to_utf8("%s %s\n" % (word, ' '.join(repr(val) for val in row))))

In [38]:
##saving the file
from numpy import zeros, dtype, float32 as REAL, ascontiguousarray, fromstring
from gensim import utils

model_gensim = gensim.models.keyedvectors.Word2VecKeyedVectors(vector_size=50)
model_gensim.vocab = word_vectors_dict
model_gensim.vectors = np.array(list(word_vectors_dict.values()))
save_word2vec_format_dict(binary=True, fname='w2vncs.bin', total_vec=len(word_vectors_dict), vocab=model_gensim.vocab, vectors=model_gensim.vectors)

206705 100


In [0]:
##loading model
model_gensim = gensim.models.keyedvectors.Word2VecKeyedVectors.load_word2vec_format('w2vncs.bin', binary=True)

In [40]:
##word vectors
model_gensim.wv['movies']

array([-0.02759881,  0.375678  ,  0.00394442,  0.03537954, -0.3508165 ,
       -0.3843107 , -1.902659  , -0.03924064,  0.03567823,  0.22394422,
        1.3254056 , -0.2738524 , -0.10814386, -0.16480844,  1.5734187 ,
        0.24689345,  0.1799947 , -0.31367883, -0.03789733, -0.2153853 ,
        0.23517315, -0.43801263, -1.3624109 , -1.4466581 ,  1.2364206 ,
        0.16642871, -0.05696939, -1.5534589 ,  0.08557777,  0.19669291,
       -0.37249157, -0.21300569, -0.03838867,  0.08273836, -0.10653205,
        0.06428144, -0.15504088, -0.17317666,  0.17255655,  0.2918513 ,
        1.3766997 ,  0.40330094,  0.46494156,  0.80065376, -0.8771551 ,
       -0.04986344, -0.8916342 ,  0.03179355,  1.5216345 ,  0.18060522,
        0.06709374, -0.0697518 , -0.11124301,  0.18427111, -1.3699241 ,
        0.10965209, -1.0532825 ,  0.37451154, -0.08506859, -0.0325211 ,
        0.298475  ,  0.15674552, -0.06431884, -0.260983  , -0.02124155,
        0.07537135, -0.42176956, -0.22111684,  1.740518  , -0.26

In [54]:
model_gensim.wv.most_similar(positive='movie')

[('on', 0.9762333631515503),
 ('one', 0.9754350185394287),
 ('is', 0.9748979806900024),
 ('about', 0.9745714664459229),
 ('for', 0.9742895364761353),
 ('like', 0.9740601778030396),
 ('film', 0.9738096594810486),
 ('with', 0.9723396301269531),
 ('in', 0.9720105528831482),
 ('so', 0.9718752503395081)]