**Implement Word2vec with skipgrams using negative sampling on selected Questions** 

In [None]:
from collections import Counter
import pandas as pd
import numpy as np
from datetime import datetime
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tqdm.notebook import tqdm
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import TextVectorization, Dense,Input,Activation,Embedding, Dot, Flatten, Dropout
from tensorflow.keras.models import Model
import tensorflow.keras.initializers
from sklearn.metrics import f1_score
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.losses import BinaryCrossentropy
import h5py
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping
import io

In [None]:
#get the preprocessed selected questions data
pre_processed_data = pd.read_pickle("/content/drive/MyDrive/StackOverflow_CaseStudy/Preprocessed_selected_data.pkl")

In [None]:
pre_processed_data.shape

(296099, 8)

**Check the number of words for each question to decide the maximum length**

In [None]:
word_count = [len(str(x).split()) for x in list(pre_processed_data['Ques_Text'].values)]

In [None]:
for i in range(90,100):
    var = sorted(word_count)
    #var = np.sort(var,axis = None)
    print("{} percentile value is {}".format(i,var[int(len(var)*(float(i)/100))]))
print ("100 percentile value is ",var[-1])

90 percentile value is 195
91 percentile value is 203
92 percentile value is 211
93 percentile value is 222
94 percentile value is 233
95 percentile value is 247
96 percentile value is 265
97 percentile value is 290
98 percentile value is 326
99 percentile value is 398
100 percentile value is  10245


We will keep the max length as 200 since it can cover more than 90% of the questions

In [None]:
#Word frequency across the corpus
word_freq_corpus = pd.Series(' '.join(pre_processed_data.Ques_Text).split()).value_counts()
word_freq_corpus

not                   738796
code                  329440
use                   320126
using                 274556
like                  255820
                       ...  
dftwiki                    1
csc231                     1
imagicon                   1
compile-time-error         1
datecal                    1
Length: 486804, dtype: int64

In [None]:
#We will consider the words occuring more than 10 times in the corpus
vocab_size = len(word_freq_corpus[word_freq_corpus.values>10])
vocab_size

40278

In [None]:
vocab_size=40278

**Prepare the Question text dataset**

In [None]:
question_text = tf.data.Dataset.from_tensor_slices(list(pre_processed_data['Ques_Text'].values))


In [None]:
max_ques_text_length = 200
vectorize_layer = TextVectorization(standardize=None, max_tokens=vocab_size, output_mode='int', output_sequence_length=max_ques_text_length)
vectorize_layer.adapt(question_text.batch(1024))


In [None]:
inverse_vocab = vectorize_layer.get_vocabulary()

In [None]:
# Vectorize all the questions in question_text.
question_text = question_text.batch(1024).prefetch(tf.data.AUTOTUNE).map(vectorize_layer).unbatch()

In [None]:
question_sequences = list(question_text.as_numpy_iterator())
print(len(question_sequences))

296099


In [None]:
for seq in question_sequences[:5]:
  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

[  245  2543   575    59   789   396   583   431    53   278    95  4298
  1467 12542  1633   583   644   236    19  3063   149   445  1252   242
  1633    12  1089   251    19   111   583    29   227  5160     1     7
     6    12  1633   197     3    95  1252   539    31     4    30    54
  1633    30    95   382    47   889    93  2834   709  1858  1240   106
     7  1758   709   503  1633   273  3462  1889    94  1078    12   197
    15  1427   213   479    53   776  4137    94   120   149    59   479
 15241     1    53   149    78     3    84   133   635  3991  1807   149
  9022    47    70   276     7   484   479  1052   883  2543 16726  7076
  5890  7076 16726   333  1600  1225  4221  7076   102  1946  7076   149
   330   536    88   347   403  1054  5890  2543   479  3721    63  3220
  5890   482    59   333  3059   154    36   307  3059    56     1  4224
   635   102     8   986   854   482    59    70  2786  3744    42  3123
  3868   565  4221  3123    47   154  6864    84  1

**Generate training examples (positive and negative skipgrams) from question text**

For training Word2Vec with skipgrams we need to generate positive and negative samples of (target word, context word) pair and label (0&1). For postive sample, target and context words occuring together in the corpus is used while for negative samples, context words are randomly generated which has not occured together with the corresponding context word in the corpus. This way multi-class classification problem is converted to binary classification problem.

In [None]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.

#tensorflow.keras has functions which help in easy generation of positive and negative samples
def generate_training_data(sequences, window_size, vocab_size, seed):
  # Elements of each training example are appended to this array.
  data_array = np.empty((0,3), dtype=int)
  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)
  
  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm(sequences):
    pairs, label_seq = tf.keras.preprocessing.sequence.skipgrams(sequence,vocabulary_size=vocab_size,
          sampling_table=sampling_table, window_size=window_size, negative_samples=2.0, shuffle=True)
    
    data_seq = np.column_stack((np.array(pairs, dtype=int), np.array(label_seq, dtype=int)))
    if(data_seq.shape[1]<3):
      data_seq = np.empty((0,3), dtype=int)
    data_array = np.vstack((data_array, data_seq))
  return data_array

In [None]:
#performing this operation in batches to avoid running out of memory
no_of_batches=4
batch_size = int(len(question_sequences)/no_of_batches)
model_data = tf.data.Dataset
path = "/content/drive/MyDrive/StackOverflow_CaseStudy/DataFiles/"

for i in range(0,no_of_batches):
  #pairs, labels = 
  data = generate_training_data(sequences=question_sequences[i*batch_size : (i+1)*batch_size],
                                                     window_size=2, vocab_size=vocab_size, seed=10)
  #save the (target word, context word) pair and labels generated to a hdf5 file
  hf = h5py.File(path+str(i)+'_data.hdf5', 'w')
  hf.create_dataset('dataset_1', data=data)
  hf.close()


  0%|          | 0/74024 [00:00<?, ?it/s]

In [None]:
path = "/content/drive/MyDrive/StackOverflow_CaseStudy/DataFiles/"

In [None]:
#get the data generated for all the batches
hf = h5py.File(path+'0_data.hdf5', 'r')
data1 = hf.get('dataset_1')
hf = h5py.File(path+'1_data.hdf5', 'r')
data2 = hf.get('dataset_1')
hf = h5py.File(path+'2_data.hdf5', 'r')
data3 = hf.get('dataset_1')
hf = h5py.File(path+'3_data.hdf5', 'r')
data4 = hf.get('dataset_1')

In [None]:
print(data1.shape)
print(data2.shape)
print(data3.shape)
print(data4.shape)

(21448311, 3)
(18481056, 3)
(17546679, 3)
(17182413, 3)


In [None]:
#combine the data generated for all the batches
data = np.vstack((np.array(data1), np.array(data2), np.array(data3), np.array(data4)))
data.shape

(74658459, 3)

In [None]:
targets = data[:, 0]
contexts = data[:, 1]
labels = data[:, 2]

In [None]:
print(targets.shape)
print(contexts.shape)
print(labels.shape)

(74658459,)
(74658459,)
(74658459,)


**Create a simple Deep Learning model to train on above data to generaye word vectors**

In [None]:
embedding_dim = 128
BATCH_SIZE = 1024

In [None]:
#input and embedding layer for target words
target_word = Input(shape=(1,))
#weights of this layer will be the word embeddings 
target_emb = tf.keras.layers.Embedding(input_dim=vocab_size+1, output_dim=embedding_dim, input_length=1)(target_word)

#input and embedding layer for context words
context_word = Input(shape=(1,))
context_emb = tf.keras.layers.Embedding(input_dim=vocab_size+1, output_dim=embedding_dim, input_length=1)(context_word)

#dot product of two vectors gives their similarity (cosine similarity) thats why dot layer is used
dot = tf.keras.layers.Dot(axes=1)([target_emb, context_emb])

dense1 = tf.keras.layers.Dense(64, activation=tf.nn.relu)(dot)
drp1 = tf.keras.layers.Dropout(0.2)(dense1)

dense2 = tf.keras.layers.Dense(32, activation=tf.nn.relu)(drp1)
drp2 = tf.keras.layers.Dropout(0.2)(dense2)

flt = tf.keras.layers.Flatten()(drp2)
output = tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)(flt)

model =Model(inputs=[target_word, context_word], outputs=output)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1, 128)       5155712     ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 1, 128)       5155712     ['input_2[0][0]']                
                                                                                              

In [None]:
#compile the model using Adam optimiser and accuracy as metric
model.compile(optimizer='adam', loss = BinaryCrossentropy(), metrics=['accuracy'])

#define Tensorboard callback to log the losses and to generate loss and accuracy curve later
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

#callback to stop the training if validation accuracy is not increased in last 2 epochs
earlystop = EarlyStopping(monitor='val_accuracy', min_delta=0.01, patience=2, verbose=1)

filepath="/content/drive/MyDrive/StackOverflow_CaseStudy/Saved_Model/weights-{epoch:02d}-{val_accuracy:.4f}.hdf5"
#callback to save model at every epoch if validation accuracy is improved from previous epoch
checkpoint = ModelCheckpoint(filepath=filepath, monitor='val_accuracy',  verbose=1, save_best_only=True, mode='auto')

callbacks = [tensorboard_callback, earlystop, checkpoint]

In [None]:
#train the Word2Vec model
model.fit([targets, contexts], labels, batch_size=BATCH_SIZE, epochs=10, validation_split=.15, callbacks=callbacks)

Epoch 1/10
Epoch 1: val_accuracy improved from -inf to 0.91747, saving model to /content/drive/MyDrive/StackOverflow_CaseStudy/Saved_Model/weights-01-0.9175.hdf5
Epoch 2/10
Epoch 2: val_accuracy improved from 0.91747 to 0.91974, saving model to /content/drive/MyDrive/StackOverflow_CaseStudy/Saved_Model/weights-02-0.9197.hdf5
Epoch 3/10
Epoch 3: val_accuracy improved from 0.91974 to 0.92045, saving model to /content/drive/MyDrive/StackOverflow_CaseStudy/Saved_Model/weights-03-0.9205.hdf5
Epoch 3: early stopping


<keras.callbacks.History at 0x7fd893c1d6d0>

In [None]:
#get the model with best performance
saved_model = tf.keras.models.load_model('/content/drive/MyDrive/StackOverflow_CaseStudy/Saved_Model/weights-03-0.9205.hdf5')
#get the weights of embedding layer, these are are word vectors for our vocabulary
weights = saved_model.get_layer('embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [None]:
#write vocab-vector dictionary to a file
out_v = io.open('/content/drive/MyDrive/StackOverflow_CaseStudy/Saved_Model/word_vectors.txt', 'w', encoding='utf-8')
for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write(word+' '+' '.join([str(x) for x in vec]) + "\n")
out_v.close()

In [None]:
#below files can be used to analyse created word vectors in Tensorflow's Embedding projector
out_v = io.open('/content/drive/MyDrive/StackOverflow_CaseStudy/Saved_Model/vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('/content/drive/MyDrive/StackOverflow_CaseStudy/Saved_Model/metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()