In [5]:
import numpy as np
import pandas as pd

from gensim.models import Word2Vec
from gensim.models import KeyedVectors

import tensorflow as tf
import string
import tqdm
import re
import io

#### Train word2vec model on your own corpus

#### split to word level but save as sentences. 2d array

In [18]:
sentences = ['This chapter is about exploring different methods to classify toxic comments for multi label classification',
           'This chapter is about exploring different methods to classify toxic comments for multi label classification'] 


In [23]:
word_sents = []
for each in sentences:
    temp = []
    temp = each.split(' ')
    word_sents.append(temp)

In [24]:
model = Word2Vec(min_count=1, vector_size = 10)
model.build_vocab(word_sents)
model.train(word_sents, total_examples = len(sentences), epochs=3)

(7, 90)

#### The trained word vectors are stored in a KeyedVectors instance, as model.wv

In [25]:
word_vectors = model.wv
word_vectors.save("word2vec.wordvectors")

In [26]:
wv = KeyedVectors.load("word2vec.wordvectors", mmap='r')

In [27]:
vector = wv['chapter']
vector.shape

(10,)

#### Word2vec from scratch
#### Help has been taken from https://www.tensorflow.org/tutorials/text/word2vec

In [46]:
AUTOTUNE = tf.data.AUTOTUNE

In [47]:
sentence = 'This chapter is about exploring different methods to classify toxic comments for multi label classification' 


In [48]:
tokens = list(sentence.split(' '))
tokens

['This',
 'chapter',
 'is',
 'about',
 'exploring',
 'different',
 'methods',
 'to',
 'classify',
 'toxic',
 'comments',
 'for',
 'multi',
 'label',
 'classification']

In [49]:
vocab, index = {}, 1
vocab['<pad>'] = 0
for token in tokens:
    if token not in vocab:
        vocab[token] = index
        index += 1
vocab_size = len(vocab)
vocab

{'<pad>': 0,
 'This': 1,
 'chapter': 2,
 'is': 3,
 'about': 4,
 'exploring': 5,
 'different': 6,
 'methods': 7,
 'to': 8,
 'classify': 9,
 'toxic': 10,
 'comments': 11,
 'for': 12,
 'multi': 13,
 'label': 14,
 'classification': 15}

In [50]:
inverse_vocab = {index: token for token, index in vocab.items()}
inverse_vocab

{0: '<pad>',
 1: 'This',
 2: 'chapter',
 3: 'is',
 4: 'about',
 5: 'exploring',
 6: 'different',
 7: 'methods',
 8: 'to',
 9: 'classify',
 10: 'toxic',
 11: 'comments',
 12: 'for',
 13: 'multi',
 14: 'label',
 15: 'classification'}

In [51]:
example_sentence = [vocab[word] for word in tokens]
example_sentence

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

In [52]:
window_size = 3
positive_skip_gram, _ = tf.keras.preprocessing.sequence.skipgrams(
    example_sentence,
    shuffle = False,
    vocabulary_size = vocab_size,
    window_size = window_size,
    negative_samples = 0.0
)
len(positive_skip_gram), positive_skip_gram

(78,
 [[1, 2],
  [1, 3],
  [1, 4],
  [2, 1],
  [2, 3],
  [2, 4],
  [2, 5],
  [3, 1],
  [3, 2],
  [3, 4],
  [3, 5],
  [3, 6],
  [4, 1],
  [4, 2],
  [4, 3],
  [4, 5],
  [4, 6],
  [4, 7],
  [5, 2],
  [5, 3],
  [5, 4],
  [5, 6],
  [5, 7],
  [5, 8],
  [6, 3],
  [6, 4],
  [6, 5],
  [6, 7],
  [6, 8],
  [6, 9],
  [7, 4],
  [7, 5],
  [7, 6],
  [7, 8],
  [7, 9],
  [7, 10],
  [8, 5],
  [8, 6],
  [8, 7],
  [8, 9],
  [8, 10],
  [8, 11],
  [9, 6],
  [9, 7],
  [9, 8],
  [9, 10],
  [9, 11],
  [9, 12],
  [10, 7],
  [10, 8],
  [10, 9],
  [10, 11],
  [10, 12],
  [10, 13],
  [11, 8],
  [11, 9],
  [11, 10],
  [11, 12],
  [11, 13],
  [11, 14],
  [12, 9],
  [12, 10],
  [12, 11],
  [12, 13],
  [12, 14],
  [12, 15],
  [13, 10],
  [13, 11],
  [13, 12],
  [13, 14],
  [13, 15],
  [14, 11],
  [14, 12],
  [14, 13],
  [14, 15],
  [15, 12],
  [15, 13],
  [15, 14]])

In [53]:
target_word, context_word = positive_skip_gram[0]
print(inverse_vocab[target_word], inverse_vocab[context_word])
num_ns = 4
context_class = tf.reshape(tf.constant(context_word, dtype = 'int64'), (1,1))
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes = context_class,
    num_true = 1,
    num_sampled = num_ns,
    unique = True,
    range_max = vocab_size,
    name = 'negative_sampling'
)
negative_sampling_candidates, [inverse_vocab[index.numpy()] for index in negative_sampling_candidates]


This chapter


(<tf.Tensor: shape=(4,), dtype=int64, numpy=array([ 1, 10, 11,  2])>,
 ['This', 'toxic', 'comments', 'chapter'])

In [54]:
squeezed_context_class = tf.squeeze(context_class, 1)
context = tf.concat([squeezed_context_class, negative_sampling_candidates], 0)
label = tf.constant([1] + [0] * num_ns, dtype = 'int64')
target = target_word

In [55]:
target, context, label

(1,
 <tf.Tensor: shape=(5,), dtype=int64, numpy=array([ 2,  1, 10, 11,  2])>,
 <tf.Tensor: shape=(5,), dtype=int64, numpy=array([1, 0, 0, 0, 0])>)

In [56]:
sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(size=10)
print(sampling_table)

[0.00315225 0.00315225 0.00547597 0.00741556 0.00912817 0.01068435
 0.01212381 0.01347162 0.01474487 0.0159558 ]


#### On your dataset

In [44]:
train_file = pd.read_csv('clean.csv')

#### It’s always fun to understand how a model works from scratch. Here we will build word2vec from scratch using tensorflow. First we will convert the dataframe to tensorflow dataset. TextVectorization layer of tensorflow maps text features to integer sequences. We will get vocab and total number of sentences using tensorflow dataset and vectorization. 
#### Also we get the vocabulary and inverse vocabulary which is very important. Next we want to form target and context words using skipgrams. We will only form positive pairs. For negative samples we will use log_uniform_candidate_sampler. This samples words at random from vocabulary. At the end we will concat positive and negative words to form context. 

In [45]:
text_ds = tf.data.Dataset.from_tensor_slices(train_file['comment_text'])
                                             

In [7]:
sentence_length = 1250
vocab_size = 168658

vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens = vocab_size,
    output_mode = 'int',
    output_sequence_length = sentence_length
)
vectorize_layer.adapt(text_ds.batch(1024))

2023-04-25 13:53:35.896927: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


In [8]:
inverse_vocab = vectorize_layer.get_vocabulary()
inverse_vocab[:20]

['',
 '[UNK]',
 'article',
 'wikipedia',
 'page',
 'talk',
 'please',
 'would',
 'one',
 'like',
 'see',
 'also',
 'think',
 'know',
 'people',
 'edit',
 'articles',
 'use',
 'time',
 'may']

In [9]:
text_vector_ds = text_ds.batch(1024).map(vectorize_layer).unbatch()


In [10]:
sequences = list(text_vector_ds.as_numpy_iterator())
len(sequences)

159513

In [11]:
for seq in sequences[:3]:
    print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

[533  47  49 ...   0   0   0] => ['explanation', 'edits', 'made', 'username', 'hardcore', 'metallica', 'fan', 'reverted', 'vandalisms', 'closure', 'gas', 'voted', 'new', 'york', 'dolls', 'fac', 'please', 'remove', 'template', 'talk', 'page', 'since', 'retired', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''

In [14]:
def generate_training_data(sequences, window_size, num_ns, vocab_size):
    targets, contexts, labels = [], [], []
    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)
    
    for sequence in tqdm.tqdm(sequences):
        positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
            sequence,
            vocabulary_size = vocab_size,
            sampling_table = sampling_table,
            window_size = window_size,
            negative_samples = 0
        )
        for target_word, context_word in positive_skip_grams:
            context_class = tf.expand_dims(
                tf.constant([context_word], dtype = 'int64'), 1
            )
            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
                true_classes = context_class,
                num_true = 1,
                num_sampled = num_ns,
                unique = True,
                range_max = vocab_size,
                name = 'negative_sampling'
            )
            context = tf.concat([tf.squeeze(context_class, 1),negative_sampling_candidates], 0)
            label = tf.constant([1] + [0] * num_ns, dtype = 'int64')
            
            targets.append(target_word)
            contexts.append(context)
            labels.append(label)
            
    return targets, contexts, labels
    
    

In [None]:
targets, contexts, labels = generate_training_data(
    sequences = sequences,
    window_size = 2,
    num_ns = 4,
    vocab_size = vocab_size 
)

  7%|██▎                               | 11034/159513 [08:36<2:03:04, 20.11it/s]

In [None]:
targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

targets.shape, contexts.shape, labels.shape

In [None]:
batch_size = 1024
buffer_size = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(buffer_size).batch(batch_size, drop_remainder = True)
dataset

In [None]:
dataset = dataset.cache().prefetch(buffer_size = AUTOTUNE)
dataset

#### Now build the word2vec model. Initialize the word and target embedding matrix in the init(). The dot product takes place in call(). Target embedding: which looks up the embedding of a word when it appears as a target word  Context embedding: which looks up the embedding of a word when it appears as a context word.

In [None]:
class Word2Vec(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.target_embedding = tf.keras.layers.Embedding(vocab_size,
                                                         embedding_dim,
                                                         input_length = 1,
                                                         name = 'w2v_embedding')
        self.context_embedding = tf.keras.layers.Embedding(vocab_size, 
                                                          embedding_dim,
                                                          input_length = num_ns + 1)
        
    def call(self, pair):
        target, context = pair
        if len(target.shape) == 2:
            target = tf.squeeze(target, axis = 1)
        word_emb = self.target_embedding(target)
        context_emb = self.context_embedding(context)
        dots = tf.einsum('be,bce->bc', word_emb, context_emb)
        return dots
        

In [None]:
def custom_loss(x_logit, y_true):
    return tf.nn.sigmoid_cross_entropy_with_logits(logits = x_logit, labels = y_true)


In [None]:
embedding_dim = 100
word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer = 'adam',
                loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                metrics = ['accuracy'])


In [None]:
word2vec.fit(dataset, epochs = 2)

In [None]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [None]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')
for index, word in enumerate(vocab):
    if index == 0:
        continue  # skip 0, it's padding.
    vec = weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")
out_v.close()
out_m.close()
