In [1]:
import numpy as np
import pandas as pd
import random
import jieba
import os
from utils import DATA_PROCESSED_DIR, DATA_W2V_VECTOR_PATH, DATA_W2V_META_PATH, DATA_TEXT_SEQUENCES_PATH, DATA_LABELS_PATH
import tensorflow as tf
from tensorflow.keras import layers
from utils import CustomIterator
import tqdm
import math
import pickle

from typing import List

In [2]:
%load_ext tensorboard

In [3]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [4]:
# raw_dataset = pd.read_csv('../data/preset/minibatch.csv')
raw_dataset = pd.read_csv('../data/comment.csv')

raw_dataset.head()

Unnamed: 0,movie_id,user_id,rate,time,content
0,26825482,寒狐传媒,4,2022-02-24 08:15:44,标准美国科幻大片。参考影片《世界末日 Armageddon (1998)》
1,26825482,BURKARD,3,2022-02-06 22:00:29,降智😅 地震把一屋子领导砸死了我属实没想到
2,26825482,Hachiko,3,2022-03-29 17:49:46,顶着叛国罪的风险无比坚信前妻，这种还会离婚？
3,26825482,十一子,3,2022-02-06 07:40:39,月亮果然还是家园，不过这下是真的变成人类共识了。所有的惊奇点、感动点和转折点都在可预计范围内...
4,26825482,VincentP,3,2022-01-27 10:33:21,我的天，剧情真的好阴谋论，但是还算是能自圆其说。埃默里奇真的20年如一日的毁地球，还是他之前...


In [5]:
dataset = raw_dataset.dropna(axis=0).reset_index()

print(f'length: {len(dataset)}')
dataset.head()

length: 21942


Unnamed: 0,index,movie_id,user_id,rate,time,content
0,0,26825482,寒狐传媒,4,2022-02-24 08:15:44,标准美国科幻大片。参考影片《世界末日 Armageddon (1998)》
1,1,26825482,BURKARD,3,2022-02-06 22:00:29,降智😅 地震把一屋子领导砸死了我属实没想到
2,2,26825482,Hachiko,3,2022-03-29 17:49:46,顶着叛国罪的风险无比坚信前妻，这种还会离婚？
3,3,26825482,十一子,3,2022-02-06 07:40:39,月亮果然还是家园，不过这下是真的变成人类共识了。所有的惊奇点、感动点和转折点都在可预计范围内...
4,4,26825482,VincentP,3,2022-01-27 10:33:21,我的天，剧情真的好阴谋论，但是还算是能自圆其说。埃默里奇真的20年如一日的毁地球，还是他之前...


In [6]:
def line2words(line: str) -> List[str]:
    return list(jieba.cut(line))

dataset['words_count'] = dataset['content'].map((lambda x: len(line2words(x))))

EXPECT_SENTENSE_LENGTH = math.ceil(dataset['words_count'].quantile(0.75))
print(EXPECT_SENTENSE_LENGTH)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Dumping model to file cache /tmp/jieba.cache
Dump cache file failed.
Traceback (most recent call last):
  File "/data/user18302289/anaconda3/envs/news/lib/python3.7/site-packages/jieba/__init__.py", line 154, in initialize
    _replace_file(fpath, cache_file)
PermissionError: [Errno 1] Operation not permitted: '/tmp/tmpr3alipwh' -> '/tmp/jieba.cache'
Loading model cost 0.747 seconds.
Prefix dict has been built successfully.


68


In [7]:
with open(DATA_LABELS_PATH, 'wb') as f:
    pickle.dump(dataset['rate'].values, f)

In [8]:
vocab, index = {}, 1
vocab['<pad>'] = 0

for words in CustomIterator(dataset['content'], line2words):
    for word in words:
        if word not in vocab:
            vocab[word] = index
            index += 1

vocab_size = len(vocab)
inverse_vocab = {index: token for token, index in vocab.items()}
print(len(vocab))

66909


In [9]:
def words2sequence(words: List[str], vocab: dict) -> List[int]:
    return [vocab[word] for word in words]

example_sequence = words2sequence(line2words(dataset.iloc[0]['content']), vocab)
print(example_sequence)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 10, 12, 13, 14, 15]


In [10]:
window_size = 3
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=0)

print(len(positive_skip_grams))

84


In [11]:
for target, context in positive_skip_grams[:5]:
  print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")

(14, 13): (), 1998)
(4, 6): (大片, 参考)
(8, 9): (《, 世界末日)
(7, 10): (影片,  )
(11, 8): (Armageddon, 《)


In [12]:
# Get target and context words for one positive skip-gram.
target_word, context_word = positive_skip_grams[0]

# Set the number of negative samples per positive context.
num_ns = 4

context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class,  # class that should be sampled as 'positive'
    num_true=1,  # each positive skip-gram has 1 positive context class
    num_sampled=num_ns,  # number of negative context words to sample
    unique=True,  # all the negative samples should be unique
    range_max=vocab_size,  # pick index of the samples from [0, vocab_size]
    seed=SEED,  # seed for reproducibility
    name="negative_sampling"  # name of this operation
)
print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates][:5])

tf.Tensor([ 960   55 7892  225], shape=(4,), dtype=int64)
['全然', '、', '念白', '略微']


2022-04-11 19:07:01.735852: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-11 19:07:03.990946: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 9249 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3080 Ti, pci bus id: 0000:18:00.0, compute capability: 8.6
2022-04-11 19:07:03.991856: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 1628 MB memory:  -> device: 1, name: NVIDIA GeForce RTX 3080 Ti, pci bus id: 0000:3b:00.0, compute capability: 8.6
2022-04-11 19:07:03.992619: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/repli

In [13]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=seed,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      negative_sampling_candidates = tf.expand_dims(
          negative_sampling_candidates, 1)

      context = tf.concat([context_class, negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [14]:
TEXT_DATASET_PATH = os.path.join(DATA_PROCESSED_DIR, 'text_ds.tsv')

with open(DATA_TEXT_SEQUENCES_PATH, 'wb') as f:
  pickle.dump([line2words(line) for line in CustomIterator(dataset['content'])], f)

with open(TEXT_DATASET_PATH, 'w') as f:
    for line in CustomIterator(dataset['content']):
        print('\t'.join(line2words(line)), file=f)

text_ds = tf.data.TextLineDataset(TEXT_DATASET_PATH)

In [15]:
# Now, create a custom standardization function to lowercase the text and
# remove punctuation.
# def custom_standardization(input_data):
#   return line2words(input_data)


# Define the vocabulary size and the number of words in a sequence.
vocab_size = 4096
# sequence_length = 10
sequence_length = EXPECT_SENTENSE_LENGTH

# Use the `TextVectorization` layer to normalize, split, and map strings to
# integers. Set the `output_sequence_length` length to pad all samples to the
# same length.
vectorize_layer = layers.TextVectorization(
    # standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [16]:
vectorize_layer.adapt(text_ds.batch(1024))

In [17]:
# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

['', '[UNK]', '，', '的', '。', '了', '是', '在', '也', '和', '我', '都', '电影', '、', '不', '有', '就', '很', '人', '！']


In [18]:
# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

In [19]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

for seq in sequences[:2]:
  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

21943
[1274  256  519 1248    4 3910  123   21    1    1    1   22    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0] => ['标准', '美国', '科幻', '大片', '。', '参考', '影片', '《', '[UNK]', '[UNK]', '[UNK]', '》', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
[3954 1991    1   62  177    1    1    1    5   10 2624  380    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0   

In [20]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    num_ns=4,
    vocab_size=vocab_size,
    seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)[:,:,0]
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

100%|██████████| 21943/21943 [01:58<00:00, 185.76it/s]




targets.shape: (317080,)
contexts.shape: (317080, 5)
labels.shape: (317080, 5)


## 训练 word2vec

In [21]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000

dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
# dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)

# print(dataset)

In [22]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<PrefetchDataset element_spec=((TensorSpec(shape=(None,), dtype=tf.int64, name=None), TensorSpec(shape=(None, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(None, 5), dtype=tf.int64, name=None))>


In [23]:
class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = layers.Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = layers.Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1)

  def call(self, pair):
    target, context = pair
    # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots

In [24]:
embedding_dim = 256

word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'], run_eagerly=True)

In [25]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [26]:
dataset

<PrefetchDataset element_spec=((TensorSpec(shape=(None,), dtype=tf.int64, name=None), TensorSpec(shape=(None, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(None, 5), dtype=tf.int64, name=None))>

In [27]:
word2vec.fit(dataset, epochs=100, callbacks=[tensorboard_callback])
# word2vec.fit(dataset, epochs=20)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x7f81dc18d7d0>

In [28]:
word2vec.summary()

Model: "word2_vec"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 w2v_embedding (Embedding)   multiple                  1048576   
                                                                 
 embedding (Embedding)       multiple                  1048576   
                                                                 
Total params: 2,097,152
Trainable params: 2,097,152
Non-trainable params: 0
_________________________________________________________________


In [29]:
#docs_infra: no_execute
# %tensorboard --logdir logs

In [30]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [31]:
vf = open(DATA_W2V_VECTOR_PATH, 'w')
mf = open(DATA_W2V_META_PATH, 'w')


for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  vf.write('\t'.join([str(x) for x in vec]) + "\n")
  mf.write(word + "\n")
  
mf.close()
vf.close()