In [1]:
import numpy as np
import pandas as pd
import random
import jieba
import os
from utils import DATA_PROCESSED_DIR, DATA_W2V_VECTOR_PATH, DATA_W2V_META_PATH
import tensorflow as tf
from tensorflow.keras import layers
import tqdm
import math

from typing import List

In [2]:
%load_ext tensorboard

In [3]:
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [4]:
raw_dataset = pd.read_csv('../data/preset/minibatch.csv')

raw_dataset.head()

Unnamed: 0,movie_id,user_id,rate,time,content
0,26825482,莫挨脑子,1.0,2022-03-25 20:24:25,看到影片开头 华谊兄弟几个字的时候 我就心里有数了……果然…..
1,26825482,壹安²,4.0,2022-03-26 12:35:51,一定要看IMAX一定要看IMAX一定要看IMAX！画面和特效远超预期他M的，最后视觉大高潮我...
2,26825482,ZeonGin Sou,1.0,2022-03-26 23:43:12,大概是主创团队致力于讲一个自嗨至极的阴谋论，导致全片的特效竟然没有服务于剧情和刺激，这对于一...
3,26825482,夏小時,2.0,2022-02-03 17:43:26,儘管降低了心理預期，但是還是沒料到這災難片能拍成史詩級災難。能感覺導演應當有致敬駭客帝國的意...
4,26825482,人民南路壹号,2.0,2022-02-13 12:22:24,加上最后一段的灾难特效算它勉强及格吧，在导演本人的作品里应当排在倒数的，很多逻辑漏洞和一言难...


In [5]:
dataset = raw_dataset.dropna(axis=0).reset_index()

print(f'length: {len(dataset)}')
dataset.head()

length: 19


Unnamed: 0,index,movie_id,user_id,rate,time,content
0,0,26825482,莫挨脑子,1.0,2022-03-25 20:24:25,看到影片开头 华谊兄弟几个字的时候 我就心里有数了……果然…..
1,1,26825482,壹安²,4.0,2022-03-26 12:35:51,一定要看IMAX一定要看IMAX一定要看IMAX！画面和特效远超预期他M的，最后视觉大高潮我...
2,2,26825482,ZeonGin Sou,1.0,2022-03-26 23:43:12,大概是主创团队致力于讲一个自嗨至极的阴谋论，导致全片的特效竟然没有服务于剧情和刺激，这对于一...
3,3,26825482,夏小時,2.0,2022-02-03 17:43:26,儘管降低了心理預期，但是還是沒料到這災難片能拍成史詩級災難。能感覺導演應當有致敬駭客帝國的意...
4,4,26825482,人民南路壹号,2.0,2022-02-13 12:22:24,加上最后一段的灾难特效算它勉强及格吧，在导演本人的作品里应当排在倒数的，很多逻辑漏洞和一言难...


In [6]:
def line2words(line: str) -> List[str]:
    return list(jieba.cut(line))

dataset['words_count'] = dataset['content'].map((lambda x: len(line2words(x))))

EXPECT_SENTENSE_LENGTH = math.ceil(dataset['words_count'].quantile(0.75))
print(EXPECT_SENTENSE_LENGTH)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Dumping model to file cache /tmp/jieba.cache
Dump cache file failed.
Traceback (most recent call last):
  File "/data/user18302289/anaconda3/envs/news/lib/python3.7/site-packages/jieba/__init__.py", line 154, in initialize
    _replace_file(fpath, cache_file)
PermissionError: [Errno 1] Operation not permitted: '/tmp/tmphqwh5vn2' -> '/tmp/jieba.cache'
Loading model cost 0.781 seconds.
Prefix dict has been built successfully.


58


In [7]:
val_size = 0.25

idx = list(range(len(dataset)))

random.shuffle(idx)

dataset_train_idx, dataset_val_idx = idx[int(val_size * len(idx)):], idx[:int(val_size * len(idx))]

dataset_train, dataset_val = dataset.iloc[dataset_train_idx], dataset.iloc[dataset_val_idx]

print(f'length: {len(dataset_train)}')
dataset_train.head()

length: 15


Unnamed: 0,index,movie_id,user_id,rate,time,content,words_count
9,9,26825482,亵渎电影,1.0,2022-03-30 17:04:17,编剧写这种东西，本意是在写一个粗制滥造的网大吧，不仅时时刻刻在侮辱观众的智商，还侮辱了外星智...,47
13,13,26825482,Henrique Asano,3.0,2022-03-25 22:06:30,脑洞大开，硬塞中国元素是很过分了，当然还是适合银幕看看的无脑爽片,19
8,8,26825482,胖球,4.0,2022-03-29 23:14:15,这不还行吗？剧情不算特别无脑，场面确实非常惊艳，就足够了～,19
12,12,26825482,Keith Lee,3.0,2022-03-25 21:17:34,时间过得真快，不敢相信看罗兰艾默里奇上一部《Midway》已经是两年前的事情了。这次罗兰艾默...,113
3,3,26825482,夏小時,2.0,2022-02-03 17:43:26,儘管降低了心理預期，但是還是沒料到這災難片能拍成史詩級災難。能感覺導演應當有致敬駭客帝國的意...,137


In [8]:
class CustomIterator:
    def __init__(self, dataset, operator = lambda x: x):
        self.dataset = dataset
        self.operator = operator
    
    def __iter__(self):
        for line in self.dataset:
            yield self.operator(line)
    
train_contents = CustomIterator(dataset['content'], line2words)

In [9]:
vocab, index = {}, 1
vocab['<pad>'] = 0

for words in CustomIterator(dataset['content'], line2words):
    for word in words:
        if word not in vocab:
            vocab[word] = index
            index += 1

vocab_size = len(vocab)
inverse_vocab = {index: token for token, index in vocab.items()}
print(vocab)

{'<pad>': 0, '看到': 1, '影片': 2, '开头': 3, ' ': 4, '华谊': 5, '兄弟': 6, '几个': 7, '字': 8, '的': 9, '时候': 10, '我': 11, '就': 12, '心里有数': 13, '了': 14, '…': 15, '果然': 16, '..': 17, '一定': 18, '要': 19, '看': 20, 'IMAX': 21, '！': 22, '画面': 23, '和': 24, '特效': 25, '远超': 26, '预期': 27, '他': 28, 'M': 29, '，': 30, '最后': 31, '视觉': 32, '大': 33, '高潮': 34, '整个': 35, '喊': 36, '有': 37, '十遍': 38, '卧槽': 39, '吧': 40, '期待': 41, '更': 42, '多': 43, '科学': 44, '学说': 45, '未': 46, '被': 47, '证实': 48, '理论': 49, '搬': 50, '上银幕': 51, '骂': 52, '人': 53, '奥斯卡': 54, '颁奖': 55, '马上': 56, '在家': 57, '看看': 58, '资源': 59, '不好': 60, '吗': 61, '？': 62, '大概': 63, '是': 64, '主创': 65, '团队': 66, '致力于': 67, '讲': 68, '一个': 69, '自': 70, '嗨': 71, '至极': 72, '阴谋论': 73, '导致': 74, '全片': 75, '竟然': 76, '没有': 77, '服务': 78, '于': 79, '剧情': 80, '刺激': 81, '这': 82, '对于': 83, '一部': 84, '灾难片': 85, '来说': 86, '就是': 87, '一场': 88, '切切实实': 89, '灾难': 90, '。': 91, '相比': 92, '起': 93, '《': 94, '2012': 95, '》': 96, '人家': 97, '可是': 98, '跟': 99, '各种': 100, '形式': 101, '追逐': 102

In [10]:
def words2sequence(words: List[str], vocab: dict) -> List[int]:
    return [vocab[word] for word in words]

example_sequence = words2sequence(line2words(dataset.iloc[0]['content']), vocab)
print(example_sequence)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 4, 11, 12, 13, 14, 15, 15, 16, 15, 17]


In [11]:
window_size = 3
positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
      example_sequence,
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=0)

print(len(positive_skip_grams))

108


In [12]:
for target, context in positive_skip_grams[:5]:
  print(f"({target}, {context}): ({inverse_vocab[target]}, {inverse_vocab[context]})")

(5, 2): (华谊, 影片)
(15, 14): (…, 了)
(1, 4): (看到,  )
(3, 5): (开头, 华谊)
(7, 8): (几个, 字)


In [13]:
# Get target and context words for one positive skip-gram.
target_word, context_word = positive_skip_grams[0]

# Set the number of negative samples per positive context.
num_ns = 4

context_class = tf.reshape(tf.constant(context_word, dtype="int64"), (1, 1))
negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
    true_classes=context_class,  # class that should be sampled as 'positive'
    num_true=1,  # each positive skip-gram has 1 positive context class
    num_sampled=num_ns,  # number of negative context words to sample
    unique=True,  # all the negative samples should be unique
    range_max=vocab_size,  # pick index of the samples from [0, vocab_size]
    seed=SEED,  # seed for reproducibility
    name="negative_sampling"  # name of this operation
)
print(negative_sampling_candidates)
print([inverse_vocab[index.numpy()] for index in negative_sampling_candidates])

tf.Tensor([ 44   8 147  19], shape=(4,), dtype=int64)
['科学', '字', '除了', '要']


2022-04-06 20:01:59.345089: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-06 20:02:01.657506: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5459 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 3080 Ti, pci bus id: 0000:18:00.0, compute capability: 8.6
2022-04-06 20:02:01.658582: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 8550 MB memory:  -> device: 1, name: NVIDIA GeForce RTX 3080 Ti, pci bus id: 0000:3b:00.0, compute capability: 8.6
2022-04-06 20:02:01.659327: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/repli

In [14]:
# Generates skip-gram pairs with negative sampling for a list of sequences
# (int-encoded sentences) based on window size, number of negative samples
# and vocabulary size.
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
  # Elements of each training example are appended to these lists.
  targets, contexts, labels = [], [], []

  # Build the sampling table for `vocab_size` tokens.
  sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

  # Iterate over all sequences (sentences) in the dataset.
  for sequence in tqdm.tqdm(sequences):

    # Generate positive skip-gram pairs for a sequence (sentence).
    positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
          sequence,
          vocabulary_size=vocab_size,
          sampling_table=sampling_table,
          window_size=window_size,
          negative_samples=0)

    # Iterate over each positive skip-gram pair to produce training examples
    # with a positive context word and negative samples.
    for target_word, context_word in positive_skip_grams:
      context_class = tf.expand_dims(
          tf.constant([context_word], dtype="int64"), 1)
      negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
          true_classes=context_class,
          num_true=1,
          num_sampled=num_ns,
          unique=True,
          range_max=vocab_size,
          seed=seed,
          name="negative_sampling")

      # Build context and label vectors (for one target word)
      negative_sampling_candidates = tf.expand_dims(
          negative_sampling_candidates, 1)

      context = tf.concat([context_class, negative_sampling_candidates], 0)
      label = tf.constant([1] + [0]*num_ns, dtype="int64")

      # Append each element from the training example to global lists.
      targets.append(target_word)
      contexts.append(context)
      labels.append(label)

  return targets, contexts, labels

In [15]:
TEXT_DATASET_PATH = os.path.join(DATA_PROCESSED_DIR, 'text_ds.txt')

with open(TEXT_DATASET_PATH, 'w') as f:
    for line in CustomIterator(dataset['content']):
        print(' '.join(line2words(line)), file=f)

text_ds = tf.data.TextLineDataset(TEXT_DATASET_PATH)

In [16]:
# Now, create a custom standardization function to lowercase the text and
# remove punctuation.
# def custom_standardization(input_data):
#   return line2words(input_data)


# Define the vocabulary size and the number of words in a sequence.
vocab_size = 4096
# sequence_length = 10
sequence_length = EXPECT_SENTENSE_LENGTH

# Use the `TextVectorization` layer to normalize, split, and map strings to
# integers. Set the `output_sequence_length` length to pad all samples to the
# same length.
vectorize_layer = layers.TextVectorization(
    # standardize=custom_standardization,
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

In [17]:
vectorize_layer.adapt(text_ds.batch(1024))

In [18]:
# Save the created vocabulary for reference.
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

['', '[UNK]', '，', '的', '了', '。', '在', '是', '就', '和', '！', '看', '特效', '最后', '…', '？', '都', '还', '电影', '灾难片']


In [19]:
# Vectorize the data in text_ds.
text_vector_ds = text_ds.batch(1024).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

In [20]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

for seq in sequences[:5]:
  print(f"{seq} => {[inverse_vocab[i] for i in seq]}")

19
[ 60 306 309  22 410 402 327   3 264  35   8 300   4  14  14 243  14   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0] => ['看到', '影片', '开头', '华谊', '兄弟', '几个', '字', '的', '时候', '我', '就', '心里有数', '了', '…', '…', '果然', '…', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']
[ 41  29  11  44  41  29  11  44  41  29  11  44  10 203   9  12 126 103
 428 473   3   2  13 155  40  98  35  34 357   4 253 381 377  21  10 250
 256  74   3 189 326   9 249  57 149   3 209 280 452  10  13  14 100   3
 431  14 332 104] => ['一定', '要', '看', 'imax', '一定', '要', '看', 'imax', '一定', '要', '看', 'imax', '！', '画面', '和', '特效', '远超', '预期', '他', 'm', '的', '，', '最后', '视觉', '大', '高潮', '我', '整个', '喊', '了', '有', '十遍', '卧槽', '吧', '！', '期待', '更', '多', '的', '科学', '学说', '和', '未', '被', '证实', '的',

In [21]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    num_ns=4,
    vocab_size=vocab_size,
    seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)[:,:,0]
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

100%|██████████| 19/19 [00:00<00:00, 234.92it/s]



targets.shape: (184,)
contexts.shape: (184, 5)
labels.shape: (184, 5)





## 训练 word2vec

In [22]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000

dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
# dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)

print(dataset)

<PrefetchDataset element_spec=((TensorSpec(shape=(None,), dtype=tf.int64, name=None), TensorSpec(shape=(None, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(None, 5), dtype=tf.int64, name=None))>


In [23]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<PrefetchDataset element_spec=((TensorSpec(shape=(None,), dtype=tf.int64, name=None), TensorSpec(shape=(None, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(None, 5), dtype=tf.int64, name=None))>


In [24]:
class Word2Vec(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim):
    super(Word2Vec, self).__init__()
    self.target_embedding = layers.Embedding(vocab_size,
                                      embedding_dim,
                                      input_length=1,
                                      name="w2v_embedding")
    self.context_embedding = layers.Embedding(vocab_size,
                                       embedding_dim,
                                       input_length=num_ns+1)

  def call(self, pair):
    target, context = pair
    # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
    # context: (batch, context)
    if len(target.shape) == 2:
      target = tf.squeeze(target, axis=1)
    # target: (batch,)
    word_emb = self.target_embedding(target)
    # word_emb: (batch, embed)
    context_emb = self.context_embedding(context)
    # context_emb: (batch, context, embed)
    dots = tf.einsum('be,bce->bc', word_emb, context_emb)
    # dots: (batch, context)
    return dots

In [25]:
embedding_dim = 128

word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'], run_eagerly=True)

In [26]:
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [27]:
dataset

<PrefetchDataset element_spec=((TensorSpec(shape=(None,), dtype=tf.int64, name=None), TensorSpec(shape=(None, 5), dtype=tf.int64, name=None)), TensorSpec(shape=(None, 5), dtype=tf.int64, name=None))>

In [28]:
word2vec.fit(dataset, epochs=200, callbacks=[tensorboard_callback])
# word2vec.fit(dataset, epochs=20)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x7f701c1022d0>

In [29]:
word2vec.summary()

Model: "word2_vec"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 w2v_embedding (Embedding)   multiple                  524288    
                                                                 
 embedding (Embedding)       multiple                  524288    
                                                                 
Total params: 1,048,576
Trainable params: 1,048,576
Non-trainable params: 0
_________________________________________________________________


In [30]:
#docs_infra: no_execute
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 1766728), started 6:19:32 ago. (Use '!kill 1766728' to kill it.)

In [31]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [32]:
vf = open(DATA_W2V_VECTOR_PATH, 'w')
mf = open(DATA_W2V_META_PATH, 'w')


for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  vf.write('\t'.join([str(x) for x in vec]) + "\n")
  mf.write(word + "\n")
  
mf.close()
vf.close()