# 数据加载

In [1]:
CONTENT_WORDS = 120

import json
import numpy as np

In [2]:
with open('../dataset/train.json', 'r', encoding='utf-8') as f:
    train = json.load(f)
with open('../dataset/test.json', 'r', encoding='utf-8') as f:
    test = json.load(f)

len(train), len(test)

(32193, 1613)

In [4]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [5]:
embeddings_index = {}
with open('../../../word2vec/sgns.weibo.bigram-char') as f:
    lines = f.readlines()
    lines = [l.strip() for l in lines]
    print(lines[0])
    
    for line in lines[1:]:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

195197 300
Found 195197 word vectors.


# Text

In [6]:
MAX_SEQUENCE_LENGTH = CONTENT_WORDS
MAX_NUM_WORDS = 6000
EMBEDDING_DIM = 300

In [7]:
texts = []

for pieces in [train, test]:
    texts += [p['content_words'] for p in pieces]
    
len(texts)

33806

In [8]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

we = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

Found 76901 unique tokens.


In [9]:
list(word_index.items())[:5]

[('，', 1), ('的', 2), ('。', 3), ('了', 4), ('、', 5)]

In [10]:
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [11]:
list(word_index.items())[0]

('，', 1)

In [13]:
embedding_matrix.shape, we.shape

((6000, 300), (33806, 120))

In [14]:
we[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0, 2115,  420,  148,   75,  649,   15, 2392,
       2634,  228,  110,   14,    2, 1129,   10, 1168, 1943,   10],
      dtype=int32)

In [15]:
train[0]['content_words']

'回复 新浪 网友 对 博文 【 国家文物局 限制 鉴宝 节目 现场 估价 转 】 的 评论 ： ; ; 查看 原文 ：'

In [17]:
word_index['回复'], word_index['新浪'], word_index['网友'], word_index['对']

(2115, 420, 148, 75)

In [19]:
# embedding_matrix[2115] == embeddings_index['回复']

In [21]:
len(train), len(test), len(train) + len(test)

(32193, 1613, 33806)

In [22]:
we.shape

(33806, 120)

In [23]:
train_we = np.zeros((len(train), MAX_SEQUENCE_LENGTH))
test_we = np.zeros((len(test), MAX_SEQUENCE_LENGTH))

train_we = we[:len(train)]
test_we = we[len(train):]

In [24]:
train_we.shape, test_we.shape

((32193, 120), (1613, 120))

In [30]:
(test_we[0] == we[len(train)]).all()

True

In [31]:
np.save('./data/we_embedding_matrix_{}.npy'.format(embedding_matrix.shape), embedding_matrix)
np.save('./data/train_we_{}.npy'.format(train_we.shape), train_we)
np.save('./data/test_we_{}.npy'.format(test_we.shape), test_we)