# 文本数据加载

In [2]:
import json
import numpy as np

with open('./preprocess/dataset.json', 'r') as f:
    dataset = json.load(f)

len(dataset)

1665

In [3]:
MAX_SEQUENCE_LENGTH = 85
EVENT_TYPE = 7

# Label: (1665, 85, 7)

In [1]:
from keras.utils import to_categorical

Using TensorFlow backend.


In [4]:
event_type = dict()

for sent in dataset:
    for trigger in sent['triggers']:
        t = trigger['event']
        if t not in event_type.keys():
            event_type[t] = 1
        else:
            event_type[t] += 1

event_type

events = list(event_type.keys())
events.sort()

event2index = dict(zip(events, [i for i in range(len(events))]))
event2index

{'action': 0,
 'emergency': 1,
 'movement': 2,
 'operation': 3,
 'perception': 4,
 'stateChange': 5,
 'statement': 6}

In [5]:
dataset[0]

{'sentence': '2014年1月7日 广州番禺市桥街兴泰路 商铺 火灾 ， 从化女子 烧死 ！',
 'sentence_words': '2014 年 1 月 7 日 广州 番禺市 桥街 兴泰路 商铺 火灾 ， 从化 女子 烧死 ！',
 'triggers': [{'event': 'emergency',
   'event_arguments': '商铺',
   'event_trigger': '火灾'}]}

## 获取 Event Trigger 与 Event Argument 的位置

In [12]:
label = np.zeros((len(dataset), MAX_SEQUENCE_LENGTH))

for i, piece in enumerate(dataset):
    words = piece['sentence_words'].split()
    
    for trigger in piece['triggers']:
        j = words.index(trigger['event_trigger'])
        t = event2index[trigger['event']]
        label[i][j] = t
        
        trigger['index_event'] = t
        trigger['index_event_trigger'] = j
        
        if 'event_arguments' in trigger.keys():
            argument_index = words.index(trigger['event_arguments'])
            trigger['index_event_arguments'] = argument_index

In [14]:
label = to_categorical(label)
label.shape

(1665, 85, 7)

In [20]:
dataset[0]

{'sentence': '2014年1月7日 广州番禺市桥街兴泰路 商铺 火灾 ， 从化女子 烧死 ！',
 'sentence_words': '2014 年 1 月 7 日 广州 番禺市 桥街 兴泰路 商铺 火灾 ， 从化 女子 烧死 ！',
 'triggers': [{'event': 'emergency',
   'event_arguments': '商铺',
   'event_trigger': '火灾',
   'index_event': 1,
   'index_event_arguments': 10,
   'index_event_trigger': 11}]}

In [22]:
label[0][11]

array([ 0.,  1.,  0.,  0.,  0.,  0.,  0.])

In [10]:
np.save('./label_{}.npy'.format(label.shape), label)

In [23]:
with open('./preprocess/dataset.json', 'w') as f:
    json.dump(dataset, f, ensure_ascii=False, sort_keys=True, indent=4)

# Word2Vec

In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [4]:
embeddings_index = {}
with open('../../../../../../实验室/word2vec/sgns.weibo.bigram-char') as f:
    lines = f.readlines()
    lines = [l.strip() for l in lines]
    print(lines[0])
    
    for line in lines[1:]:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

195197 300
Found 195197 word vectors.


In [6]:
MAX_NUM_WORDS = 6000
EMBEDDING_DIM = 300

In [7]:
texts = []

for piece in dataset:
    texts.append(piece['sentence_words'])
    
len(texts)

1665

In [8]:
texts[0]

'2014 年 1 月 7 日 广州 番禺市 桥街 兴泰路 商铺 火灾 ， 从化 女子 烧死 ！'

In [9]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

we = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

Found 6476 unique tokens.


In [10]:
list(word_index.items())[:5]

[('双语', 3724), ('承包', 5133), ('不远处', 5252), ('党和政府', 2840), ('情报', 3975)]

In [11]:
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [12]:
list(word_index.items())[0]

('双语', 3724)

In [13]:
embedding_matrix.shape, we.shape

((6000, 300), (1665, 85))

In [14]:
we[0]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,  567,  133,   38,    9,   66,    5,  805, 2996, 2997,
       2998, 1140,   28,    1, 2999,  628, 3000, 1440], dtype=int32)

In [15]:
dataset[0]['sentence_words']

'2014 年 1 月 7 日 广州 番禺市 桥街 兴泰路 商铺 火灾 ， 从化 女子 烧死 ！'

In [16]:
word_index['2014'], word_index['年'], word_index['1'], word_index['月'], word_index['烧死']

(567, 133, 38, 9, 3000)

In [18]:
# embedding_matrix[3000] == embeddings_index['烧死']

In [19]:
we.shape

(1665, 85)

In [20]:
np.save('./we_embedding_matrix_{}.npy'.format(embedding_matrix.shape), embedding_matrix)
np.save('./we_{}.npy'.format(we.shape), we)

# Position: (1665, 85, 85, 2)