In [1]:
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
import gensim

train_data = './data/ctb5.1-pos/train.tsv'
test_data = './data/ctb5.1-pos/test.tsv'

## 数据读取

In [2]:
def get_data(file_path):
    data = pd.read_csv(file_path, sep='\t', skip_blank_lines=False, header=None)
    # 取出文本部分
    content = data[0]
    # 取出标签部分
    label = data[1]
    
    return content, label

In [3]:
# 读取数据集
X_train, y_train = get_data(train_data)
X_test, y_test = get_data(test_data)

FileNotFoundError: File b'./data/ctb5.1-pos/train.tsv' does not exist

## 数据预处理与格式转化

In [4]:
# 构建标签字典
labels = y_train.tolist() + y_test.tolist()
labels_types = list(set(labels))
print(labels_types)

labels_dict = {}
labels_index = {"padded_label" : 0}

for index in range(len(labels_types)):
    label = labels_types[index]
    labels_dict.update({label: labels.count(label)})
    labels_index.update({label: index+1})

np.save('y_labels_index.npy', labels_index) 
print(len(labels), len(labels_dict), labels_dict)

[nan, 'LC', 'FW', 'PN', 'VV', 'X', 'IJ', 'P', 'SB', 'OD', 'MSP', 'VP', 'CC', 'PU', 'DEC', 'NR', 'ETC', 'VC', 'CD', 'DT', 'DER', 'AD', 'VE', 'CS', 'BA', 'AS', 'DEG', 'NT', 'SP', 'VA', 'DEV', 'JJ', 'NN', 'LB', 'NP', 'M']
520125 36 {nan: 18426, 'LC': 7782, 'FW': 33, 'PN': 6644, 'VV': 69858, 'X': 6, 'IJ': 12, 'P': 17606, 'SB': 455, 'OD': 1675, 'MSP': 1336, 'VP': 1, 'CC': 7355, 'PU': 76753, 'DEC': 12510, 'NR': 30570, 'ETC': 1303, 'VC': 5404, 'CD': 16182, 'DT': 5986, 'DER': 258, 'AD': 36430, 'VE': 3005, 'CS': 892, 'BA': 755, 'AS': 4118, 'DEG': 12337, 'NT': 9659, 'SP': 468, 'VA': 7755, 'DEV': 634, 'JJ': 13234, 'NN': 136643, 'LB': 245, 'NP': 5, 'M': 13790}


In [5]:
# 构建的Y标签字典
print(labels_index)

{'padded_label': 0, nan: 1, 'LC': 2, 'FW': 3, 'PN': 4, 'VV': 5, 'X': 6, 'IJ': 7, 'P': 8, 'SB': 9, 'OD': 10, 'MSP': 11, 'VP': 12, 'CC': 13, 'PU': 14, 'DEC': 15, 'NR': 16, 'ETC': 17, 'VC': 18, 'CD': 19, 'DT': 20, 'DER': 21, 'AD': 22, 'VE': 23, 'CS': 24, 'BA': 25, 'AS': 26, 'DEG': 27, 'NT': 28, 'SP': 29, 'VA': 30, 'DEV': 31, 'JJ': 32, 'NN': 33, 'LB': 34, 'NP': 35, 'M': 36}


In [6]:
# 按句对X、y进行拆分
def split_corpus_by_sentence(content):
    cleaned_sentence = []
    split_label = content.isnull()
    last_split_index = 0
    index = 0
    while index < len(content):
        current_word = content[index]
        if split_label[index] == True and len(cleaned_sentence) == 0:
            cleaned_sentence.append(np.array(content[last_split_index:index]))
            last_split_index = index + 1
            index += 1
        elif split_label[index] == True  and len(cleaned_sentence) > 0:
            cleaned_sentence.append(np.array(content[last_split_index:index]))
            last_split_index = index + 1
            index += 1
        else:
            index += 1
    return cleaned_sentence

X_train_sent_split = split_corpus_by_sentence(X_train)
y_train_sent_split = split_corpus_by_sentence(y_train)
X_test_sent_split = split_corpus_by_sentence(X_test)
y_test_sent_split = split_corpus_by_sentence(y_test)

print('以句子进行拆分后的句子为：\n', X_train_sent_split[:5])
print('以句子进行拆分后的句子所对应的词性为：\n', y_train_sent_split[:5])

以句子进行拆分后的句子为：
 [array(['上海', '浦东', '开发', '与', '法制', '建设', '同步'], dtype=object), array(['新华社', '上海', '二月', '十日', '电', '（', '记者', '谢金虎', '、', '张持坚', '）'],
      dtype=object), array(['上海', '浦东', '近年', '来', '颁布', '实行', '了', '涉及', '经济', '、', '贸易', '、',
       '建设', '、', '规划', '、', '科技', '、', '文教', '等', '领域', '的', '七十一', '件',
       '法规性', '文件', '，', '确保', '了', '浦东', '开发', '的', '有序', '进行', '。'],
      dtype=object), array(['浦东', '开发', '开放', '是', '一', '项', '振兴', '上海', '，', '建设', '现代化',
       '经济', '、', '贸易', '、', '金融', '中心', '的', '跨世纪', '工程', '，', '因此',
       '大量', '出现', '的', '是', '以前', '不', '曾', '遇到', '过', '的', '新', '情况',
       '、', '新', '问题', '。'], dtype=object), array(['对', '此', '，', '浦东', '不', '是', '简单', '的', '采取', '“', '干', '一', '段',
       '时间', '，', '等', '积累', '了', '经验', '以后', '再', '制定', '法规', '条例', '”',
       '的', '做法', '，', '而', '是', '借鉴', '发达', '国家', '和', '深圳', '等', '特区',
       '的', '经验', '教训', '，', '聘请', '国内外', '有关', '专家', '学者', '，', '积极',
       '、', '及时', '地', '制定', '和', '推

In [7]:
def transfer_label_category_index(origin_labels, labels_types):
    transfered_label = []
    for sentence_labels in origin_labels:
        labels_format_index = [labels_types.index(label) for label in sentence_labels]  # 将标签依据字典转化为序号
        transfered_label.append(labels_format_index)
    return transfered_label

y_train_index = transfer_label_category_index(y_train_sent_split, labels_types)
y_test_index = transfer_label_category_index(y_test_sent_split, labels_types)

print(y_train_index[:5])

[[15, 15, 32, 12, 32, 32, 4], [32, 15, 27, 27, 32, 13, 32, 15, 13, 15, 13], [15, 15, 27, 1, 4, 4, 25, 4, 32, 13, 32, 13, 32, 13, 32, 13, 32, 13, 32, 16, 32, 14, 18, 35, 32, 32, 13, 4, 25, 15, 32, 26, 31, 32, 13], [15, 32, 32, 17, 18, 35, 4, 15, 13, 4, 32, 32, 13, 32, 13, 32, 32, 14, 31, 32, 13, 21, 21, 4, 14, 17, 27, 21, 21, 4, 25, 14, 31, 32, 13, 31, 32, 13], [7, 3, 13, 15, 21, 17, 29, 30, 4, 13, 4, 18, 35, 32, 13, 7, 4, 25, 32, 1, 21, 4, 32, 32, 13, 14, 32, 13, 12, 17, 4, 31, 32, 12, 15, 16, 32, 26, 32, 32, 13, 4, 32, 31, 32, 32, 13, 21, 13, 21, 30, 4, 12, 4, 32, 32, 13, 4, 19, 32, 32, 21, 4, 21, 8, 4, 32, 32, 13]]


In [8]:
MAX_SEQUENCE_LENGTH = 100

# 标签格式转化
# 构建对应（标签样本数，句子长度，标签类别数）形状的张量，值全为0
y_train_index_padded = np.zeros((len(y_train_index), MAX_SEQUENCE_LENGTH, len(labels_types)+1), dtype='float', order='C')
y_test_index_padded = np.zeros((len(y_test_index), MAX_SEQUENCE_LENGTH, len(labels_types)+1), dtype='float', order='C')

# 填充张量
for sentence_labels_index in range(len(y_train_index)):
    for label_index in range(len(y_train_index[sentence_labels_index])):
        if label_index < MAX_SEQUENCE_LENGTH:
            y_train_index_padded[sentence_labels_index, label_index, y_train_index[sentence_labels_index][label_index]+1] = 1
    
    if len(y_train_index[sentence_labels_index]) < MAX_SEQUENCE_LENGTH:
        for label_index in range(len(y_train_index[sentence_labels_index]), MAX_SEQUENCE_LENGTH):
            y_train_index_padded[sentence_labels_index, label_index, 0] = 1

# 优化：若为填充的标签，则将其预测为第一位为1

for sentence_labels_index in range(len(y_test_index)):
    for label_index in range(len(y_test_index[sentence_labels_index])):
        if label_index < MAX_SEQUENCE_LENGTH:
            y_test_index_padded[sentence_labels_index, label_index, y_test_index[sentence_labels_index][label_index]+1] = 1
    
    if len(y_test_index[sentence_labels_index]) < MAX_SEQUENCE_LENGTH:
        for label_index in range(len(y_test_index[sentence_labels_index]), MAX_SEQUENCE_LENGTH):
            y_test_index_padded[sentence_labels_index, label_index, 0] = 1

print(y_train_index_padded[:1])

[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]]


## word2vec模型导入

预训练的word2vec模型采用前人用中文维基百科训练好的模型，请各位同学进入链接下载，并放到相对本脚本 同一级文件夹data 的目录下 并解压。

[word2vec模型链接](https://github.com/Embedding/Chinese-Word-Vectors)

In [9]:
## 1 导入 预训练的词向量
myPath = './data/sgns.wiki.word' # 本地词向量的地址
Word2VecModel = gensim.models.KeyedVectors.load_word2vec_format(myPath).wv # 读取词向量，以二进制读取

vector = Word2VecModel.wv['空间']  # 词语的向量，是numpy格式

  This is separate from the ipykernel package so we can avoid doing imports until
  """


In [10]:
print(type(Word2VecModel.wv)) # 结果为：Word2VecKeyedVectors

for i,j in Word2VecModel.wv.vocab.items():
    print(i) # 此时 i 代表每个单词
    print(j) # j 代表封装了 词频 等信息的 gensim“Vocab”对象，例子：Vocab(count:1481, index:38, sample_int:3701260191)
    break

<class 'gensim.models.keyedvectors.Word2VecKeyedVectors'>
，
Vocab(count:352217, index:0)


  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
## 2 构造包含所有词语的 list，以及初始化 “词语-序号”字典 和 “词向量”矩阵
vocab_list = [word for word, Vocab in Word2VecModel.wv.vocab.items()]# 存储 所有的 词语

word_index = {" ": 0}# 初始化 `[word : token]` ，后期 tokenize 语料库就是用该词典。
word_vector = {} # 初始化`[word : vector]`字典

# 初始化存储所有向量的大矩阵，留意其中多一位（首行），词向量全为 0，用于 padding补零。
# 行数 为 所有单词数+1 比如 10000+1 ； 列数为 词向量“维度”比如100。
embeddings_matrix = np.zeros((len(vocab_list) + 1, Word2VecModel.vector_size))

  


In [12]:
## 3 填充 上述 的字典 和 大矩阵
for i in range(len(vocab_list)):
    # print(i)
    word = vocab_list[i]  # 每个词语
    word_index[word] = i + 1 # 词语：序号
    word_vector[word] = Word2VecModel.wv[word] # 词语：词向量
    embeddings_matrix[i + 1] = Word2VecModel.wv[word]  # 词向量矩阵

np.save('x_word_index.npy', word_index)

  
  import sys


In [13]:
from keras.preprocessing import sequence
# 序号化 文本，tokenizer句子，并返回每个句子所对应的词语索引

# 由于将词语转化为索引的word_index需要与词向量模型对齐，故在导入词向量模型后再将X进行处理
def tokenizer(texts, word_index):
    data = []
    for sentence in texts:
        new_sentence = []
        for word in sentence:
            try:
                new_sentence.append(word_index[word])  # 把文本中的 词语转化为index
            except:
                new_sentence.append(0)
            
        data.append(new_sentence)
    # 使用kears的内置函数padding对齐句子,好处是输出numpy数组，不用自己转化了
    data = sequence.pad_sequences(data, maxlen = MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
    
    return data

X_train_tokenized = tokenizer(X_train_sent_split, word_index)
X_test_tokenized = tokenizer(X_test_sent_split, word_index)

print(X_train_tokenized[:1])

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


[[  347 16980   507    10 15537   603  4380     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]]


## 标引网络构建及训练评估

In [14]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
import keras
from keras import optimizers

EMBEDDING_DIM = 300 #词向量维度

model = Sequential()
model.add(Embedding(input_dim = len(embeddings_matrix), # 字典长度
                    output_dim = EMBEDDING_DIM, # 词向量 长度（300）
                    weights=[embeddings_matrix], # 重点：预训练的词向量系数
                    input_length=MAX_SEQUENCE_LENGTH, # 每句话的 最大长度（必须padding） 
                    trainable=False # 是否在 训练的过程中 更新词向量
                   ))
# input shape (Batch_size, Time_step, Input_Sizes)
model.add(LSTM(128, input_shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM), activation='tanh', return_sequences=True))
model.add(Dropout(0.5))
model.add(Dense(64, input_shape=(MAX_SEQUENCE_LENGTH, 128), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(labels_types)+1, input_shape=(MAX_SEQUENCE_LENGTH, 64), activation='softmax'))

adam = optimizers.Adam(lr=0.01, decay=1e-6)
model.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['accuracy'])

Instructions for updating:
Colocations handled automatically by placer.


In [15]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 300)          105665400 
_________________________________________________________________
lstm_1 (LSTM)                (None, 100, 128)          219648    
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 128)          0         
_________________________________________________________________
dense_1 (Dense)              (None, 100, 64)           8256      
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 64)           0         
_________________________________________________________________
dense_2 (Dense)              (None, 100, 37)           2405      
Total params: 105,895,709
Trainable params: 230,309
Non-trainable params: 105,665,400
__________________________________

In [16]:
print(X_train_tokenized.shape, y_train_index_padded.shape)

(18078, 100) (18078, 100, 37)


In [17]:
model.fit(X_train_tokenized, y_train_index_padded,
          epochs=20,
          batch_size=128,
          verbose=1
         )
print('evaluation!')
score = model.evaluate(X_test_tokenized, y_test_index_padded, batch_size=128)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
evaluation!


In [18]:
model.save('cn_pos_tag.h5')

In [19]:
print('训练的模型经在测试集上验证获得的loss和accuracy为：')
print(score)

训练的模型经在测试集上验证获得的loss和accuracy为：
[0.054519426377340294, 0.982557475566864]


In [20]:
print(model.predict(X_test_tokenized[:1]))

[[[1.7863530e-06 1.3260282e-08 5.1007342e-01 ... 3.3138915e-07
   5.6417657e-09 7.0415095e-05]
  [1.5724202e-08 4.0942895e-13 2.9143064e-06 ... 1.8065069e-11
   7.4609210e-14 3.3791670e-07]
  [8.4986749e-21 3.6002287e-20 2.4648545e-13 ... 4.1570019e-09
   1.7601839e-18 1.6017105e-09]
  ...
  [1.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
   0.0000000e+00 0.0000000e+00]
  [1.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
   0.0000000e+00 0.0000000e+00]
  [1.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
   0.0000000e+00 0.0000000e+00]]]


In [21]:
print(y_test_index_padded[:1])

[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]]


In [22]:
print(model.predict(X_test_tokenized[2:3]))

[[[5.5427216e-16 5.9282714e-28 1.4246143e-22 ... 6.5725498e-34
   5.4068325e-31 1.9028390e-18]
  [1.3030440e-05 6.9722041e-08 3.1839829e-02 ... 6.0993862e-08
   1.3876459e-08 1.2540212e-04]
  [5.0733701e-10 1.3419541e-15 6.1689619e-08 ... 1.1741996e-13
   2.0201592e-16 2.7416769e-08]
  ...
  [1.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
   0.0000000e+00 0.0000000e+00]
  [1.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
   0.0000000e+00 0.0000000e+00]
  [1.0000000e+00 0.0000000e+00 0.0000000e+00 ... 0.0000000e+00
   0.0000000e+00 0.0000000e+00]]]


In [23]:
print(y_test_index_padded[2:3])

[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]]


In [24]:
from keras.layers import GRU, SimpleRNN

model_RNN = Sequential()
model_RNN.add(Embedding(input_dim = len(embeddings_matrix), # 字典长度
                    output_dim = EMBEDDING_DIM, # 词向量 长度（300）
                    weights=[embeddings_matrix], # 重点：预训练的词向量系数
                    input_length=MAX_SEQUENCE_LENGTH, # 每句话的 最大长度（必须padding） 
                    trainable=False # 是否在 训练的过程中 更新词向量
                   ))
# input shape (Batch_size, Time_step, Input_Sizes)
model_RNN.add(SimpleRNN(128, input_shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM), activation='tanh', return_sequences=True))
model_RNN.add(Dropout(0.5))
model_RNN.add(Dense(64, input_shape=(MAX_SEQUENCE_LENGTH, 128), activation='relu'))
model_RNN.add(Dropout(0.5))
model_RNN.add(Dense(len(labels_types)+1, input_shape=(MAX_SEQUENCE_LENGTH, 64), activation='softmax'))

adam = optimizers.Adam(lr=0.01, decay=1e-6)
model_RNN.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['accuracy'])

model_RNN.fit(X_train_tokenized, y_train_index_padded,
          epochs=2,
          batch_size=128,
          verbose=1
         )

Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x7fcd7af3c4d0>

In [25]:
model_GRU = Sequential()
model_GRU.add(Embedding(input_dim = len(embeddings_matrix), # 字典长度
                    output_dim = EMBEDDING_DIM, # 词向量 长度（300）
                    weights=[embeddings_matrix], # 重点：预训练的词向量系数
                    input_length=MAX_SEQUENCE_LENGTH, # 每句话的 最大长度（必须padding） 
                    trainable=False # 是否在 训练的过程中 更新词向量
                   ))
# input shape (Batch_size, Time_step, Input_Sizes)
model_GRU.add(GRU(128, input_shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM), activation='tanh', return_sequences=True))
model_GRU.add(Dropout(0.5))
model_GRU.add(Dense(64, input_shape=(MAX_SEQUENCE_LENGTH, 128), activation='relu'))
model_GRU.add(Dropout(0.5))
model_GRU.add(Dense(len(labels_types)+1, input_shape=(MAX_SEQUENCE_LENGTH, 64), activation='softmax'))

adam = optimizers.Adam(lr=0.01, decay=1e-6)
model_GRU.compile(loss='categorical_crossentropy',
              optimizer=adam,
              metrics=['accuracy'])

model_GRU.fit(X_train_tokenized, y_train_index_padded,
          epochs=2,
          batch_size=128,
          verbose=1
         )

Epoch 1/2
Epoch 2/2


<keras.callbacks.callbacks.History at 0x7fcc12fa6690>