In [1]:
from gensim.models import Word2Vec
import pandas as pd
import numpy as np
import gensim

train_data = './data/ctb5.1-pos/train.tsv'
test_data = './data/ctb5.1-pos/test.tsv'

## 数据读取

In [2]:
def get_data(file_path):
    data = pd.read_csv(file_path, sep='\t', skip_blank_lines=False, header=None)
    # 取出文本部分
    content = data[0]
    # 取出标签部分
    label = data[1]
    
    return content, label

In [3]:
# 读取数据集
X_train, y_train = get_data(train_data)
X_test, y_test = get_data(test_data)

## 数据预处理与格式转化

In [4]:
# 构建标签字典
labels = y_train.tolist() + y_test.tolist()
labels_types = list(set(labels))

labels_dict = {}

for label in labels_types:
    labels_dict.update({label: labels.count(label)})
print(len(labels), len(labels_dict), labels_dict)

520125 36 {nan: 18426, 'DER': 258, 'NP': 5, 'JJ': 13234, 'CS': 892, 'IJ': 12, 'FW': 33, 'X': 6, 'SP': 468, 'AD': 36430, 'VA': 7755, 'DEG': 12337, 'VV': 69858, 'VC': 5404, 'CC': 7355, 'PU': 76753, 'NN': 136643, 'BA': 755, 'VP': 1, 'OD': 1675, 'P': 17606, 'DEC': 12510, 'MSP': 1336, 'LC': 7782, 'DEV': 634, 'M': 13790, 'NT': 9659, 'CD': 16182, 'DT': 5986, 'VE': 3005, 'ETC': 1303, 'SB': 455, 'PN': 6644, 'LB': 245, 'AS': 4118, 'NR': 30570}


In [5]:
# 按句对X、y进行拆分
def split_corpus_by_sentence(content):
    cleaned_sentence = []
    split_label = content.isnull()
    last_split_index = 0
    index = 0
    while index < len(content):
        current_word = content[index]
        if split_label[index] == True and len(cleaned_sentence) == 0:
            cleaned_sentence.append(np.array(content[last_split_index:index]))
            last_split_index = index + 1
            index += 1
        elif split_label[index] == True  and len(cleaned_sentence) > 0:
            cleaned_sentence.append(np.array(content[last_split_index:index]))
            last_split_index = index + 1
            index += 1
        else:
            index += 1
    return cleaned_sentence

X_train_sent_split = split_corpus_by_sentence(X_train)
y_train_sent_split = split_corpus_by_sentence(y_train)
X_test_sent_split = split_corpus_by_sentence(X_test)
y_test_sent_split = split_corpus_by_sentence(y_test)

print('以句子进行拆分后的句子为：\n', X_train_sent_split[:5])
print('以句子进行拆分后的句子所对应的词性为：\n', y_train_sent_split[:5])

以句子进行拆分后的句子为：
 [array(['上海', '浦东', '开发', '与', '法制', '建设', '同步'], dtype=object), array(['新华社', '上海', '二月', '十日', '电', '（', '记者', '谢金虎', '、', '张持坚', '）'],
      dtype=object), array(['上海', '浦东', '近年', '来', '颁布', '实行', '了', '涉及', '经济', '、', '贸易', '、',
       '建设', '、', '规划', '、', '科技', '、', '文教', '等', '领域', '的', '七十一', '件',
       '法规性', '文件', '，', '确保', '了', '浦东', '开发', '的', '有序', '进行', '。'],
      dtype=object), array(['浦东', '开发', '开放', '是', '一', '项', '振兴', '上海', '，', '建设', '现代化',
       '经济', '、', '贸易', '、', '金融', '中心', '的', '跨世纪', '工程', '，', '因此',
       '大量', '出现', '的', '是', '以前', '不', '曾', '遇到', '过', '的', '新', '情况',
       '、', '新', '问题', '。'], dtype=object), array(['对', '此', '，', '浦东', '不', '是', '简单', '的', '采取', '“', '干', '一', '段',
       '时间', '，', '等', '积累', '了', '经验', '以后', '再', '制定', '法规', '条例', '”',
       '的', '做法', '，', '而', '是', '借鉴', '发达', '国家', '和', '深圳', '等', '特区',
       '的', '经验', '教训', '，', '聘请', '国内外', '有关', '专家', '学者', '，', '积极',
       '、', '及时', '地', '制定', '和', '推

In [6]:
# 将标签文本转换为索引（根据之前构建的标签字典），再转换为one-hot向量
def transfer_label_category_index(origin_labels, labels_types):
    transfered_label = []
    for sentence_labels in origin_labels:
        labels_format_index = [labels_types.index(label) for label in sentence_labels]  # 将标签依据字典转化为序号
        transfered_label.append(labels_format_index)
    return transfered_label

y_train_index = transfer_label_category_index(y_train_sent_split, labels_types)
y_test_index = transfer_label_category_index(y_test_sent_split, labels_types)

print(y_train_index[:5])

[[35, 35, 16, 14, 16, 16, 12], [16, 35, 26, 26, 16, 15, 16, 35, 15, 35, 15], [35, 35, 26, 23, 12, 12, 34, 12, 16, 15, 16, 15, 16, 15, 16, 15, 16, 15, 16, 30, 16, 21, 27, 25, 16, 16, 15, 12, 34, 35, 16, 11, 3, 16, 15], [35, 16, 16, 13, 27, 25, 12, 35, 15, 12, 16, 16, 15, 16, 15, 16, 16, 21, 3, 16, 15, 9, 9, 12, 21, 13, 26, 9, 9, 12, 34, 21, 3, 16, 15, 3, 16, 15], [20, 32, 15, 35, 9, 13, 10, 24, 12, 15, 12, 27, 25, 16, 15, 20, 12, 34, 16, 23, 9, 12, 16, 16, 15, 21, 16, 15, 14, 13, 12, 3, 16, 14, 35, 30, 16, 11, 16, 16, 15, 12, 16, 3, 16, 16, 15, 9, 15, 9, 24, 12, 14, 12, 16, 16, 15, 12, 28, 16, 16, 9, 12, 9, 31, 12, 16, 16, 15]]


In [7]:
# 设置句子最大长度
MAX_SEQUENCE_LENGTH = 100 

# 标签格式转化
# 构建对应（标签样本数，句子长度，标签类别数）形状的张量，值全为0
y_train_index_padded = np.zeros((len(y_train_index), MAX_SEQUENCE_LENGTH, len(labels_types)+1), dtype='float', order='C')
y_test_index_padded = np.zeros((len(y_test_index), MAX_SEQUENCE_LENGTH, len(labels_types)+1), dtype='float', order='C')

# 填充张量
for sentence_labels_index in range(len(y_train_index)):
    for label_index in range(len(y_train_index[sentence_labels_index])):
        if label_index < MAX_SEQUENCE_LENGTH:
            y_train_index_padded[sentence_labels_index, label_index, y_train_index[sentence_labels_index][label_index]+1] = 1

for sentence_labels_index in range(len(y_test_index)):
    for label_index in range(len(y_test_index[sentence_labels_index])):
        if label_index < MAX_SEQUENCE_LENGTH:
            y_test_index_padded[sentence_labels_index, label_index, y_test_index[sentence_labels_index][label_index]+1] = 1

print(y_train_index_padded[:1])

[[[0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]


## word2vec模型导入

预训练的word2vec模型采用前人用中文维基百科训练好的模型，请各位同学进入链接下载，并放到相对本脚本 同一级文件夹data 的目录下 并解压。

[word2vec模型链接](https://github.com/Embedding/Chinese-Word-Vectors)

In [8]:
## 1 导入 预训练的词向量
myPath = './data/sgns.wiki.word' # 本地词向量的地址
Word2VecModel = gensim.models.KeyedVectors.load_word2vec_format(myPath).wv # 读取词向量，以二进制读取

vector = Word2VecModel.wv['空间']  # 词语的向量，是numpy格式

  This is separate from the ipykernel package so we can avoid doing imports until
  """


In [9]:
print(type(Word2VecModel.wv)) # 结果为：Word2VecKeyedVectors

for i,j in Word2VecModel.wv.vocab.items():
    print(i) # 此时 i 代表每个单词
    print(j) # j 代表封装了 词频 等信息的 gensim“Vocab”对象，例子：Vocab(count:1481, index:38, sample_int:3701260191)
    break

<class 'gensim.models.keyedvectors.Word2VecKeyedVectors'>
，
Vocab(count:352217, index:0)


  """Entry point for launching an IPython kernel.
  This is separate from the ipykernel package so we can avoid doing imports until


In [10]:
## 2 构造包含所有词语的 list，以及初始化 “词语-序号”字典 和 “词向量”矩阵
vocab_list = [word for word, Vocab in Word2VecModel.wv.vocab.items()]# 存储 所有的 词语

word_index = {" ": 0}# 初始化 `[word : token]` ，后期 tokenize 语料库就是用该词典。
word_vector = {} # 初始化`[word : vector]`字典

# 初始化存储所有向量的大矩阵，留意其中多一位（首行），词向量全为 0，用于 padding补零。
# 行数 为 所有单词数+1 比如 10000+1 ； 列数为 词向量“维度”比如100。
embeddings_matrix = np.zeros((len(vocab_list) + 1, Word2VecModel.vector_size))

  


In [11]:
## 3 填充 上述 的字典word_index 和 大矩阵
for i in range(len(vocab_list)):
    # print(i)
    word = vocab_list[i]  # 每个词语
    word_index[word] = i + 1 # 词语：序号
    word_vector[word] = Word2VecModel.wv[word] # 词语：词向量
    embeddings_matrix[i + 1] = Word2VecModel.wv[word]  # 词向量矩阵

  
  import sys


In [12]:
from keras.preprocessing import sequence
# 序号化 文本，tokenizer句子，并返回每个句子所对应的词语索引

# 由于将词语转化为索引的word_index需要与词向量模型对齐，故在导入词向量模型后再将X进行处理
def tokenizer(texts, word_index):
    data = []
    for sentence in texts:
        new_sentence = []
        for word in sentence:
            try:
                new_sentence.append(word_index[word])  # 把文本中的 词语转化为index
            except:
                new_sentence.append(0)
            
        data.append(new_sentence)
    # 使用kears的内置函数padding对齐句子,好处是输出numpy数组，不用自己转化了
    data = sequence.pad_sequences(data, maxlen = MAX_SEQUENCE_LENGTH, padding='post', truncating='post')
    
    return data

X_train_tokenized = tokenizer(X_train_sent_split, word_index)
X_test_tokenized = tokenizer(X_test_sent_split, word_index)

print(X_train_tokenized[:1])

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


[[  347 16980   507    10 15537   603  4380     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0]]


## 标引网络构建及训练评估

In [13]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
import keras

EMBEDDING_DIM = 300 # 词向量维度

model = Sequential()
model.add(Embedding(input_dim = len(embeddings_matrix), # 字典长度
                    output_dim = EMBEDDING_DIM, # 词向量 长度（300）
                    weights=[embeddings_matrix], # 重点：预训练的词向量系数
                    input_length=MAX_SEQUENCE_LENGTH, # 每句话的 最大长度（必须padding） 
                    trainable=False # 是否在 训练的过程中 更新词向量
                   ))
# input shape (Batch_size, Time_step, Input_Sizes)
model.add(LSTM(128, input_shape=(MAX_SEQUENCE_LENGTH, EMBEDDING_DIM), activation='tanh', return_sequences=True)) # 增加bidirectional
model.add(Dropout(0.5))
model.add(Dense(64, input_shape=(MAX_SEQUENCE_LENGTH, 128), activation='relu')) # 128维
model.add(Dropout(0.5))
model.add(Dense(len(labels_types)+1, input_shape=(MAX_SEQUENCE_LENGTH, 64), activation='softmax')) # 64维

model.compile(loss='categorical_crossentropy',  # 损失函数
              optimizer='adam',  # 优化器——更改参数
              metrics=['accuracy']  # 评价指标
             )

Instructions for updating:
Colocations handled automatically by placer.


In [14]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 300)          105665400 
_________________________________________________________________
lstm_1 (LSTM)                (None, 100, 128)          219648    
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 128)          0         
_________________________________________________________________
dense_1 (Dense)              (None, 100, 64)           8256      
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 64)           0         
_________________________________________________________________
dense_2 (Dense)              (None, 100, 37)           2405      
_________________________________________________________________
dropout_3 (Dropout)          (None, 100, 37)          

In [15]:
print(X_train_tokenized.shape, y_train_index_padded.shape) # 18078个样本,每个样本100个词

(18078, 100) (18078, 100, 37)


In [16]:
model.fit(X_train_tokenized, y_train_index_padded,
          epochs=20, # 20轮
          batch_size=128,
          verbose=1  # 展示训练过程
         )
print('evaluation!')
score = model.evaluate(X_test_tokenized, y_test_index_padded, batch_size=128)

Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
evaluation!


In [17]:
print('训练的模型经在测试集上验证获得的loss和accuracy为：')
print(score)

训练的模型经在测试集上验证获得的loss和accuracy为：
[0.05799064429840822, 0.20675286650657654]


In [19]:
print(model.predict(X_test_tokenized[:1]))

[[[5.2290096e-07 2.6394284e-07 1.4585145e-06 ... 5.7066409e-06
   9.5998666e-06 6.9099808e-01]
  [1.9605020e-10 2.8755645e-11 4.7406906e-10 ... 1.4757541e-11
   3.4634517e-10 9.8503667e-01]
  [5.1990853e-11 3.0280665e-12 1.4870039e-09 ... 8.3266883e-08
   7.4978939e-09 5.9678172e-07]
  ...
  [5.2306046e-07 2.9986347e-07 3.8933899e-06 ... 5.3384105e-07
   5.6671212e-07 1.9845031e-02]
  [5.2306046e-07 2.9986347e-07 3.8933899e-06 ... 5.3384105e-07
   5.6671212e-07 1.9845031e-02]
  [5.2306046e-07 2.9986347e-07 3.8933940e-06 ... 5.3384105e-07
   5.6671212e-07 1.9845037e-02]]]


In [21]:
print(y_test_index_padded[:1])

[[[0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]


In [20]:
print(model.predict(X_test_tokenized[2:3]))

[[[3.45361896e-14 4.47106892e-15 4.74992875e-13 ... 1.62147195e-14
   8.04468186e-14 1.36462643e-06]
  [1.76132282e-08 1.17716965e-08 4.95060846e-08 ... 4.80711542e-08
   1.13095993e-07 9.02875662e-01]
  [2.06781068e-11 2.86890932e-12 5.92366711e-11 ... 1.52504159e-12
   4.57937646e-11 9.90172505e-01]
  ...
  [5.23060464e-07 2.99863473e-07 3.89339402e-06 ... 5.33841501e-07
   5.66712117e-07 1.98450424e-02]
  [5.23060464e-07 2.99863473e-07 3.89338993e-06 ... 5.33841046e-07
   5.66712117e-07 1.98450424e-02]
  [5.23059953e-07 2.99862904e-07 3.89338993e-06 ... 5.33841046e-07
   5.66712117e-07 1.98450424e-02]]]


In [22]:
print(y_test_index_padded[2:3])

[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]]
