# 训练char2vec

In [1]:
# 输出每个cell的运行时间
%load_ext autotime
# https://github.com/cpcloud/ipython-autotime

In [3]:
import warnings
warnings.filterwarnings("ignore")

time: 998 µs


In [4]:
root_path = "data/round1/train/"
w2v_input_path = "model_file/char2vec_prepareData.txt"
w2v_output_path = "model_file/char2vec.model"

from Model import Char2VecTrainer
char2vec = Char2VecTrainer(root=root_path,w2v_file_path=w2v_output_path)

Using TensorFlow backend.


time: 4.58 s


In [5]:
char2vec.prepare_data()
char2vec.train(w2v_output_path,emb_size=256)

time: 7.52 s


In [6]:
char2vec_model = char2vec.load()

w2v的模型维度是：256
w2v的模型的词表总长是：2301
time: 48.9 ms


# 数据预处理

## 创建word2idx

In [7]:
from common.Entity import Document
from common.Utils import scan_files
from Data import DataSet

from sklearn.model_selection import ShuffleSplit

file_names = scan_files(root_path)

rs = ShuffleSplit(n_splits=1, test_size=.15, random_state=2019)
train_idx,test_idx = next(rs.split(file_names))

train_file_names = [file_names[idx] for idx in train_idx]
test_file_names = [file_names[idx] for idx in test_idx]

whole_set = DataSet(root_path,file_names,vocab_size=-1)
char2idx = whole_set.char2idx
del whole_set

time: 1.94 s


In [8]:
len(char2idx)

3246

time: 6.96 ms


## 创建emb_matrix

In [9]:
import numpy as np

vec_size = char2vec_model.wv.vector_size
emb_matrix = np.zeros(vec_size)

def random_vec(vec_size):
    vec = np.random.random(size=vec_size)
    vec = vec - vec.mean()
    return vec

for c in char2idx.keys():
    if c is "_padding":
        char2idx[c] = 0
    elif c is "_unk":
        emb = random_vec(vec_size)
        emb_matrix = np.vstack((emb_matrix,emb))
        char2idx[c] = 1
    else:
        if c in [" ","\n"]:
            idx = emb_matrix.shape[0]
            emb = random_vec(vec_size)
            emb_matrix = np.vstack((emb_matrix,emb))
            char2idx[c] = idx
        elif c not in char2vec_model.wv.vocab.keys():
            idx = char2idx["_unk"]
            char2idx[c] = idx
        else:
            idx = emb_matrix.shape[0]
            emb = char2vec_model.wv[c]
            emb_matrix = np.vstack((emb_matrix,emb))
            char2idx[c] = idx

time: 3.56 s


In [10]:
len(char2vec_model.wv.vocab)

2301

time: 2 ms


## 读取并切分数据集

In [11]:
import pickle

rs = ShuffleSplit(n_splits=1, test_size=.20, random_state=2019)
train_idx,val_idx = next(rs.split(train_file_names))

train_file_names = [file_names[idx] for idx in train_idx]
val_file_names = [file_names[idx] for idx in val_idx]

trainset = DataSet(root_path,train_file_names,char2idx)
valset = DataSet(root_path,val_file_names,char2idx)
testset = DataSet(root_path,test_file_names,char2idx)

# 持久化
pickle.dump(testset,open('pickle_file/testset.pkl','wb'))

time: 791 ms


### 向量化 + 滑动窗切分句子

In [12]:
from Data import DataProcessor
import pickle
data_processors = []

for dataset in [trainset,valset,testset]:
    processor = DataProcessor(dataset).data4NER(window=70,pad=10)
    data_processors.append(processor)

# 持久化
pickle.dump(data_processors,open('pickle_file/data_processors.pkl','wb'))

time: 6.95 s


### 创建X-Y

In [13]:
from Data import DataProcessor
from typing import List
import pickle

data_processors = pickle.load(open('pickle_file/data_processors.pkl','rb')) #type:List[DataProcessor]

train_X,train_Y = data_processors[0].get_ner_data()
train_Y = np.expand_dims(train_Y,-1)

val_X,val_Y = data_processors[1].get_ner_data()
val_Y = np.expand_dims(val_Y,-1)

test_X,_ = data_processors[2].get_ner_data()

print(train_X.shape,train_Y.shape)
print(val_X.shape,val_Y.shape)
print(test_X.shape)

(30780, 90) (30780, 90, 1)
(7647, 90) (7647, 90, 1)
(7161, 90)
time: 5.06 s


In [14]:
train_X

array([[   0.,    0.,    0., ...,  754.,    2.,    2.],
       [ 529.,  437.,  511., ...,  529.,  437.,  511.],
       [ 104.,   94.,  309., ...,  437.,  511.,   68.],
       ...,
       [ 996.,   24.,  220., ..., 1045.,   24.,  861.],
       [  98.,  169.,   21., ...,   24.,  454., 1335.],
       [   3., 1841.,   24., ...,    0.,    0.,    0.]])

time: 1.99 ms


# 创建模型

In [16]:
from Model import BiLstmCrfTrainer
from Data import CATEGORY
from keras.callbacks import EarlyStopping

BATCH_SIZE = 64
EPOCH = 1

model = BiLstmCrfTrainer(category_count = len(CATEGORY)+1,
                         seq_len = train_X.shape[1],
                         lstm_units=512,
                         vocab_size = emb_matrix.shape[0],
                         emb_matrix = emb_matrix).build()

early_stopping = EarlyStopping(monitor='val_crf_viterbi_accuracy', patience=2, mode='max')

print('开始训练啦！！')
print(20*"===")
history = model.fit(train_X,train_Y,batch_size=BATCH_SIZE,
                    epochs = EPOCH,
                    class_weight="auto",
                    callbacks = [early_stopping],
                    validation_data = (val_X,val_Y,)
                    )

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 90)                0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 90, 256)           590080    
_________________________________________________________________
dropout_3 (Dropout)          (None, 90, 256)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 90, 1024)          3149824   
_________________________________________________________________
dropout_4 (Dropout)          (None, 90, 1024)          0         
_________________________________________________________________
crf_2 (CRF)                  (None, 90, 16)            16688     
Total params: 3,756,592
Trainable params: 3,166,512
Non-trainable params: 590,080
___________________________________________

In [17]:
import datetime
time = datetime.datetime.now()
model.save(filepath="model_file/bi_lstm_crf_{}_{}_{}_{}.h5".format(str(time.month),str(time.day),str(time.hour),str(time.minute)),overwrite=True)

time: 825 ms


# 预测结果

In [2]:
import keras
import keras_contrib
import pickle

model = keras.models.load_model("model_file/bi_lstm_crf_12_1_20_36.h5",
                                custom_objects={"CRF": keras_contrib.layers.CRF, "crf_loss": keras_contrib.losses.crf_loss,
                                                "crf_viterbi_accuracy": keras_contrib.metrics.crf_viterbi_accuracy})

In [4]:
data_processors = pickle.load(open('pickle_file/data_processors.pkl','rb')) #type:List[DataProcessor]
test_X,_ = data_processors[2].get_ner_data()

preds = model.predict(test_X, batch_size=16, verbose=True)



In [5]:
from Evaluator import *
from common.Entity import Document
from Data import DataSet
from typing import List

testset = pickle.load(open('pickle_file/testset.pkl','rb')) # type:DataSet
pre_docs = merge_preds4ner(testset,data_processors[2],preds) # type:List[Document]
source_docs = testset.docs

f1,prediction,recall = f1_score4ner(pre_docs,source_docs,'all')
print("【严格相交】F1:{:.4f}  -  Predicition:{:.4f}  -  Recall:{:.4f}".format(f1,prediction,recall))

f1,prediction,recall = f1_score4ner(pre_docs,source_docs,'others')
print("【不严格相交】F1:{:.4f}  -  Predicition:{:.4f}  -  Recall:{:.4f}".format(f1,prediction,recall))

【严格相交】F1:0.6186  -  Predicition:0.5890  -  Recall:0.6514
【不严格相交】F1:0.7127  -  Predicition:0.6786  -  Recall:0.7504


## 换biLSTM-LAN模型来实现

In [8]:
from Model import BiLstm_Lan_Trainer
from Prepare_sents import CATEGORY
from keras.callbacks import EarlyStopping

BATCH_SIZE = 16
EPOCH = 50

model = BiLstm_Lan_Trainer(category_count = len(CATEGORY)+1,
                         seq_len = train_X.shape[1],
                         lstm_units=[256,256],
                         vocab_size = emb_matrix.shape[0],
                         emb_matrix = emb_matrix).build()

early_stopping = EarlyStopping(monitor='val_acc', patience=2, mode='max')

print('开始训练啦！！')
print(20*"===")
history = model.fit(train_X,train_Y,batch_size=BATCH_SIZE,
                    epochs = EPOCH,
                    class_weight="auto",
                    callbacks = [early_stopping],
                    validation_data = (val_X,val_Y)
                    )

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 90)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 90, 256)      589312      input_1[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 90, 256)      0           embedding_1[0][0]                
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 90, 512)      1050624     dropout_1[0][0]                  
____________________________________________________________________________________________

In [9]:
preds = model.predict(test_X, batch_size=16, verbose=True)
from Evaluator import merge_preds,f1_score
from Prepare_sents import Sentences
testset = pickle.load(open('pickle_data/testset.pkl','rb'))
pre_docs = merge_preds(testset,preds,70,10)
source_docs = testset.docs
f1,prediction,recall = f1_score(pre_docs,source_docs,'all')
print("【严格相交】F1:{:.4f}  -  Predicition:{:.4f}  -  Recall:{:.4f}".format(f1,prediction,recall))

f1,prediction,recall = f1_score(pre_docs,source_docs,'others')
print("【不严格相交】F1:{:.4f}  -  Predicition:{:.4f}  -  Recall:{:.4f}".format(f1,prediction,recall))

【严格相交】F1:0.7369  -  Predicition:0.6846  -  Recall:0.7979
【不严格相交】F1:0.8043  -  Predicition:0.7472  -  Recall:0.8710
time: 1min 5s
