# 训练char2vec

In [1]:
# 输出每个cell的运行时间
%load_ext autotime
# https://github.com/cpcloud/ipython-autotime

In [2]:
import warnings
warnings.filterwarnings("ignore")

time: 1.02 ms


In [3]:
root_path = "data/round1/ruijin_round1_train_20181022/ruijin_round1_train2_20181022/"
w2v_input_path = "model/char2vec_prepareData.txt"
w2v_output_path = "model/char2vec.model"

from Model import Char2VecTrainer
char2vec = Char2VecTrainer(root=root_path,w2v_file_path=w2v_output_path)

Using TensorFlow backend.


time: 4.42 s


In [4]:
char2vec.prepare_data()
char2vec.train(w2v_output_path,emb_size=256)

time: 7.41 s


In [5]:
char2vec_model = char2vec.load()

w2v的模型维度是：256
w2v的模型的词表总长是：2301
time: 49.9 ms


# 数据预处理

## 创建word2idx

In [6]:
from Entity import Document
from Data_Set import DataSet
from Utils import scan_files
from sklearn.model_selection import ShuffleSplit

file_names = scan_files(root_path)

rs = ShuffleSplit(n_splits=1, test_size=.15, random_state=2019)
train_idx,test_idx = next(rs.split(file_names))

train_file_names = [file_names[idx] for idx in train_idx]
test_file_names = [file_names[idx] for idx in test_idx]

whole_set = DataSet(train_file_names,root_path,vocab_size=-1)
word2idx = whole_set.word2idx
del whole_set

time: 1.6 s


## 创建emb_matrix

In [7]:
import numpy as np

vec_size = char2vec_model.wv.vector_size
emb_matrix = np.zeros(char2vec_model.wv.vector_size)

def random_vec(vec_size):
    vec = np.random.random(size=vec_size)
    vec = vec - vec.mean()
    return vec

for c in word2idx.keys():
    if c is "_padding":
        word2idx[c] = 0
    elif c is "_unk":
        emb = random_vec(vec_size)
        emb_matrix = np.vstack((emb_matrix,emb))
        word2idx[c] = 1
    else:
        if c in [" ","\n"]:
            idx = emb_matrix.shape[0]
            emb = random_vec(vec_size)
            emb_matrix = np.vstack((emb_matrix,emb))
            word2idx[c] = idx
        elif c not in char2vec_model.wv.vocab.keys():
            idx = word2idx["_unk"]
            word2idx[c] = idx
        else:
            idx = emb_matrix.shape[0]
            emb = char2vec_model.wv[c]
            emb_matrix = np.vstack((emb_matrix,emb))
            word2idx[c] = idx

time: 3.4 s


## 读取并切分数据集

In [8]:
rs = ShuffleSplit(n_splits=1, test_size=.20, random_state=2019)
train_idx,val_idx = next(rs.split(train_file_names))

train_file_names = [file_names[idx] for idx in train_idx]
val_file_names = [file_names[idx] for idx in val_idx]

trainset = DataSet(train_file_names,root_path,word2idx)
valset = DataSet(val_file_names,root_path,word2idx)
testset = DataSet(test_file_names,root_path,word2idx)

time: 663 ms


### 向量化 + 滑动窗切分句子

In [9]:
import pickle
sents_set = []

for dataset in [trainset,valset,testset]:
    sents = dataset.get_sents_set(70,10)
    sents_set.append(sents)

# 持久化
pickle.dump(sents_set,open('pickle_data/sents_set.pkl','wb'))

time: 6.85 s


### 创建X-Y

In [11]:
from Prepare_sents import get_sentsArray as get_array
import pickle

sents_set = pickle.load(open('pickle_data/sents_set.pkl','rb'))

train_X,train_Y = get_array(sents_set[0])
train_Y = np.expand_dims(train_Y,-1)

val_X,val_Y = get_array(sents_set[1])
val_Y = np.expand_dims(val_Y,-1)

test_X,_ = get_array(sents_set[2])

print(train_X.shape,train_Y.shape)
print(val_X.shape,val_Y.shape)
print(test_X.shape)

(30780, 90) (30780, 90, 1)
(7647, 90) (7647, 90, 1)
(7161, 90)
time: 4.69 s


# 创建模型

In [12]:
from Model import BiLstmCrfTrainer
from Prepare_sents import CATEGORY
from keras.callbacks import EarlyStopping

BATCH_SIZE = 32
EPOCH = 50

model = BiLstmCrfTrainer(category_count = len(CATEGORY)+1,
                         seq_len = train_X.shape[1],
                         vocab_size = emb_matrix.shape[0],
                         emb_matrix = emb_matrix).build()

early_stopping = EarlyStopping(monitor='val_crf_viterbi_accuracy', patience=2, mode='max')

print('开始训练啦！！')
print(20*"===")
history = model.fit(train_X,train_Y,batch_size=BATCH_SIZE,
                    epochs = EPOCH,
                    callbacks = [early_stopping],
                    validation_data = (val_X,val_Y)
                    )

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 90)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 90, 256)           589312    
_________________________________________________________________
dropout_1 (Dropout)          (None, 90, 256)           0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 90, 512)           1050624   
_________________________________________________________________
crf_1 (CRF)                  (None, 90, 16)            8496      
Total params: 1,648,432
Trainable params: 1,059,120
Non-trainable params: 589,312
_________________________________________________________________
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
开始训练啦！！

Train 

KeyboardInterrupt: 

time: 7min 46s
