In [176]:
import numpy as np
from sklearn import metrics
import torch
import torch.nn as nn
import torch.utils.data as tdata
from keras import callbacks
from keras import optimizers
from keras.preprocessing.sequence import pad_sequences

from htorch.contrib.data.reader import SKE2019Reader

In [2]:
reader = SKE2019Reader()

In [3]:
train_data = reader.read("/home/heyao/projects/htorch/data/SKE-2019/train_data.json")
dev_data = reader.read("/home/heyao/projects/htorch/data/SKE-2019/dev_data.json")

In [7]:
train_data.head()

Unnamed: 0,text,word_segs,pos_tags,subject,subject_type,object,object_type,predicate
0,内容简介《宜兴紫砂图典》由故宫出版社出版,内容|简介|《|宜兴紫砂图典|》|由|故宫出版社|出版,n|n|w|nw|w|p|nt|v,宜兴紫砂图典,书籍,故宫出版社,出版社,出版社
1,《中国风水十讲》是2007年华夏出版社出版的图书，作者是杨文衡,《|中国风水十讲|》|是|2007年|华夏出版社|出版|的|图书|，|作者|是|杨文衡,w|nw|w|v|t|nt|v|u|n|w|n|v|nr,中国风水十讲,书籍,华夏出版社,出版社,出版社
2,《中国风水十讲》是2007年华夏出版社出版的图书，作者是杨文衡,《|中国风水十讲|》|是|2007年|华夏出版社|出版|的|图书|，|作者|是|杨文衡,w|nw|w|v|t|nt|v|u|n|w|n|v|nr,中国风水十讲,图书作品,杨文衡,人物,作者
3,《空城未央》是夙言以信创作的网络小说，发表于17K小说网,《|空城未央|》|是|夙言|以|信|创作|的|网络|小说|，|发表|于|17K小说网,w|nw|w|v|n|p|n|v|u|n|n|w|v|p|nz,空城未央,图书作品,夙言以信,人物,作者
4,《空城未央》是夙言以信创作的网络小说，发表于17K小说网,《|空城未央|》|是|夙言|以|信|创作|的|网络|小说|，|发表|于|17K小说网,w|nw|w|v|n|p|n|v|u|n|n|w|v|p|nz,空城未央,网络小说,17K小说网,网站,连载网站


In [5]:
train_data.subject_type.value_counts()

影视作品    1486
人物      1379
歌曲       777
图书作品     532
书籍       280
企业       187
网络小说     181
生物       163
历史人物     106
机构        83
电视综艺      61
行政区       29
国家        10
景点        10
地点         7
学科专业       2
Name: subject_type, dtype: int64

In [6]:
train_data.object_type.value_counts()

人物        2771
Date       530
Text       302
地点         292
出版社        280
国家         208
学校         181
网站         180
目          163
音乐专辑       158
企业          93
Number      91
城市          18
气候          14
作品          10
语言           2
Name: object_type, dtype: int64

In [8]:
from itertools import chain

def load_data(df, return_vocab=False):
    tokens = df.word_segs.apply(lambda x: x.split("|"))
    postags = df.pos_tags.apply(lambda x: x.split("|"))
    if not return_vocab:
        return tokens, postags
    token_vocab = list(set(chain.from_iterable(tokens)))
    postag_vocab = list(set(chain.from_iterable(postags)))
    return tokens, postags, token_vocab, postag_vocab

In [62]:
train_tokens, train_pos_tokens, vocabs, pos_vocabs = load_data(train_data, return_vocab=True)
dev_tokens, dev_pos_tokens = load_data(dev_data, return_vocab=False)

print("vocab:", len(vocabs))
print("pos vocab:", len(pos_vocabs))

vocab: 18213
pos vocab: 25


In [63]:
pos_tags = pos_vocabs
subject_types = train_data.subject_type.value_counts().index.to_list()
print(subject_types)
object_types = train_data.object_type.value_counts().index.to_list()
print(object_types)

['影视作品', '人物', '歌曲', '图书作品', '书籍', '企业', '网络小说', '生物', '历史人物', '机构', '电视综艺', '行政区', '国家', '景点', '地点', '学科专业']
['人物', 'Date', 'Text', '地点', '出版社', '国家', '学校', '网站', '目', '音乐专辑', '企业', 'Number', '城市', '气候', '作品', '语言']


In [64]:
def convert_tokens_to_ids(tokens, mapping):
    return [mapping.get(i, 0) for i in tokens]

In [65]:
token_to_id = dict(zip(vocabs, range(1, len(vocabs) + 1)))
pos_to_id = dict(zip(pos_tags, range(1, len(pos_tags) + 1)))
train_tokens = [convert_tokens_to_ids(i, mapping=token_to_id) for i in train_tokens]
dev_tokens = [convert_tokens_to_ids(i, mapping=pos_to_id) for i in dev_tokens]
train_pos_tokens = [convert_tokens_to_ids(i, mapping=token_to_id) for i in train_pos_tokens]
dev_pos_tokens = [convert_tokens_to_ids(i, mapping=pos_to_id) for i in dev_pos_tokens]

In [66]:
len_of_token = [len(i) for i in train_tokens]
len_of_pos = [len(i) for i in train_pos_tokens]
for p in [0, 25, 50, 75, 90, 95, 99, 100]:
    print("TOKEN:", p, np.percentile(len_of_token, p))
    print("POS:", p, np.percentile(len_of_pos, p))
maxlen = 80

TOKEN: 0 1.0
POS: 0 1.0
TOKEN: 25 16.0
POS: 25 16.0
TOKEN: 50 25.0
POS: 50 25.0
TOKEN: 75 36.0
POS: 75 36.0
TOKEN: 90 54.0
POS: 90 54.0
TOKEN: 95 68.0
POS: 95 68.0
TOKEN: 99 110.0
POS: 99 110.0
TOKEN: 100 173.0
POS: 100 173.0


In [67]:
train_tokens = pad_sequences(train_tokens, maxlen=maxlen)
dev_tokens = pad_sequences(dev_tokens, maxlen=maxlen)

train_pos_tokens = pad_sequences(train_pos_tokens, maxlen=maxlen)
dev_pos_tokens = pad_sequences(dev_pos_tokens, maxlen=maxlen)

In [68]:
subject_to_id = dict(zip(subject_types, range(len(subject_types) + 1)))
train_subjects = train_data.subject_type.map(subject_to_id).values
dev_subjects = dev_data.subject_type.map(subject_to_id).values

object_to_id = dict(zip(object_types, range(len(object_types) + 1)))
train_objects = train_data.object_type.map(object_to_id).values
dev_objects = dev_data.object_type.map(object_to_id).values
print(len(subject_to_id), len(object_to_id))

16 16


In [69]:
labels = train_data.predicate.unique()
label_to_id = dict(zip(labels, range(1, len(labels) + 1)))
y_train = train_data.predicate.map(label_to_id).values
y_dev = dev_data.predicate.map(label_to_id).values

## Position encode

In [208]:
text = "内容简介《宜兴紫砂图典》由故宫出版社出版"
tokens = "内容|简介|《|宜兴|紫砂|图典|》|由|故宫出版社|出版".split("|")
subject = "宜兴紫砂图典"

i = 0
positions = []
for token in tokens:
    if token not in subject:
        positions.append(i)
        i += 1
        continue
    
text.index(subject)

5

In [158]:
from keras.layers import Input, Dense, CuDNNLSTM, GlobalMaxPooling1D, Concatenate, Embedding, Dropout, Reshape
from keras.layers import GlobalAveragePooling1D
from keras.models import Model

In [71]:
from sklearn.preprocessing import OneHotEncoder

onehot_encoder = OneHotEncoder(sparse=False)
y_train_onehot = onehot_encoder.fit_transform(y_train.reshape(-1, 1))
y_dev_onehot = onehot_encoder.transform(y_dev.reshape(-1, 1))

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [193]:
def build_model(seq_tokens_shape, seq_pos_shape, cat_emb_shapes, lstm_size=40, 
                num_class=train_data.predicate.nunique()):
    """without position embedding
    train: 0.82099
    dev  : 0.75307
    """
    seq_token_input = Input((maxlen, ))
    seq_output = Embedding(*seq_tokens_shape)(seq_token_input)
    seq_output = CuDNNLSTM(lstm_size, return_sequences=True)(seq_output)
    seq_output = GlobalAveragePooling1D()(seq_output)
    
    seq_pos_input = Input((maxlen, ))
    seq_pos_output = Embedding(*seq_pos_shape)(seq_pos_input)
    seq_pos_output = CuDNNLSTM(lstm_size, return_sequences=True)(seq_pos_output)
    seq_pos_output = GlobalAveragePooling1D()(seq_pos_output)
    
    inputs = [seq_token_input, seq_pos_input]
    outputs = [seq_output, seq_pos_output]
    for cat_vocab, cat_dim in cat_emb_shapes:
        cat_input = Input((1, ))
        cat_output = Embedding(cat_vocab, cat_dim)(cat_input)
        cat_output = Reshape((cat_dim, ))(cat_output)
        cat_output = Dense(128, activation="relu")(cat_output)
        inputs.append(cat_input)
        outputs.append(cat_output)
    
    x = Concatenate()(outputs)
    x = Dropout(0.5)(x)
    x = Dense(128, activation="relu")(x)
    x = Dropout(0.5)(x)
    x = Dense(num_class, activation="softmax")(x)
    model = Model(inputs, x)
    return model


def build_model(seq_tokens_shape, seq_pos_shape, cat_emb_shapes, lstm_size=40, 
                num_class=train_data.predicate.nunique()):
    seq_token_input = Input((maxlen, ))
    seq_output = Embedding(*seq_tokens_shape)(seq_token_input)
    seq_output = CuDNNLSTM(lstm_size, return_sequences=True)(seq_output)
    seq_output = GlobalAveragePooling1D()(seq_output)
    
    seq_pos_input = Input((maxlen, ))
    seq_pos_output = Embedding(*seq_pos_shape)(seq_pos_input)
    seq_pos_output = CuDNNLSTM(lstm_size, return_sequences=True)(seq_pos_output)
    seq_pos_output = GlobalAveragePooling1D()(seq_pos_output)
    
    inputs = [seq_token_input, seq_pos_input]
    outputs = [seq_output, seq_pos_output]
    for cat_vocab, cat_dim in cat_emb_shapes:
        cat_input = Input((1, ))
        cat_output = Embedding(cat_vocab, cat_dim)(cat_input)
        cat_output = Reshape((cat_dim, ))(cat_output)
        cat_output = Dense(128, activation="relu")(cat_output)
        inputs.append(cat_input)
        outputs.append(cat_output)
    
    x = Concatenate()(outputs)
    x = Dropout(0.5)(x)
    x = Dense(128, activation="relu")(x)
    x = Dropout(0.5)(x)
    x = Dense(num_class, activation="softmax")(x)
    model = Model(inputs, x)
    return model

In [203]:
model = build_model((len(vocabs) + 1, 100), (len(pos_tags) + 1, 8), [(16 + 1, 4), (16 + 1, 4)], lstm_size=20)
model.compile(optimizers.Adam(), "categorical_crossentropy")
callback_list = [
    callbacks.EarlyStopping(patience=3, restore_best_weights=True)
]
model.fit([train_tokens, train_pos_tokens, train_subjects, train_objects], y_train_onehot, 
          validation_data=([dev_tokens, dev_pos_tokens, dev_subjects, dev_objects], y_dev_onehot), epochs=40, 
          batch_size=64, verbose=1, callbacks=callback_list)

Train on 5293 samples, validate on 2073 samples
Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40


<keras.callbacks.History at 0x7f4e0874b6d8>

In [204]:
train_pred = model.predict([train_tokens, train_pos_tokens, train_subjects, train_objects], batch_size=512)
dev_pred = model.predict([dev_tokens, dev_pos_tokens, dev_subjects, dev_objects], batch_size=512)

In [205]:
print(metrics.f1_score(y_train, train_pred.argmax(axis=1) + 1, average="weighted"))
print(metrics.f1_score(y_dev, dev_pred.argmax(axis=1) + 1, average="weighted"))

0.8260985863104171
0.7470346404439404


In [108]:
# print(metrics.classification_report(y_train, train_pred.argmax(axis=1) + 1))

In [109]:
# print(metrics.classification_report(y_dev, dev_pred.argmax(axis=1) + 1))

|              Title              |      Train     |    Dev     |
|               ---               |       ---      |    ---     |
|    without postion embedding    |     0.82099    |  0.75307   |