In [1]:
import keras

from keras.models import Model
from keras.models import Sequential
from keras.models import load_model

from keras.layers import Activation
from keras.layers import Input, Dense, Embedding, Conv1D, Conv2D, MaxPooling1D, MaxPool2D
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.layers import SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers.recurrent import LSTM,SimpleRNN

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences

from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping
from keras.optimizers import Adam

from keras.utils.vis_utils import plot_model
from keras.utils.np_utils import to_categorical

from keras.models import load_model

import jieba
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

Using TensorFlow backend.


In [2]:
# 导入自定义库
from utils.data_utils import clean_str
from utils.data_utils import build_vocab
from utils.data_utils import get_tokens

In [7]:
# 加载训练集合
df_train_dataset = pd.read_csv('./data/training-inspur.csv', encoding='utf-8')
# 加载测试集
df_test_dataset = pd.read_csv('./data/Preliminary-texting-1.csv', encoding='utf-8')
# 查看数据集合 shape
df_test_dataset.shape

(102024, 2)

In [8]:
# 提取数据集中所需的字段
df_train_dataset = df_train_dataset[['COMMCONTENT', 'COMMLEVEL']]
df_test_dataset = df_test_dataset[['COMMCONTENT']]

In [9]:
# 合并数据集用于构建词汇表
df_all_dataset = pd.concat([df_train_dataset, df_test_dataset], ignore_index=True)
df_all_dataset.shape

(122024, 2)

In [10]:
BASE_DIR = '/Users/tsw/ScenicSpotReviews'

W2V_DIR = BASE_DIR + '/embeddings/'

TEXT_DATA_DIR = BASE_DIR + '/data/'

MAX_NUM_WORDS = 33950

MAX_SEQUENCE_LENGTH = 150 # 每篇文章选取150个词

MAX_NB_WORDS = 80000 # 将字典设置为含有1万个词84480

EMBEDDING_DIM = 300 # 词向量维度，300维

VALIDATION_SPLIT = 0.1 # 测试集大小，全部数据的10%

BATCH_SIZE = 128

NUM_LABELS = 3

In [11]:
def seg_corpus(corpus):
    seg_corpus = []
    for line in corpus:
        line = str(line).strip()
        seg_list = jieba.cut(line, cut_all=False)
        # 过滤空字符
        seg_list = [w for w in seg_list if w != ' ']
        seg_corpus.append(" ".join(seg_list))
    return seg_corpus

In [12]:
# 对所有文本分词
seged_text = seg_corpus(df_all_dataset['COMMCONTENT'])

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/5n/2_by50851fxc4d_snc1d9wf80000gn/T/jieba.cache
Loading model cost 0.859 seconds.
Prefix dict has been built succesfully.


In [13]:
# 将分词后的数据并入 df_all_dataset
df_all_dataset['COMMCONTENT_SEG'] = pd.DataFrame(seged_text,columns=['COMMCONTENT_SEG'])
df_all_dataset.head()

Unnamed: 0,COMMCONTENT,COMMLEVEL,COMMCONTENT_SEG
0,普通公园一个只是多了几个泉而已，人不多，适合老人孩子闲逛，买票的话还是贵了，人家说6.30之...,1.0,普通 公园 一个 只是 多 了 几个 泉 而已 ， 人不多 ， 适合 老人 孩子 闲逛 ， ...
1,跟儿子在里面玩了一天，非常好！跟儿子在里面玩了一天，非常好！真的很不错哦，有空还要去,1.0,跟 儿子 在 里面 玩 了 一天 ， 非常 好 ！ 跟 儿子 在 里面 玩 了 一天 ， 非...
2,这已经是第五次来这里玩了。每次孩子都很喜欢，不愿意从水里出来。有机会还会再来。还有比我更忠诚...,1.0,这 已经 是 第五次 来 这里 玩 了 。 每次 孩子 都 很 喜欢 ， 不 愿意 从水里 ...
3,当天在携程上定的票，打温泉度假村咨询电话和携程客服都说次日生效，但到酒店后，票能用。请客服人...,1.0,当天 在 携程 上定 的 票 ， 打 温泉 度假村 咨询电话 和 携程 客服 都 说 次日 ...
4,烟台历史的一部分，非常值得推荐去看看！海边景色也很漂亮！,1.0,烟台 历史 的 一部分 ， 非常 值得 推荐 去 看看 ！ 海边 景色 也 很漂亮 ！


In [14]:
# text_corpus
text_corpus = df_all_dataset['COMMCONTENT_SEG']
# 传入我们词向量的字典
tokenizer = Tokenizer(num_words=MAX_NB_WORDS) 
# 传入我们的训练数据，得到训练数据中出现的词的字典
tokenizer.fit_on_texts(text_corpus) 

In [15]:
# 根据训练数据中出现的词的字典，将训练数据转换为sequences
dataset_sequences = tokenizer.texts_to_sequences(text_corpus) 

In [16]:
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 100134 unique tokens.


In [17]:
padded_dataset_sequences = pad_sequences(dataset_sequences, maxlen=MAX_SEQUENCE_LENGTH)
padded_dataset_sequences.shape

(122024, 150)

In [18]:
df_test_dataset_seg = df_all_dataset['COMMCONTENT_SEG'][20000:]
test_dataset_sequences = tokenizer.texts_to_sequences(df_test_dataset_seg)
padded_test_dataset_sequences = pad_sequences(test_dataset_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [19]:
# 划分训练集和测试集
from sklearn.model_selection import train_test_split
train_X,valid_X,train_y,valid_y =train_test_split(padded_dataset_sequences[:df_train_dataset.shape[0]], 
                                                  df_all_dataset['COMMLEVEL'][:df_train_dataset.shape[0]], 
                                                  test_size=0.1)

In [20]:
# label one-hot 表示
labels = df_all_dataset['COMMLEVEL'].dropna().map(int)#.values.tolist()
labels = to_categorical(labels-1) 

In [21]:
vocab,vocab_freqs = build_vocab(df_all_dataset['COMMCONTENT_SEG'])
vocab_size = min(MAX_NB_WORDS, len(vocab_freqs)) + 2
word2index = {x[0]: i+2 for i, x in enumerate(vocab_freqs.most_common(MAX_NB_WORDS))}
word2index["PAD"] = 0
word2index["UNK"] = 1
index2word = {v:k for k, v in word2index.items()}
len(word2index)

80002

In [22]:
import codecs
print('Indexing word embeddings.')  
embeddings_index = {}
with codecs.open('./embeddings/sgns.weibo.word','r','utf-8') as f:
    f = f.readlines()
    for i in f[1:]:
        values = i.strip().split(' ')
        word = str(values[0])
        embedding = np.asarray(values[1:],dtype='float')
        embeddings_index[word] = embedding
print('word embedding',len(embeddings_index))

Indexing word embeddings.
word embedding 195201


In [34]:
embeddings_index['中国']

array([-0.186542,  0.153161, -0.092138, -0.409595, -0.277637,  0.32679 ,
        0.460779, -0.290725, -0.11773 , -0.026282,  0.306992, -0.241884,
       -0.131621, -0.072939, -0.353897,  0.325635, -0.245221,  0.192655,
        0.491776, -0.038478,  0.172667, -0.099799, -0.022893,  0.421129,
       -0.021248,  0.113363, -0.240293, -0.269463,  0.262599,  0.059695,
       -0.068543, -0.164919, -0.236679, -0.12863 ,  0.009809,  0.025645,
       -0.272379, -0.154907, -0.161305, -0.176863,  0.377503,  0.223636,
       -0.387001, -0.244671,  0.41847 , -0.04869 ,  0.067996,  0.012222,
       -0.035722, -0.052362, -0.650677,  0.100913, -0.202876, -0.612033,
        0.438661,  0.193497, -0.267914, -0.278571, -0.292877, -0.049786,
        0.236615,  0.059674,  0.245647, -0.156111,  0.307591, -0.11428 ,
       -0.322858, -0.481675, -0.14655 ,  0.519312, -0.155763,  0.156163,
        0.01132 , -0.157673, -0.117068, -0.556283,  0.487568, -0.175978,
        0.105386, -0.092337, -0.262746,  0.513361, 

In [23]:
nb_words = min(MAX_NB_WORDS,len(word2index))
nb_words

80000

In [24]:
word_embedding_matrix = np.zeros((nb_words + 1, EMBEDDING_DIM))

In [25]:
for word, i in word2index.items():
    if i > MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(str(word).upper())
    if embedding_vector is not None:
        word_embedding_matrix[i] = embedding_vector

In [32]:
# pre_embedding_layer =  Embedding(input_dim = nb_words+1, 
#                              output_dim = EMBEDDING_DIM, 
#                             weights=[word_embedding_matrix], 
#                              input_length=MAX_SEQUENCE_LENGTH, 
#                              mask_zero=True,
#                              trainable=False
#                             )

In [26]:
from keras.layers import Lambda,BatchNormalization
from keras import backend as K

In [27]:
def get_rnn_cnn_model():
    embedding_dim = 300
    inp = Input(shape=(MAX_SEQUENCE_LENGTH,))
    x =  Embedding(input_dim = nb_words+1, 
                             output_dim = EMBEDDING_DIM, 
                             weights=[word_embedding_matrix], 
                             input_length=MAX_SEQUENCE_LENGTH, 
                             mask_zero=False,
                             trainable=True
                            )(inp)
    x = SpatialDropout1D(0.3)(x) # 0.3 - 0.6970
    x = Bidirectional(GRU(80, return_sequences=True))(x) # 80-0.6970
    
    x1 = Conv1D(32, kernel_size=2, padding="valid", kernel_initializer="he_uniform")(x)
    avg_pool = GlobalAveragePooling1D()(x1)
    max_pool = GlobalMaxPooling1D()(x1)
    kmax_pool = Lambda(lambda x: K.max(x, axis=1), output_shape=(32,))(x1)
    conc1 = concatenate([avg_pool, kmax_pool])
    
    x2 = Conv1D(32, kernel_size=3, padding="valid", kernel_initializer="he_uniform")(x)
    avg_pool2 = GlobalAveragePooling1D()(x2)
    max_pool2 = GlobalMaxPooling1D()(x2)
    kmax_pool2 = Lambda(lambda x: K.max(x, axis=1), output_shape=(32,))(x2)
    conc2 = concatenate([avg_pool2, kmax_pool2])
    
    #x3 = Conv1D(64, kernel_size=4, padding="valid", kernel_initializer="he_uniform")(x)
    #avg_pool3 = GlobalAveragePooling1D()(x3)
    #max_pool3 = GlobalMaxPooling1D()(x3)
    #kmax_pool3 = Lambda(lambda x: K.max(x, axis=1), output_shape=(64,))(x3)
    #conc3 = concatenate([avg_pool3, max_pool3, kmax_pool3])
    
    merge = concatenate([conc1, conc2])
    
    drop_merge = Dropout(0.25)(merge)
    
    #drop_merge = BatchNormalization()(drop_merge)
    
    #drop_merge = Dense(300, activation="relu")(drop_merge)
    
    
    #drop_merge = Dropout(0.2)(drop_merge)
    #drop_merge = BatchNormalization()(drop_merge)
    outp = Dense(3, activation="softmax")(drop_merge)

    model = Model(inputs=inp, outputs=outp)
    
    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
    
    return model

In [28]:
rnn_cnn_model = get_rnn_cnn_model()

In [31]:
plot_model(rnn_cnn_model, to_file='./best_model_layer_name.png', 
show_shapes=False, show_layer_names=True)

!['./best_model.png'](./rnn_cnn_model.png)

In [26]:
filepath = "./checkpoints/rnn-cnn-fasttext/weights-improvement-{epoch:02d}-{val_acc:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True,mode='max')

In [27]:
batch_size = 256
epochs = 15

history = rnn_cnn_model.fit(x=train_X, y=to_categorical(train_y-1, num_classes=3), 
                    validation_data=(valid_X, to_categorical(valid_y-1, num_classes=3)[:]), 
                    batch_size=batch_size, 
                    epochs=epochs,
                    shuffle=True,
                    callbacks = [EarlyStopping(monitor='val_acc', patience=3, mode='auto'),checkpoint],
                    verbose=1)

Train on 18000 samples, validate on 2000 samples
Epoch 1/15

Epoch 00001: val_acc improved from -inf to 0.61550, saving model to ./checkpoints/rnn-cnn-fasttext/weights-improvement-01-0.6155.hdf5
Epoch 2/15

Epoch 00002: val_acc improved from 0.61550 to 0.67450, saving model to ./checkpoints/rnn-cnn-fasttext/weights-improvement-02-0.6745.hdf5
Epoch 3/15

Epoch 00003: val_acc improved from 0.67450 to 0.68750, saving model to ./checkpoints/rnn-cnn-fasttext/weights-improvement-03-0.6875.hdf5
Epoch 4/15

Epoch 00004: val_acc improved from 0.68750 to 0.69500, saving model to ./checkpoints/rnn-cnn-fasttext/weights-improvement-04-0.6950.hdf5
Epoch 5/15

Epoch 00005: val_acc did not improve from 0.69500
Epoch 6/15

Epoch 00006: val_acc did not improve from 0.69500
Epoch 7/15

Epoch 00007: val_acc did not improve from 0.69500


In [28]:
best_rnn_cnn_model = load_model('./checkpoints/rnn-cnn-fasttext/weights-improvement-04-0.6950.hdf5')

y_pred_rnn_cnn = best_rnn_cnn_model.predict(padded_test_dataset_sequences, batch_size=1024)

pooled_gru_conv_model_preds = np.argmax(y_pred_rnn_cnn,axis=1)[:]+1

pd.Series(pooled_gru_conv_model_preds).value_counts(normalize=True)

1    0.452207
3    0.287256
2    0.260537
dtype: float64

In [29]:
np.savetxt("weights-improvement-04-0.6950.txt", pooled_gru_conv_model_preds,fmt="%d")