In [1]:
from keras.layers import Dense,GlobalMaxPool1D,Input,Embedding,Conv1D
from keras.layers import Dropout,BatchNormalization,Flatten,Concatenate
from keras.models import Model
import numpy as np
# 构建一个多卷积窗口的CNN模型来进行情感分析

Using TensorFlow backend.


In [2]:
# 首先构造cnn模型，这里使用泛型模型（Model）进行模型的搭建
# 主要包括，embedding、conv1D、GLobalMaxPool1D、Dense层等
#这里我们使用dropout、以及batchnormalization策略，防止过拟合，并加快训练，激活函数选择softmax，因此最后输出神经元个数为2
def buildModel(maxlen,max_features,embedding_size):
    from keras.regularizers import l2
    print('build model...')
    nb_filter =100
    main_inputs = Input(shape=(maxlen,), dtype='int32', name='main_input')
    inputs = Embedding(max_features, embedding_size, input_length=maxlen)(main_inputs)
    # x =Dropout(0.25)(inputs)
    convs = []
    filter_sizes =(3,4,5)
    for fsz in filter_sizes:
        conv = Conv1D(filters=nb_filter,
                             kernel_size=fsz,
                             strides=1,
                             padding='valid',
                             activation='relu',
                             kernel_regularizer=l2(l=0.01)
                             )(inputs)        
        pool = GlobalMaxPool1D()(conv)       
        convs.append(pool)
    out = Concatenate(axis=1)(convs) #拼接不同大小filter池化得到的feature map
    out =BatchNormalization()(out)
    out =Dense(100,activation='relu')(out)
    out =Dropout(0.5)(out)
    out =Dense(50,activation='relu')(out)
    predict = Dense(2, activation='softmax',kernel_regularizer=l2(0.01))(out)
    model = Model(inputs=main_inputs, outputs=predict)
    return model

In [3]:
# 超参数设置
max_features =5000
maxlen =100
embedding_size =100
batch_size =100
epochs=10

In [4]:
model =buildModel(maxlen,max_features,embedding_size)

build model...


In [5]:
#打印一下模型的结构看一看
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
main_input (InputLayer)          (None, 100)           0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 100, 100)      500000                                       
____________________________________________________________________________________________________
conv1d_1 (Conv1D)                (None, 98, 100)       30100                                        
____________________________________________________________________________________________________
conv1d_2 (Conv1D)                (None, 97, 100)       40100                                        
___________________________________________________________________________________________

In [6]:
# 这里定义一个读取imdb数据的函数，加载imdb数据，该函数来自keras官方代码的删减，去掉了在网上下载数据集的部分
# 如果使用原始代码格式如下：
# from keras.datasets import imdb
def load_data(path='imdb.npz', num_words=None, skip_top=0,
              maxlen=None, seed=113,
              start_char=1, oov_char=2, index_from=3, **kwargs):
  
    # Legacy support
    if 'nb_words' in kwargs:
        warnings.warn('The `nb_words` argument in `load_data` '
                      'has been renamed `num_words`.')
        num_words = kwargs.pop('nb_words')
        
    f = np.load(path)
    x_train = f['x_train']
    labels_train = f['y_train']
    x_test = f['x_test']
    labels_test = f['y_test']
    f.close()

    np.random.seed(seed)
    np.random.shuffle(x_train)
    np.random.seed(seed)
    np.random.shuffle(labels_train)

    np.random.seed(seed * 2)
    np.random.shuffle(x_test)
    np.random.seed(seed * 2)
    np.random.shuffle(labels_test)

    xs = np.concatenate([x_train, x_test])
    labels = np.concatenate([labels_train, labels_test])

    if start_char is not None:
        xs = [[start_char] + [w + index_from for w in x] for x in xs]
    elif index_from:
        xs = [[w + index_from for w in x] for x in xs]

    if maxlen:
        new_xs = []
        new_labels = []
        for x, y in zip(xs, labels):
            if len(x) < maxlen:
                new_xs.append(x)
                new_labels.append(y)
        xs = new_xs
        labels = new_labels
        if not xs:
            raise ValueError('After filtering for sequences shorter than maxlen=' +
                             str(maxlen) + ', no sequence was kept. '
                             'Increase maxlen.')
    if not num_words:
        num_words = max([max(x) for x in xs])

    # by convention, use 2 as OOV word
    # reserve 'index_from' (=3 by default) characters:
    # 0 (padding), 1 (start), 2 (OOV)
    if oov_char is not None:
        xs = [[oov_char if (w >= num_words or w < skip_top) else w for w in x] for x in xs]
    else:
        new_xs = []
        for x in xs:
            nx = []
            for w in x:
                if skip_top <= w < num_words:
                    nx.append(w)
            new_xs.append(nx)
        xs = new_xs

    x_train = np.array(xs[:len(x_train)])
    y_train = np.array(labels[:len(x_train)])

    x_test = np.array(xs[len(x_train):])
    y_test = np.array(labels[len(x_train):])

    return (x_train, y_train), (x_test, y_test)

In [7]:
# 加载数据，并做一些处理。将所有的句子padding到最大长度
from keras.preprocessing import sequence
print('Loading data...')
(x_train, y_train), (x_test, y_test) = load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

Loading data...
(25000, 'train sequences')
(25000, 'test sequences')
Pad sequences (samples x time)


In [8]:
# keras的输出要求是one-hot形式，将输出转化为one-hot
from keras.utils.np_utils import to_categorical
y_train =to_categorical(y_train,2)
y_test=to_categorical(y_test,2)

In [9]:
# 编译模型并训练
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
validation_data=(x_test, y_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7efeabf71690>

In [11]:
# 将模型可视化
from keras.utils.vis_utils import plot_model
plot_model(model,'model.png',show_shapes=True)