In [1]:
#首先导入必要的库
#有些库读者朋友可能不知道是做什么的
#没有关系，后面我们在用到的时候，会进行讲解
import gc
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, Dense, Activation, Input
from keras.layers import Convolution1D, Flatten, Dropout, MaxPool1D
from keras.layers import LSTM
from keras.layers.merge import concatenate
from keras.models import Model, Sequential
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint

Using TensorFlow backend.


In [2]:
#这个单元格中的内容就是在第12、13章中用过的
#载入数据并添加极性标签
#并合成一个DataFrame的代码
#本章中就不逐行注释了
pos_corpus = []
with open('positive.txt','r') as f:
    for sent in f:
        pos_corpus.append(sent.replace('\n', ''))
neg_corpus = []
with open('negtive.txt', 'r') as f:
    for sent in f:
        neg_corpus.append(sent.replace('\n', ''))
pos_df = pd.DataFrame(pos_corpus, columns=['text'])
pos_df['polarity'] = 1
neg_df = pd.DataFrame(neg_corpus, columns=['text'])
neg_df['polarity'] = 0
df = pd.concat([pos_df, neg_df]).reset_index(drop = True)
#检查一下DataFrame的信息
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9214 entries, 0 to 9213
Data columns (total 2 columns):
text        9214 non-null object
polarity    9214 non-null int64
dtypes: int64(1), object(1)
memory usage: 144.0+ KB


In [3]:
#分配好数据集的特征和目标
X = df['text']
y = df['polarity'].astype('int')
#使用tokenizer对数据进行处理
#这个在第13章中，也是使用过的了
tokenizer = Tokenizer(filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                     lower = True, split=" ")
#用tokenizer拟合文本数据
tokenizer.fit_on_texts(X)
#文本特征存储在word_index中
vocab = tokenizer.word_index
#拆分数据
X_train, X_test, y_train, y_test =\
train_test_split(X, y, random_state = 30)
#这次我们使用填充序列来训练模型
#也就是用pad_sequences来进行处理
X_train_word_ids = tokenizer.texts_to_sequences(X_train)
X_test_word_ids = tokenizer.texts_to_sequences(X_test)
#将训练集和验证集都转化为填充序列
#为了节省时间，我们设置序列的最大长度为16
X_train_padded_seqs = pad_sequences(X_train_word_ids, maxlen=16)
X_test_padded_seqs = pad_sequences(X_test_word_ids, maxlen=16)

In [4]:
#下面我们就开始搭建卷积神经网络
#首先是建立一个输入，因为填充序列的长度是16
#所以Input的形态也要指定为16，数据类型为64位浮点数
main_input = Input(shape = (16,),dtype = 'float64')
#这里我们引入一个嵌入层，对输入的序列进行处理
embedder = Embedding(len(vocab)+1, 8, input_length = 16)
embed = embedder(main_input)
#先创建一个1维卷积神经层
cnn1 = Convolution1D(16, 3, padding='same', strides=1, activation='relu')(embed)
#用一个池化层与cnn1堆叠
cnn1 = MaxPool1D(pool_size=8)(cnn1)
#创建第二个1维卷积层
cnn2 = Convolution1D(16, 4, padding='same', strides=1, activation='relu')(embed)
#同样与池化层堆叠
cnn2 = MaxPool1D(pool_size=8)(cnn2)
#第3个1维卷积层
cnn3 = Convolution1D(16, 5, padding='same', strides=1, activation='relu')(embed)
#与池化层堆叠
cnn3 = MaxPool1D(pool_size=8)(cnn3)
#将3个卷积层进行连接
cnn = concatenate([cnn1, cnn2, cnn3], axis=-1)
#使用一个Flatten层，把输入从高维压缩到1维
flat = Flatten()(cnn)
#添加一个dropout层来进行正则化
drop = Dropout(0.2)(flat)
#最后是一个全连接层，用来输出模型结果
main_output = Dense(1, activation='sigmoid')(drop)
#这次使用Model来搭建模型，输入和输出分别是最初的输入和全连接层给出的输出
model = Model(inputs=main_input, outputs=main_output)
#最后对模型进行编译
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
#查看模型的概述
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 16)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 16, 8)        125160      input_1[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 16, 16)       400         embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_2 (Conv1D)               (None, 16, 16)       528         embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_3 (

In [11]:
#首先设置early_stopping，
#这次选择监控的指标是验证集的准确率，
#在准确率连续下降5次后停止训练
early_stopping = EarlyStopping(monitor='val_acc', patience=5)
#设置模型的检查点，用来保存最佳的模型参数
model_checkpoint = ModelCheckpoint('model-TextCNN.h5', save_best_only=True)
#下面就开模型的训练
#为了节约时间，还是将轮次设定为10
hist = model.fit(X_train_padded_seqs, y_train, batch_size=128, epochs=10,
                 validation_data=(X_test_padded_seqs, y_test),
                 callbacks=[early_stopping, model_checkpoint])

Train on 6910 samples, validate on 2304 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


In [12]:
model.evaluate(X_test_padded_seqs, y_test)



[0.4742104259009163, 0.8567708333333334]

In [13]:
#使用卷积神经网络模型对样本作出预测
model.predict(X_test_padded_seqs[:1])

array([[0.9986297]], dtype=float32)

In [14]:
X_test[:1]

3596    上证指数 创业板 指 任性 机会 不 好好 把握
Name: text, dtype: object

In [15]:
#清除一下垃圾
gc.collect()

0

In [18]:
#下面来搭建长短期记忆网络
lstm = Sequential()
#在网络中先添加一个Embedding层
lstm.add(Embedding(len(vocab)+1, 8, weights=[np.zeros((len(vocab) + 1, 8))], 
                   input_length=16, trainable=True))
#添加长短期记忆网络
lstm.add(LSTM(8, dropout=0.5, recurrent_dropout=0.2))
#添加全连接层
lstm.add(Dense(1, activation='sigmoid'))
#编译模型
lstm.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
#看产模型的概况
lstm.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 16, 8)             125160    
_________________________________________________________________
lstm_1 (LSTM)                (None, 8)                 544       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 9         
Total params: 125,713
Trainable params: 125,713
Non-trainable params: 0
_________________________________________________________________


In [20]:
#这里是设置模型停止和保存检查点的代码
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
model_checkpoint = ModelCheckpoint('model-LSTM.h5', save_best_only=True)
#开始训练LSTM网络
hist = lstm.fit(X_train_padded_seqs, y_train,
                batch_size=128,
                epochs=10,
                validation_data=(X_test_padded_seqs, y_test),
                callbacks=[early_stopping, model_checkpoint])

Train on 6910 samples, validate on 2304 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [21]:
lstm.evaluate(X_test_padded_seqs, y_test)



[0.33796224743127823, 0.8810763888888888]

In [22]:
lstm.predict(X_test_padded_seqs[0].reshape(1,-1))

array([[0.9755633]], dtype=float32)

In [23]:
X_test_padded_seqs[0].reshape(1,-1)

array([[0, 0, 0, 0, 0, 0, 0, 0, 276, 60, 229, 2951, 38, 1, 786, 478]],
      dtype=int32)

In [24]:
X_test_padded_seqs[:1]

array([[0, 0, 0, 0, 0, 0, 0, 0, 276, 60, 229, 2951, 38, 1, 786, 478]],
      dtype=int32)

In [25]:
gc.collect()

39

In [26]:
#keras的模型保存是比较简单的
#使用save方法就可以了
lstm.save('lstm.h5')