In [1]:
#先来看看平台是否提供了tensorflw
!pip show tensorflow

Name: tensorflow
Version: 1.12.2
Summary: TensorFlow is an open source machine learning framework for everyone.
Home-page: https://www.tensorflow.org/
Author: Google Inc.
Author-email: opensource@google.com
License: Apache 2.0
Location: /opt/conda/lib/python3.6/site-packages
Requires: tensorboard, grpcio, numpy, wheel, absl-py, astor, keras-preprocessing, six, termcolor, protobuf, gast, keras-applications
Required-by: 


In [2]:
#如果有keras就更好了
!pip show keras

Name: Keras
Version: 2.2.4
Summary: Deep Learning for humans
Home-page: https://github.com/keras-team/keras
Author: Francois Chollet
Author-email: francois.chollet@gmail.com
License: MIT
Location: /opt/conda/lib/python3.6/site-packages
Requires: keras-preprocessing, six, keras-applications, numpy, h5py, pyyaml, scipy
Required-by: 


In [3]:
#导入keras，看看在用什么做后端
import keras

Using TensorFlow backend.


In [1]:
#导入要用到的库
import numpy as np
import pandas as pd
#这次我们使用keras内置的tokenizer来处理文本数据
from keras.preprocessing.text import Tokenizer
#导入一个用来填充序列的工具
from keras.preprocessing.sequence import pad_sequences
#导入全连接层和Dropout层
from keras.layers import Dense, Dropout
#导入model类中的Sequential
from keras.models import Sequential

Using TensorFlow backend.


In [2]:
#这个单元格中的内容就是在第12章中用过的
#载入数据并添加极性标签
#并合成一个DataFrame的代码
#本章中就不逐行注释了
pos_corpus = []
with open('positive.txt','r') as f:
    for sent in f:
        pos_corpus.append(sent.replace('\n', ''))
neg_corpus = []
with open('negtive.txt', 'r') as f:
    for sent in f:
        neg_corpus.append(sent.replace('\n', ''))
pos_df = pd.DataFrame(pos_corpus, columns=['text'])
pos_df['polarity'] = 1
neg_df = pd.DataFrame(neg_corpus, columns=['text'])
neg_df['polarity'] = 0
df = pd.concat([pos_df, neg_df]).reset_index(drop = True)
#检查一下DataFrame的信息
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9214 entries, 0 to 9213
Data columns (total 2 columns):
text        9214 non-null object
polarity    9214 non-null int64
dtypes: int64(1), object(1)
memory usage: 144.0+ KB


In [3]:
#首先还是将文本作为样本特征
X = df['text']
#极性标签作为目标
y = df['polarity'].astype('int')

In [4]:
X[0]

'买入 长期 持有 沃森 生物 19条 简短 想法'

In [5]:
#这里使用keras中的tokenizer来进行向量的转化
#filter参数可以就使用下面这行代码中的
#这样一般的标点符号和特殊字符就会被过滤出去
tokenizer = Tokenizer(filters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                     lower = True, split=" ")
#用tokenizer拟合文本数据
tokenizer.fit_on_texts(X)
#文本特征存储在word_index中
vocab = tokenizer.word_index
#可以检查一下特征的数量
len(vocab)

15644

In [6]:
#如果大家好奇的话，可以查看一下前几个特征
slice_dict = {k: vocab[k] for k in list(vocab.keys())[0:10]}
slice_dict

{'不': 1,
 '今天': 2,
 '大': 3,
 '涨停': 4,
 '明天': 5,
 '跌': 6,
 '大盘': 7,
 '都': 8,
 '涨': 9,
 '股': 10}

In [7]:
#这里导入scikit-learn的数据集拆分工具
from sklearn.model_selection import train_test_split
#将数据集拆分为训练集和验证集
X_train, X_test, y_train, y_test =\
train_test_split(X, y, random_state = 30)
#使用texts_to_sequences就可以把文本转化为序列
#这个序列可以看成是数组
X_train_ids = tokenizer.texts_to_sequences(X_train)

In [9]:
#和原始的训练集对比一下
#大家就明白texts_to_sequances的作用
X_train[:5]

3467    目前 走势 良好 继续 持股 观望
8296       伊利 行情 会 不会 一日游
6357        现在 价位 盈利 真 不信
3729    不要 后知后觉 尾盘 大涨 酷 酷
9196    收市 前仓底 应该 呵呵 继续 跌
Name: text, dtype: object

In [8]:
#检查转化后的训练集
X_train_ids[:5]

[[110, 70, 1066, 18, 102, 403],
 [1812, 48, 16, 300, 14683],
 [59, 753, 409, 106, 2925],
 [86, 9892, 51, 165, 857, 857],
 [2461, 15632, 99, 314, 18, 6]]

In [10]:
#如果要让所有样本向量化后的特征数量一致
#就要用到填充序列的方法，pad_sequences
#例如我们指定maxlen为64，也就是会让keras保留出现次数最多的64个词
#作为特征
X_train_padded = pad_sequences(X_train_ids,maxlen = 64)
#检查一下填充后的序列
X_train_padded[:5]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 110, 70, 1066, 18, 102,
        403],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1812, 48, 16, 300,
        14683],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 753, 409, 106,
        2925],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 86, 9892, 51, 165, 857,
        857],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [11]:
#当然，我们还可以使用sequences_to_matrix来保留全部的特征
X_train_matrix = tokenizer.sequences_to_matrix(X_train_ids, mode='binary')
#可以检查一下转化为matrix的结果
X_train_matrix[0]

array([0.0, 0.0, 0.0, ..., 0.0, 0.0, 0.0])

In [12]:
#转化成matrix后的特征数量
len(X_train_matrix[0])

15645

In [13]:
#把验证集转化为序列
X_test_ids = tokenizer.texts_to_sequences(X_test)
#并且进行填充
X_test_padded = pad_sequences(X_test_ids, maxlen = 64)
#检查结果
X_test_padded[:5]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 276, 60, 229, 2951, 38, 1,
        786, 478],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 86, 3347, 959, 5,
        462],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 53, 110, 10369, 670,
        10370, 10371, 45],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 265, 121, 1176,
        3619, 9],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [14]:
#把验证集转化为矩阵
X_test_matrix = tokenizer.sequences_to_matrix(X_test_ids, mode='binary')
X_test_matrix[0]

array([0.0, 1.0, 0.0, ..., 0.0, 0.0, 0.0])

In [15]:
#下面就可以开始模型的搭建了
#这里我们使用Sequential模型
model = Sequential()
#首先向模型添加一个全连接层
#包含16个隐藏单元，激活函数选择relu
#input_shape选择样本特征的数量
model.add(Dense(16, input_shape = (len(vocab)+1,), activation = 'relu'))
#再添加一个dropout层，来降低过拟合的风险
model.add(Dropout(0.5))
#最后一个全连接层，激活函数为sigmoid
#输出的结果是样本属于标签“1”的可能性
model.add(Dense(1, activation='sigmoid'))
#几个隐藏层堆叠好，就可以对模型进行编译
model.compile(loss = 'binary_crossentropy',
             optimizer = 'adam',
             metrics = ['accuracy'])
#查看模型的概况
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 16)                250336    
_________________________________________________________________
dropout_1 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 250,353
Trainable params: 250,353
Non-trainable params: 0
_________________________________________________________________


In [16]:
#下面就可以开始训练模型
#使用128个样本组成的小批量
#进行10个轮次的训练
#指定转化为矩阵的验证集作为验证数据
hist = model.fit(X_train_matrix, y_train,
              batch_size=128,
              epochs=10,
              validation_data=(X_test_matrix, y_test))
#找到模型训练过程中最高的准确率
best_acc = max(hist.history['val_acc'])
#检查一下最高准确率是多少
best_acc

Train on 6910 samples, validate on 2304 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


0.8958333333333334

In [17]:
#当然，也可以使用evaluate方法对模型评估
model.evaluate(X_test_matrix, y_test)



[0.2868647529847092, 0.8958333333333334]

In [18]:
#使用模型进行预测的方法和scikit-learn也比较接近
#我们可以随意挑一个来试试
model.predict([X_test_matrix[:1]])

array([[0.6495502]], dtype=float32)

In [19]:
#和真值做个对比，看看模型预测是否正确
y_test[:1]

3596    1
Name: polarity, dtype: int64

In [None]:
#导入几个可以让模型性能达到最优时停止训练的工具
from keras.callbacks import Callback, EarlyStopping, ModelCheckpoint