In [1]:
%matplotlib inline 
# 魔法命令，使用后画图不用show了

import numpy as np
import matplotlib.pyplot as plt
import re# 引入正则

import warnings
warnings.filterwarnings("ignore")

# 1.解压词向量并加载

## 1.1解压词向量

In [2]:
import bz2# 用来解压文件

In [3]:
with open("./embeddings/sgns.weibo.bigram", 'wb') as new_file, open("./embeddings/sgns.weibo.bigram.bz2", 'rb') as file:
    decompressor = bz2.BZ2Decompressor()
    for data in iter(lambda : file.read(100 * 1024), b''):
        new_file.write(decompressor.decompress(data))

## 1.2加载词向量

In [4]:
from gensim.models import KeyedVectors# gensim用来加载预训练词向量

In [5]:
cn_model = KeyedVectors.load_word2vec_format('./embeddings/sgns.weibo.bigram', 
                                             binary=False,
                                             unicode_errors="ignore")

# 2.语料预处理

In [6]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## 2.1读取原始文本
* weibo：DataFrame存储的博文及其对应标签
* content：list存储的原始文本字符串
* label：标签，1为非谣言

In [7]:
import pandas as pd

In [8]:
weibo = pd.read_csv('./data/all_data.txt',sep='\t', names=['is_not_rumor','content'],encoding='utf-8')
weibo = weibo.dropna()#删除缺失值
weibo.head()

Unnamed: 0,is_not_rumor,content
0,0,【超市手推车插入式摆放宛若同性爱姿势，专家建议取缔[汗]】教育专家王建立近日指出，超市的手推...
1,0,紧急通知，汕头市出事了！揭阳市出事了！潮州、汕尾、深圳、广州都相继出事了！大家一定要互相转告...
2,1,分手后，不要回想甜蜜往事，因为会让自己更痛苦；不要怀疑TA的决定，因为TA已经决定了；不要尝...
3,0,注意啦！【打针西瓜】入夏，西瓜成为首选的消暑食品，但黑心商贩却把针头对准了尚未成熟的西瓜。“...
4,0,2012年2月3日当地时间下午3点41分有个印度妇女生育了11个小孩！ 瞬间就被震惊了


In [9]:
weibo.shape

(3387, 2)

In [10]:
#将DataFrame中的Series转换为list
content = weibo.content.values.tolist()
label=weibo.is_not_rumor.values.tolist()

In [11]:
print (content[3:5])

['注意啦！【打针西瓜】入夏，西瓜成为首选的消暑食品，但黑心商贩却把针头对准了尚未成熟的西瓜。“打针西瓜”所注射的禁用食品添加剂甜蜜素和胭脂红！打过针的西瓜瓜瓤呈红色，汁液也很“丰富”，但没有一点西瓜味。所用添加剂破坏 肝脏、肾脏的功能、影响儿童智力发育等毒性！', '2012年2月3日当地时间下午3点41分有个印度妇女生育了11个小孩！ 瞬间就被震惊了']


## 2.2进行分词和tokenize

https://github.com/lancopku/PKUSeg-python

对每一条微博文本text，
1. 去掉每个样本的标点符号；
2. 用pkuseg分词，得到存放分词结果的cut_list；
3. 去掉cut_list中的停用词得到cut_list_clean；
3. 将分词结果cut_list_clean索引化（使用北京师范大学中文信息处理研究所与中国人民大学 DBIIR 实验室的研究者开源的"chinese-word-vectors"），这样每一例评价的文本变成一段索引数字，对应着预训练词向量模型中的词。

将每个text的结果存到train_tokens中。

In [12]:
import pkuseg

In [13]:
#导入停用词
stopwords=pd.read_csv("./stopwords/stopwords.txt",index_col=False,sep="\t",quoting=3,names=['stopword'], encoding='utf-8')
stopwords = stopwords.stopword.values.tolist()#转为list形式

In [14]:
seg = pkuseg.pkuseg(model_name='web')  # 程序会自动下载所对应的细领域模型

In [15]:
train_tokens = []
for text in content:
    # 去掉标点
    text = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）]+", "",text)
    # pkuseg分词
    cut_list = seg.cut(text)

    #去除停用词
    cut_list_clean=[]
    for word in cut_list:
        if word in stopwords:
            continue
        cut_list_clean.append(word)
    
    #索引化
    for i, word in enumerate(cut_list_clean): # enumerate()
        try:
            # 将词转换为索引index
            cut_list_clean[i] = cn_model.vocab[word].index
        except KeyError:
            # 如果词不在字典中，则输出0
            cut_list_clean[i] = 0
    train_tokens.append(cut_list_clean)

## 2.3索引长度标准化

因为每段评语的长度是不一样的，如果单纯取最长的一个评语，并把其他评填充成同样的长度，这样十分浪费计算资源，所以取一个折衷的长度。

In [16]:
# 获得所有tokens的长度
num_tokens = [len(tokens) for tokens in train_tokens]
num_tokens = np.array(num_tokens)
# 取tokens平均值并加上两个tokens的标准差，
# 假设tokens长度的分布为正态分布，则max_tokens这个值可以涵盖95%左右的样本
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

58

### padding（填充）和truncating（修剪）

我们把文本转换为tokens（索引）之后，每一串索引的长度并不相等，所以为了方便模型的训练我们需要把索引的长度标准化，上面我们选择了max_tokens个可以涵盖95%训练样本的长度，接下来我们进行padding和truncating，我们一般采用'pre'的方法，这会在文本索引的前面填充0，因为根据一些研究资料中的实践，如果在文本索引后面填充0的话，会对模型造成一些不良影响。 

进行padding和truncating， 输入的train_tokens是一个list
返回的train_pad是一个numpy array

In [17]:
train_pad = pad_sequences(train_tokens, maxlen=max_tokens,
                            padding='pre', truncating='pre')

## 2.4准备Embedding Matrix

* 作为模型的输入，需要准备一个维度为 (𝑛𝑢𝑚𝑤𝑜𝑟𝑑𝑠,𝑒𝑚𝑏𝑒𝑑𝑑𝑖𝑛𝑔𝑑𝑖𝑚) 的embedding矩阵，num words代表使用的词汇的数量。


* 不进行词向量的训练，而是使用预训练的词向量——北京师范大学中文信息处理研究所与中国人民大学 DBIIR 实验室的研究者开源的"chinese-word-vectors"；https://github.com/Embedding/Chinese-Word-Vectors ；emdedding dimension在现在使用的预训练词向量模型中是300，每一个词汇都用一个长度为300的向量表示。


* 注意只选择使用前50k个使用频率最高的词，在这个预训练词向量模型中，一共有260万词汇量，如果全部使用在分类问题上会很浪费计算资源，因为训练样本很小，如果有更多的训练样本时，在分类问题上可以考虑减少使用的词汇量。

In [18]:
num_words = 50000
embedding_dim=300
# 初始化embedding_matrix，之后在keras上进行应用
embedding_matrix = np.zeros((num_words, embedding_dim))
# embedding_matrix为一个 [num_words，embedding_dim] 的矩阵
# 维度为 50000 * 300
for i in range(num_words):
    embedding_matrix[i,:] = cn_model[cn_model.index2word[i]]#前50000个index对应的词的词向量
embedding_matrix = embedding_matrix.astype('float32')
# 检查index是否对应，
# 输出300意义为长度为300的embedding向量一一对应
np.sum(cn_model[cn_model.index2word[333]] == embedding_matrix[333] )

300

In [19]:
# 超出五万个词向量的词用0代替
train_pad[train_pad>=num_words ] = 0

# 准备target向量，前2000样本为1，后2000为0
train_target = np.array(label)

# 3.训练语料

In [20]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from utils import Attention,convolution
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau

## 3.1划分训练集和测试集

In [21]:
# 进行训练和测试样本的分割
from sklearn.model_selection import train_test_split
# 90%的样本用来训练，剩余10%用来测试
X_train, X_test, y_train, y_test = train_test_split(train_pad,
                                                    train_target,
                                                    test_size=0.1,
                                                    random_state=12)

## 3.2搭建网络结构

## LSTM
* Embedding：使用预训练词向量，参数不可训练
* 双向LSTM，参数64
* 双向LSTM，参数32
* 全连接层，参数1

In [22]:
model = Sequential()
model.add(Embedding(num_words,
                    embedding_dim,
                    weights=[embedding_matrix],
                    input_length=max_tokens,
                    trainable=False))
model.add(Bidirectional(LSTM(units=64, return_sequences=True)))
model.add(Bidirectional(LSTM(units=32, return_sequences=False)))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
optimizer=tf.keras.optimizers.legacy.Adam(lr=1e-3)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 58, 300)           15000000  
                                                                 
 bidirectional (Bidirectiona  (None, 58, 128)          186880    
 l)                                                              
                                                                 
 bidirectional_1 (Bidirectio  (None, 64)               41216     
 nal)                                                            
                                                                 
 dense (Dense)               (None, 64)                4160      
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 15,232,321
Trainable params: 232,321
Non-t

## GRU
* Embedding：使用预训练词向量，参数不可训练
* 双向GRU，参数128
* 全连接层，参数32
* 全连接层，参数1

In [23]:
model1 = Sequential()
model1.add(Embedding(num_words,
                    embedding_dim,
                    weights=[embedding_matrix],
                    input_length=max_tokens,
                    trainable=False))
model1.add(Bidirectional(GRU(32)))
model1.add(Dense(6, activation='relu'))
model1.add(Dense(1, activation='sigmoid'))
optimizer=tf.keras.optimizers.legacy.Adam(lr=1e-3)
model1.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 58, 300)           15000000  
                                                                 
 bidirectional_2 (Bidirectio  (None, 64)               64128     
 nal)                                                            
                                                                 
 dense_2 (Dense)             (None, 6)                 390       
                                                                 
 dense_3 (Dense)             (None, 1)                 7         
                                                                 
Total params: 15,064,525
Trainable params: 64,525
Non-trainable params: 15,000,000
_________________________________________________________________


## CNN+LSTM+ATTENTION
* Embedding：使用预训练词向量，参数不可训练
* 一维卷积层，参数64
* 双向LSTM，参数64
* 注意力层
* 双向LSTM，参数32
* 全连接层，参数64
* 全连接层，参数1

In [24]:
model2 = Sequential()
model2.add(Embedding(num_words,
                    embedding_dim,
                    weights=[embedding_matrix],
                    input_length=max_tokens,
                    trainable=False))
model2.add(Conv1D(32,1,activation='relu'))
model2.add(Bidirectional(LSTM(units=64, return_sequences=True)))
model2.add(Attention(return_sequences=True))
model2.add(LSTM(units=32, return_sequences=False))
model2.add(Dense(64, activation='relu'))
model2.add(Dense(1, activation='sigmoid'))
optimizer=tf.keras.optimizers.legacy.Adam(lr=1e-3)
model2.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 58, 300)           15000000  
                                                                 
 conv1d (Conv1D)             (None, 58, 32)            9632      
                                                                 
 bidirectional_3 (Bidirectio  (None, 58, 128)          49664     
 nal)                                                            
                                                                 
 attention (Attention)       (None, 58, 128)           186       
                                                                 
 lstm_3 (LSTM)               (None, 32)                20608     
                                                                 
 dense_4 (Dense)             (None, 64)                2112      
                                                      

## TextCNN
* Embedding：使用预训练词向量，参数不可训练
* 一维卷积层，参数64
* 双向LSTM，参数64
* 注意力层
* 双向LSTM，参数32
* 全连接层，参数64
* 全连接层，参数1

In [25]:
model3 = Sequential()
model3.add(Embedding(num_words,
                    embedding_dim,
                    weights=[embedding_matrix],
                    input_length=max_tokens,
                    trainable=False))
model3.add(Reshape((58,300, 1)))
model3.add(convolution())
model3.add(Flatten())
model3.add(Dense(10, activation='relu'))
model3.add(Dropout(0.2))
model3.add(Dense(1, activation='sigmoid'))
model3.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 58, 300)           15000000  
                                                                 
 reshape (Reshape)           (None, 58, 300, 1)        0         
                                                                 
 model (Functional)          (None, 1, 1, 192)         230592    
                                                                 
 flatten (Flatten)           (None, 192)               0         
                                                                 
 dense_6 (Dense)             (None, 10)                1930      
                                                                 
 dropout (Dropout)           (None, 10)                0         
                                                                 
 dense_7 (Dense)             (None, 1)                

## 3.3模型配置

### 模型保存（断点续训）、early stoping、学习率

In [28]:
import os

In [29]:
# 建立一个权重的存储点
checkpoint_save_path="./checkpoint/rumor_LSTM.ckpt"
if os.path.exists(checkpoint_save_path+'.index'):
    print('----------load the model----------')
    model.load_weights(checkpoint_save_path)

In [30]:
#保存参数和模型
checkpoint = ModelCheckpoint(filepath=checkpoint_save_path, monitor='val_loss',
                                      verbose=1, save_weights_only=True,
                                      save_best_only=True)

In [31]:
# 定义early stoping如果3个epoch内validation loss没有改善则停止训练
earlystopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

# 自动降低learning rate
lr_reduction = ReduceLROnPlateau(monitor='val_loss',
                                       factor=0.1, min_lr=1e-8, patience=0,
                                       verbose=1)
# 定义callback函数
callbacks = [
    earlystopping, 
#    checkpoint,
    lr_reduction
]

In [32]:
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

========================model1===================

In [33]:
# 建立一个权重的存储点
checkpoint_save_path1="./checkpoint/rumor_GRU.ckpt"
if os.path.exists(checkpoint_save_path1+'.index'):
    print('----------load the model----------')
    model1.load_weights(checkpoint_save_path1)

In [34]:
#保存参数和模型
checkpoint1 = ModelCheckpoint(filepath=checkpoint_save_path1, monitor='val_loss',
                                      verbose=1, save_weights_only=True,
                                      save_best_only=True)

In [35]:
model1.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

========================model2===================

In [36]:
# 建立一个权重的存储点
checkpoint_save_path2="./checkpoint/rumor_CNN_LSTM.ckpt"
if os.path.exists(checkpoint_save_path2+'.index'):
    print('----------load the model----------')
    model2.load_weights(checkpoint_save_path2)

In [37]:
#保存参数和模型
checkpoint2 = ModelCheckpoint(filepath=checkpoint_save_path2, monitor='val_loss',
                                      verbose=1, save_weights_only=True,
                                      save_best_only=True)

In [38]:
model2.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

========================model3===================

In [39]:
# 建立一个权重的存储点
checkpoint_save_path3="./checkpoint/rumor_TextCNN.ckpt"
if os.path.exists(checkpoint_save_path3+'.index'):
    print('----------load the model----------')
    model3.load_weights(checkpoint_save_path3)

In [40]:
#保存参数和模型
checkpoint3 = ModelCheckpoint(filepath=checkpoint_save_path3, monitor='val_loss',
                                      verbose=1, save_weights_only=True,
                                      save_best_only=True)

In [41]:
model3.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy'])

## 3.4训练模型

In [35]:
model.fit(X_train, y_train,validation_split=0.1,epochs=20,batch_size=128,callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 7: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 8/20
Epoch 8: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 9/20
Epoch 9: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 10/20
Epoch 10: ReduceLROnPlateau reducing learning rate to 1.0000001111620805e-07.
Epoch 11/20
Epoch 11: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-08.
Epoch 11: early stopping


<keras.callbacks.History at 0x1e0881a39c8>

======================model1===========================

In [45]:
model1.fit(X_train, y_train,validation_split=0.1,epochs=20,batch_size=128,callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 8: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 9/20
Epoch 9: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 10/20
Epoch 10: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 11/20
Epoch 11: ReduceLROnPlateau reducing learning rate to 1.0000001111620805e-07.
Epoch 12/20
Epoch 12: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-08.
Epoch 12: early stopping


<keras.callbacks.History at 0x1e0a39cd448>

In [89]:
model2.fit(X_train, y_train,validation_split=0.1,epochs=20,batch_size=128,callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 6/20
Epoch 7/20
Epoch 7: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 8/20
Epoch 8: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 9/20
Epoch 9: ReduceLROnPlateau reducing learning rate to 1.0000001111620805e-07.
Epoch 10/20
Epoch 10: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-08.
Epoch 11/20
Epoch 11: ReduceLROnPlateau reducing learning rate to 1e-08.
Epoch 11: early stopping


<keras.callbacks.History at 0x1e114446c88>

In [42]:
model3.fit(X_train, y_train,validation_split=0.1,epochs=20,batch_size=128,callbacks=callbacks)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 6: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 7/20
Epoch 7: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 8/20
Epoch 8: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 9/20
Epoch 9: ReduceLROnPlateau reducing learning rate to 1.0000001111620805e-07.
Epoch 10/20
Epoch 10: ReduceLROnPlateau reducing learning rate to 1.000000082740371e-08.
Epoch 10: early stopping


<keras.callbacks.History at 0x22d67f97208>

## 3.5应用于测试集

In [46]:
result = model.evaluate(X_test, y_test)
print('Accuracy:{0:.2%}'.format(result[1]))

Accuracy:85.25%


In [47]:
result = model1.evaluate(X_test, y_test)
print('Accuracy:{0:.2%}'.format(result[1]))

Accuracy:85.55%


In [90]:
result = model2.evaluate(X_test, y_test)
print('Accuracy:{0:.2%}'.format(result[1]))

Accuracy:84.07%


In [43]:
result = model3.evaluate(X_test, y_test)
print('Accuracy:{0:.2%}'.format(result[1]))

Accuracy:87.32%
