# 新闻情感分类：RNN

In [1]:
import collections
import os
import tarfile
import random
import torch
import torch.nn as nn
import torchtext.vocab as Vocab
import torch.utils.data as Data
import time
from tqdm import tqdm
import  torch.nn.functional as F

import sys
# 设置GPU
os.environ["CUDA_VISIBLE_DEVICES"]="0"
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")

DATA_ROOT="../data"

In [2]:
fname=os.path.join(DATA_ROOT,"aclImdb_v1.tar.gz")
if not os.path.exists(os.path.join(DATA_ROOT,'aclImdb')):
    print("compress from tarfile.")
    with tarfile.open(fname,'r') as f:
        f.extractall(DATA_ROOT)

In [3]:
# 读取训练数据
def read_imdb(folder="train",data_root="../data/aclImdb"):
    data=[]
    for label in ["pos","neg"]:
        folder_name=os.path.join(data_root,folder,label)
        for target_file in tqdm(os.listdir(folder_name)):
            with open(os.path.join(folder_name, target_file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '').lower()
                data.append([review, 1 if label == 'pos' else 0])
    random.shuffle(data)
    return data
train_data, test_data = read_imdb('train'), read_imdb('test')

100%|██████████| 12500/12500 [00:04<00:00, 2961.68it/s]
100%|██████████| 12500/12500 [00:04<00:00, 2847.36it/s]
100%|██████████| 12500/12500 [00:04<00:00, 3047.81it/s]
100%|██████████| 12500/12500 [00:04<00:00, 3020.20it/s]


In [4]:
# 查看数据格式
train_data[0]

['i\'ll keep this short; thanks to greg for helping me to put this succinctly: captivity is about a guy who drugs a girl\'s drink, imprisons and tortures her, then poses as a captive to have sex with her. that is the single twist and punchline of the film. it\'s torture as slow motion date rape. and, it\'s not even a good movie. it\'s not so bad it\'s good; it\'s just bad.<br /><br />it should also be mentioned that among critics, there is a "spoiler code" that they dare not break, even though some were tempted to on this one because it is so vile. why no one had the cojones to step up and say, "this is garbage, and this is why," is beyond me.<br /><br />don\'t give your money to these poop-peddlers.',
 0]

In [5]:
# 获取分词结果
def get_tokenized_imdb(data):
    """
    data: list of [string, label]
    """
    def tokenizer(text):
        return [token.lower() for token in text.split(" ")]
    return [tokenizer(review) for review, _ in data]

# 根据分词的结果创建词典
def get_vocab_imdb(data):
    tokenized_data=get_tokenized_imdb(data)
    counter=collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter, min_freq=5)

vocab = get_vocab_imdb(train_data)
print('# words in vocab:', len(vocab))

# words in vocab: 46152


In [6]:
# 通过截断或者补0来将每条评论长度固定成500。
def precess_imdb(data,vocab,max_len=500):
    def padding(x):
        return x[:max_len] if len(x)>max_len else x+[0]*(max_len-len(x))
    tokenized_data = get_tokenized_imdb(data)
    features = torch.tensor([padding([vocab.stoi[word] for word in words]) for words in tokenized_data])
    labels = torch.tensor([score for _, score in data])
    return features, labels

In [7]:
# 创建数据迭代器
batch_size=64
train_set=Data.TensorDataset(*precess_imdb(train_data,vocab))
test_set=Data.TensorDataset(*precess_imdb(test_data,vocab))

# 创建迭代器
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

In [8]:
for X, y in train_iter:
    print('X', X.shape, 'y', y.shape)
    print(X[0][:10])
    break
print('#batches:', len(train_iter))

X torch.Size([64, 500]) y torch.Size([64])
tensor([  10,    7,  111, 1430,  649,  289,   34,    2,    0,   17])
#batches: 391


In [9]:
# 定义RNN神经网络
class BiRNN(nn.Module):
    def __init__(self,hidden_size,embedding_size,vocab,num_layers):
        super(BiRNN,self).__init__()
        self.embedding=nn.Embedding(len(vocab),embedding_size)
        self.encoder=nn.LSTM(input_size=embedding_size,hidden_size=hidden_size,
                             num_layers=num_layers,batch_first=True,bidirectional=True)
        self.fc=nn.Linear(in_features=4*hidden_size,out_features=2)
        
    def forward(self,inputs):
        """
        inputs: (batch_size,features)
        """
        # embedd.shape=(batch_size,feature_size,embedding_size)
        embedd=self.embedding(inputs)
        # output.shape:(batch_size,feature_size,hidden_size*num_layer)
        outputs,_=self.encoder(embedd)
        # 连结初始时间步和最终时间步的隐藏状态作为全连接层输入
        encoding = torch.cat((outputs[:,0,:], outputs[:,-1,:]), -1)
        outs = self.fc(encoding)
        return outs

In [10]:
embed_size, num_hiddens, num_layers = 100, 100, 2
net = BiRNN(num_hiddens,embed_size,vocab,num_layers)

In [11]:
# 导入预训练模型
glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=os.path.join("../data", "pretrain_models"))

In [12]:
# 导入词向量
def load_pretrained_embedding(words,pretrained_vocab):
    embed=torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0])
    oov_count = 0
    for i,word in enumerate(words):
        try:
            index=pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[index]
        except KeyError:
            oov_count += 1
    if oov_count > 0:
        print("There are %d oov words." % oov_count)
    return embed
# 加载预训练词向量
net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.embedding.weight.requires_grad = False

There are 21202 oov words.


In [13]:
# 模型训练
lr, num_epochs = 0.01, 5
optimizer=torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)

criterion=nn.CrossEntropyLoss()

In [14]:
def evaluate_accuracy(data_iter, net, device=None):
    net.eval() # 评估模式, 这会关闭dropout
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
            n += y.shape[0]
    net.train() # 改回训练模式
    return acc_sum / n

def train(train_iter,test_iter,net,criterion,optimizer,device,num_epochs):
    net=net.to(device)
    print("training on ", device)
    batch_count = 0
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X,y in train_iter:
            X = X.to(device)
            y = y.to(device)
            y=y.view(-1)
#             print("X.shape=",X.shape)
#             print("y.shape=",y.shape)
            y_hat=net(X)
#             print("y_hat.shape=",y_hat.shape)
            loss=criterion(y_hat,y)
            
            optimizer.zero_grad()
            # 梯度迭代
            loss.backward()
            #  参数更新
            optimizer.step()
            
            train_l_sum += loss.cpu().item()
            train_acc_sum+=(y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        evaluate_accuracy(test_iter,net,device)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))

In [15]:
#train(train_iter, test_iter, net, criterion, optimizer, device, num_epochs)

In [16]:
def predict_sentiment(net,vocab,sentence):
    """
    sentence:list-like
    """
    device = list(net.parameters())[0].device
    sentence = torch.tensor([vocab.stoi[word] for word in sentence], device=device)
    label = torch.argmax(net(sentence.view((1, -1))), dim=1)
    return 'positive' if label.item() == 1 else 'negative'

In [17]:
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'great'])

'positive'

# 卷积神经网络textCNN
主要是在信息抽取的方式，通过引入卷积核，来获取特征之间的信息

In [18]:
# 定义一维互相关运算
def corr1d(X, K):
    """
    X:输入的一维list
    K：卷积核
    @returen: 返回一维卷积运算的结果
    """
    w=K.shape[0]
    Y = torch.zeros((X.shape[0] - w + 1))
    for i in range(Y.shape[0]):
        Y[i]=(X[i: i + w] * K).sum()
    return Y

X, K = torch.tensor([0, 1, 2, 3, 4, 5, 6]), torch.tensor([1, 2])
corr1d(X, K)

tensor([ 2.,  5.,  8., 11., 14., 17.])

In [19]:
# 多输入通道的一维互相关运算也与多输入通道的二维互相关运算类似：在每个通道上，将核与相应的输入做一维互相关运算，
# 并将通道之间的结果相加得到输出结果。
def corr1d_multi_in(X,K):
    """
    多输入通道的一维互相关运算
    """
    return torch.stack([corr1d(x, k) for x, k in zip(X, K)]).sum(dim=0)
X = torch.tensor([[0, 1, 2, 3, 4, 5, 6],
              [1, 2, 3, 4, 5, 6, 7],
              [2, 3, 4, 5, 6, 7, 8]])
K = torch.tensor([[1, 2], [3, 4], [-1, -3]])
corr1d_multi_in(X, K)

tensor([ 2.,  8., 14., 20., 26., 32.])

In [20]:
# 时序最大池化层
class GlobalMaxPool1d(nn.Module):
    def __init__(self):
        super(GlobalMaxPool1d, self).__init__()
    def forward(self, x):
         # x shape: (batch_size, channel, seq_len)
         # return shape: (batch_size, channel, 1)
        return F.max_pool1d(x, kernel_size=x.shape[2])

In [21]:
class TextCNN(nn.Module):
    def __init__(self,vocab,embedding_size,kernel_sizes,num_channels):
        super(TextCNN,self).__init__()
        self.embedding=nn.Embedding(len(vocab),embedding_size)
        # 不参与训练的嵌入层
        self.constant_embedding=nn.Embedding(len(vocab),embedding_size)
        self.dropout=nn.Dropout(0.5)
        self.decoder = nn.Linear(sum(num_channels), 2)
        
        # 时序最大池化层没有权重，所以可以共用一个实例
        self.pool = GlobalMaxPool1d()
        self.convs = nn.ModuleList()  # 创建多个一维卷积层
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.append(nn.Conv1d(in_channels = 2*embed_size, out_channels = c, kernel_size = k))
        
    def forward(self,inputs):
        # (batch, seq_len, 2*embed_size)
        embeddings = torch.cat((self.embedding(inputs), self.constant_embedding(inputs)), dim=2) 
        # 根据Conv1D要求的输入格式，将词向量维，即一维卷积层的通道维(即词向量那一维)，变换到前一维
        embeddings = embeddings.permute(0, 2, 1)
        # 对于每个一维卷积层，在时序最大池化后会得到一个形状为(批量大小, 通道大小, 1)的Tensor。使用flatten函数去掉最后一维，然后在通道维上连结
        encoding = torch.cat([self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs], dim=1)
        # 应用丢弃法后使用全连接层得到输出
        outputs = self.decoder(self.dropout(encoding))
        return outputs

In [22]:
embed_size, kernel_sizes, nums_channels = 100, [3, 4, 5], [100, 100, 100]
net = TextCNN(vocab, embed_size, kernel_sizes, nums_channels)

In [23]:
# 导入预训练词向量
glove_vocab = Vocab.GloVe(name='6B', dim=100,cache=os.path.join("../data", "pretrain_models"))

# 导入两种形式的embedding
net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.requires_grad = False

There are 21202 oov words.
There are 21202 oov words.


In [24]:
lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, net.parameters()), lr=lr)
criterion = nn.CrossEntropyLoss()
#train(train_iter, test_iter, net, criterion, optimizer, device, num_epochs)

In [25]:
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'great'])

'negative'