In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import jieba
import os
from torch.nn import init
from torchtext import data
from torchtext.vocab import Vectors
import time

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# data processing

In [3]:
# 分词
def tokenizer(text): 
    return [word for word in jieba.lcut(text) if word not in stop_words]

In [4]:
# 去停用词
def get_stop_words():
    file_object = open('data/stopwords.txt',encoding='utf-8')
    stop_words = []
    for line in file_object.readlines():
        line = line[:-1]
        line = line.strip()
        stop_words.append(line)
    return stop_words

stop_words = get_stop_words()  # 加载停用词表

In [5]:
text = data.Field(sequential=True,
                  lower=True,
                  tokenize=tokenizer,
                  stop_words=stop_words)
label = data.Field(sequential=False)

In [6]:
train, val = data.TabularDataset.splits(
    path='data/',
    skip_header=True,
    train='train.tsv',
    validation='validation.tsv',
    format='tsv',
    fields=[('index', None), ('label', label), ('text', text)],
)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\ADMINI~1\AppData\Local\Temp\jieba.cache
Loading model cost 2.157 seconds.
Prefix dict has been built successfully.


In [7]:
print(train[2].text)
print(train[5].__dict__.keys())

['油耗', '显示', '13', '升', '多一点', '希望', '慢慢', '下降', '倒车', '雷达', '真', '可恨']
dict_keys(['label', 'text'])


In [8]:
#加载Google训练的词向量
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('data/myvector.vector', binary=False)

In [9]:
cache = 'data/.vector_cache'
if not os.path.exists(cache):
    os.mkdir(cache)
vectors = Vectors(name='data/myvector.vector', cache=cache)
# 指定Vector缺失值的初始化方式，没有命中的token的初始化方式
#vectors.unk_init = nn.init.xavier_uniform_

text.build_vocab(train, val, vectors=vectors)#加入测试集的vertor

In [10]:
#text.build_vocab(train, val, vectors=Vectors(name='data/myvector.vector'))#加入测试集的vertor
label.build_vocab(train, val)

embedding_dim = text.vocab.vectors.size()[-1]
vectors = text.vocab.vectors

In [12]:
text.vocab.freqs.most_common(10)
print(text.vocab.vectors.shape)

[('空间', 13734),
 ('外观', 10641),
 ('满意', 9777),
 ('车', 8260),
 ('动力', 7856),
 ('油耗', 7540),
 ('高', 6119),
 ('内饰', 6068),
 ('感觉', 5383),
 ('配置', 4596)]

torch.Size([34841, 100])


In [13]:
batch_size=128
train_iter, val_iter = data.Iterator.splits(
            (train, val),
            sort_key=lambda x: len(x.text),
            batch_sizes=(batch_size, len(val)), # 训练集设置batch_size,验证集整个集合用于测试
    )

vocab_size = len(text.vocab)
label_num = len(label.vocab)

In [14]:
batch = next(iter(train_iter))
data = batch.text
print(batch.text.shape)
print(batch.text)

torch.Size([35, 128])
tensor([[  23,  499,   48,  ...,    3,   97,    3],
        [5755, 1179, 5475,  ...,  348,  646,  217],
        [   9, 3741,   10,  ...,   31,  411,   89],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]])


# model

In [35]:
class BiRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, num_hiddens, num_layers):
        super(BiRNN, self).__init__()
        # embedding之后的shape: torch.Size([200, 8, 300])
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.word_embeddings = self.word_embeddings.from_pretrained(
            vectors, freeze=False)
        # bidirectional设为True即得到双向循环神经网络
        self.encoder = nn.LSTM(input_size=embedding_dim,
                               hidden_size=num_hiddens,
                               num_layers=num_layers,
                               batch_first=True,
                               bidirectional=True)
        # 初始时间步和最终时间步的隐藏状态作为全连接层输入
        self.w_omega = Variable(torch.zeros(
            self.hidden_size * self.layer_size, self.attention_size))
        self.u_omega = Variable(torch.zeros(self.attention_size))
        self.decoder = nn.Linear(2*num_hiddens, 2)

    def forward(self, inputs):
        # inputs的形状是(批量大小, 词数)，因为LSTM需要将序列长度(seq_len)作为第一维，所以将输入转置后
        # 再提取词特征，输出形状为(词数, 批量大小, 词向量维度)
        embeddings = self.word_embeddings(inputs)
        # rnn.LSTM只传入输入embeddings，因此只返回最后一层的隐藏层在各时间步的隐藏状态。
        # outputs形状是(词数, 批量大小, 2 * 隐藏单元个数)
        outputs, _ = self.encoder(embeddings)  # output, (h, c)
        # 连结初始时间步和最终时间步的隐藏状态作为全连接层输入。它的形状为
        # (批量大小, 4 * 隐藏单元个数)。
        outs = self.decoder(outputs[:, -1, :])
        return outs

In [39]:
embedding_dim, num_hiddens, num_layers = 100, 64, 1
net = BiRNN(vocab_size, embedding_dim, num_hiddens, num_layers)
print(net)

BiRNN(
  (word_embeddings): Embedding(34841, 100)
  (encoder): LSTM(100, 64, batch_first=True, bidirectional=True)
  (decoder): Linear(in_features=128, out_features=2, bias=True)
)


# train

In [40]:
def evaluate_accuracy(data_iter, net):
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for batch_idx, batch in enumerate(train_iter):
            X, y = batch.text, batch.label
            X = X.permute(1, 0)
            y.data.sub_(1)  #X转置 y为啥要减1
            if isinstance(net, torch.nn.Module):
                net.eval() # 评估模式, 这会关闭dropout
                acc_sum += (net(X).argmax(dim=1) == y).float().sum().item()
                net.train() # 改回训练模式
            else: # 自定义的模型, 3.13节之后不会用到, 不考虑GPU
                if('is_training' in net.__code__.co_varnames): # 如果有is_training这个参数
                    # 将is_training设置成False
                    acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
                else:
                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 
            n += y.shape[0]
    return acc_sum / n

In [41]:
def train(train_iter, test_iter, net, loss, optimizer, num_epochs):
    batch_count = 0
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for batch_idx, batch in enumerate(train_iter):
            X, y = batch.text, batch.label
            X = X.permute(1, 0)
            y.data.sub_(1)  #X转置 y为啥要减1
            y_hat = net(X)
            l = loss(y_hat, y)
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum += l.item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = evaluate_accuracy(test_iter, net)
        print(
            'epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
            % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n,
               test_acc, time.time() - start))

In [None]:
lr, num_epochs = 0.01, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
loss = nn.CrossEntropyLoss()
train(train_iter, val_iter, net, loss, optimizer, num_epochs)

epoch 1, loss 0.3787, train acc 0.827, test acc 0.935, time 684.1 sec
epoch 2, loss 0.0774, train acc 0.943, test acc 0.964, time 837.2 sec
epoch 3, loss 0.0358, train acc 0.961, test acc 0.971, time 1081.4 sec
