In [1]:
import os
import torch
import collections
from torch import nn
import torchtext.vocab as Vocab
import torch.nn.functional as F
import torch.utils.data as Data
import random
import sys
import time
import tarfile
import d2lzh_pytorch as d2l
sys.path.append("..")
DATA_ROOT = "/Users/air/Datasets"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

读取训练数据集和测试数据集
' 1 ' ：正面
' 0 ' : 负面

In [2]:
from tqdm import tqdm
def read_imdb( folder = 'train', data_root = "/Users/air/Datasets/aclImdb"):
    data = []
    for lable in ['pos', 'neg']:
        #分别读取训练数据集和测试数据集中的正面评论和负面评论，并打上标签
        folder_name = os.path.join(data_root, folder, lable)
        for file in tqdm(os.listdir(folder_name)):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', ' ').lower()
                data.append([review, 1 if lable=='pos' else 0])
    #对所有数据进行随机排序
    random.shuffle(data)
    return data

对每条评论分词，这里使用空格分词

In [3]:
def get_tokenized_imdb(data):
    """
    data : list of [string, lable]
    """
    def tokenizer(text):
        #使用列表推捯式进行分词，并将所有单词小写
        return [tok.lower() for tok in text.split(' ')]
    return [tokenizer(reviwe) for reviwe, _ in data]

根据分好词的训练数据集来创建词典，滤掉初现次数少于5的词

In [4]:
def get_vocab_imdb(data):
    tokenized_data = get_tokenized_imdb(data)
    """
    tokenized_data : [ [word1, word2, ...], [word3, word4, ...], ... ]
    """
    counter = collections.Counter([tk for st in tokenized_data for tk in st])
    """
    counter : dict of [word:num, ......]
    """
    return Vocab.Vocab(counter, min_freq=5)

* 为了使每条评论的长度一致，通过截断或补零来使每条评论长度固定为500
* vocab.stoi[] : [单词，索引]
* vocab.itos[] : [索引，单词]

In [5]:
def preprocess_imdb(data, vocab):
    max_l = 500
    
    def pad(x):
        return x[:max_l] if len(x) > max_l else x + [0]*(max_l - len(x))
    tokenized_data = get_tokenized_imdb(data)
    features =torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data]) 
    labels = torch.tensor([score for _, score in data])
    return features, labels

In [6]:
class GlobalMaxPool1d(nn.Module):
    def __init__(self):
        super(GlobalMaxPool1d, self).__init__()
    def forward(self, x):
        return F.max_pool1d(x, kernel_size = x.shape[2])

In [7]:
batch_size = 64
train_data, test_data = read_imdb('train'), read_imdb('test')
vocab = get_vocab_imdb(train_data)

100%|██████████| 12500/12500 [00:02<00:00, 4472.49it/s]
100%|██████████| 12500/12500 [00:02<00:00, 4905.04it/s]
100%|██████████| 12500/12500 [00:02<00:00, 5663.80it/s]
100%|██████████| 12500/12500 [00:03<00:00, 3988.39it/s]


In [8]:
train_set = Data.TensorDataset(*preprocess_imdb(train_data, vocab))
test_set = Data.TensorDataset(*preprocess_imdb(test_data, vocab))

In [9]:
train_iter = Data.DataLoader(train_set, batch_size, shuffle=True)
test_iter = Data.DataLoader(test_set, batch_size)

In [10]:
class TextCNN(nn.Module):
    def __init__(self, vocab, embed_size, kernel_sizes, num_channels):
        super(TextCNN, self).__init__()
        self.embedding = nn.Embedding(len(vocab), embed_size)
        self.constant_embedding = nn.Embedding(len(vocab), embed_size)
        self.dropout = nn.Dropout(0.5)
        self.decoder = nn.Linear(sum(num_channels), 2)
        self.pool = GlobalMaxPool1d()
        self.convs = nn.ModuleList()
        for c, k in zip(num_channels, kernel_sizes):
            self.convs.append(nn.Conv1d(in_channels=2*embed_size,
                                                       out_channels = c,
                                                       kernel_size = k))
    def forward(self, inputs):
        embeddings = torch.cat((self.embedding(inputs),
                                           self.constant_embedding(inputs)), dim=2)
        embeddings = embeddings.permute(0, 2, 1)
        encoding = torch.cat([self.pool(F.relu(conv(embeddings))).squeeze(-1) for conv in self.convs],
                                        dim=1)
        outputs = self.decoder(self.dropout(encoding))
        return outputs

 从训练好的vocab中提取出words对应的词向量

In [11]:
def load_pretrained_embedding(words, pretrained_vocab):
    #生成与训练词向量维度相同的零矩阵
    embed = torch.zeros(len(words), pretrained_vocab.vectors[0].shape[0])
    oov_count = 0
    for i, word in enumerate(words):
        try:
            idx = pretrained_vocab.stoi[word]
            embed[i, :] = pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count += 0
    if oov_count > 0:
        print("there are %d oov words")
    return embed

In [12]:
glove_vocab = Vocab.GloVe(name='6B', dim=100, cache=os.path.join(DATA_ROOT, "glove"))

In [13]:
embed_size, kernal_sizes, nums_channels = 100, [2, 3, 4], [100, 100, 100]
net = TextCNN(vocab, embed_size, kernal_sizes, nums_channels)

In [14]:
print(net)

TextCNN(
  (embedding): Embedding(46152, 100)
  (constant_embedding): Embedding(46152, 100)
  (dropout): Dropout(p=0.5, inplace=False)
  (decoder): Linear(in_features=300, out_features=2, bias=True)
  (pool): GlobalMaxPool1d()
  (convs): ModuleList(
    (0): Conv1d(200, 100, kernel_size=(2,), stride=(1,))
    (1): Conv1d(200, 100, kernel_size=(3,), stride=(1,))
    (2): Conv1d(200, 100, kernel_size=(4,), stride=(1,))
  )
)


In [15]:
net.embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.data.copy_(load_pretrained_embedding(vocab.itos, glove_vocab))
net.constant_embedding.weight.requires_grad = False

In [16]:
def train(train_iter, test_iter, net, loss, optimizer, device, num_epochs):
    net = net.to(device)
    print("training on ", device)
    batch_count = 0
    for epoch in range(num_epochs):
        train_l_sum, train_acc_sum, n, start = 0.0, 0.0, 0, time.time()
        for X, y in train_iter:
            X = X.to(device)
            y = y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y) 
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            train_l_sum += l.cpu().item()
            train_acc_sum += (y_hat.argmax(dim=1) == y).sum().cpu().item()
            n += y.shape[0]
            batch_count += 1
        test_acc = evaluate_accuracy(test_iter, net)
        print('epoch %d, loss %.4f, train acc %.3f, test acc %.3f, time %.1f sec'
              % (epoch + 1, train_l_sum / batch_count, train_acc_sum / n, test_acc, time.time() - start))

In [17]:
def evaluate_accuracy(data_iter, net, device=None):
    if device is None and isinstance(net, torch.nn.Module):
        # 如果没指定device就使用net的device
        device = list(net.parameters())[0].device 
    acc_sum, n = 0.0, 0
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(net, torch.nn.Module):
                net.eval() # 评估模式, 这会关闭dropout
                acc_sum += (net(X.to(device)).argmax(dim=1) == y.to(device)).float().sum().cpu().item()
                net.train() # 改回训练模式
            else: 
                if('is_training' in net.__code__.co_varnames): # 如果有is_training这个参数
                    # 将is_training设置成False
                    acc_sum += (net(X, is_training=False).argmax(dim=1) == y).float().sum().item() 
                else:
                    acc_sum += (net(X).argmax(dim=1) == y).float().sum().item() 
            n += y.shape[0]
    return acc_sum / n

In [19]:
lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(filter(lambda p : p.requires_grad, net.parameters()), lr = lr)
loss  = nn.CrossEntropyLoss()

In [20]:
train(train_iter, test_iter, net, loss, optimizer, device, num_epochs)

training on  cpu
epoch 1, loss 0.4991, train acc 0.749, test acc 0.843, time 519.7 sec
epoch 2, loss 0.1649, train acc 0.857, test acc 0.869, time 526.1 sec
epoch 3, loss 0.0717, train acc 0.916, test acc 0.874, time 523.3 sec
epoch 4, loss 0.0306, train acc 0.957, test acc 0.848, time 523.7 sec
epoch 5, loss 0.0128, train acc 0.978, test acc 0.861, time 521.9 sec


batch_size=64;

In [46]:
def predict_sentiment(net, vocab, sentence):
    device = list(net.parameters())[0].device
    sentence = torch.tensor([vocab.stoi[word] for word in sentence], device=device)
    label = torch.argmax(net(sentence.view((1, -1))), dim=1 )
    return 'positive' if label.item() == 1 else 'negative'

In [49]:
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'great'])

'positive'

In [50]:
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'bad'])

'negative'