In [6]:
import collections
import os
import random
import tarfile
import torch
from torch import nn
import torchtext.vocab as Vocab
import torch.utils.data as Data

import sys
sys.path.append("D:\CS\MachineLearning\Dive-into-DL-PyTorch-master\code") 
import d2lzh_pytorch as d2l

os.environ["CUDA_VISIBLE_DEVICES"] = "0"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

DATA_ROOT = r"D:\CS\MachineLearning\Dive-into-DL-PyTorch-master\S1\CSCL\tangss\Datasets"

In [8]:
#use stanford's large movie review dateaset including 25000 reviews in IMDB
#download dataset
fname = os.path.join(DATA_ROOT, "aclImdb_v1.tar.gz")
if not os.path.exists(os.path.join(DATA_ROOT, "aclImdb")):
    print("从压缩包解压...")
    with tarfile.open(fname, 'r') as f:
        f.extractall(DATA_ROOT)

In [12]:
from tqdm import tqdm
# 本函数已保存在d2lzh_pytorch包中方便以后使用
def read_imdb(folder='train', data_root=r"D:\CS\MachineLearning\Dive-into-DL-PyTorch-master/S1/CSCL/tangss/Datasets/aclImdb"): 
    data = []
    for label in ['pos', 'neg']:
        folder_name = os.path.join(data_root, folder, label)
        for file in tqdm(os.listdir(folder_name)):
            with open(os.path.join(folder_name, file), 'rb') as f:
                review = f.read().decode('utf-8').replace('\n', '').lower()
                data.append([review, 1 if label == 'pos' else 0])
    random.shuffle(data)
    return data

train_data, test_data = read_imdb('train'), read_imdb('test')

100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:02<00:00, 5734.79it/s]
100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:02<00:00, 5869.50it/s]
100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:02<00:00, 5677.48it/s]
100%|██████████████████████████████████████████████████████████████████████████| 12500/12500 [00:02<00:00, 5773.18it/s]


In [15]:
#-Pre_processing_split words based with space
def get_tokenized_imdb(data):
    """
    data:list of [string,label]
    """
    def tokenizer(text):
        return [tok.lower() for tok in text.split(" ")]
    return [tokenizer(review) for review,_ in data]

In [16]:
#split words, filtering the words number<5
def get_vocab_imdb(data):
    tokenized_data=get_tokenized_imdb(data)
    counter=collections.Counter([tk for st in tokenized_data for tk in st])
    return Vocab.Vocab(counter,min_freq=5)

vocab=get_vocab_imdb(train_data)
"words in vocab :",len(vocab)

('words in vocab :', 46152)

In [22]:
#fix the length to 500,using word index
def preprocess_imdb(data,vocab):
    max_l=500
    
    def pad(x):
        return x[:max_l] if len(x)>max_l else x+[0]*(max_l-len(x))
    
    tokenized_data=get_tokenized_imdb(data)
    features=torch.tensor([pad([vocab.stoi[word] for word in words]) for words in tokenized_data])
    labels=torch.tensor([score for _,score in data])
    return features, labels

In [25]:
#estabilish iterator
batch_size=64
train_set=Data.TensorDataset(*preprocess_imdb(train_data,vocab))
test_set=Data.TensorDataset(*preprocess_imdb(test_data,vocab))

train_iter=Data.DataLoader(train_set,batch_size,shuffle=True)
test_iter=Data.DataLoader(test_set,batch_size)

In [26]:
for X,y in train_iter:
    print("X",X.shape,"y",y.shape)
    break
"#batches",len(train_iter)

X torch.Size([64, 500]) y torch.Size([64])


('#batches', 391)

In [45]:
#bulid BiRNN
class BiRNN(nn.Module):
    def __init__(self,vocab,embed_size,num_hiddens,num_layers):
        super(BiRNN,self).__init__()
        self.embedding=nn.Embedding(len(vocab),embed_size)
        #bidirectional 设为True
        self.encoder=nn.LSTM(input_size=embed_size,
                            hidden_size=num_hiddens,
                            num_layers=num_layers,
                            bidirectional=True)
        #初始时间步和最终时间步的隐藏状态作为全连接层的输入
        self.decoder=nn.Linear(4*num_hiddens,2)
        
    def forward(self,inputs):
        #inputs 的形状是（批量大小，词数），因为LSTM需要将序列长度（seq_len)作为第一维，所以将输入转置后
        #在提取词特证，输出形状为（词数，批量大小，词向量维度）
        embedding=self.embedding(inputs.permute(1,0))
        ##rnn.LSTM只传入输入embeddings,因此只返回最后一层的隐藏层在各时间步的隐藏状态。
        #output形状是（词数，批量大小，2*隐藏单元个数）
        outputs,_=self.encoder(embedding) #output,(h,c)
        #连接初始时间步和最终时间步的隐藏状态作为全连接层输入，他的形状为
        #(批量大小，4*隐藏单元个数)
        encoding=torch.cat((outputs[0],outputs[-1]),-1)
        outs=self.decoder(encoding)
        return outs

In [46]:
#initialized BIRNN
embed_size,num_hiddens,num_layers=100,100,2
net=BiRNN(vocab,embed_size,num_hiddens,num_layers)

In [35]:
#load glove word vector ，increase dimension，avoid over-fitting
glove_vocab=Vocab.GloVe(name="6B",dim=100,cache=os.path.join(DATA_ROOT,"glove"))

D:\CS\MachineLearning\Dive-into-DL-PyTorch-master\S1\CSCL\tangss\Datasets\glove\glove.6B.zip: 862MB [07:41, 1.87MB/s]  
100%|██████████████████████████████████████████████████████████████████████▊| 399127/400000 [00:27<00:00, 14320.16it/s]

In [39]:
def load_pretrained_embedding(words,pretrained_vocab):
    """从预训练好的vocab中提取words的词向量"""
    embed=torch.zeros(len(words),pretrained_vocab.vectors[0].shape[0])
    oov_count=0#out of vocabulary
    for i,word in enumerate(words):
        try:
            idx=pretrained_vocab.stoi[word]
            embed[i,:]=pretrained_vocab.vectors[idx]
        except KeyError:
            oov_count+=1
    if oov_count>0:
        print("There are %d oov words:" % oov_count)
    return embed

In [40]:
net.embedding.weight.data.copy_(
    load_pretrained_embedding(vocab.itos,glove_vocab))
net.embedding.weight.required_grad=False#直接加载训练好的，因此这里置false


There are 21202 oov words:


In [47]:
lr,num_epoches=0.01,5
#过滤掉不计算梯度的embedding参数
optimizer=torch.optim.Adam(filter(lambda p:p.requires_grad,net.parameters()),lr=lr)
loss=nn.CrossEntropyLoss()
d2l.train(train_iter,test_iter,net,loss,optimizer,device,num_epoches)

training on  cpu
epoch 1, loss 0.5560, train acc 0.708, test acc 0.840, time 2559.9 sec
epoch 2, loss 0.1383, train acc 0.889, test acc 0.831, time 2514.1 sec
epoch 3, loss 0.0530, train acc 0.943, test acc 0.858, time 2506.0 sec
epoch 4, loss 0.0238, train acc 0.968, test acc 0.859, time 2616.5 sec
epoch 5, loss 0.0128, train acc 0.980, test acc 0.849, time 2618.3 sec


In [49]:
# 本函数已保存在d2lzh_pytorch包中方便以后使用
def predict_sentiment(net, vocab, sentence):
    """sentence是词语的列表"""
    device = list(net.parameters())[0].device
    sentence = torch.tensor([vocab.stoi[word] for word in sentence], device=device)
    label = torch.argmax(net(sentence.view((1, -1))), dim=1)
    return 'positive' if label.item() == 1 else 'negative'

In [50]:
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'great']) 

'positive'

In [51]:
predict_sentiment(net, vocab, ['this', 'movie', 'is', 'so', 'bad']) 

'negative'

In [53]:
predict_sentiment(net, vocab, ['this', 'movie', 'does',"not","contain","actions"]) 

'negative'