# IMDB Sentiment Analysis

这个文件在IMDB上进行情感分类，测试以下模型的表现：

* 一些机器学习方法（not finished）
* LSTM
* BERT（not finished）

目前的进度：

处理数据

问题：

* 迭代器并没有shuffle?
* torchtext怎么使用预训练词向量？
* RNN训练时要使用packandpad吗？还是仅仅pad就行？使用了之后还需要让长度相近的在一起pad吗？
* 后续使用BERT可能要用transformer或者allennlp？
* 训练过程中是按batch评估还是按epoch评估找最优？

## Import

In [39]:
import torch
from torchtext import datasets
from torchtext import data
import numpy as np
import random
from torch import nn,optim
from sklearn import metrics

use_cuda=torch.cuda.is_available()
device=torch.device("cuda" if use_cuda else "cpu")

## 供调整的参数

In [40]:
lr=1e-3
bs=64
d_embed=100
d_hidden=256
d_output=2
dropout=0.2
max_epochs=20
require_improvement=1

## 数据载入和处理

在载入和处理数据部分采用了torchtext库。

In [3]:
TEXT=data.Field(tokenize='spacy',tokenizer_language="en_core_web_sm",batch_first=True)
LABEL=data.LabelField(dtype=torch.long)
train_data,test_data=datasets.IMDB.splits(TEXT,LABEL)

在这里可能会遇到报错，提示不能载入*en_core_web_sm*。

输入命令（需要翻墙）即可解决
> python -m spacy download en_core_web_sm

**下面展示样本数量和一个样本。**

In [5]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of testing examples: {len(test_data)}')
print(vars(train_data.examples[0])['text'])

Number of training examples: 25000
Number of testing examples: 25000
['Bromwell', 'High', 'is', 'a', 'cartoon', 'comedy', '.', 'It', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', ',', 'such', 'as', '"', 'Teachers', '"', '.', 'My', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'Bromwell', 'High', "'s", 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '"', 'Teachers', '"', '.', 'The', 'scramble', 'to', 'survive', 'financially', ',', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', 'teachers', "'", 'pomp', ',', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', ',', 'all', 'remind', 'me', 'of', 'the', 'schools', 'I', 'knew', 'and', 'their', 'students', '.', 'When', 'I', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', ',', 'I', 'immediately', 'recalled', '...

有25000个训练样本和25000个测试样本，尽管这个数量比不太符合要求，但是这个任务比较简单，我们就这么来。

一个样本是一个字典的形式，'text'中含有分词完毕的单词列表，'label'中含其标签（pos或neg）。

**下面我们需要把训练样本中再分一些出来作为验证集。**

In [6]:
#确保每次分割相同
SEED = 1234
np.random.seed(SEED)
random.seed(SEED)
torch.manual_seed(SEED)
if use_cuda:
    torch.cuda.manual_seed(SEED)
    
train_data,valid_data=train_data.split(split_ratio=0.8)

print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')
print(f'Number of testing examples: {len(test_data)}')

Number of training examples: 20000
Number of validation examples: 5000
Number of testing examples: 25000


**下面我们需要建立字典**

In [13]:
TEXT.build_vocab(train_data)
LABEL.build_vocab(train_data)

d_vocab=len(TEXT.vocab)
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

print('最频繁的20个单词：')
print(TEXT.vocab.freqs.most_common(20))

Unique tokens in TEXT vocabulary: 108284
Unique tokens in LABEL vocabulary: 2
最频繁的20个单词：
[('the', 232320), (',', 219824), ('.', 189598), ('and', 125133), ('a', 124608), ('of', 115066), ('to', 107184), ('is', 87278), ('in', 70017), ('I', 61811), ('it', 61383), ('that', 56358), ('"', 50570), ("'s", 49484), ('this', 48454), ('-', 42677), ('/><br', 40779), ('was', 40082), ('as', 34777), ('with', 34057)]


测试和验证文本中可能出现训练集中没有的单词，另外在训练时为了满足批量输入需要将所有或一个批次的文本长度对齐，因此上述字典的建立中会自动加入特殊标记_&lt;unk&gt;_ 和*&lt;pad&gt;* ，用来表示未知字符和填充字符。

字典长度比较大，可能需要去除一些，或者使用预训练词向量初始化。

**下面我们需要建立迭代器**

In [42]:
train_iterator, valid_iterator, test_iterator =data.BucketIterator.splits(
    (train_data,valid_data,test_data),
    batch_size=bs,device=device,shuffle=True)

#测试
for x in test_iterator:
    print(x.text.shape)
    print(x.text)
    break

torch.Size([64, 36])
tensor([[  263,     6,  4886,  ...,     4, 24290,    39],
        [  492,     3,    16,  ..., 11775,     4,    14],
        [   11,   159,   123,  ...,  2405,   403,     4],
        ...,
        [ 6532,     5,   760,  ...,     1,     1,     1],
        [ 6451,     2,   298,  ...,     1,     1,     1],
        [77674,     0,     0,  ...,     1,     1,     1]])


值得注意的是，**迭代器中的文本已经被转换成了序号**，torchtext内部具体怎么实现的不清楚。


## Model

定义一个LSTM模型。

In [24]:
class simple_rnn(nn.Module):
    
    def __init__(self,d_vocab: int,d_embed:int ,d_hidden:int ,dropout:float,d_output:int,vectors=None):
        super(simple_rnn, self).__init__()

        self.d_hidden=d_hidden
        self.embed=nn.Embedding(d_vocab,d_embed)
        self.rnn=nn.LSTM(d_embed,d_hidden,batch_first=True)
        self.fc=nn.Linear(d_hidden,d_output)
        self.dropout=nn.Dropout(dropout)

        #self.init_weight(vectors)

    def init_weight(self,vectors=None):
        if vectors is not None:
            self.embed.weight.data.copy_(vectors)
            
        initrange=0.1
        self.fc.weight.data.uniform_(-initrange,initrange)
        
    def forward(self,x,hidden=None):
        # input:(bs,1ength)
        
        embeded=self.dropout(self.embed(x)) #(bs,length,d_embed)
        
        if hidden is not None:
            output,hidden=self.rnn(embeded,hidden)
        else:
            output,(hidden,_)=self.rnn(embeded)
        #output:(bs,length,d_hidden)
        #hidden:(batch,1,d_hidden)

        assert torch.equal(output[:,-1,:],hidden.squeeze(0))

        return self.fc(hidden.squeeze(0))#(batch,d_output)
    
model=simple_rnn(d_vocab,d_embed,d_hidden,dropout,d_output)
print(model)
if use_cuda:
    model.cuda()

simple_rnn(
  (embed): Embedding(108284, 100)
  (rnn): LSTM(100, 256, batch_first=True)
  (fc): Linear(in_features=256, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


测试一下能否跑通

In [27]:
optimizer = optim.Adam(model.parameters(),lr=lr)
criterion = nn.CrossEntropyLoss()
if use_cuda:
    criterion.cuda()
with torch.no_grad():
    for batch in train_iterator:
        x=batch.text
        y=batch.label
        if use_cuda:
            x.cuda()
            y.cuda()
        preds=model(batch.text)
        print(preds.shape)
        criterion(preds,y.long())
        break

torch.Size([64, 2])


## Training

In [None]:
def train(model, train_iter, dev_iter, test_iter):
    model.train()
    optimizer = optim.Adam(model.parameters(),lr=lr)
    criterion = nn.CrossEntropyLoss()
    if use_cuda:
        criterion.cuda()

    # 学习率指数衰减，每次epoch：学习率 = gamma * 学习率
    # scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
    dev_best_loss = float('inf')
    last_improve = 0  # 记录上次验证集loss下降的batch数
    #writer = SummaryWriter(log_dir=config.log_path + '/' + time.strftime('%m-%d.%H.%M', time.localtime())+'_'+which_data+'_'+which_model+'_'+which_task+'_'+exp_number)
    
    for epoch in range(max_epochs):
        train_loss=0
        train_correct=0
        # scheduler.step() # 学习率衰减
        for i, batch in enumerate(train_iter):
            optimizer.zero_grad()
            x=batch.text
            y=batch.label
            if use_cuda:
                x.cuda()
                y.cuda()
            outputs = model(x)
            loss = criterion(outputs, y)
            loss.backward()
            optimizer.step()
            #训练集的准确率
            true = y.data.cpu()
            preds = torch.max(outputs.data, 1)[1].cpu()
            train_correct+=(true==preds).sum()
            train_loss+=loss.item()
        train_loss/=len(train_iterator)   #train_loss
        train_acc=train_correct/len(train_iterator.dataset)   #train_acc
            
        #验证集
        dev_acc, dev_loss = evaluate(model, dev_iter)
        if dev_loss < dev_best_loss:
            dev_best_loss = dev_loss
            improve = '*'
            last_improve=epoch
        else:
            improve = ''
        msg = 'Epoch: {0:>6},  Train Loss: {1:>5.2},  Train Acc: {2:>6.2%},  Val Loss: {3:>5.2},  Val Acc: {4:>6.2%} {5}'
        print(msg.format(epoch+1, train_loss, train_acc, dev_loss, dev_acc, improve))
        #writer.add_scalar("loss/train", loss.item(), total_batch)
        #writer.add_scalar("loss/dev", dev_loss, total_batch)
        #writer.add_scalar("acc/train", train_acc, total_batch)
        #writer.add_scalar("acc/dev", dev_acc, total_batch)

        if epoch - last_improve > require_improvement:
            # 验证集loss超过1epoch没下降，结束训练
            print("No optimization for a long time, auto-stopping...")
            break
    #writer.close()
    test(model, test_iter)

def evaluate(model, data_iter, test=False):
    model.eval()
    loss_total = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    with torch.no_grad():
        for batch in data_iter:
            x=batch.text
            labels=batch.label
            if use_cuda:
                x.cuda()
                labels.cuda()
            outputs = model(x)
            loss = F.cross_entropy(outputs, labels)
            loss_total += loss
            labels = labels.data.cpu().numpy()
            predic = torch.max(outputs.data, 1)[1].cpu().numpy()
            labels_all = np.append(labels_all, labels)
            predict_all = np.append(predict_all, predic)
    model.train()
    acc = metrics.accuracy_score(labels_all, predict_all)
    
    if test:
        report = metrics.classification_report(labels_all, predict_all, labels=[i for i in range(len(class_list))],target_names=class_list, digits=4,output_dict=True)
        confusion = metrics.confusion_matrix(labels_all, predict_all)
        return acc, loss_total / len(data_iter), report, confusion
    
    return acc, loss_total / len(data_iter)


def test(model, test_iter):
    test_acc, test_loss, test_report, test_confusion = evaluate(model, test_iter, test=True)
    msg = 'Test Loss: {0:>5.2},  Test Acc: {1:>6.2%}'
    print(msg.format(test_loss, test_acc))
    print("Precision, Recall and F1-Score...")
    print(test_report)
    print("Confusion Matrix...")
    print(test_confusion)