# 作业二：文本分类 Part2（2）

* 这部分作业，我们使用Transformers的预训练模型来看看效果

In [1]:
import random
from collections import defaultdict
from pathlib import Path

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torchtext
import pytorch_lightning as pl
import transformers
from transformers import (
    DataProcessor,
    InputExample,
    BertForSequenceClassification, 
    BertTokenizer,
    glue_convert_examples_to_features,
)


def set_random_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

set_random_seed(2020)
device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')

**设定计算设备与数据集路径**

In [2]:
device = torch.device('cuda' if torch.cuda.is_available else 'cpu')
data_path = Path('/media/bnu/data/nlp-practice/sentiment-analysis/standford-sentiment-treebank')
model_path = Path('/media/bnu/data/nlp-practice/transformers')

print('PyTorch Version:', torch.__version__)
print('PyTorch Lightning Version:', pl.__version__)
print('Transformers Version:', transformers.__version__)
print('-' * 60)
print('Data Path:', data_path)
print('Model Path:', model_path)
print('-' * 60)

if torch.cuda.is_available():
    print('CUDA Device Count:', torch.cuda.device_count())
    print('CUDA Device Name:')
    for i in range(torch.cuda.device_count()):
        print('\t', torch.cuda.get_device_name(i))
    print('CUDA Current Device Index:', torch.cuda.current_device())
    print('-' * 60)

PyTorch Version: 1.4.0
PyTorch Lightning Version: 0.7.1
Transformers Version: 2.7.0
------------------------------------------------------------
Data Path: /media/bnu/data/nlp-practice/sentiment-analysis/standford-sentiment-treebank
Model Path: /media/bnu/data/nlp-practice/transformers
------------------------------------------------------------
CUDA Device Count: 2
CUDA Device Name:
	 GeForce RTX 2080 Ti
	 GeForce RTX 2080 Ti
CUDA Current Device Index: 0
------------------------------------------------------------


## 数据准备

### 获取数据

* 我们首先通过`pandas`查看一下SST数据集

In [3]:
train_data = pd.read_csv(data_path / 'senti.train.tsv', header=None, delimiter='\t')
valid_data = pd.read_csv(data_path / 'senti.dev.tsv', header=None, delimiter='\t')
test_data = pd.read_csv(data_path / 'senti.test.tsv', header=None, delimiter='\t')

train_data.head()    

Unnamed: 0,0,1
0,hide new secretions from the parental units,0
1,"contains no wit , only labored gags",0
2,that loves its characters and communicates som...,1
3,remains utterly satisfied to remain the same t...,0
4,on the worst revenge-of-the-nerds clichés the ...,0


In [4]:
print('Train Size:', len(train_data))
print('Valid Size:', len(valid_data))
print('Test Size:', len(test_data))

Train Size: 67349
Valid Size: 872
Test Size: 1821


### 数据处理

* 首先我们参考`glue.py`文件中的内容自定义`DataProcessor`

In [5]:
class SstProcessor(DataProcessor):
    
    def get_train_examples(self, data_dir):
        return self._create_examples(
            self._read_tsv(data_dir / 'senti.train.tsv'),
            set_type='train',
        )
    
    def get_dev_examples(self, data_dir):
        return self._create_examples(
            self._read_tsv(data_dir / 'senti.dev.tsv'),
            set_type='valid',
        )
    
    def get_test_examples(self, data_dir):
        return self._create_examples(
            self._read_tsv(data_dir / 'senti.test.tsv'),
            set_type='test'
        )
    
    def get_labels(self):
        return ['0', '1']
    
    def _create_examples(self, lines, set_type):
        examples = []
        for i, line in enumerate(lines):
            guid = f'{set_type}-{i}'  # 样本的唯一编号
            text_a = line[0]  # 预训练模型中的第一句话，因为是分类问题不需要第二句话
            label = line[1]  # 样本标签
            examples.append(InputExample(
                guid=guid,
                text_a=text_a,
                text_b=None,
                label=label,
            ))
        return examples

**简单测试**

In [6]:
processor = SstProcessor()
examples = processor.get_train_examples(data_path)
print('Train:', len(examples))
print(examples[10])
examples = processor.get_dev_examples(data_path)
print('Valid:', len(examples))
print(examples[10])
examples = processor.get_test_examples(data_path)
print('Test:', len(examples))
print(examples[10])

Train: 67349
InputExample(guid='train-10', text_a='goes to absurd lengths', text_b=None, label='0')
Valid: 872
InputExample(guid='valid-10', text_a='The mesmerizing performances of the leads keep the film grounded and keep the audience riveted .', text_b=None, label='1')
Test: 1821
InputExample(guid='test-10', text_a="It 's also heavy-handed and devotes too much time to bigoted views .", text_b=None, label='0')


* 接下来我们要构建模型所需的`DataLoader`

In [7]:
def generate_dataloaders(tokenizer):
    
    def generate_dataloader_inner(examples):
        features = glue_convert_examples_to_features(
            train_examples,
            tokenizer,
            label_list=['0', '1'],
            max_length=128,
            output_mode='classification',
            pad_on_left=False,
            pad_token=tokenizer.pad_token_id,
            pad_token_segment_id=0)
        
        dataset = torch.utils.data.TensorDataset(
            torch.LongTensor([f.input_ids for f in features]),
            torch.LongTensor([f.attention_mask for f in features]),
            torch.LongTensor([f.token_type_ids for f in features]),
            torch.LongTensor([f.label for f in features])
        )
        
        sampler = torch.utils.data.RandomSampler(dataset)
        dataloader = torch.utils.data.DataLoader(
            dataset, sampler=sampler, batch_size=32
        )
        return dataloader
    
    # 训练数据
    train_examples = processor.get_train_examples(data_path)
    train_loader = generate_dataloader_inner(train_examples)
    
    # 验证数据
    valid_examples = processor.get_dev_examples(data_path)
    valid_loader = generate_dataloader_inner(valid_examples)
    
    # 测试数据
    test_examples = processor.get_test_examples(data_path)
    test_loader = generate_dataloader_inner(test_examples)
    
    return train_loader, valid_loader, test_loader

**简单测试**

In [8]:
tokenizer = BertTokenizer.from_pretrained(
    model_path / 'bert-base-uncased' / 'bert-base-uncased-vocab.txt'
)
train_loader, valid_loader, test_loader = generate_dataloaders(tokenizer)

for batch in train_loader:
    input_ids, attention_mask, token_type_ids, label = batch
    print('Input Ids:', input_ids.shape)
    print('Attn Mask:', attention_mask.shape)
    print('Token Type Ids:', token_type_ids.shape)
    print('Label:', label.shape)
    break

Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated


Input Ids: torch.Size([32, 128])
Attn Mask: torch.Size([32, 128])
Token Type Ids: torch.Size([32, 128])
Label: torch.Size([32])


In [9]:
ptm = BertForSequenceClassification.from_pretrained(
    model_path / 'bert-base-uncased' / 'bert-base-uncased-pytorch_model.bin',
    config = model_path / 'bert-base-uncased' / 'bert-base-uncased-config.json'
)

for batch in train_loader:
    input_ids, attention_mask, token_type_ids, label = batch
    outputs = ptm(input_ids, 
                  token_type_ids=token_type_ids, 
                  attention_mask=attention_mask)[0]
    print('Batch Size:', len(label))
    print('Outputs:', outputs.shape)
    break

Batch Size: 32
Outputs: torch.Size([32, 2])


## 定义模型

* 为了减少大量训练代码的编写量，这次使用了Pytorch-Lightning库对结构进行简化

In [10]:
class SstPreTrainedModel(pl.LightningModule):
    
    def __init__(self):
        super(SstPreTrainedModel, self).__init__()
        # 预训练模型
        self.ptm = ptm
        # 损失函数
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, input_ids, attention_mask, token_type_ids):
        return self.ptm(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            token_type_ids=token_type_ids
        )[0]
    
    def training_step(self, batch, batch_idx):
        input_ids, attention_mask, token_type_ids, label = batch
        out = self(input_ids, attention_mask, token_type_ids)
        
        loss = self.criterion(out, label)
        
        _, pred = torch.max(out, dim=1)
        acc = (pred == label).float().mean()
        
        tensorboard_logs = {'train_loss': loss, 'train_acc': acc}
        return {'loss': loss, 'log': tensorboard_logs}
    
    def validation_step(self, batch, batch_idx):
        input_ids, attention_mask, token_type_ids, label = batch
        out = self(input_ids, attention_mask, token_type_ids)
        
        loss = self.criterion(out, label)
        
        _, pred = torch.max(out, dim=1)
        acc = (pred == label).float().mean()
        
        return {'val_loss': loss, 'val_acc': acc}
        
    def validation_epoch_end(self, outputs):
        val_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
        val_acc = torch.stack([x['val_acc'] for x in outputs]).mean()
        
        tensorboard_logs = {'val_loss': val_loss, 'val_acc': val_acc}
        return {'val_loss': val_loss, 'log': tensorboard_logs, 'progress_bar': tensorboard_logs}
        
    def test_step(self, batch, batch_idx):
        input_ids, attention_mask, token_type_ids, label = batch
        out = self(input_ids, attention_mask, token_type_ids)
        
        _, pred = torch.max(out, dim=1)
        acc = (pred == label).float().mean()
        
        return {'test_acc': acc}
        
    def test_epoch_end(self, outputs):
        test_acc = torch.stack([x['test_acc'] for x in outputs]).mean()
        
        tensorboard_logs = {'test_acc': test_acc}
        return {'test_acc': test_acc, 'log': tensorboard_logs, 'progress_bar': tensorboard_logs}
        
    
    def configure_optimizers(self):
        return torch.optim.Adam([p for p in self.parameters() if p.requires_grad], lr=2e-5, eps=1e-8)

    def train_dataloader(self):
        return train_loader
    
    def val_dataloader(self):
        return valid_loader
    
    def test_dataloader(self):
        return test_loader
        

In [11]:
torch.cuda.empty_cache()
model = SstPreTrainedModel()
trainer = pl.Trainer(
    max_epochs=1,
    gpus=1,
#     gpus=2,
#     distributed_backend='dp',
    default_save_path='/media/bnu/data/pytorch-lightning-checkpoints/'
)
trainer.fit(model)

HBox(children=(FloatProgress(value=0.0, description='Validation sanity check', layout=Layout(flex='2'), max=5.…



HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=1.0), HTML(value='')), …

HBox(children=(FloatProgress(value=0.0, description='Validating', layout=Layout(flex='2'), max=2105.0, style=P…




1

In [12]:
trainer.test(model)

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=2105.0, style=Prog…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'test_acc': tensor(0.9723, device='cuda:0')}
----------------------------------------------------------------------------------------------------



可以看到在测试集当中已经能达到97%以上的准确率了

In [13]:
%load_ext tensorboard
%tensorboard --logdir /media/bnu/data/pytorch-lightning-checkpoints/lightning_logs

Reusing TensorBoard on port 6007 (pid 4257), started 0:36:59 ago. (Use '!kill 4257' to kill it.)