A text classification example for BERT by PyTorch.  
The data is from https://github.com/FudanNLP/nlpcc2017_news_headline_categorization.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [1]:
import os
import warnings
import time
import torch
import numpy as np
import pandas as pd
from sklearn import preprocessing
from torch import utils
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup

os.chdir('./drive/My Drive/Python/Research/bert')
warnings.filterwarnings('ignore')

In [2]:
MAXLEN = 40
CATE = 18
DROP = 0.5
LRATE = 5e-5
BATCH = 64
EPOCH = 3
MODEL = 'bert-base-chinese'
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
path_1 = 'tasks/datasets/nlpcc_2017_news/'
training_1 = pd.read_table(path_1+'train.txt', names=['label', 'text'])
dev_1 = pd.read_table(path_1+'dev.txt', names=['label', 'text'])
test_1 = pd.read_table(path_1+'test.txt', names=['label', 'text'])

encoder_1 = preprocessing.LabelEncoder()
training_1['label'] = encoder_1.fit_transform(training_1['label'])
dev_1['label'] = encoder_1.transform(dev_1['label'])
test_1['label'] = encoder_1.transform(test_1['label'])

print(training_1.head())

   label                                               text
0      3      台 媒 预 测 周 冬 雨 金 马 奖 封 后 ， 大 气 的 倪 妮 却 佳 作 难 出
1      7  农 村 就 是 好 ， 能 吃 到 纯 天 然 无 添 加 的 野 生 蜂 蜜 ， 营 养 ...
2      5        1 4 款 知 性 美 装 ， 时 尚 惊 艳 搁 浅 的 阳 光 轻 熟 的 优 雅
3      9              火 焰 喷 射 器 1 0 0 0 度 火 焰 烧 死 鬼 子 4 连 拍
4     12                            1 8 岁 青 年 砍 死 8 8 岁 老 兵


In [None]:
class DataProcessor(utils.data.Dataset):
  def __init__(self, dataframe, tokenizer, maxlen):
    self.len = len(dataframe)
    self.data = dataframe
    self.tokenizer = tokenizer
    self.maxlen = maxlen

  def __len__(self):
    return self.len

  def __getitem__(self, index):
    text = ' '.join(str(self.data.text[index]).split())
    inputs = self.tokenizer.encode_plus(
      text,
      None,
      add_special_tokens=True,
      max_length=self.maxlen,
      pad_to_max_length=True,
      return_token_type_ids=True,
      truncation=True)
    return {
      'ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
      'mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
      'targets': torch.tensor(self.data.label[index], dtype=torch.long)}


tokenizer_1 = BertTokenizer.from_pretrained(MODEL)
param_1 = {'batch_size': BATCH, 'shuffle': True, 'num_workers': 0}
training_2 = utils.data.DataLoader(DataProcessor(training_1, tokenizer_1, MAXLEN), **param_1)
dev_2 = utils.data.DataLoader(DataProcessor(dev_1, tokenizer_1, MAXLEN), **param_1)
test_2 = utils.data.DataLoader(DataProcessor(test_1, tokenizer_1, MAXLEN), **param_1)

In [None]:
class ModelCLS(torch.nn.Module):
  def __init__(self, ckpt, drop, cate):
    super(ModelCLS, self).__init__()
    self.bert = BertModel.from_pretrained(ckpt)
    self.drop = torch.nn.Dropout(drop)
    self.dense = torch.nn.Linear(768, cate)
    
  def forward(self, ids, mask):
    _, x = self.bert(ids, attention_mask=mask)
    return self.dense(self.drop(x))


model_1 = ModelCLS(MODEL, DROP, CATE)
model_1.to(DEVICE)
nodecay_1 = ['bias', 'LayerNorm.weight']
var_1 = [
  {'params': [p_1 for n_1, p_1 in model_1.named_parameters() if not any(
    nd_1 in n_1 for nd_1 in nodecay_1)], 'weight_decay': 0.01},
  {'params': [p_1 for n_1, p_1 in model_1.named_parameters() if any(
    nd_1 in n_1 for nd_1 in nodecay_1)], 'weight_decay': 0.00}]

In [6]:
step_1 = EPOCH*(int(len(training_1)/BATCH)+1)
function_1 = torch.nn.CrossEntropyLoss()
optimizer_1 = AdamW(var_1, lr=LRATE)
sch_1 = get_linear_schedule_with_warmup(optimizer_1, int(step_1*0.1), step_1)
record_1 = []


def step_training(data):
  model_1.train()
  total, correct = 0, 0

  for r, data in enumerate(data, 0):
    ids = data['ids'].to(DEVICE, dtype=torch.long)
    mask = data['mask'].to(DEVICE, dtype=torch.long)
    targets = data['targets'].to(DEVICE, dtype=torch.long)
    outputs = model_1(ids, mask).squeeze()
    loss = function_1(outputs, targets)

    total += targets.shape[0]
    correct += (outputs.argmax(-1) == targets).sum().item()
    record_1.append(loss.item())
    
    optimizer_1.zero_grad()
    loss.backward()
    optimizer_1.step()
    sch_1.step()

    if (r+1) % 500 == 0:
      totalloss = round(np.mean(record_1), 4)
      acc = round(correct/total, 4)
      print(f'Training loss is {totalloss}, and accuracy is {acc}.')


def step_evaluating(data):
  model_1.eval()
  total, correct = 0, 0

  for _, data in enumerate(data, 0):
    ids = data['ids'].to(DEVICE, dtype=torch.long)
    mask = data['mask'].to(DEVICE, dtype=torch.long)
    targets = data['targets'].to(DEVICE, dtype=torch.long)
    outputs = model_1(ids, mask).squeeze()

    total += targets.shape[0]
    correct += (outputs.argmax(-1) == targets).sum().item()

  acc = round(correct/total, 4)
  print(f'Test (dev) accuracy is {acc}.')

In [7]:
for epoch_1 in range(EPOCH):
  print('Epoch {} running.'.format(epoch_1+1))
  check_1 = time.time()
  step_training(training_2)
  step_evaluating(dev_2)
  print('Epoch time cost is {}.'.format(round(time.time()-check_1, 4)))
  print('**********')

Epoch 1 running.
Training loss is 1.4872, and accuracy is 0.5916.
Training loss is 1.1076, and accuracy is 0.6926.
Training loss is 0.9638, and accuracy is 0.7296.
Training loss is 0.8837, and accuracy is 0.7504.
Test (dev) accuracy is 0.8255.
Epoch time cost is 748.874.
**********
Epoch 2 running.
Training loss is 0.7747, and accuracy is 0.8613.
Training loss is 0.7304, and accuracy is 0.8607.
Training loss is 0.6976, and accuracy is 0.8601.
Training loss is 0.6707, and accuracy is 0.861.
Test (dev) accuracy is 0.8349.
Epoch time cost is 747.1922.
**********
Epoch 3 running.
Training loss is 0.6184, and accuracy is 0.9085.
Training loss is 0.5913, and accuracy is 0.91.
Training loss is 0.5678, and accuracy is 0.9116.
Training loss is 0.547, and accuracy is 0.9128.
Test (dev) accuracy is 0.8387.
Epoch time cost is 746.1158.
**********


In [8]:
step_evaluating(test_2)

Test (dev) accuracy is 0.8373.
