#                                                                                 BERT

BERT（Bidirectional Encoder Representations from Transformers）是一种全新的预训练的语言模型表示，发布至今也才一个多月，横扫各类NLP任务，效果卓然！实现细节及理论基础看这里：https://arxiv.org/abs/1810.04805.

谷歌官方的Github在这里：https://github.com/google-research/bert

BERT是一种预训练语言表示的方法，这意味着在大型文本语料库（如维基百科）上训练通用“语言理解”模型，然后将该模型用于下游的NLP任务（如问答、情感分析、文本聚类等）。 BERT优于以前的方法，因为它是第一个用于预训练NLP的无监督(Unsupervised)且深度双向（Deeply Bidirectional）的系统。

![](https://www.lyrn.ai/wp-content/uploads/2018/11/transformer.png)

# 下载所有必要的依赖项


In [1]:
import pandas as pd
import os
import numpy as np
import pandas as pd
import zipfile
from matplotlib import pyplot as plt
%matplotlib inline
import sys
import datetime

  'Matplotlib is building the font cache using fc-list. '


In [2]:
#下载该模型的weights和cofiguration文件
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip

--2018-11-26 13:02:17--  https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.160.80, 2404:6800:4012:1::2010
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.160.80|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 407727028 (389M) [application/zip]
Saving to: ‘uncased_L-12_H-768_A-12.zip’


2018-11-26 13:03:04 (9.22 MB/s) - ‘uncased_L-12_H-768_A-12.zip’ saved [407727028/407727028]



In [3]:
#解压缩
repo = 'model_repo'
with zipfile.ZipFile("uncased_L-12_H-768_A-12.zip","r") as zip_ref:
    zip_ref.extractall(repo)

In [4]:
!ls 'model_repo/uncased_L-12_H-768_A-12'

bert_config.json		     bert_model.ckpt.index  vocab.txt
bert_model.ckpt.data-00000-of-00001  bert_model.ckpt.meta


In [5]:
#下载必要的组件（py脚本）
!wget https://raw.githubusercontent.com/google-research/bert/master/modeling.py 
!wget https://raw.githubusercontent.com/google-research/bert/master/optimization.py 
!wget https://raw.githubusercontent.com/google-research/bert/master/run_classifier.py 
!wget https://raw.githubusercontent.com/google-research/bert/master/tokenization.py 

--2018-11-26 13:03:14--  https://raw.githubusercontent.com/google-research/bert/master/modeling.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.228.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.228.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 38086 (37K) [text/plain]
Saving to: ‘modeling.py’


2018-11-26 13:03:18 (19.4 KB/s) - ‘modeling.py’ saved [38086/38086]

--2018-11-26 13:03:18--  https://raw.githubusercontent.com/google-research/bert/master/optimization.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.228.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.228.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6046 (5.9K) [text/plain]
Saving to: ‘optimization.py’


2018-11-26 13:03:21 (78.5 MB/s) - ‘optimization.py’ saved [6046/6046]

--2018-11-26 13:03:22--  https://raw.githubusercontent.com/google-re

下面的示例是在预处理代码上完成的，类似于** CoLa **：

该语料库是旨在完成二进制的单句分类任务，其目标在于预测是否某个英文语句是否可以接受（是不是垃圾信息）

可以将预训练的BERT模型用于各种任务，包括文本分类、情感分析、文本聚类。


In [9]:
#可获取的模型检查点（Checkpoints）:
#   uncased_L-12_H-768_A-12: uncased BERT base model
#   uncased_L-24_H-1024_A-16: uncased BERT large model
#   cased_L-12_H-768_A-12: cased BERT large model
#在这里使用基础模型
BERT_MODEL = 'uncased_L-12_H-768_A-12'
BERT_PRETRAINED_DIR = 'model_repo/uncased_L-12_H-768_A-12'
OUTPUT_DIR = 'model_repo/outputs'
print('***** Model output directory: {OUTPUT_DIR} *****')
print('***** BERT pretrained directory: {BERT_PRETRAINED_DIR} *****')


***** Model output directory: {OUTPUT_DIR} *****
***** BERT pretrained directory: {BERT_PRETRAINED_DIR} *****


In [None]:
from sklearn.model_selection import train_test_split

train_df =  pd.read_csv('../input/train.csv',encoding='utf-8')
train_df = train_df.sample(2000)

train, test = train_test_split(train_df, test_size = 0.1, random_state=42)

train_lines, train_labels = train.question_text.values, train.target.values
test_lines, test_labels = test.question_text.values, test.target.values

In [None]:
import modeling
import optimization
import run_classifier
import tokenization
import tensorflow as tf


def create_examples(lines, set_type, labels=None):
#为BERT模型构建数据
    guid = f'{set_type}'
    examples = []
    if guid == 'train':
        for line, label in zip(lines, labels):
            text_a = line
            label = str(label)
            examples.append(
              run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
    else:
        for line in lines:
            text_a = line
            label = '0'
            examples.append(
              run_classifier.InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
    return examples

# 模型的超参数
TRAIN_BATCH_SIZE = 32
EVAL_BATCH_SIZE = 8
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
WARMUP_PROPORTION = 0.1
MAX_SEQ_LENGTH = 128

# 模型配置文件
SAVE_CHECKPOINTS_STEPS = 1000   #如你希望在大型数据集上微调模型，请使用更大的间隔（Interval）

# 每个checpoint的weights大概 1.5GB
ITERATIONS_PER_LOOP = 1000
NUM_TPU_CORES = 8
VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt')
CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR, 'bert_config.json')
INIT_CHECKPOINT = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt')
DO_LOWER_CASE = BERT_MODEL.startswith('uncased')

label_list = ['0', '1']
tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE)
train_examples = create_examples(train_lines, 'train', labels=train_labels)

tpu_cluster_resolver = None ##既然训练将在GPU上进行，我们不需要群集解析器


#TPUEstimator还支持CPU和GPU训练，你无需定义单独的tf.estimator.Estimator
run_config = tf.contrib.tpu.RunConfig(
    cluster=tpu_cluster_resolver,
    model_dir=OUTPUT_DIR,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
    tpu_config=tf.contrib.tpu.TPUConfig(
        iterations_per_loop=ITERATIONS_PER_LOOP,
        num_shards=NUM_TPU_CORES,
        per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

num_train_steps = int(
    len(train_examples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

model_fn = run_classifier.model_fn_builder(
    bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE),
    num_labels=len(label_list),
    init_checkpoint=INIT_CHECKPOINT,
    learning_rate=LEARNING_RATE,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    use_tpu=False, #如果设定为False，训练将在CPU或GPU上进行，具体取决于可用的设备
    use_one_hot_embeddings=True)

estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=False, #如果设定为False，训练将在CPU或GPU上进行，具体取决于可用的设备
    model_fn=model_fn,
    config=run_config,
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE)

In [None]:
"""
Note：您可能会看到一条消息“在CPU上运行训练（'Running train on CPU'）”。
这实际上只意味着它运行在除了TPU之外的其他设备上，其中包括GPU。
"""

# 训练模型
print('Please wait...')
train_features = run_classifier.convert_examples_to_features(
    train_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
print('***** 训练起始时间 {} *****'.format(datetime.datetime.now()))
print('  Num examples = {}'.format(len(train_examples)))
print('  Batch size = {}'.format(TRAIN_BATCH_SIZE))
tf.logging.info("  Num steps = %d", num_train_steps)
train_input_fn = run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=True)
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print('***** 训练结束于 {} *****'.format(datetime.datetime.now()))

In [None]:
"""
原始代码中有一个比较诡异的错误。
在预测时，估计器返回一个空的dict {}，而不使用batch_size。
重新定义input_fn_builder并硬编码（Hardcode）batch_size，现在正在使用'params'。
"""

def input_fn_builder(features, seq_length, is_training, drop_remainder):
  """创建一个`input_fn`闭包，传递给TPUEstimator。"""

  all_input_ids = []
  all_input_mask = []
  all_segment_ids = []
  all_label_ids = []

  for feature in features:
    all_input_ids.append(feature.input_ids)
    all_input_mask.append(feature.input_mask)
    all_segment_ids.append(feature.segment_ids)
    all_label_ids.append(feature.label_id)

  def input_fn(params):
    """实际的输入函数"""
    print(params)
    batch_size = 32

    num_examples = len(features)

    d = tf.data.Dataset.from_tensor_slices({
        "input_ids":
            tf.constant(
                all_input_ids, shape=[num_examples, seq_length],
                dtype=tf.int32),
        "input_mask":
            tf.constant(
                all_input_mask,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "segment_ids":
            tf.constant(
                all_segment_ids,
                shape=[num_examples, seq_length],
                dtype=tf.int32),
        "label_ids":
            tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32),
    })

    if is_training:
      d = d.repeat()
      d = d.shuffle(buffer_size=100)

    d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
    return d

  return input_fn

In [None]:
predict_examples = create_examples(test_lines, 'test')

predict_features = run_classifier.convert_examples_to_features(
    predict_examples, label_list, MAX_SEQ_LENGTH, tokenizer)

predict_input_fn = input_fn_builder(
    features=predict_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

result = estimator.predict(input_fn=predict_input_fn)

In [None]:
from tqdm import tqdm
preds = []
for prediction in tqdm(result):
    for class_probability in prediction:
      preds.append(float(class_probability))

results = []
for i in tqdm(range(0,len(preds),2)):
  if preds[i] < 0.9:
    results.append(1)
  else:
    results.append(0)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

print(accuracy_score(np.array(results), test_labels))
print(f1_score(np.array(results), test_labels))

尽管BERT效果极佳，但它目前目前仍然存在着如下缺点：

- 训练成本昂贵。 本notebook的所有结果都在单个云TPU上进行了微调，后者具有64GB的RAM。 目前无法使用具有12GB-16GB RAM的GPU复现论文中的大多数BERT-Large运行结果，因为适配内存的最大batch size 太小。

