# BERT for sentiment classify and export 

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [None]:
import random
import re
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import backend as K
from tqdm import tqdm_notebook

In [None]:
# bert basic module
import tokenization
import optimization
import run_classifier

In [None]:
def create_tokenizer_from_hub_module(bert_model_hub):
    """Get the vocab file and casing info from the Hub module."""
    with tf.Graph().as_default():
        bert_module = hub.Module(bert_model_hub)
        tokenization_info = bert_module(
            signature="tokenization_info", as_dict=True)
        with tf.Session() as sess:
            vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                                  tokenization_info["do_lower_case"]])

    return tokenization.FullTokenizer(
        vocab_file=vocab_file, do_lower_case=do_lower_case)

In [None]:
def make_features(dataset, label_list, MAX_SEQ_LENGTH, tokenizer, DATA_COLUMN, LABEL_COLUMN):
    input_example = dataset.apply(lambda x: run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)
    features = run_classifier.convert_examples_to_features(input_example, label_list, MAX_SEQ_LENGTH, tokenizer)
    return features

In [None]:
def create_model(bert_model_hub, is_predicting, input_ids, input_mask, segment_ids, labels,
                 num_labels):
    """Creates a classification model."""

    bert_module = hub.Module(
        bert_model_hub,
        trainable=True)
    bert_inputs = dict(
        input_ids=input_ids,
        input_mask=input_mask,
        segment_ids=segment_ids)
    bert_outputs = bert_module(
        inputs=bert_inputs,
        signature="tokens",
        as_dict=True)

    # Use "pooled_output" for classification tasks on an entire sentence.
    # Use "sequence_outputs" for token-level output.
    output_layer = bert_outputs["pooled_output"]

    hidden_size = output_layer.shape[-1].value

    # Create our own layer to tune for politeness data.
    output_weights = tf.get_variable(
        "output_weights", [num_labels, hidden_size],
        initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable(
        "output_bias", [num_labels], initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):

        # Dropout helps prevent overfitting
        output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        probabilities = tf.nn.softmax(logits, axis=-1)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

        # Convert labels into one-hot encoding
        one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

#         predicted_labels = tf.squeeze(
#             tf.argmax(log_probs, axis=-1, output_type=tf.int32))
        # If we're predicting, we want predicted labels and the probabiltiies.
#         if is_predicting:
#             return (predicted_labels, probabilities)

        # If we're train/eval, compute loss between predicted and actual label
        per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
        loss = tf.reduce_mean(per_example_loss)
#         return (loss, predicted_labels, log_probs)
        return (loss, per_example_loss, logits, probabilities)


In [None]:
# model_fn_builder actually creates our model function
# using the passed parameters for num_labels, learning_rate, etc.
def model_fn_builder(bert_model_hub, num_labels, learning_rate, num_train_steps,
                     num_warmup_steps):
    """Returns `model_fn` closure for TPUEstimator."""
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        label_ids = features["label_ids"]

        is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
        
        (loss, per_example_loss, logits, probabilities) = create_model(
                bert_model_hub, is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

        # TRAIN and EVAL
        if not is_predicting:
#             (loss, predicted_labels, log_probs) = create_model(
#                 bert_model_hub, is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)
            train_op = optimization.create_optimizer(
                loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)            

            if mode == tf.estimator.ModeKeys.TRAIN:
                return tf.estimator.EstimatorSpec(mode=mode,
                                                  loss=loss,
                                                  train_op=train_op)
            else:
                def metric_fn(per_example_loss, label_ids, logits, is_real_example):
                    predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
                    accuracy = tf.metrics.accuracy(labels=label_ids, predictions=predictions, weights=is_real_example)
                    loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
                    return {
                        "eval_accuracy": accuracy,
                        "eval_loss": loss,
                    }
                is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)
                eval_metrics = metric_fn(per_example_loss, label_ids, logits, is_real_example)
                return tf.estimator.EstimatorSpec(mode=mode,
                                                  loss=loss,
                                                  eval_metric_ops=eval_metrics)
        else:
#             (predicted_labels, log_probs) = create_model(
#                 bert_model_hub, is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)
#             predicted_labels = tf.squeeze(tf.argmax(probabilities, axis=-1, output_type=tf.int32))

            predictions = {
                'probabilities': probabilities,
            }
            return tf.estimator.EstimatorSpec(mode, predictions=predictions)

    # Return the actual model function in the closure
    return model_fn

In [None]:
def estimator_builder(bert_model_hub, OUTPUT_DIR, SAVE_SUMMARY_STEPS, SAVE_CHECKPOINTS_STEPS, label_list, LEARNING_RATE, num_train_steps, num_warmup_steps, BATCH_SIZE):

    # Specify outpit directory and number of checkpoint steps to save
    run_config = tf.estimator.RunConfig(
        model_dir=OUTPUT_DIR,
        save_summary_steps=SAVE_SUMMARY_STEPS,
        save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

    model_fn = model_fn_builder(
      bert_model_hub = bert_model_hub,
      num_labels=len(label_list),
      learning_rate=LEARNING_RATE,
      num_train_steps=num_train_steps,
      num_warmup_steps=num_warmup_steps)

    estimator = tf.estimator.Estimator(
      model_fn=model_fn,
      config=run_config,
      params={"batch_size": BATCH_SIZE})
    return estimator, model_fn, run_config

In [None]:
def run_on_dfs(train, test, DATA_COLUMN, LABEL_COLUMN, 
               MAX_SEQ_LENGTH = 128,
              BATCH_SIZE = 16,
              LEARNING_RATE = 2e-5,
              NUM_TRAIN_EPOCHS = 3.0,
              WARMUP_PROPORTION = 0.1,
              SAVE_SUMMARY_STEPS = 100,
              SAVE_CHECKPOINTS_STEPS = 10000,
              bert_model_hub = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"):

    label_list = sorted(train[LABEL_COLUMN].unique().tolist())
    
    tokenizer = create_tokenizer_from_hub_module(bert_model_hub)

    train_features = make_features(train, label_list, MAX_SEQ_LENGTH, tokenizer, DATA_COLUMN, LABEL_COLUMN)
    test_features = make_features(test, label_list, MAX_SEQ_LENGTH, tokenizer, DATA_COLUMN, LABEL_COLUMN)

    num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
    num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

    estimator, model_fn, run_config = estimator_builder(
                                  bert_model_hub, 
                                  OUTPUT_DIR, 
                                  SAVE_SUMMARY_STEPS, 
                                  SAVE_CHECKPOINTS_STEPS, 
                                  label_list, 
                                  LEARNING_RATE, 
                                  num_train_steps, 
                                  num_warmup_steps, 
                                  BATCH_SIZE)

    train_input_fn = run_classifier.input_fn_builder(
        features=train_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=True,
        drop_remainder=False)

    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

    test_input_fn = run_classifier.input_fn_builder(
        features=test_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=False,
        drop_remainder=False)

    result_dict = estimator.evaluate(input_fn=test_input_fn, steps=None)
#     result_dict = {}

    predict_input_fn = run_classifier.input_fn_builder(
        features=test_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=False,
        drop_remainder=False)
    pred_result = estimator.predict(input_fn=predict_input_fn)
    
    return result_dict, estimator

In [None]:
def transform_data(fn):
    dataset = []
    with open(dev_fn, 'r') as fr:
        for l in fr:
            label, text = l.strip().split('\t')
            if text.lower().strip() and label:
                dataset.append([text.lower().strip(), int(label)])

    data_df = pd.DataFrame(data=dataset)
    data_df.columns=['comment', 'sentiment']
    return data_df

In [None]:
def pretty_print(result):
    df = pd.DataFrame([result]).T
    df.columns = ["values"]
    return df

In [None]:
train_fn = 'weibo_data/train.tsv'
test_fn = 'weibo_data/test.tsv'
dev_fn = 'weibo_data/dev.tsv'

In [None]:
train = transform_data(train_fn)
train = train.sample(len(train))

test = transform_data(test_fn)

In [None]:
# test.head()
# test['sentiment'].unique().tolist()

In [None]:
#---------- dianping dataset --------#
# import pickle
# with open('dianping_train_test.pickle', 'rb') as fr:
#     train, test = pickle.load(fr)

In [None]:
bert_hub = "hub_bert_v1"

myparam = {
        "DATA_COLUMN": "comment",
        "LABEL_COLUMN": "sentiment",
        "BATCH_SIZE": 16,
        "MAX_SEQ_LENGTH": 128,
        "LEARNING_RATE": 2e-5,
        "NUM_TRAIN_EPOCHS": 10,
        "bert_model_hub": bert_hub
    }

random.seed(10)
OUTPUT_DIR = 'output'

In [None]:
result, estimator = run_on_dfs(train, test, **myparam)

In [None]:
pretty_print(result)

In [None]:
# label_list = train['sentiment'].unique().tolist()

# tokenizer = create_tokenizer_from_hub_module(myparam['bert_model_hub'])

# test_features = make_features(test, label_list, myparam['MAX_SEQ_LENGTH'], tokenizer, 'comment', 'sentiment')

# test_input_fn = run_classifier.input_fn_builder(
#     features=test_features,
#     seq_length=myparam['MAX_SEQ_LENGTH'],
#     is_training=False,
#     drop_remainder=False)
# result_dict = estimator.evaluate(input_fn=test_input_fn, steps=None)
# result_dict

In [None]:
label_list = train['sentiment'].unique().tolist()

tokenizer = create_tokenizer_from_hub_module(myparam['bert_model_hub'])


In [None]:
predict_dataset = [['深夜发吃报复社会', 2], ['恨,要报复社会', 1],['尼玛，节操碎了碎了,小姐不可怕，就怕小姐有文化', 1]]

predict_df = pd.DataFrame(data=predict_dataset)
predict_df.columns=['comment', 'sentiment']

# predict_df = transform_data(train_fn)
# predict_df = predict_df[predict_df['sentiment'] == 1]

predict_features = make_features(predict_df, label_list, myparam['MAX_SEQ_LENGTH'], tokenizer, 'comment', 'sentiment')

predict_input_fn = run_classifier.input_fn_builder(
        features=predict_features,
        seq_length=myparam['MAX_SEQ_LENGTH'],
        is_training=False,
        drop_remainder=False)

result_dict = estimator.predict(input_fn=predict_input_fn, yield_single_examples=False)

In [None]:
list(result_dict)

# for (i, prediction) in enumerate(result_dict):
#     probabilities = prediction["probabilities"]
#     print(type(probabilities), probabilities)

In [None]:
# predict_df.count()

## Predict

In [None]:
predict_df = transform_data(dev_fn)

predict_labels = predict_df['sentiment']
predict_text = predict_df['comment']

predict_features = make_features(predict_df, label_list, myparam['MAX_SEQ_LENGTH'], tokenizer, 'comment', 'sentiment')

predict_input_fn = run_classifier.input_fn_builder(
        features=predict_features,
        seq_length=myparam['MAX_SEQ_LENGTH'],
        is_training=False,
        drop_remainder=False)

result = estimator.predict(input_fn=predict_input_fn)

output_predict_file = "test_results.tsv"
num_actual_predict_examples = len(predict_df)
with tf.gfile.GFile(output_predict_file, "w") as writer:
    tf.logging.info("***** Predict results *****")
    for (i, prediction) in enumerate(result):
        probabilities = prediction["probabilities"]
        if i >= num_actual_predict_examples:
            break
        output_line = "\t".join(str(class_probability) for class_probability in probabilities) + '\t' + str(predict_labels[i]) + '\t' + predict_text[i] + "\n"
        writer.write(output_line)

## Export Model

In [None]:
def serving_input_fn():
    label_ids = tf.placeholder(tf.int32, [None], name='label_ids')
    input_ids = tf.placeholder(tf.int32, [None, myparam['MAX_SEQ_LENGTH']], name='input_ids')
    input_mask = tf.placeholder(tf.int32, [None, myparam['MAX_SEQ_LENGTH']], name='input_mask')
    segment_ids = tf.placeholder(tf.int32, [None, myparam['MAX_SEQ_LENGTH']], name='segment_ids')

    input_fn = tf.estimator.export.build_raw_serving_input_receiver_fn({'label_ids': label_ids,'input_ids': input_ids, 'input_mask': input_mask, 'segment_ids': segment_ids})()
    return input_fn

In [None]:
export_dir = os.path.join(OUTPUT_DIR, 'exported')
estimator._export_to_tpu = False
estimator.export_savedmodel(export_dir, serving_input_fn)