# Bert Regressor

In [None]:
import tensorflow as tf
import modeling
import tokenization
import optimization
import run_regressor

## Defining args

In [None]:
bert_config_file = '/home/ubuntu/bert_models/uncased_L-12_H-768_A-12/bert_config.json'
bert_config = modeling.BertConfig.from_json_file(bert_config_file)
vocab_file = '/home/ubuntu/bert_models/uncased_L-12_H-768_A-12/vocab.txt'
max_seq_length = 128
train_batch_size = 32
eval_batch_size = 8
predict_batch_size = 8
num_train_epochs=3.0
warmup_proportion = 0.1

''' tpu '''
save_checkpoints_steps = 1000
iterations_per_loop = 1000

DATA_DIR = '/home/ubuntu/glue_data/ARD/'
BERT_BASE_DIR = '/home/ubuntu/bert_models/uncased_L-12_H-768_A-12'
INIT_CHECKPOINT = '/home/ubuntu/bert_models/uncased_L-12_H-768_A-12/bert_model.ckpt'
OUTPUT_DIR = '/home/ubuntu/tmp/ard_regressor/'

## Defining methods

In [None]:
def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,values):
    """Creates a regression model."""
    model = modeling.BertModel(
      config=bert_config,
      is_training=is_training,
      input_ids=input_ids,
      input_mask=input_mask,
      token_type_ids=segment_ids)

    output_layer = model.get_pooled_output()

    hidden_size = output_layer.shape[-1].value

    output_weights = tf.get_variable(
      "output_weights", [1, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable(
      "output_bias", [1], initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):
        if is_training:
          # I.e., 0.1 dropout
          output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

        logits = tf.matmul(output_layer, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        logits = tf.squeeze(logits)
        loss = tf.losses.mean_squared_error(values,logits)

    return (loss, logits)

In [None]:
def model_fn_builder(bert_config, init_checkpoint, learning_rate,
                     num_train_steps, num_warmup_steps, use_tpu=False):
  """Returns `model_fn` closure"""

  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""
    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    values = features["values"]

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)

    (total_loss, logits) = create_model(
        bert_config, is_training, input_ids, input_mask, segment_ids, values)

    tvars = tf.trainable_variables()
    initialized_variable_names = {}
    scaffold_fn = None
    if init_checkpoint:
        (assignment_map, initialized_variable_names
              ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
        if use_tpu:

            def tpu_scaffold():
                tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
                return tf.train.Scaffold()

            scaffold_fn = tpu_scaffold
        else:
            tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    output_spec = None
    if mode == tf.estimator.ModeKeys.TRAIN:

        train_op = optimization.create_optimizer(total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu)

        output_spec = tf.contrib.tpu.TPUEstimatorSpec(
                              mode=mode,
                              loss=total_loss,
                              train_op=train_op,
                              scaffold_fn=scaffold_fn)
    elif mode == tf.estimator.ModeKeys.EVAL:

        def metric_fn(total_loss, logits):
            return {
                "eval_loss": total_loss,
            }

        eval_metrics = (metric_fn, [total_loss, logits])
        output_spec = tf.contrib.tpu.TPUEstimatorSpec(
          mode=mode,
          loss=total_loss,
          eval_metrics=eval_metrics,
          scaffold_fn=scaffold_fn)
    else:
        output_spec = tf.contrib.tpu.TPUEstimatorSpec(
            mode=mode, predictions=logits, scaffold_fn=scaffold_fn)
    return output_spec

  return model_fn


In [None]:
def input_fn_builder(features, seq_length, is_training, drop_remainder):
    """Creates an `input_fn` closure to be passed to TPUEstimator."""

    all_input_ids = []
    all_input_mask = []
    all_segment_ids = []
    all_label_ids = []

    for feature in features:
        all_input_ids.append(feature.input_ids)
        all_input_mask.append(feature.input_mask)
        all_segment_ids.append(feature.segment_ids)
        all_label_ids.append(feature.label_id)
  
    def input_fn(params):
        """The actual input function."""
        batch_size = params["batch_size"]

        num_examples = len(features)

        # This is for demo purposes and does NOT scale to large data sets. We do
        # not use Dataset.from_generator() because that uses tf.py_func which is
        # not TPU compatible. The right way to load data is with TFRecordReader.
        d = tf.data.Dataset.from_tensor_slices({
            "input_ids":
                tf.constant(
                    all_input_ids, shape=[num_examples, seq_length],
                    dtype=tf.int32),
            "input_mask":
                tf.constant(
                    all_input_mask,
                    shape=[num_examples, seq_length],
                    dtype=tf.int32),
            "segment_ids":
                tf.constant(
                    all_segment_ids,
                    shape=[num_examples, seq_length],
                    dtype=tf.int32),
            "values":
                tf.constant(all_label_ids, shape=[num_examples], dtype=tf.float32),
        })

        if is_training:
            d = d.repeat()
            d = d.shuffle(buffer_size=100)

        d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
        return d

    return input_fn

## Construct Estimator

In [None]:
processor = run_regressor.ArdProcessor()
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file,do_lower_case=True)

In [None]:
train_examples = processor.get_train_examples(DATA_DIR)
num_train_steps = int(len(train_examples) / train_batch_size * num_train_epochs)
num_warmup_steps = int(num_train_steps * warmup_proportion)

In [None]:
# tpu configuration
tpu_cluster_resolver = None
is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
run_config = tf.contrib.tpu.RunConfig(
  cluster=tpu_cluster_resolver,
  master=None,
  model_dir=OUTPUT_DIR,
  save_checkpoints_steps=save_checkpoints_steps,
  tpu_config=tf.contrib.tpu.TPUConfig(
      iterations_per_loop=iterations_per_loop,
      num_shards=None,
      per_host_input_for_training=is_per_host))


In [None]:
model_fn = model_fn_builder(bert_config=bert_config,
                            init_checkpoint=INIT_CHECKPOINT,
                            learning_rate=2e-5,
                            num_train_steps=num_train_steps,
                            num_warmup_steps=num_warmup_steps)

In [None]:
estimator = tf.contrib.tpu.TPUEstimator(
    use_tpu=False,
    model_fn=model_fn,
    config=run_config,
    train_batch_size=train_batch_size,
    eval_batch_size=eval_batch_size,
    predict_batch_size=predict_batch_size)

## Training

In [None]:
train_examples = processor.get_train_examples(DATA_DIR)

In [None]:
train_features = run_regressor.convert_examples_to_features(train_examples,max_seq_length=max_seq_length,tokenizer=tokenizer)

In [None]:
train_input_fn = input_fn_builder(train_features,max_seq_length,is_training=True,drop_remainder=True)

In [None]:
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)

## Evaluation

In [None]:
eval_examples = processor.get_eval_examples(DATA_DIR)

In [None]:
eval_features = run_regressor.convert_examples_to_features(eval_examples,max_seq_length=max_seq_length,tokenizer=tokenizer)

In [None]:
# This tells the estimator to run through the entire set.
eval_steps = None

In [None]:
eval_drop_remainder = False

## Prediction

In [None]:
pre

In [None]:
test_examples = processor.get_test_examples('/home/ubuntu/glue_data/ARD/')

In [None]:
test_features = run_regressor.convert_examples_to_features(test_examples,
                                                            max_seq_length=max_seq_length,
                                                            tokenizer=tokenizer)

In [None]:
all_input_ids = []
all_input_mask = []
all_segment_ids = []
all_label_ids = []

In [None]:
for feature in test_features:
    all_input_ids.append(feature.input_ids)
    all_input_mask.append(feature.input_mask)
    all_segment_ids.append(feature.segment_ids)
    all_label_ids.append(feature.label_id)


In [None]:
input_ids=tf.constant(all_input_ids[:50])
input_mask=tf.constant(all_input_mask[:50])
segment_ids=tf.constant(all_segment_ids[:50])
label_ids = tf.constant(all_label_ids[:50])

In [None]:
datasets = tf.data.Dataset.zip((tf.data.Dataset.from_tensor_slices(input_ids),
                                tf.data.Dataset.from_tensor_slices(input_mask),
                                tf.data.Dataset.from_tensor_slices(segment_ids),
                                tf.data.Dataset.from_tensor_slices(label_ids)))

In [None]:
datasets.output_types

In [None]:
datasets.output_shapes

In [None]:
tf.estimator.Estimator

In [None]:
 = datasets.batch(batch_size=10)

In [None]:
batch

In [None]:
model = modeling.BertModel(
  config=bert_config,
  is_training=True,
  input_ids=input_ids,
  input_mask=input_mask,
  token_type_ids=segment_ids,
  use_one_hot_embeddings=False)

In [None]:
num_labels = 1
is_training = True

In [None]:
output_layer = model.get_pooled_output()

hidden_size = output_layer.shape[-1].value

output_weights = tf.get_variable(
  "output_weights", [1, hidden_size],
  initializer=tf.truncated_normal_initializer(stddev=0.02))

output_bias = tf.get_variable(
  "output_bias", [num_labels], initializer=tf.zeros_initializer())

with tf.variable_scope("loss"):
    if is_training:
        # I.e., 0.1 dropout
        output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    #probabilities = tf.nn.softmax(logits, axis=-1)
    #log_probs = tf.nn.log_softmax(logits, axis=-1)
    logits = tf.squeeze(logits)

    loss = tf.losses.mean_squared_error(label_ids,logits)


In [None]:
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
loss_ = loss.eval()
sess.close()

In [None]:
loss_