## Install Tensorflow

In [103]:
#https://colab.research.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb#scrollTo=ynDmeatCWLJK
!pip install bert-tensorflow



## Check that TPU is being used

In [104]:
import datetime
import json
import os
import pprint
import random
import string
import sys
import tensorflow as tf
import bert
import numpy as np



assert 'COLAB_TPU_ADDR' in os.environ, 'ERROR: Not connected to a TPU runtime!'
TPU_ADDRESS = 'grpc://' + os.environ['COLAB_TPU_ADDR']
print('TPU address is', TPU_ADDRESS)

from google.colab import auth
auth.authenticate_user()
with tf.Session(TPU_ADDRESS) as session:
  print('TPU devices:')
  pprint.pprint(session.list_devices())

  # Upload credentials to TPU.
  with open('/content/adc.json', 'r') as f:
    auth_info = json.load(f)
  tf.contrib.cloud.configure_gcs(session, credentials=auth_info)
  # Now credentials are set for all future sessions on this TPU.

TPU address is grpc://10.124.108.66:8470
TPU devices:
[_DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:CPU:0, CPU, -1, 16449318307470296195),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 3599827788311202100),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 321353889036299835),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:1, TPU, 17179869184, 9236739624978311139),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:2, TPU, 17179869184, 6577049995452220479),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:3, TPU, 17179869184, 10360541729901226601),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:4, TPU, 17179869184, 3004189358294271403),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:5, TPU, 17179869184, 9307654225121160783),
 _DeviceAttributes(/job:tpu_worker/replica:0/task:0/device:TPU:6, TPU, 17179869184, 18329039505750

## Clone the BERT files

In [0]:
import sys

!test -d bert_repo || git clone https://github.com/google-research/bert bert_repo
if not 'bert_repo' in sys.path:
  sys.path += ['bert_repo']

# import python modules defined by BERT
import modeling
import optimization
import run_classifier
import run_classifier_with_tfhub
import tokenization

# import tfhub 
import tensorflow_hub as hub

## Authenticat Google Cloud Storage locations for saving files

In [106]:
!pip install gcsfs



In [107]:
# Set the output directory for saving model file
# Optionally, set a GCP bucket location

FILE_OUTPUT_DIR = 'bert_large_assess_files'#@param {type:"string"}
#@markdown Whether or not to clear/delete the directory and create a new one
DO_DELETE = True #@param {type:"boolean"}
#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.
USE_BUCKET = True #@param {type:"boolean"}
BUCKET = 'dissertation_bucket' #@param {type:"string"}

if USE_BUCKET:
  FILE_OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, FILE_OUTPUT_DIR)
  from google.colab import auth
  auth.authenticate_user()

if DO_DELETE:
  try:
    tf.gfile.DeleteRecursively(FILE_OUTPUT_DIR)
  except:
    # Doesn't matter if the directory didn't exist
    pass
tf.gfile.MakeDirs(FILE_OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(FILE_OUTPUT_DIR))

***** Model output directory: gs://dissertation_bucket/bert_large_assess_files *****


In [109]:
# Set the output directory for saving model file
# Optionally, set a GCP bucket location

OUTPUT_DIR = 'bert_large_assess'#@param {type:"string"}
#@markdown Whether or not to clear/delete the directory and create a new one
DO_DELETE = True #@param {type:"boolean"}
#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.
USE_BUCKET = True #@param {type:"boolean"}
BUCKET = 'dissertation_bucket' #@param {type:"string"}

if USE_BUCKET:
  OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, OUTPUT_DIR)
  from google.colab import auth
  auth.authenticate_user()

if DO_DELETE:
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
    # Doesn't matter if the directory didn't exist
    pass
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))

***** Model output directory: gs://dissertation_bucket/bert_large_assess *****


## Select the BERT model to be used (Base or Large)

In [0]:
# Available pretrained model checkpoints:
#   uncased_L-12_H-768_A-12: uncased BERT base model
#   uncased_L-24_H-1024_A-16: uncased BERT large model
#   cased_L-12_H-768_A-12: cased BERT large model
BERT_MODEL = 'uncased_L-24_H-1024_A-16' #@param {type:"string"}
BERT_MODEL_HUB = 'https://tfhub.dev/google/bert_' + BERT_MODEL + '/1'

## Create a tokenizer

In [111]:
tokenizer = run_classifier_with_tfhub.create_tokenizer_from_hub_module(BERT_MODEL_HUB)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


## Import the Train and Eval datasets from GitHub

In [0]:
! wget -cq https://github.com/ScottJK-20190706/tweet_classifier/blob/master/pickle_files//train_data_formatted.pickle?raw=true
! wget -cq https://github.com/ScottJK-20190706/tweet_classifier/blob/master/pickle_files/eval_data_formatted.pickle?raw=true

import pandas as pd

train_data = pd.read_pickle('train_data_formatted.pickle?raw=true')
eval_data = pd.read_pickle('eval_data_formatted.pickle?raw=true')
all_data = train_data.append(eval_data).reset_index()

DATA_COLUMN = 'Tweet'
LABEL_COLUMN = 'class'
# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = [0, 1]

## Set up the configurations

In [0]:
TRAIN_BATCH_SIZE = 32
EVAL_BATCH_SIZE = 8
PREDICT_BATCH_SIZE = 8
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 4.0
MAX_SEQ_LENGTH = 55
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 1000
SAVE_SUMMARY_STEPS = 500

#processor = processors[TASK.lower()]()
label_list = [0, 1]
#label_list = processor.get_labels()

## 10 stratified splits of the data.  Train model and assess 10 times.

In [113]:
model_run = []
name = []
tn = []
fp = []
fn = []
tp = []
p = []
r = []
f_1 = []
auc_sc = []
acc = []

n = 0

from sklearn.model_selection import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(n_splits=10, test_size=0.2, random_state=0)
for train_index, test_index in sss.split(all_data['Tweet'], all_data['class']):

  # Set the output directory for saving model file
  # Optionally, set a GCP bucket location

  OUTPUT_DIR = 'bert_large_assess'#@param {type:"string"}
  #@markdown Whether or not to clear/delete the directory and create a new one
  DO_DELETE = True #@param {type:"boolean"}
  #@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.
  USE_BUCKET = True #@param {type:"boolean"}
  BUCKET = 'dissertation_bucket' #@param {type:"string"}

  if USE_BUCKET:
    OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, OUTPUT_DIR)
    from google.colab import auth
    auth.authenticate_user()

  if DO_DELETE:
    try:
      tf.gfile.DeleteRecursively(OUTPUT_DIR)
    except:
      # Doesn't matter if the directory didn't exist
      pass
  tf.gfile.MakeDirs(OUTPUT_DIR)
  print('***** Model output directory: {} *****'.format(OUTPUT_DIR))


  td = all_data.loc[train_index,:]
  #td = train_data
  ed = all_data.loc[test_index,:]
  #ed =
  print(td.shape)
  print(ed.shape)

  # Compute number of train and warmup steps from batch size
  # Use the InputExample class from BERT's run_classifier code to create examples from the data
  train_examples = td.apply(lambda x: run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                    text_a = x[DATA_COLUMN], 
                                                                    text_b = None, 
                                                                    label = x[LABEL_COLUMN]), axis = 1)
  num_train_steps = int(len(train_examples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
  num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

  ##### FORMAT THE EVALUATION DATASET #####
  eval_examples = ed.apply(lambda x: run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                    text_a = x[DATA_COLUMN], 
                                                                    text_b = None, 
                                                                    label = x[LABEL_COLUMN]), axis = 1)

  # Setup TPU related config
  tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)
  NUM_TPU_CORES = 8
  ITERATIONS_PER_LOOP = 1000

  def get_run_config(output_dir):
    return tf.contrib.tpu.RunConfig(
      cluster=tpu_cluster_resolver,
      model_dir=output_dir,
      save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
      tpu_config=tf.contrib.tpu.TPUConfig(
          iterations_per_loop=ITERATIONS_PER_LOOP,
          num_shards=NUM_TPU_CORES,
          per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))

  # Force TF Hub writes to the GS bucket we provide.
  os.environ['TFHUB_CACHE_DIR'] = OUTPUT_DIR


  ##### TENSORFLOW ESTIMATOR OBJECT #####
  model_fn = run_classifier_with_tfhub.model_fn_builder(
    num_labels=len(label_list),
    learning_rate=LEARNING_RATE,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    use_tpu=True,
    bert_hub_module_handle=BERT_MODEL_HUB
  )

  estimator_from_tfhub = tf.contrib.tpu.TPUEstimator(
    use_tpu=True,
    model_fn=model_fn,
    config=get_run_config(OUTPUT_DIR),
    train_batch_size=TRAIN_BATCH_SIZE,
    eval_batch_size=EVAL_BATCH_SIZE,
    predict_batch_size=PREDICT_BATCH_SIZE,
  )


  # FUNCTION FOR TRAINING THE MODEL #####
  def model_train(estimator):
        # We'll set sequences to be at most 128 tokens long.
    train_features = run_classifier.convert_examples_to_features(
        train_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
    print('***** Started training at {} *****'.format(datetime.datetime.now()))
    print('  Num examples = {}'.format(len(train_examples)))
    print('  Batch size = {}'.format(TRAIN_BATCH_SIZE))
    tf.logging.info("  Num steps = %d", num_train_steps)
    train_input_fn = run_classifier.input_fn_builder(
        features=train_features,
        seq_length=MAX_SEQ_LENGTH,
        is_training=True,
        drop_remainder=True)
    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
    print('***** Finished training at {} *****'.format(datetime.datetime.now()))


  model_train(estimator_from_tfhub)


  ##### FUNCTION FOR MAKING PRESICTIONS #####
  def model_predict(estimator):
    # Make predictions on a subset of eval examples
    #prediction_examples = processor.get_dev_examples(TASK_DATA_DIR)[:PREDICT_BATCH_SIZE]
    input_features = run_classifier.convert_examples_to_features(eval_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
    predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=True)
    predictions = estimator.predict(predict_input_fn)
    
    pred_0 = []
    actual = []

    for example, prediction in zip(eval_examples, predictions):
      print('text_a: %s\ntext_b: %s\nlabel:%s\nprediction:%s\n' % (example.text_a, example.text_b, str(example.label), prediction['probabilities']))
      pred_0 = np.append(pred_0,prediction['probabilities'])
      actual = np.append(actual,example.label)
    
    return(pred_0, actual)

  ##### MAKE THE PREDICTIONS  ##### 
  pred_0, actual = model_predict(estimator_from_tfhub)


  ##### GET THE PREDICTION RESULTS  ##### 
  i=0
  pred = []
  while i <= len(pred_0)-1:
    if pred_0[i]<=pred_0[i+1]:
      pred = np.append(pred,1)
    else:
      pred = np.append(pred,0)
    i = i+2

  from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, auc, roc_curve, accuracy_score #metrics

  conf_matrix = confusion_matrix(actual,pred) #build confusion matrix
  precision = precision_score(actual,pred) #calculate precision
  recall = recall_score(actual,pred) #calculate recall
  f1 = f1_score(actual,pred) #calculate f1
  fpr, tpr, thresholds = roc_curve(actual,pred)
  auc_score = auc(fpr, tpr) #calculate auc
  accuracy = accuracy_score(actual,pred) #calculate accuracy

  model_run = np.append(model_run,n)
  name = np.append(name,'bert_large')
  tn = np.append(tn,conf_matrix[0][0])
  fp = np.append(fp,conf_matrix[0][1])
  fn = np.append(fn,conf_matrix[1][0])
  tp = np.append(tp,conf_matrix[1][1])
  p = np.append(p,precision)
  r = np.append(r,recall)
  f_1 = np.append(f_1,f1)
  auc_sc = np.append(auc_sc,auc_score)
  acc = np.append(acc,accuracy)

  n = n+1



Output hidden; open in https://colab.research.google.com to view.

In [0]:
metrics = pd.DataFrame({
  'model_run': model_run,  
  'name' : name,
  'tn' : tn,
  'fp' : fp,
  'fn' : fn,
  'tp' : tp,
  'p' : p,
  'r' : r,
  'f_1' : f_1,
  'auc_sc' : auc_sc,
  'acc' : acc  
})


In [115]:
metrics

Unnamed: 0,model_run,name,tn,fp,fn,tp,p,r,f_1,auc_sc,acc
0,0.0,bert_large,845.0,8.0,6.0,162.0,0.952941,0.964286,0.95858,0.977454,0.986288
1,1.0,bert_large,838.0,15.0,14.0,154.0,0.911243,0.916667,0.913947,0.949541,0.971596
2,2.0,bert_large,841.0,12.0,24.0,144.0,0.923077,0.857143,0.888889,0.921537,0.96474
3,3.0,bert_large,846.0,7.0,14.0,154.0,0.956522,0.916667,0.93617,0.95423,0.979432
4,4.0,bert_large,843.0,10.0,9.0,159.0,0.940828,0.946429,0.94362,0.967353,0.981391
5,5.0,bert_large,838.0,15.0,9.0,159.0,0.913793,0.946429,0.929825,0.964422,0.976494
6,6.0,bert_large,835.0,18.0,9.0,159.0,0.898305,0.946429,0.921739,0.962663,0.973555
7,7.0,bert_large,844.0,9.0,9.0,159.0,0.946429,0.946429,0.946429,0.967939,0.98237
8,8.0,bert_large,841.0,12.0,9.0,159.0,0.929825,0.946429,0.938053,0.96618,0.979432
9,9.0,bert_large,842.0,11.0,12.0,156.0,0.934132,0.928571,0.931343,0.957838,0.977473


## Save metrics and predictions

In [116]:
metrics.to_csv('gs://dissertation_bucket/bert_large_assess_files/metrics.csv')

!gsutil cp /tmp/prediction_summary.pickle gs://dissertation_bucket/bert_large_assess_files/



CommandException: No URLs matched: /tmp/prediction_summary.pickle
