<a href="https://colab.research.google.com/github/Theonimfi/Bert-based-preprocessing-techniques-acceleration/blob/main/src/Bert_based_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing the necessary modules:
1. The tensorflow(1.15.0) versions need to be correctly compatible with the bert-tensorflow version of 1.0.1. The corresponding bert modules like 'tokenization' performing tokenization on the input sentence.
2. We use the nltk toolkit for our pre-processing tasks.
2. Libraries like pandas,numpy are used for basic operations. 

In [None]:
pip install tensorflow==1.15.0

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime
from sklearn.model_selection import train_test_split
import os

print("tensorflow version : ", tf.__version__)
print("tensorflow_hub version : ", hub.__version__)

In [None]:
!pip install bert-tensorflow==1.0.1

In [None]:
#Importing BERT modules
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

In [None]:
# Load the dataset.
!pip install numpy
!pip install pandas
!pip install nltk
!pip install -q -U tensorflow-text
!pip install -q tf-models-official
!pip install contractions
!pip install textblob

import numpy as np
import pandas as pd
import nltk

# Loading the Datasets :
We use a total of 4 datasets to test our preprocessing techniques by fine-tuning the BERT model.
 We set our datasets as the following :
 1. DATASET_1: The Twitter Airline Dataset https://data.world/crowdflower/airline-twitter-sentiment
 2. DATASET_2: Amazon Consumer Reviews Dataset https://www.kaggle.com/datafiniti/consumer-reviews-of-amazon-products
 3. DATASET_3: Amazon Automotive Reviews Dataset https://jmcauley.ucsd.edu/data/amazon/
 4. DATASET_4 :Twitter Sentiment Dataset for Self-driving cars https://data.world/crowdflower/sentiment-self-driving-cars

We set the corresponding dataset we test on to "True" and the others to "False"



In [None]:
DATASET_1 = True
DATASET_2 = False
DATASET_3 = False
DATASET_4 = False

# AIRLINE TWITTER DATASET
if DATASET_1:
  # https://www.codegrepper.com/code-examples/python/how+to+read+csv+file+from+google+drive+on+google+colab+
  #Derive the id from the google drive shareable link.
  #For the file at hand the link is as below
  URL = 'https://drive.google.com/file/d/10j1dwpPCk7XAHCTX7gczX2woXzMZ5Dva/view?usp=sharing'
  path = 'https://drive.google.com/uc?export=download&id='+URL.split('/')[-2]
  #df = pd.read_pickle(path)
  df = pd.read_csv(path)
  # df = df.head(100)
  df = df[['text','airline_sentiment']]
  DATA_COLUMN = 'text'
  LABEL_COLUMN = 'airline_sentiment'
  cleanup_nums = {"airline_sentiment": {"positive": 1, "negative": 0,"neutral":2}}
  df = df.replace(cleanup_nums)
  # The list containing all the classes (train['SECTION'].unique())
  label_list = [0, 1, 2]
  dataset_name = 'D1_'

# DATAFININITI AMAZON CONSUMER REVIEWS
if DATASET_2:
  URL = 'https://drive.google.com/file/d/1EbWy4GSV_Ano6OlOiXZaNuqmJrnc95p5/view?usp=sharing'
  path = 'https://drive.google.com/uc?export=download&id='+URL.split('/')[-2]
  df = pd.read_csv(path)
  df = df[['reviews.rating','reviews.text']]
  DATA_COLUMN = 'reviews.text'
  LABEL_COLUMN = 'reviews.rating'
  dataset_name = 'D2_'
  # The list containing all the classes (train['SECTION'].unique())
  label_list = [1, 2, 3, 4, 5]
  
# AMAZON AUTOMOTIVE
if DATASET_3:
  URL = 'https://drive.google.com/file/d/1-ST5Wffhx9ky56Qh7KEkbfhMd9MRY97d/view?usp=sharing'
  path = 'https://drive.google.com/uc?export=download&id='+URL.split('/')[-2]
  df = pd.read_json(path, lines=True)
  df = df[['reviewText','overall']]
  DATA_COLUMN = 'reviewText'
  LABEL_COLUMN = 'overall'
  dataset_name = 'D3_'
  # The list containing all the classes (train['SECTION'].unique())
  label_list = [1, 2, 3, 4, 5]

# TWEET SENTIMENT OF SELF-DRIVING CARS
if DATASET_4:
  #Derive the id from the google drive shareable link.
  #For the file at hand the link is as below
  URL = 'https://drive.google.com/file/d/1XeIrsFJkOnAaly_YxyOQ8pj18b_yilTA/view?usp=sharing'
  path = 'https://drive.google.com/uc?export=download&id='+URL.split('/')[-2]
  #df = pd.read_pickle(path)
  df = pd.read_csv(path,encoding = 'unicode_escape')
  # df = df.head(100)
  print(df)
  df = df[['text','sentiment']]
  DATA_COLUMN = 'text'
  LABEL_COLUMN = 'sentiment'
  cleanup_nums = {"sentiment": {"1":1,"2":2,"3":3,"4":4,"5":5,"not_relevant": 0}}
  df = df.replace(cleanup_nums)
  # The list containing all the classes (train['SECTION'].unique())
  label_list = [0, 1, 2, 3, 4, 5]
  dataset_name = 'D4_'


df



# Data Pre-processing techniques :

We individually evaluate the preprocessing techniques and do not combine any of them in our testing.The results are correspondingly compared without any techniques too. In case of no preprocessing techniques being applied, we set all of them to "False". 

In [None]:
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
import string
import re
from datetime import datetime
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
import contractions

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

LOWERCASE = False
REMOVE_NUMBERS = False
REMOVE_HASHTAGS = False
REMOVE_MENTIONS = False
REMOVE_PUNCTUATIONS = False
STEMMING = False

REMOVE_URLS = False
REPLACE_REPEAT_PUNCTUATION = False # repeat punctuation show intense emotion show we replace the punctuation with a word to capture the intensity of the text
HANDLE_CAPITALIZED_WORDS = False # capitalize words show intense emotion show we add a word before each capitalized word to capture the intensity of the text
EMOJIS = False # replace emojis with a word which indicates whether it is a positive or negative emoji (eg :) -> POS_EMOJI, :(( -> NEG_EMOJI)

# choose which tech we use
STOP_WORDS_REMOVAL = False
REPEATED_CHARACTERS_REMOVAL = False
LEMMATIZATION = False
EXTRA_SPACE_REMOVAL = False
CONTRACTED_WORDS_EXPANDING = False
print(f'Beginning Pre-processing!')
current_time = datetime.now()



porter = PorterStemmer()
prepr = dataset_name + 'None'

df['processed_text'] = df[DATA_COLUMN].fillna('').apply(str)
if REPLACE_REPEAT_PUNCTUATION:
  prepr = dataset_name + 'replace_repeat_punctuation'
  df['processed_text'] = df['processed_text'].apply(lambda x: re.sub('\!\!+', 'multiExclamationMark ', x))
  df['processed_text'] = df['processed_text'].apply(lambda x: re.sub('\?\?+', 'multiQuestionMark ', x))



if LOWERCASE:
  prepr = dataset_name +  'lowercase'
  df['processed_text'] = df['processed_text'].apply(lambda x: x.lower())

if REMOVE_MENTIONS:
  prepr = dataset_name + 'rmv_mentions'
  df['processed_text'] = df['processed_text'].apply(lambda x: re.sub("@[A-Za-z0-9_]+","", x))

if REMOVE_HASHTAGS:
  prepr = dataset_name +  'rmv_hashtags'
  df['processed_text'] = df['processed_text'].apply(lambda x: re.sub("#[A-Za-z0-9_]+","", x))

if REMOVE_URLS:
  prepr = dataset_name +  'rmv_urls'
  df['processed_text'] = df['processed_text'].apply(lambda x: re.sub(r'http\S+', '', x))

if REMOVE_NUMBERS:
  prepr = dataset_name +  'rmv_numbers'
  df['processed_text'] = df['processed_text'].apply(lambda x: re.sub(r'[0-9]+', '', x))

if REMOVE_PUNCTUATIONS:
  prepr = dataset_name + 'rmv_punctuations'
  df['processed_text'] = df['processed_text'].apply(lambda x: "".join([char for char in x if char not in string.punctuation]))

if HANDLE_CAPITALIZED_WORDS:
  prepr = dataset_name + 'habdle_capitalized_words'
  df['processed_text'] = df['processed_text'].apply(lambda x: " ".join(["ALL_CAPS " + word if (word.isupper() and len(word) > 2) else word for word in word_tokenize(x)]))

# if SPELLING_CORRECTION:
#   prepr = dataset_name + 'spelling correction'
#   df['processed_text'] = df['processed_text'].apply(lambda x: " ".join([str(TextBlob(word).correct()) for word in word_tokenize(x)]))

if EMOJIS:
  prepr = dataset_name +  'emojis'

  def find_emojis(text, neg_emoticons, pos_emoticons, neg_emojis, pos_emojis):
      words = word_tokenize(text)
      for i, _ in enumerate(words):
        if words[i] in neg_emojis or words[i] in neg_emoticons:
          words[i] = "NEG_EMOJI"
        elif words[i] in pos_emojis or words[i] in pos_emoticons:
          words[i] = "POS_EMOJI"
      
      return " ".join(words)
    




  # files from here: https://github.com/modarwish1/sentimentr
  URL = 'https://drive.google.com/file/d/1IBTu3IgYECIl1SOdbPr4o8LUM1_avAas/view?usp=sharing'
  path = 'https://drive.google.com/uc?export=download&id='+URL.split('/')[-2]
  neg_emoticons = pd.read_csv(path, sep="\t", header=None, usecols=[0])
  neg_emoticons.columns = ["emoticon"]
  neg_emoticons = neg_emoticons["emoticon"].to_list()
  # print(neg_emoticons)

  URL = 'https://drive.google.com/file/d/148G4WZablFzVQSmEak6DqABarn4EY2do/view?usp=sharing'
  path = 'https://drive.google.com/uc?export=download&id='+URL.split('/')[-2]
  pos_emoticons = pd.read_csv(path, sep="\t", header=None, usecols=[0])
  pos_emoticons.columns = ["emoticon"]
  pos_emoticons = pos_emoticons["emoticon"].to_list()
  # print(pos_emoticons)

  URL = 'https://drive.google.com/file/d/1FTKcH1ackudDaeDi-ZveA9eIUYkJhsRK/view?usp=sharing'
  path = 'https://drive.google.com/uc?export=download&id='+URL.split('/')[-2]
  neg_emojis = pd.read_csv(path, sep="\t", header=None, usecols=[0])
  neg_emojis.columns = ["emoticon"]
  neg_emojis = neg_emojis["emoticon"].to_list()
  # print(neg_emojis)

  URL = 'https://drive.google.com/file/d/1QOLDjOjYtkCBWsGwjfJ4bBGGsDqfg3Ck/view?usp=sharing'
  path = 'https://drive.google.com/uc?export=download&id='+URL.split('/')[-2]
  pos_emojis = pd.read_csv(path, sep="\t", header=None, usecols=[0])
  pos_emojis.columns = ["emoticon"]
  pos_emojis = pos_emojis["emoticon"].to_list()
  # print(pos_emojis)
  df['processed_text'] = df['processed_text'].apply(lambda x: find_emojis(x, neg_emoticons, pos_emoticons, neg_emojis, pos_emojis))

if STEMMING:
  prepr = dataset_name + 'stemming'
  df['processed_text'] = df['processed_text'].apply(lambda x: " ".join([porter.stem(word) for word in word_tokenize(x)]))




column_0 = 'processed_text'
column_1 = LABEL_COLUMN
text_column = df[column_0]

# some prepard class and method
class RepeatReplacer(object):
  def __init__(self):
    self.regex = re.compile(r'(\w*)(\w)\2(\w*)')
    self.repl = r'\1\2\3'
  
  def replace(self, word):
    if wordnet.synsets(word):
      return word
    loop_res = self.regex.sub(self.repl, word)
    if(word == loop_res):
      return loop_res
    else:
      return self.replace(loop_res)
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return None



# start processing
if STOP_WORDS_REMOVAL:
  prepr = dataset_name +  'stop_words_removal'
  stop_words = set(stopwords.words('english'))
  # processing the text in loop
  for t in text_column:
    new_t = ' '.join([word for word in t.lower().split(' ') if word not in stop_words])
    text_column.replace({t:new_t}, inplace=True)
if REPEATED_CHARACTERS_REMOVAL:
  prepr = dataset_name + 'repeated_characters_removal'
  replacer = RepeatReplacer()
  # processing the text in loop
  for t in text_column:
    word_list = []
    for word in t.lower().split(' '):
      word_list.append(replacer.replace(word))
    new_t = ' '.join(word_list)
    text_column.replace({t:new_t}, inplace=True)
if LEMMATIZATION:
  prepr = dataset_name +  'lemmatization'
  lemmatizer = WordNetLemmatizer()
  # processing the text in loop
  for t in text_column:
    tokens = word_tokenize(t)
    tagged_t = pos_tag(tokens)
    lemmas_t = []
    for tag in tagged_t:
      wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
      lemmas_t.append(lemmatizer.lemmatize(tag[0], pos=wordnet_pos)) 
    new_t = ' '.join(lemmas_t)
    text_column.replace({t:new_t}, inplace=True)
if EXTRA_SPACE_REMOVAL:
  prepr = dataset_name +  'extra_space_removal'
  # processing the text in loop
  for t in text_column:
    new_t = re.sub(' +', ' ', t)
    text_column.replace({t:new_t}, inplace=True)
if CONTRACTED_WORDS_EXPANDING:
  prepr = dataset_name +  'contracted_words_expanding'
# processing the text in loop
  for t in text_column:
    word_list = []
    for word in t.lower().split(' '):
      word_list.append(contractions.fix(word))
    new_t = ' '.join(word_list)
    text_column.replace({t:new_t}, inplace=True)


df[DATA_COLUMN] = df['processed_text']

print("Pre-processing took time ", datetime.now() - current_time)

# with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
print(df)

In [None]:
df[DATA_COLUMN].head()

In [None]:
filtered_columns=[DATA_COLUMN, LABEL_COLUMN]
df= df.reindex(columns=filtered_columns)
df.head()


In [None]:
from sklearn.model_selection import train_test_split
train, val = train_test_split(df, test_size=0.2)
train.head(5)

# Setting up the input to BERT
The data in our dataframe needs to be transformed intoa format BERT understands. 
We create InputExample's using the constructor provided in the BERT library.

1. **text_a** is the text we want to classify, which in this case, is the Data Column field in our Dataframe.
2. **text_b** is used if we're training a model to understand the relationship between sentences (i.e. is **text_b** a translation of **text_a**? Is **text_b** an answer to the question asked by **text_a**?). This doesn't apply to our task of Sentiment Analysis, so we can leave **text_b** blank.
3. For our task,**label **is the sentiment label of the reviews in the dataset.

In [None]:

train_InputExamples = train.apply(lambda x: bert.run_classifier.InputExample(guid=None,
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

val_InputExamples = val.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

In [None]:

train_InputExamples

In [None]:
print("Row 0 - guid of training set : ", train_InputExamples.iloc[0].guid)
print("\n__________\nRow 0 - text_a of training set : ", train_InputExamples.iloc[0].text_a)
print("\n__________\nRow 0 - text_b of training set : ", train_InputExamples.iloc[0].text_b)
print("\n__________\nRow 0 - label of training set : ", train_InputExamples.iloc[0].label)

# Tokenizing the preprocessed text so that it fits the BERT data
We do the following in our case :
1. Tokenize it 
2. Break words into WordPieces 
3. Map the words to indexes using a vocab file that BERT provides(from  BERT tf hub module)
4. Add special "CLS" and "SEP" tokens
5. Individual inputs are assigned with "index" and "segment" tokens.


In [None]:
# This is a path to an uncased (all lowercase) version of BERT
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():
  """Get the vocab file and casing info from the Hub module."""
  with tf.Graph().as_default():
    bert_module = hub.Module(BERT_MODEL_HUB)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
  return bert.tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

In [None]:
print(tokenizer.tokenize(train_InputExamples.iloc[0].text_a))

In [None]:

# We'll set sequences to be at most 128 tokens long.
MAX_SEQ_LENGTH = 128

# Convert our train and validation features to InputFeatures that BERT understands.
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

val_features = bert.run_classifier.convert_examples_to_features(val_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

In [None]:
print("Sentence : ", train_InputExamples.iloc[0].text_a)
print("-"*30)
print("Tokens : ", tokenizer.tokenize(train_InputExamples.iloc[0].text_a))
print("-"*30)
print("Input IDs : ", train_features[0].input_ids)
print("-"*30)
print("Input Masks : ", train_features[0].input_mask)
print("-"*30)
print("Segment IDs : ", train_features[0].segment_ids)

# Fine tuning the model 

The function create_model is used the set up the model.
1. It loads the BERT tf hub module again to extract the computational graph.
2. We create a fully-connected layer on top of it to classify the sentiments labels.
3. This last layer is trained to adapt the BERT to the sentiment analysis task.

In [None]:
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
                 num_labels):
  
  bert_module = hub.Module(
      BERT_MODEL_HUB,
      trainable=True)
  bert_inputs = dict(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids)
  bert_outputs = bert_module(
      inputs=bert_inputs,
      signature="tokens",
      as_dict=True)

  # Use "pooled_output" for classification tasks on an entire sentence.
  # Use "sequence_outputs" for token-level output.
  output_layer = bert_outputs["pooled_output"]
  print(output_layer.shape)

  hidden_size = output_layer.shape[-1].value

  # Create our own layer to tune for politeness data.
  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):

    # Dropout helps prevent overfitting
    output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    # Convert labels into one-hot encoding
    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
    predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
    # If we're predicting, we want predicted labels and the probabiltiies.
    if is_predicting:
      return (predicted_labels, log_probs)

    # If we're train/eval, compute loss between predicted and actual label
    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, predicted_labels, log_probs)

In [None]:
def model_fn_builder(num_labels, learning_rate, num_train_steps,
                     num_warmup_steps):
  """Returns `model_fn` closure for TPUEstimator."""
  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]

    is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
    
    # TRAIN and EVAL
    if not is_predicting:

      (loss, predicted_labels, log_probs) = create_model(
        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      train_op = bert.optimization.create_optimizer(
          loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)
      def metric_fn(label_ids, predicted_labels):
        accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
        true_pos = tf.metrics.true_positives(
            label_ids,
            predicted_labels)
        true_neg = tf.metrics.true_negatives(
            label_ids,
            predicted_labels)   
        false_pos = tf.metrics.false_positives(
            label_ids,
            predicted_labels)  
        false_neg = tf.metrics.false_negatives(
            label_ids,
            predicted_labels)
        
        return {
            "eval_accuracy": accuracy,
            "true_positives": true_pos,
            "true_negatives": true_neg,
            "false_positives": false_pos,
            "false_negatives": false_neg
            }

      eval_metrics = metric_fn(label_ids, predicted_labels)

      if mode == tf.estimator.ModeKeys.TRAIN:
        return tf.estimator.EstimatorSpec(mode=mode,
          loss=loss,
          train_op=train_op)
      else:
          return tf.estimator.EstimatorSpec(mode=mode,
            loss=loss,
            eval_metric_ops=eval_metrics)
    else:
      (predicted_labels, log_probs) = create_model(
        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)
      predictions = {
          'probabilities': log_probs,
          'labels': predicted_labels
      }
      return tf.estimator.EstimatorSpec(mode, predictions=predictions)

  # Return the actual model function in the closure
  return model_fn

In [None]:
#OUTPUT_DIR = '/drive/MyDrive/DataMining_NLP_original'
import os

path = '/content/gdrive/MyDrive/Data Science and AI/Research in Data Mining/Training_NLP' + prepr
#path = '/drive/MyDrive/DataMining_NLP_rmv_punct'
OUTPUT_DIR = path
#OUTPUT_DIR = '/drive/MyDrive/DataMining_NLP'
#OUTPUT_DIR = '/drive/MyDrive/DataMining_NLP2'

In [None]:
# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
# Warmup is a period of time where the learning rate is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 300
SAVE_SUMMARY_STEPS = 100

# Compute train and warmup steps from batch size
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

# Specify output directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

In [None]:
#from google.colab import drive
#drive.mount('/content/gdrive')

In [None]:
model_fn = model_fn_builder(
  num_labels=len(label_list),
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})

In [None]:

# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = bert.run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=False)

# Create an input function for validating. drop_remainder = True for using TPUs.
val_input_fn = run_classifier.input_fn_builder(
    features=val_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)


In [None]:
#Training the model
print(f'Beginning Training!')
current_time = datetime.now()
training_output = estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)

In [None]:
print(training_output)

In [None]:
output = estimator.evaluate(input_fn=val_input_fn, steps=None)

In [None]:
!pip install --upgrade gspread

# Output
Creates a google spread sheet to store the results of the individual preprocessing tasks. 


In [None]:
from google.colab import auth
auth.authenticate_user()

import gspread
from oauth2client.client import GoogleCredentials

gc = gspread.authorize(GoogleCredentials.get_application_default())

# Open our new sheet and add some data.
spreadsheet = gc.open('Bert results')

values = [prepr]
for h in output:
  values.append(str(output[h]))

#worksheet.update('A1:G2',output)
spreadsheet.values_append('Sheet1', {'valueInputOption': 'USER_ENTERED'}, {'values': [values]})

In [None]:
%rm -rf '/content/gdrive'