In [None]:
#pip install bert-tensorflow==1.0.1

In [None]:
#!pip install tensorflow==1.15

In [None]:
import re
import sys
import math
import string
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
table = str.maketrans(dict.fromkeys(string.punctuation))

In [None]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

#### Appending the legal details to the summarized documents

In [None]:
def legal_data_appending(df):
    '''
    Function: The legal details that were tracked using document ID between the actual data and raw data are merged in this 
    section.
    '''
    Summarized_Text = list()
    for idx,row in df.iterrows():
        Clean_Text=row['Summarized_content'].translate(table)
        Summarized_Text.append(row['Legal_Details'] + ' ' +  Clean_Text)
    df['Summarized_Content_LegalDetails'] = Summarized_Text
    return df

In [None]:
# Change the path:
# For tfidf: Thesis - Dataset and Transformations/transform - post text augmentation/lsa_tfidf_augmentation.csv
# For tf: Thesis - Dataset and Transformations/transform - post text augmentation/lsa_tf_augmentation.csv
df= pd.read_csv('Thesis - Dataset and Transformations/transform - post text augmentation/lsa_binary_augmentation.csv')
df.fillna('No text',inplace=True)
df = legal_data_appending(df)
df.tail(5)

## Create an output directory 
While fine-tuning the model, we will save the training checkpoints 
and the model in an output directory so that we can use the trained model for our predictions later.

In [None]:
 # Set the output directory for saving model file
OUTPUT_DIR = '/content/drive/My Drive/BERT checkpoints' 

DO_DELETE = True #@param {type:"boolean"}

if DO_DELETE:
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
    pass

tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))

## Train, Validation and Test data splitting

In [None]:
train_data, test_data = train_test_split(df, test_size = 0.2, random_state = 42)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)

train_data.reset_index(drop=True, inplace=True)
val_data.reset_index(drop=True, inplace=True)
test_data.reset_index(drop=True, inplace=True)

print("Training Set Shape :", train_data.shape)
print("Validation Set Shape :", val_data.shape)
print("Testing Set Shape :", test_data.shape)

In [None]:
DATA_COLUMN_A =  'Summarized_Content_LegalDetails' 
LABEL_COLUMN = 'Labels'
label_list = [x for x in np.unique(df.Labels)]

## Data Preprocessing
BERT model accept only a specific type of input and the datasets are usually structuress to have have the following four features:
* guid : A unique id that represents an observation.
* text_a : The text we need to classify into given categories
* text_b: It is used when we're training a model to understand the relationship between sentences and it does not apply for classification problems.
* label: It consists of the labels or classes or categories that a given text belongs to.
 
In our dataset we have text_a and label. The following code block will create objects for each of the above mentioned features for all the records in our dataset using the InputExample class provided in the BERT library.

In [None]:

train_InputExamples = train_data.apply(lambda x: bert.run_classifier.InputExample(guid=None,
                                                                   text_a = x[DATA_COLUMN_A], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

val_InputExamples = val_data.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN_A], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)


In [None]:
print("Row 0 - guid of training set : ", train_InputExamples.iloc[0].guid)
print("\n__________\nRow 0 - text_a of training set : ", train_InputExamples.iloc[0].text_a)
print("\n__________\nRow 0 - label of training set : ", train_InputExamples.iloc[0].label)

## Loading vocab and pretrained BERT model
We will use the ```bert_uncased_L-12_H-768_A-12/1``` model. To check all available versions click [here](https://tfhub.dev/s?network-architecture=transformer&publisher=google).
We will be using the vocab.txt file in the model to map the words in the dataset to indexes. Also the loaded BERT model is trained on uncased/lowercase data and hence the data we feed to train the model should also be of lowercase.

In [None]:

BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():
    with tf.Graph().as_default():
    # Operation nodes creation.
    bert_module = hub.Module(BERT_MODEL_HUB)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)

    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])

    return bert.tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

tokenizer = create_tokenizer_from_hub_module()

## Converting examples to features
We will now format out text in to input features which the BERT model expects. We will also set a sequence length which will be the length of the input features.

In [None]:
# We'll set sequences to be at most 512 tokens long.
MAX_SEQ_LENGTH = 512

# Convert our train and validation features to InputFeatures that BERT understands.
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

val_features = bert.run_classifier.convert_examples_to_features(val_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

In [None]:
#Example on first observation in the training set
i = 3509
print("Sentence : ", train_InputExamples.iloc[i].text_a)
print("-"*30)
print("Tokens : ", tokenizer.tokenize(train_InputExamples.iloc[i].text_a))
print("-"*30)
print("Input IDs : ", train_features[i].input_ids)
print("-"*30)
print("Input Masks : ", train_features[i].input_mask)
print("-"*30)
print("Segment IDs : ", train_features[i].segment_ids)

### **Pre-requisite understanding of create_model function** - reference from [link text](https://medium.com/@prasad.pai/how-to-use-tensorflow-hub-with-code-examples-9100edec29af)

**1) Module Instantiation**: Various modules made up with different models (Inception, ResNet, ElMo etc) serving different purposes (image classification, text embeddings etc) are hosted in TensorFlow Hub website. The user has to browse through the catalogue of modules and then once finalised with his purpose and model, needs to copy the URL of the model where it is hosted. Then, the user can instantiate his module like this:



```
import tensorflow_hub as hub
module = hub.Module(<<Module URL as string>>, trainable=True)
```
Apart from the URL parameter, the other most notable parameter is ‘trainable’. If user wishes to fine-tune/modify the weights of the model, this parameter has to be set as True.

**2) Signature**: The signature of the module specifies what is the purpose for which module is being used for. All the module, comes with the ‘default’ signature and makes use of it, if a signature is not explicitly mentioned.



```
module = hub.Module('https://tfhub.dev/google/imagenet/inception_v3/classification/1')
print(module.get_signature_names())
# ['default', 'image_classification', 'image_feature_vector']
```
**3) Expected inputs:** Each of the module has some set of expected inputs depending upon the signature of the module being used.


```
module = hub.Module('https://tfhub.dev/google/imagenet/inception_v3/classification/1')
print(module.get_input_info_dict())   # When no signature is given, considers it as 'default'
# {'images': <hub.ParsedTensorInfo shape=(?, 299, 299, 3) dtype=float32 is_sparse=False>}

print(module.get_input_info_dict(signature='image_feature_vector'))
# {'images': <hub.ParsedTensorInfo shape=(?, 299, 299, 3) dtype=float32 is_sparse=False>}
```
**4) Expected outputs:** In order to build the remaining part of the graph after the TensorFlow Hub’s model is built, it is necessary to know the expected type of output. get_output_info_dict() function is used for this purpose. 


```
module = hub.Module('https://tfhub.dev/google/imagenet/inception_v3/classification/1')
print(module.get_output_info_dict())  # When no signature is given, considers it as 'default'
# {'default': <hub.ParsedTensorInfo shape=(?, 1001) dtype=float32 is_sparse=False>}

print(module.get_output_info_dict(signature='image_classification'))
```




In [None]:
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
                 num_labels):

    tags=set()
    if not is_predicting:
      tags.add("train")
    bert_module = hub.Module(
      BERT_MODEL_HUB,
      tags = tags,
      trainable=True)
    bert_inputs = dict(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids)
    bert_outputs = bert_module(
      inputs=bert_inputs,
      signature="tokens",
      as_dict=True)


    output_layer = bert_outputs["pooled_output"]

    output_layer1 = bert_outputs["pooled_output"]         

    hidden_size = output_layer.shape[-1].value   

    output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

    output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

    with tf.variable_scope("loss"):

    output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)  
    logits = tf.nn.bias_add(logits, output_bias)                        
    log_probs = tf.nn.log_softmax(logits, axis=-1)                      
    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
    predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))

    if is_predicting:
      return (predicted_labels, log_probs, output_layer1)

    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, predicted_labels, log_probs)

In [None]:

def model_fn_builder(num_labels, learning_rate, num_train_steps,num_warmup_steps):
    '''
    A function that adapts our model to work for training, evaluation, and prediction.
    Model_fn_builder actually creates our model function using the passed parameters for num_labels, learning_rate, etc.
    '''
  
    def model_fn(features, labels, mode, params):  

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]

    is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)         # checks if the mode == PREDICT

    # TRAIN and EVAL
    if not is_predicting:
      (loss, predicted_labels, log_probs) = create_model(
        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      train_op = bert.optimization.create_optimizer(
          loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)

      # Calculate evaluation metrics. 
      def metric_fn(label_ids, predicted_labels):
        acc = tf.metrics.accuracy(label_ids, predicted_labels)
        true_pos = tf.metrics.true_positives(
            label_ids,
            predicted_labels)
        true_neg = tf.metrics.true_negatives(
            label_ids,
            predicted_labels)   
        false_pos = tf.metrics.false_positives(
            label_ids,
            predicted_labels)  
        false_neg = tf.metrics.false_negatives(
            label_ids,
            predicted_labels)
        recall = tf.metrics.recall(
            label_ids,
            predicted_labels)
        precision = tf.metrics.precision(
            label_ids,
            predicted_labels)

        return {
            "eval_accuracy": acc,
            "true_positives": true_pos,
            "true_negatives": true_neg,
            "false_positives": false_pos,
            "false_negatives": false_neg,
            "Precision": precision,
            "Recall": recall,
            }

      eval_metrics = metric_fn(label_ids, predicted_labels)

      # For mode == ModeKeys.TRAIN: required fields are loss and train_op.
      # For mode == ModeKeys.EVAL: required field is loss.
      # For mode == ModeKeys.PREDICT: required fields are predictions.

      if mode == tf.estimator.ModeKeys.TRAIN:                 
        return tf.estimator.EstimatorSpec(mode=mode,
          loss=loss,
          train_op=train_op)
      else:
          return tf.estimator.EstimatorSpec(mode=mode,            
            loss=loss,
            eval_metric_ops=eval_metrics)
    else:                                                          
      (predicted_labels, log_probs, output_layer) = create_model(
        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      predictions = {
          'probabilities': log_probs,
          'labels': predicted_labels,
          'pooled_output': output_layer
      }
      return tf.estimator.EstimatorSpec(mode, predictions=predictions) # The dictionary for the predictions is created above.

    # Return the actual model function in the closure
    return model_fn

In [None]:
BATCH_SIZE = 8                                        
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
WARMUP_PROPORTION = 0.1

In [None]:
#Compute train and warmup steps from batch size
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)
print('The number of train steps', num_train_steps)

In [None]:
SAVE_CHECKPOINTS_STEPS = 600
SAVE_SUMMARY_STEPS = 600
run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

In [None]:
model_fn = model_fn_builder(
  num_labels=len(label_list),
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})

In [None]:
train_input_fn = bert.run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=False)

val_input_fn = run_classifier.input_fn_builder(
    features=val_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

In [None]:
print(f'Beginning Training!')
current_time = datetime.now()
train_estimator = estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)

In [None]:
#Evaluating the model with Validation set
validation_estimator = estimator.evaluate(input_fn=val_input_fn, steps=None)

#If you want to consider evaluation from a specific checkpoint then add this parameter to the estimator:
#checkpoint_path='/content/drive/My Drive/BERT checkpoints/model.ckpt-6000')

## Performance on testing data

In [None]:
def getPrediction(in_sentences):
  #Transforming the test data into BERT accepted form
  input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in in_sentences] 
  
  #Creating input features for Test data
  input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)

  #Predicting the classes 
  predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
  predictions = estimator.predict(predict_input_fn, yield_single_examples=True)
  return [(sentence, prediction['probabilities'],prediction['labels']) for sentence, prediction in zip(in_sentences, predictions)]

In [None]:
#predictions = getPrediction(pred_sentences_A, pred_sentences_B)
pred_sentences = list(test_data[DATA_COLUMN_A])
predictions = getPrediction(pred_sentences)

In [None]:
predict_labels = []
act_labels = []
for i in range(len(predictions)):
  predict_labels.append(predictions[i][2])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(list(test_data['Labels']), predict_labels))

In [None]:
from sklearn.metrics import confusion_matrix
def heatconmat(y_true,y_pred):
    sns.set_context('talk')
    plt.figure(figsize=(15,12))
    sns.heatmap(confusion_matrix(y_true,y_pred),
                annot=True,
                fmt='d',
                cbar=False,
                cmap='gist_earth_r',
                yticklabels=sorted(y_true.unique()))
    plt.show()

In [None]:
heatconmat(test_data['Labels'],predict_labels)

### **Intermediate results with parameter details to decide the threshold Sigma value for SVD by fine tuning on a dataset with documents from 5 labels:**


**1.** Thresh_sigma = 0.5



BERT Evaluation results:
* 'Precision': 0.9358491,
* 'Recall': 0.95384616,
* 'eval_accuracy': 0.8985849,
* 'false_negatives': 12.0,
* 'false_positives': 17.0,
* 'global_step': 636,
* 'loss': 0.35691655,
* 'true_negatives': 147.0,
* 'true_positives': 248.0.

-------------------------------------------------------------------------------------------------
**2.** Thresh_sigma = 0.6


BERT Evaluation results:
* 'Precision': 0.9266409,
* 'Recall': 0.9230769,
* 'eval_accuracy': 0.8419811,
* 'false_negatives': 20.0,
* 'false_positives': 19.0,
* 'global_step': 636,
* 'loss': 0.50120294,
* 'true_negatives': 145.0,
* 'true_positives': 240.0.

-------------------------------------------------------------------------------------------------
**3.** Thresh_sigma = 0.7



BERT Evaluation results:
* 'Precision': 0.9348659,
* 'Recall': 0.93846154,
* 'eval_accuracy': 0.8962264,
* 'false_negatives': 16.0,
* 'false_positives': 17.0,
* 'global_step': 600,
* 'loss': 0.3652602,
* 'true_negatives': 147.0,
* 'true_positives': 244.0.

-------------------------------------------------------------------------------------------------
**4.** Thresh_sigma = 0.8



BERT Evaluation results:
* 'Precision': 0.9455253,
* 'Recall': 0.9346154,
* 'eval_accuracy': 0.8985849,
* 'false_negatives': 17.0,
* 'false_positives': 14.0,
* 'global_step': 636,
* 'loss': 0.3684816,
* 'true_negatives': 150.0,
* 'true_positives': 243.0

-------------------------------------------------------------------------------------------------
**5.** Thresh_sigma = 0.9



BERT Evaluation results:
* 'Precision': 0.9157509,
* 'Recall': 0.96153843,
* 'eval_accuracy': 0.8773585,
* 'false_negatives': 10.0,
* 'false_positives': 23.0,
* 'global_step': 600,
* 'loss': 0.41267064,
* 'true_negatives': 141.0,
* 'true_positives': 250.0

-------------------------------------------------------------------------------------------------
**6.** Thresh_sigma = 1.0



BERT Evaluation results:
* Precision: 0.93846154,
* Recall: 0.93846154,
* eval_accuracy: 0.8915094,
* false_negatives: 16.0,
* false_positives: 16.0,
* global_step: 600,
* loss: 0.38839507,
* true_negatives: 148.0,
* true_positives: 244.0.