### Installing the libraries

In [1]:
# A dependency of the preprocessing for BERT inputs
!pip install -q -U tensorflow-text
!pip install -q tf-models-official

import spacy
nlp = spacy.load('en_core_web_sm')

import pandas as pd
import numpy as np
import scipy
from scipy import spatial
import os
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization  # to create AdamW optimizer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

[K     |████████████████████████████████| 4.4 MB 5.3 MB/s 
[K     |████████████████████████████████| 1.8 MB 5.4 MB/s 
[K     |████████████████████████████████| 90 kB 8.4 MB/s 
[K     |████████████████████████████████| 99 kB 4.9 MB/s 
[K     |████████████████████████████████| 213 kB 50.3 MB/s 
[K     |████████████████████████████████| 37.1 MB 52 kB/s 
[K     |████████████████████████████████| 1.2 MB 38.0 MB/s 
[K     |████████████████████████████████| 43 kB 2.0 MB/s 
[K     |████████████████████████████████| 596 kB 36.7 MB/s 
[K     |████████████████████████████████| 352 kB 40.6 MB/s 
[K     |████████████████████████████████| 1.1 MB 36.4 MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone


### Loading the Training Data

In [2]:
df = pd.read_excel("Training_data.xlsx")
X_train, X_test, y_train, y_test = train_test_split(df['title'],df['label'], stratify=df['label'],test_size=0.2)

In [3]:
#BERT
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3", name='Preprocessing')
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4",trainable=True,name='BERT_encoder')

# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.1, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation=None, name="classifier")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])
print("Model Summary:")
print(model.summary())

#Setting Parameters for training
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()
epochs = 10
steps_per_epoch = 3
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)
init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                            num_train_steps=num_train_steps,
                                            num_warmup_steps=num_warmup_steps,
                                            optimizer_type='adamw')
  
model.compile(optimizer= optimizer,
                loss= loss,
                metrics=metrics)

#Model training
model.fit(X_train, y_train, epochs = epochs)

#Model testing
scores_test = model.predict(X_test)
y_pred = []

#Taking cut-off as 0.7
for each in scores_test:
    if(each>0.7):
      y_pred.append(1)
    else:
      y_pred.append(0)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

Model Summary:
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text (InputLayer)               [(None,)]            0                                            
__________________________________________________________________________________________________
Preprocessing (KerasLayer)      {'input_mask': (None 0           text[0][0]                       
__________________________________________________________________________________________________
BERT_encoder (KerasLayer)       {'encoder_outputs':  109482241   Preprocessing[0][0]              
                                                                 Preprocessing[0][1]              
                                                                 Preprocessing[0][2]              
_______________________________________________________________________________

### Validation 

In [4]:
df_val = pd.read_excel("Validation_data.xlsx")
X_val = df_val['title']
y_val = df_val['label']


#Model validation
scores_val = model.predict(X_val)


y_pred = []
for each in scores_val:
    if(each>0.7):
      y_pred.append(1)
    else:
      y_pred.append(0)

cm = confusion_matrix(y_val, y_pred)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[46 18]
 [ 0 13]]
