In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os

###Setting path to working directory
Directory contains the dataset

In [None]:
os.chdir("/content/drive/My Drive/Thesis")

In [None]:
!pip install -q tf-models-official==2.3.0

[K     |████████████████████████████████| 849kB 10.8MB/s 
[K     |████████████████████████████████| 102kB 7.1MB/s 
[K     |████████████████████████████████| 37.6MB 88kB/s 
[K     |████████████████████████████████| 1.2MB 35.9MB/s 
[K     |████████████████████████████████| 358kB 39.8MB/s 
[K     |████████████████████████████████| 174kB 43.0MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import tensorflow_hub as hub
import official.nlp.bert.tokenization

##Initiating Bert

In [None]:
tkn = official.nlp.bert.tokenization.FullTokenizer(vocab_file="vocab.txt", 
                                                   do_lower_case=False)

In [None]:
mybert = hub.KerasLayer(
    "https://tfhub.dev/tensorflow/bert_en_wwm_cased_L-24_H-1024_A-16/3",
    trainable=False)

##Preparing BERT INPUT

### loading Dataset

In [None]:
import json


In [None]:
raw_text = []
raw_label = []
with open('training.json') as f:
  file = json.load(f)
  f.close()
for x in file:
  raw_text.append(x['text'])
  raw_label.append(x['label'])

###Formatting DATA for BERT
NB: Tune the sequnce length here

In [None]:
sequence_length = 100 # tune it to max 600
etag = 3
ftag = 2
def bert_input(text_list, label_list):
  input_id_list = []
  attention_mask_list = []
  input_type_list = []
  tag_list = []
  max_length = 320
  for sentence, stripe in zip(text_list, label_list):
    temp_token = []
    temp_tag = []
    temp_attention = []
    temp_type = []
    for word, tag in zip(sentence.split(), stripe):
      #if tag == 1:
      #  tag = 0
      #elif tag == 2:
      #  tag = 1
      new_token = tkn.tokenize(word)
      new_token = tkn.convert_tokens_to_ids(new_token)
      new_label = [tag] + ( [ftag] * (len(new_token) - 1))
      new_attention = [1] * len(new_token)
      new_type =  [0] * len(new_token)
      
      temp_token.extend(new_token)
      temp_tag.extend(new_label)
      temp_attention.extend(new_attention)
      temp_type.extend(new_type)
    
    temp_token = [101] + temp_token + [102]
    temp_tag = [0] + temp_tag + [0]
    temp_attention = [1] + temp_attention + [1]
    temp_type = [0] + temp_type + [0]

    if len(temp_token) < sequence_length:
      extend_list = [0] * (sequence_length - len(temp_token))
      extend_tag = [etag] * (sequence_length - len(temp_token))
      temp_token = temp_token + extend_list
      temp_attention = temp_attention + extend_list
      temp_type = temp_type + extend_list
      temp_tag = temp_tag + extend_tag

    elif len(temp_token) > sequence_length:
      temp_token = temp_token[:sequence_length - 1] + [102]
      temp_attention = temp_attention[:sequence_length]
      temp_tag = temp_tag[:sequence_length - 1] + [0]
      temp_type = temp_type[:sequence_length]
    
    #temp_tag = temp_tag + extend_tag
    #print(temp_attention)
    input_id_list.append(tf.constant(temp_token))
    input_type_list.append(tf.constant(temp_type))
    attention_mask_list.append(tf.constant(temp_attention))
    tag_list.append(temp_tag)


  #print(attention_mask_list)
  #print(input_type_list)
  dic = dict(
    input_word_ids=input_id_list,
    input_mask=attention_mask_list,
    input_type_ids=input_type_list)
  
  return dic, tag_list


In [None]:
train_x, train_y, train_f1 = [], [], []
for x, y in zip(raw_text[:232], raw_label[:232]):
  a, b = bert_input([x], [y])
  train_f1.append(a['input_mask'][0])
  a = mybert(a)['sequence_output'][0]
  b = b[0]
  train_x.append(np.array(a))
  train_y.append(np.array(b))
  #break
train_x = np.array(train_x)
train_y = np.array(train_y)
train_f1 = np.array(train_f1)
train_x.shape, train_y.shape, train_f1.shape

((232, 100, 1024), (232, 100), (232, 100))

In [None]:
train_x2, train_y2, train_f2 = [], [], []
for x, y in zip(raw_text[232:464], raw_label[232:464]):
  a, b = bert_input([x], [y])
  train_f2.append(a['input_mask'][0])
  a = mybert(a)['sequence_output'][0]
  b = b[0]
  train_x2.append(np.array(a))
  train_y2.append(np.array(b))
  #break
print('I am here')
train_x2 = np.array(train_x2)
train_y2 = np.array(train_y2)
train_f2 = np.array(train_f2)
train_x2.shape, train_y2.shape, train_f2.shape

I am here


((232, 100, 1024), (232, 100), (232, 100))

In [None]:
train_x = np.concatenate((train_x, train_x2))
train_y = np.concatenate((train_y, train_y2))
train_f = np.concatenate((train_f1, train_f2))
train_y = np.expand_dims(train_y, axis=-1)

In [None]:

valid_x, valid_y, valid_f = [], [], []
for x, y in zip(raw_text[464:], raw_label[464:]):
  a, b = bert_input([x], [y])
  valid_f.append(a['input_mask'][0])
  a = mybert(a)['sequence_output'][0]
  b = b[0]
  valid_x.append(np.array(a))
  valid_y.append(np.array(b))
  
  #break
print('I am here')
valid_x = np.array(valid_x)
valid_y = np.array(valid_y)
valid_f = np.array(valid_f)
valid_y = np.expand_dims(valid_y, axis=-1)
valid_x.shape, valid_y.shape, valid_f.shape

I am here


((116, 100, 1024), (116, 100, 1), (116, 100))

##ModeL Creation
##Tuning List


1.   Epoch
2.   Layer Number
3. Neuron units
4. batch size
5. sequence Length



In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [None]:
best = 0
num_label = 5
def see(ty, tp):
  #model.load('')
  ty = ty.reshape(-1)
  tp = tp.reshape(-1, num_label)
  print(tp.shape, ty.shape)
  tp = [np.argmax(x) for x in tp]
  dix = np.where(ty == (num_label - 1))[0]
  tp = np.delete(tp, dix)
  ty = np.delete(ty, dix)
  #valid_p = [1 if x > 0.5 else 0 for x in model.predict(valid_x)]
  #valid_p = model.predict(valid_x)
  print(np.shape(tp), ty.shape)
  print('\nClassification Report \n\n')
  
  f = classification_report(ty, tp)
  print(f)
  print('\nPrint Confusion Matrix \n\n')
  print(confusion_matrix(ty, tp))
  

In [None]:
class mcb(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, log=None):
   # global train_x, train_y, valid_x, valid_y
    print('here')
    _y = self.model.predict(train_x)
    see(train_y, _y)
    _y = self.model.predict(valid_x)
    see(valid_y, _y)

In [None]:
model = tf.keras.Sequential([
                             tf.keras.layers.LSTM(units=1024, return_sequences=True),
                             tf.keras.layers.LSTM(units=1024, return_sequences=True),
                             tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(1024, activation='relu')),
                             tf.keras.layers.Dense(256, activation='relu'),
                             tf.keras.layers.Dense(num_label, activation='softmax')
])

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam',
              metrics=['accuracy'])

In [None]:
model.fit(train_x, train_y, epochs=15,
          validation_data=(valid_x, valid_y),
          sample_weight=train_f, callbacks=[mcb()],
          batch_size=1)

Epoch 1/15




here
(46400, 4) (46400,)
(26047,) (26047,)

Classification Report 


              precision    recall  f1-score   support

           0       0.98      0.97      0.97     20601
           1       0.84      0.85      0.84      3212
           2       0.99      0.99      0.99      2234

    accuracy                           0.96     26047
   macro avg       0.94      0.94      0.94     26047
weighted avg       0.96      0.96      0.96     26047


Print Confusion Matrix 


[[20074   515    12]
 [  486  2722     4]
 [   18     4  2212]]
(11600, 4) (11600,)
(7038,) (7038,)

Classification Report 


              precision    recall  f1-score   support

           0       0.97      0.97      0.97      5465
           1       0.84      0.81      0.82       938
           2       0.98      0.99      0.99       635

    accuracy                           0.95      7038
   macro avg       0.93      0.92      0.93      7038
weighted avg       0.95      0.95      0.95      7038


Print Confusion

<tensorflow.python.keras.callbacks.History at 0x7fa0cdd11a90>

In [None]:
a = np.array([[[1, 2], [3, 4], [5, 6]], [[1, 2], [3, 4], [5, 6]]])

In [None]:
a.shape

(2, 3, 2)

In [None]:
a.reshape(-1, 2).shape

(6, 2)

In [None]:
import numpy as np

In [None]:
ar = np.array([4, 2, 3, 4, 5, 6, 7, 4, 79, 10])

In [None]:
np.delete(ar, np.where(ar == 4)[0])

array([ 2,  3,  5,  6,  7, 79, 10])