## Bert as embedding layer
### We used Bert tokens for training the model
### Now we will use bert as Embedding layer to train model with same text sentiment problem

In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
import re
from bs4 import BeautifulSoup
import random
import pandas as pd

In [2]:
!pip install bert-for-tf2
!pip install sentencepiece

Collecting bert-for-tf2
[?25l  Downloading https://files.pythonhosted.org/packages/87/df/ab6d927d6162657f30eb0ae3c534c723c28c191a9caf6ee68ec935df3d0b/bert-for-tf2-0.14.5.tar.gz (40kB)
[K     |████████                        | 10kB 21.2MB/s eta 0:00:01[K     |████████████████                | 20kB 2.8MB/s eta 0:00:01[K     |████████████████████████▏       | 30kB 3.8MB/s eta 0:00:01[K     |████████████████████████████████| 40kB 2.6MB/s 
[?25hCollecting py-params>=0.9.6
  Downloading https://files.pythonhosted.org/packages/a4/bf/c1c70d5315a8677310ea10a41cfc41c5970d9b37c31f9c90d4ab98021fd1/py-params-0.9.7.tar.gz
Collecting params-flow>=0.8.0
  Downloading https://files.pythonhosted.org/packages/a9/95/ff49f5ebd501f142a6f0aaf42bcfd1c192dc54909d1d9eb84ab031d46056/params-flow-0.8.2.tar.gz
Building wheels for collected packages: bert-for-tf2, py-params, params-flow
  Building wheel for bert-for-tf2 (setup.py) ... [?25l[?25hdone
  Created wheel for bert-for-tf2: filename=bert_for_tf2

In [3]:
## Mount gdrive
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


## Data Preprocessing
### Load training data and preprocess it, by cleaning the tweets and aremoving un-nucessary spaces.

In [4]:
column_names=["sentiment", "id", "date", "query", "user", "text"]
train_data= pd.read_csv('/content/drive/My Drive/NLP/Projects/BERT/Sentimental Data/train.csv',
                        engine='python', encoding='latin1', header=None, names= column_names)



In [5]:
train_data.tail()

Unnamed: 0,sentiment,id,date,query,user,text
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599999,4,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [6]:
## Remove un wanted columns
train_data.drop(["id", "date", "query", "user"], axis=1, inplace=True)

In [7]:
train_data.tail()

Unnamed: 0,sentiment,text
1599995,4,Just woke up. Having no school is the best fee...
1599996,4,TheWDB.com - Very cool to hear old Walt interv...
1599997,4,Are you ready for your MoJo Makeover? Ask me f...
1599998,4,Happy 38th Birthday to my boo of alll time!!! ...
1599999,4,happy #charitytuesday @theNSPCC @SparksCharity...


In [8]:
### Lets refine sentiment values
train_data.sentiment.value_counts()

4    800000
0    800000
Name: sentiment, dtype: int64

In [9]:
### Lets convert value 4 to 1 for proper understanding.
train_data.sentiment= train_data['sentiment'].apply(lambda label: 1 if label==4 else label)

In [10]:
train_data.sentiment.value_counts()

1    800000
0    800000
Name: sentiment, dtype: int64

In [11]:
train_data.tail(20)

Unnamed: 0,sentiment,text
1599980,1,@myheartandmind jo jen by nemuselo zrovna tÃ© ...
1599981,1,Another Commenting Contest! [;: Yay!!! http:/...
1599982,1,@thrillmesoon i figured out how to see my twee...
1599983,1,"@oxhot theri tomorrow, drinking coffee, talkin..."
1599984,1,You heard it here first -- We're having a girl...
1599985,1,"if ur the lead singer in a band, beware fallin..."
1599986,1,@tarayqueen too much ads on my blog.
1599987,1,@La_r_a NEVEER I think that you both will get...
1599988,1,@Roy_Everitt ha- good job. that's right - we g...
1599989,1,@Ms_Hip_Hop im glad ur doing well


In [12]:
## Lets do some preprocessing to the tweets
def preprocess_tweets(tweet):
  ##Convert xml values to text
  tweet= BeautifulSoup(tweet, "lxml").get_text()

  ##Remove @ sybmols before tweets
  tweet= re.sub(r"@[A-Za-z0-9]+", ' ', tweet)

  ##Remove Urls
  tweet= re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)

  ##Remove un-wanted symbols
  tweet= re.sub(r"[^A-Za-z0-9.?!]+", ' ', tweet)

  return tweet

In [13]:
preprocess_tweets(u"https://www.google.com Hello guys! @Morningwalk")

' Hello guys! '

In [14]:
### Apply the encoding to all the tweets
train_data.text= train_data.text.apply(lambda text:preprocess_tweets(text))

In [15]:
train_data.tail(6)

Unnamed: 0,sentiment,text
1599994,1,Forster Yeah that does work better than just ...
1599995,1,Just woke up. Having no school is the best fee...
1599996,1,TheWDB.com Very cool to hear old Walt intervie...
1599997,1,Are you ready for your MoJo Makeover? Ask me f...
1599998,1,Happy 38th Birthday to my boo of alll time!!! ...
1599999,1,happy charitytuesday


# Tokenization
## When bert is used a embedding layer we hve to follow below steps:
### unlike Bert as tokenizer which has only one input for Embeding layer
### Bert layer as embedding need 3 inputs
#### 1. Id: it consists of tokens of sentence along with [CLS] in start and [SEP] at the end for single sentence. for double sentence, for each end of sentence you have to have [SEP] at the end
#### 2. masks: WE will pass the value 0 if there is any masked value in the tokens. Basically Bert contains 15% of its tokens as masked. To identify the masked values we will search for ['PAD'] and make it as 0 and for remining as 1 in the list
#### 3. Segmentation: Segmentation of sentence. We will divide sentences with seq id's, this will be done by searching for [SEP] tokens and incrementing the values

In [16]:
## Impor tenosorflow hub and bert
import tensorflow_hub as hub
import bert

In [17]:
## Load model
FullTokenizer=bert.bert_tokenization.FullTokenizer
bert_layer= hub.KerasLayer('/content/drive/My Drive/NLP/Projects/BERT/Sentimental Data/BERT_pretrain/bert_en_uncased_L-12_H-768_A-12_2',
                           trainable=False)
##Load voacb file
vocab_file= bert_layer.resolved_object.vocab_file.asset_path.numpy()

## Load lower case boolean
lower_case= bert_layer.resolved_object.do_lower_case.numpy()
token= FullTokenizer(vocab_file, lower_case)


In [18]:
len(token.vocab)

30522

In [19]:
## Create funtion to encode the sentence to tokens
def encode_sentence(sent):
  encoded_sent= ["[CLS]"] + token.tokenize(sent) + ["[SEP]"]
  return encoded_sent

In [20]:
encode_sentence(u"Hello i am vikas")

['[CLS]', 'hello', 'i', 'am', 'vi', '##kas', '[SEP]']

In [21]:
token.convert_tokens_to_ids(token.tokenize(u"Hello i am vikas"))

[7592, 1045, 2572, 6819, 13716]

In [22]:
### now comed the inputs for embed layer of bert
## Get id's
def get_id(token):
  return token.convert_tokens_to_ids(token)

## assign id's to masked values
def get_mask(token):
  ### We will assign 0 if token has mask values else 1
  return np.char.not_equal(token, '[PAD]').astype(int)

## token segmentation
def get_segmentid(token):
  seg_id=[]
  segment_id=0
  for id in token:
    seg_id.append(segment_id)
    if id== '[SEP]':
      segment_id= 1- segment_id
  
  return seg_id

## Create Dataset
### for dtaset we have to pass list of 3 values instead of only token id's to the dataset

In [23]:
train_data.text

0           Awww that s a bummer. You shoulda got David C...
1          is upset that he can t update his Facebook by ...
2           I dived many times for the ball. Managed to s...
3            my whole body feels itchy and like its on fire 
4           no it s not behaving at all. i m mad. why am ...
                                 ...                        
1599995    Just woke up. Having no school is the best fee...
1599996    TheWDB.com Very cool to hear old Walt intervie...
1599997    Are you ready for your MoJo Makeover? Ask me f...
1599998    Happy 38th Birthday to my boo of alll time!!! ...
1599999                                happy charitytuesday 
Name: text, Length: 1600000, dtype: object

In [24]:
train_data.text= train_data.text.apply(lambda text: encode_sentence(text))

In [25]:
train_data.head()

Unnamed: 0,sentiment,text
0,0,"[[CLS], aw, ##w, ##w, that, s, a, bum, ##mer, ..."
1,0,"[[CLS], is, upset, that, he, can, t, update, h..."
2,0,"[[CLS], i, dive, ##d, many, times, for, the, b..."
3,0,"[[CLS], my, whole, body, feels, it, ##chy, and..."
4,0,"[[CLS], no, it, s, not, be, ##ha, ##ving, at, ..."


In [26]:
data_list= train_data.text.to_list()
data_label= train_data.sentiment.to_list()

In [27]:
#data_list[256]

In [28]:
## Now create dataset
data_with_len= [[sent, data_label[i], len(sent)] for i, sent in enumerate(data_list)]

In [29]:
for  sent in data_with_len[1355556]:
  #x= [sent, data_label[i], len(sent)]
  print(token.convert_tokens_to_ids(sent))
  break
#print(x)

[101, 6302, 12900, 3504, 2061, 2402, 1012, 2061, 10140, 1012, 1045, 2293, 2023, 102]


In [30]:
#data_with_len[1542584:1542586]

In [31]:
import random
random.shuffle(data_with_len)
data_with_len.sort(key= lambda x:x[2])

In [32]:
data_with_len[125698]

[['[CLS]', 'here', 's', 'my', 'newest', 'toy', '[SEP]'], 1, 7]

In [33]:
sorted_all= [([token.convert_tokens_to_ids(data[0]),
               get_mask(data[0]),
               get_segmentid(data[0])],data[1] ) 
            for data in data_with_len if data[2] > 7 ]

In [34]:
sorted_all[25]

([[101, 8996, 2002, 2018, 2062, 10474, 8771, 102],
  array([1, 1, 1, 1, 1, 1, 1, 1]),
  [0, 0, 0, 0, 0, 0, 0, 0]],
 0)

In [35]:
### Create dataset
train_dataset= tf.data.Dataset.from_generator(lambda : sorted_all, output_types= (tf.int32, tf.int32))

In [36]:
batch_size=32
train_dataset= train_dataset.padded_batch(batch_size=batch_size, padded_shapes=((3, None), ()),
                                          padding_values=(0,0))

In [37]:
for i in train_dataset.take(1):
  print(i)

(<tf.Tensor: shape=(32, 3, 8), dtype=int32, numpy=
array([[[  101,  2160,  6023,  1999,  1996,  3028,  3948,   102],
        [    1,     1,     1,     1,     1,     1,     1,     1],
        [    0,     0,     0,     0,     0,     0,     0,     0]],

       [[  101,  1045,  2481,  2102,  2202,  1037, 18996,   102],
        [    1,     1,     1,     1,     1,     1,     1,     1],
        [    0,     0,     0,     0,     0,     0,     0,     0]],

       [[  101,  1045,  2215,  2000,  2175,  2188,  1012,   102],
        [    1,     1,     1,     1,     1,     1,     1,     1],
        [    0,     0,     0,     0,     0,     0,     0,     0]],

       [[  101,  2054,  2051,  1029,  4751,  3531,  1012,   102],
        [    1,     1,     1,     1,     1,     1,     1,     1],
        [    0,     0,     0,     0,     0,     0,     0,     0]],

       [[  101,  8300,  2205,  2220,  2005,  1037,  4465,   102],
        [    1,     1,     1,     1,     1,     1,     1,     1],
        [    0,  

In [38]:
batch_size=32
num_batches= len(sorted_all)//batch_size
num_test_batches= num_batches//10
train_dataset.shuffle(num_batches)


<ShuffleDataset shapes: ((None, 3, None), (None,)), types: (tf.int32, tf.int32)>

In [39]:
test_dataset= train_dataset.take(num_test_batches)
train_dataset= train_dataset.skip(num_test_batches)

In [40]:
## iter the model
train_dataset

<SkipDataset shapes: ((None, 3, None), (None,)), types: (tf.int32, tf.int32)>

In [41]:
for i in train_dataset.take(1):
  print(i)

(<tf.Tensor: shape=(32, 3, 10), dtype=int32, numpy=
array([[[  101,  3398, 22794,  3022,  1037,  2204,  2518,  8840,  2140,
           102],
        [    1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [    0,     0,     0,     0,     0,     0,     0,     0,     0,
             0]],

       [[  101,  4149,  1996,  3308,  6046,  2064,  1056, 11852,  3892,
           102],
        [    1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [    0,     0,     0,     0,     0,     0,     0,     0,     0,
             0]],

       [[  101,  1999,  6734,  2007, 22246,  1998,  4907,  5983,  4596,
           102],
        [    1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [    0,     0,     0,     0,     0,     0,     0,     0,     0,
             0]],

       [[  101,  3407,  5798,  2000,  2115,  2684,   999,  4569,  2335,
           102],
        [    1,     1,     1,     1,     1,     1,  

In [42]:
next(iter(train_dataset))

(<tf.Tensor: shape=(32, 3, 10), dtype=int32, numpy=
 array([[[  101,  3398, 22794,  3022,  1037,  2204,  2518,  8840,  2140,
            102],
         [    1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [    0,     0,     0,     0,     0,     0,     0,     0,     0,
              0]],
 
        [[  101,  4149,  1996,  3308,  6046,  2064,  1056, 11852,  3892,
            102],
         [    1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [    0,     0,     0,     0,     0,     0,     0,     0,     0,
              0]],
 
        [[  101,  1999,  6734,  2007, 22246,  1998,  4907,  5983,  4596,
            102],
         [    1,     1,     1,     1,     1,     1,     1,     1,     1,
              1],
         [    0,     0,     0,     0,     0,     0,     0,     0,     0,
              0]],
 
        [[  101,  3407,  5798,  2000,  2115,  2684,   999,  4569,  2335,
            102],
         [    1,     1,     1

## Model Training
### This training is same like creating a layers in a class, except we create a embed layer with bert first and set trainable= False

In [53]:
class BertEmbed(tf.keras.Model):
  def __init__(self, num_filters=50, num_units=512,
                dropout_rate=0.1, trainable=False, name='BertEmbed'):    

    super(BertEmbed, self).__init__()

    ##Bert Embed Layer
    self.bert_layer= hub.KerasLayer('/content/drive/My Drive/NLP/Projects/BERT/Sentimental Data/BERT_pretrain/bert_en_uncased_L-12_H-768_A-12_2',
                           trainable=False)

    ## create n-grams
    self.bigram= keras.layers.Conv1D(filters= num_filters, kernel_size=2, padding='VALID', activation=tf.nn.relu)

    ##Trigram
    self.trigram= keras.layers.Conv1D(filters= num_filters, kernel_size=3, padding= 'valid', activation= tf.nn.relu)

    ## Four gram
    self.fourgram= keras.layers.Conv1D(filters= num_filters, kernel_size=4, padding='valid', activation= tf.nn.relu)

    ## GlobalMaxpooling
    self.maxpool= keras.layers.GlobalMaxPool1D()

    ## Dense layer
    self.dense= keras.layers.Dense(units= num_units, activation=tf.nn.relu)

    ##Dropout
    self.dropout= keras.layers.Dropout(rate=dropout_rate)

    ##output
    self.output_layer= keras.layers.Dense(units=1, activation=tf.nn.sigmoid)

  def bert_embed_layer(self, all_tokens):
    ##Bert Embed layer
    ### Shape of the batch is [32,3,number]
    ### We have to consider the axis=1 values as embed layers
    _,embed_output= self.bert_layer([all_tokens[:, 0, :],
                                     all_tokens[:, 1, :],
                                     all_tokens[:, 2, :]])

    return embed_output
  
  def call(self, inputs, training):
    x= self.bert_embed_layer(inputs)

    x_1= self.bigram(x)
    x_1= self.maxpool(x_1)
    x_2= self.trigram(x)
    x_2= self.maxpool(x_2)
    x_3= self.fourgram(x)
    x_3= self.maxpool(x_3)

    ##Concat all the n-grams
    concat= tf.concat([x_1, x_2, x_3], axis=-1)
    x= self.dense(concat)

    x= self.dropout(x, training)

    output= self.output_layer(x)

    return output

In [54]:
num_filters= 100
num_units= 256
dropout_rate=0.2

In [55]:
Embed= BertEmbed(num_filters, num_units, dropout_rate)

In [56]:
Embed.compile(loss= tf.keras.losses.BinaryCrossentropy(from_logits=True), optimizer= tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

### Create Checkpoints

In [57]:
checkpoint_dir= '/content/drive/My Drive/NLP/Projects/BERT/Sentimental Data/bert_embed_checkpoints/ckpt_bert_tok'
checkpoint= tf.train.Checkpoint(Embed= Embed)

##Checkpoint Manager
checkpoint_manager= tf.train.CheckpointManager(checkpoint, checkpoint_dir, max_to_keep=1)


In [58]:
##Call back function for checkpoint storing
class MyCallBack(tf.keras.callbacks.Callback):

  def on_epoch_end(self, epoch, logs=None):
    checkpoint_manager.save()
    print("Checkpoint saved at {}.".format(checkpoint_dir))

## Model Training

In [60]:
Embed.fit(train_dataset, epochs=2,  callbacks=[MyCallBack()])

Epoch 1/2
  40564/Unknown - 2220s 55ms/step - loss: 0.6096 - accuracy: 0.7735Checkpoint saved at /content/drive/My Drive/NLP/Projects/BERT/Sentimental Data/bert_embed_checkpoints/ckpt_bert_tok.
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f8467fa9cc0>

In [None]:
/