# Import and preprocessing Data 

we will Make sentiment analysis model for imdb reviews data that can be found in tensorflow_datasets

In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from tensorflow.keras import layers
import string
from nltk.tokenize import sent_tokenize , word_tokenize

  from .autonotebook import tqdm as notebook_tqdm


First we Download the data from the library

In [2]:
imdb_train, ds_info = tfds.load(name="imdb_reviews",split="train",with_info=True, as_supervised=True)
imdb_test = tfds.load(name="imdb_reviews", split="test", as_supervised=True)

we then try to encode and tokenize data so we can input it to the model

In [4]:
train_words = np.array([r.numpy().decode('utf-8') for r, l in imdb_train])
train_lbl = [l for r, l in imdb_train]
train_lbl = np.array(train_lbl)

In [5]:
test_words = np.array([r.numpy().decode('utf-8') for r, l in imdb_test])
test_lbl = [l for r, l in imdb_test]
test_lbl = np.array(test_lbl)

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_words)
tokenized_train = tokenizer.texts_to_sequences(train_words)
sentences = pad_sequences(tokenized_train,padding="post", maxlen=150)
word2idx = tokenizer.word_index
idx2word = {value:key for key,value in word2idx.items()}

In [7]:
#tokenize and padding test data
tokenized_test = tokenizer.texts_to_sequences(test_words)
sentences_test = pad_sequences(tokenized_test,padding="post", maxlen=150)

In [8]:
train_data = tf.data.Dataset.from_tensor_slices((sentences,train_lbl))
test_data  = tf.data.Dataset.from_tensor_slices((sentences_test,test_lbl))

# Model without pretrained Embedding Layer

In [10]:
# Length of the vocabulary in chars
vocab_size = len(word2idx)
# Number of RNN units
rnn_units = 64
#embedding diminsion
embedding_dim = 64
#batch size
BATCH_SIZE=100

In [9]:
def build_model_bilstm(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
    layers.Embedding(vocab_size, embedding_dim),
    layers.Bidirectional(layers.LSTM(rnn_units, return_sequences=True,dropout=0.5)),
    layers.Bidirectional(layers.LSTM(rnn_units, dropout=0.25)),
    tf.keras.layers.Dense(1, activation='sigmoid')])
    return model

In [11]:
bilstm = build_model_bilstm(vocab_size = vocab_size,embedding_dim=embedding_dim,rnn_units=rnn_units,batch_size=BATCH_SIZE)
bilstm.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy', 'Precision', 'Recall'])
bilstm.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          5669248   
_________________________________________________________________
bidirectional (Bidirectional (None, None, 128)         66048     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 5,834,241
Trainable params: 5,834,241
Non-trainable params: 0
_________________________________________________________________


In [12]:
encoded_train_batched = train_data.batch(BATCH_SIZE).prefetch(100)
bilstm.fit(encoded_train_batched, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2c1d7f6d7f0>

In [20]:
bilstm.evaluate(test_data.batch(BATCH_SIZE))



[0.7266671061515808,
 0.8427199721336365,
 0.8409742116928101,
 0.8452799916267395]

# Importing Glove Embedding Weights

In [13]:
dict_w2v = {}
with open('D:\TensorFlow\Trying\glove.6B/glove.6B.50d.txt', "r",encoding="utf8") as file:
    for line in file:
        tokens = line.split()
        word = tokens[0]
        vector = np.array(tokens[1:], dtype=np.float32)
        if vector.shape[0] == 50:
            dict_w2v[word] = vector
        else:
            print("There was an issue with " + word)
        # let's check the vocabulary size
print("Dictionary Size: ", len(dict_w2v))

Dictionary Size:  400000


In [14]:
embedding_dim = 50
embedding_matrix = np.zeros((len(word2idx), embedding_dim))

In [15]:
unk_cnt = 0
unk_set = set()
for word in word2idx.keys():
    embedding_vector = dict_w2v.get(word)
    if embedding_vector is not None:
        tkn_id = word2idx[word]
        embedding_matrix[tkn_id-1] = embedding_vector
    else:
        unk_cnt += 1
        unk_set.add(word)
# Print how many weren't found
print("Total unknown words: ", unk_cnt)

Total unknown words:  28423


# Building Model With PreTrained Embedding Layer 

In [16]:
def build_model_bilstm(vocab_size, embedding_dim,rnn_units, batch_size, train_emb=False):
    model = tf.keras.Sequential([
    layers.Embedding(vocab_size, embedding_dim, mask_zero=True,weights=[embedding_matrix], trainable=train_emb),
    layers.Bidirectional(layers.LSTM(rnn_units, return_sequences=True,dropout=0.5)),
    layers.Bidirectional(layers.LSTM(rnn_units, dropout=0.25)),
    layers.Dense(1, activation='sigmoid')
    ])
    return model

# Feature extraction as we set the weights of embedding Fixed

In [17]:
model_fe = build_model_bilstm(vocab_size = vocab_size,embedding_dim=embedding_dim,rnn_units=rnn_units,batch_size=BATCH_SIZE)
model_fe.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 50)          4429100   
_________________________________________________________________
bidirectional_2 (Bidirection (None, None, 128)         58880     
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 4,586,925
Trainable params: 157,825
Non-trainable params: 4,429,100
_________________________________________________________________


In [18]:
model_fe.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy', 'Precision', 'Recall'])

model_fe.fit(encoded_train_batched, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2c1e0eae460>

In [24]:
model_fe.evaluate(test_data.batch(BATCH_SIZE))



[0.5681567788124084,
 0.6933199763298035,
 0.7870293259620667,
 0.5300800204277039]

we can see that having fixed embedding layer make our model worse

# Fine-tuning model as we fine tune Embedding layer weights

In [21]:
model_ft = build_model_bilstm(vocab_size=vocab_size,embedding_dim=embedding_dim,rnn_units=rnn_units,batch_size=BATCH_SIZE,train_emb=True)
model_ft.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 50)          4429100   
_________________________________________________________________
bidirectional_4 (Bidirection (None, None, 128)         58880     
_________________________________________________________________
bidirectional_5 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 129       
Total params: 4,586,925
Trainable params: 4,586,925
Non-trainable params: 0
_________________________________________________________________


In [22]:
model_ft.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy', 'Precision', 'Recall'])
model_ft.fit(encoded_train_batched, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x2c2b2ee7eb0>

In [23]:
model_ft.evaluate(test_data.batch(BATCH_SIZE))



[0.5538679957389832,
 0.8525199890136719,
 0.8373268246650696,
 0.8750399947166443]

as we can see their are great improvment when we fine tune the embedding layer but the model tends to overfit we can solve this by adding more dropout layers between embeding layer and lstm layer

For now we had a fixed vector for each word that is known as Static Word Embedding

Next we will look at a Dynamic word Embedding in which we will use BERT Transformer

# Using Bert

In [17]:
from transformers import BertTokenizer

In [18]:
bert_name = 'bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(bert_name,
                                          add_special_tokens=True,
                                          do_lower_case=False,
                                          max_length=150,
                                          pad_to_max_length=True)

Downloading: 100%|██████████| 208k/208k [00:00<00:00, 511kB/s] 
Downloading: 100%|██████████| 29.0/29.0 [00:00<00:00, 14.5kB/s]
Downloading: 100%|██████████| 570/570 [00:00<00:00, 569kB/s]


In [19]:
def bert_encoder(review):
    txt = review.numpy().decode('utf-8')
    
    encoded = tokenizer.encode_plus(txt, add_special_tokens=True,
    max_length=150,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_token_type_ids=True)
    
    return encoded['input_ids'], encoded['token_type_ids'], encoded['attention_mask']

In [20]:
bert_train = [bert_encoder(r) for r, l in imdb_train]
bert_lbl = [l for r, l in imdb_train]
bert_train = np.array(bert_train)
bert_lbl = tf.keras.utils.to_categorical(bert_lbl, num_classes=2)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [23]:
# create training and validation splits
from sklearn.model_selection import train_test_split
x_train, x_val, y_train, y_val = train_test_split(bert_train,bert_lbl,test_size=0.2,random_state=42)
print(x_train.shape, y_train.shape)

(20000, 3, 150) (20000, 2)


In [24]:
tr_reviews, tr_segments, tr_masks = np.split(x_train, 3, axis=1)
val_reviews, val_segments, val_masks = np.split(x_val, 3, axis=1)

tr_reviews = tr_reviews.squeeze()
tr_segments = tr_segments.squeeze()
tr_masks = tr_masks.squeeze()

val_reviews = val_reviews.squeeze()
val_segments = val_segments.squeeze()
val_masks = val_masks.squeeze()

In [26]:
def example_to_features(input_ids,attention_masks,token_type_ids,y):
    return {"input_ids": input_ids,"attention_mask": attention_masks,
            "token_type_ids": token_type_ids}, y

In [37]:
train_ds = tf.data.Dataset.from_tensor_slices((tr_reviews,tr_masks, tr_segments, y_train)). \
                                                map(example_to_features).shuffle(100).batch(8)

In [38]:
valid_ds = tf.data.Dataset.from_tensor_slices((val_reviews,val_masks, val_segments, y_val)).\
                                                map(example_to_features).shuffle(100).batch(8)

In [33]:
from transformers import TFBertForSequenceClassification
bert_model = TFBertForSequenceClassification.from_pretrained(bert_name)

Downloading: 100%|██████████| 502M/502M [02:30<00:00, 3.50MB/s] 
All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [39]:
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
bert_model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
bert_model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  108310272 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 108,311,810
Trainable params: 108,311,810
Non-trainable params: 0
_________________________________________________________________


In [40]:
print("Fine-tuning BERT on IMDB")
bert_history = bert_model.fit(train_ds, epochs=3,validation_data=valid_ds)

Fine-tuning BERT on IMDB
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [41]:
# prep data for testing
bert_test = [bert_encoder(r) for r,l in imdb_test]
bert_tst_lbl = [l for r, l in imdb_test]
bert_test2 = np.array(bert_test)
bert_tst_lbl2 = tf.keras.utils.to_categorical (bert_tst_lbl,num_classes=2)

ts_reviews, ts_segments, ts_masks = np.split(bert_test2, 3, axis=1)

ts_reviews = ts_reviews.squeeze()
ts_segments = ts_segments.squeeze()
ts_masks = ts_masks.squeeze()

test_ds = tf.data.Dataset.from_tensor_slices((ts_reviews,ts_masks, ts_segments, bert_tst_lbl2)).\
                                                map(example_to_features).shuffle(100).batch(16)



In [42]:
bert_model.evaluate(test_ds)



[0.3387053608894348, 0.8784000277519226]

we can see That Bert has outperformed seq2seq model with only 3 epochs