## The aim of this lab is to build a system for text classification on top of a BERT model

We are going to use the same dataset for sentiment analysis than in the Lab U5.06

In [None]:
!wget --no-cache -O init.py -q https://raw.githubusercontent.com/rramosp/2020.deeplearning/master/init.py
from init import init; init(force_download=False)

In [None]:
import sys
if 'google.colab' in sys.modules:
    print ("setting tensorflow version in colab")
    %tensorflow_version 2.x

In [1]:
import tensorflow as tf
import tensorflow_hub as hub
print("TF version: ", tf.__version__)
print("Hub version: ", hub.__version__)

TF version:  2.2.0
Hub version:  0.8.0


In [None]:
!pip install bert-for-tf2

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

In [3]:
import bert
FullTokenizer = bert.bert_tokenization.FullTokenizer
from tensorflow.keras.models import Model       # Keras is the new high level API for TensorFlow
import math

In [4]:
data = pd.read_csv('local/data/Tweets.csv')
# Keeping only the neccessary columns
data = data[['text','airline_sentiment']]

In [5]:
import re
#Remove neutral class
data = data[data.airline_sentiment != "neutral"]

#text normalization
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x:re.sub('@[^\s]+','',x)))#remove the name of the airline
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
print(np.sum(data['airline_sentiment'].values == 'positive'))
print(np.sum(data['airline_sentiment'].values == 'negative'))

2363
9178


In [6]:
for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')

In [7]:
data[:10]

Unnamed: 0,text,airline_sentiment
1,plus youve added commercials to the experienc...,positive
3,its really aggressive to blast obnoxious ente...,negative
4,and its a really big bad thing about it,negative
5,seriously would pay 30 a flight for seats tha...,negative
6,yes nearly every time i fly vx this ear worm ...,positive
8,well i didntbut now i do d,positive
9,it was amazing and arrived an hour early your...,positive
11,i lt3 pretty graphics so much better than min...,positive
12,this is such a great deal already thinking ab...,positive
13,im flying your fabulous seductive skies agai...,positive


### Let's take a look to the following example about the use of BERT model from Tensorflow_hub

In [8]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input

In [10]:
max_seq_length = 128  # Your choice here.

In [11]:
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                       name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                   name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                    name="segment_ids")
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1",
                            trainable=True)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output])

In [12]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = FullTokenizer(vocab_file, do_lower_case)

In [13]:
def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids

In [14]:
s = "This is a nice sentence."
stokens = tokenizer.tokenize(s)
stokens = ["[CLS]"] + stokens + ["[SEP]"]

input_ids = get_ids(stokens, tokenizer, max_seq_length)
input_masks = get_masks(stokens, max_seq_length)
input_segments = get_segments(stokens, max_seq_length)


In [15]:
print(stokens)
print(input_ids)
print(input_masks)
print(input_segments)

['[CLS]', 'this', 'is', 'a', 'nice', 'sentence', '.', '[SEP]']
[101, 2023, 2003, 1037, 3835, 6251, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [16]:
print(len(stokens))
print(len(input_ids))
print(len(input_masks))
print(len(input_segments))

8
128
128
128


In [17]:
pool_embs, all_embs = model.predict([np.array(input_ids).reshape(1,-1),np.array(input_masks).reshape(1,-1),np.array(input_segments).reshape(1,-1)])

BERT provides 768 dimension embedding for each token in the given sentence. Note that it gives you two different ouputs: pool_embs and all_embs. all embs is the embedding of the whole sequence and pool_embs is the embedding of the initial CLS token. It's "pooled" from all input tokens in the sense that the multiple attention layers will force it to depend on all other tokens.

In [18]:
pool_embs.shape

(1, 768)

In [19]:
all_embs.shape

(1, 128, 768)

## Excesrcise 1: 

Define a DL model on top of the BERT Embedding to classify the tweets dataset. Include a LSTM layer with 128 cells. Remember to frezee the BERT model for training phase. Use 80% of the samples for training and run training for 3 epochs. In this case do not remove the stopwords.

You should get the following results:

In [139]:
model2.compile(optimizer='adam', loss="binary_crossentropy", metrics=["accuracy"])
model2.fit(...,validation_split=0.1,batch_size=32, epochs=3, verbose=1)

Train on 8308 samples, validate on 924 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fb5280d2bd0>

In [142]:
print('Accuracy = {}'.format(accuracy_score(y_te,y_pred)))
print('Sensitivity = {}'.format(recall_score(y_te,y_pred)))
print('Especificity = {}'.format(especi_score(y_te,y_pred.flatten())))

Accuracy = 0.931572109138155
Sensitivity = 0.7459349593495935
Especificity = 0.9818381948266374


## Excesrcise 2: 

Define a DL model on top of the BERT Embedding to classify the tweets dataset. Include a Conv1D layer with 12 filters and a 3-gram based kernel. Remember to frezee the BERT model for training phase. Use 80% of the samples for training and run training for 3 epochs. In this case, do not remove the stopwords neither.

You should get the following results:

In [144]:
model3.compile(optimizer='adam', loss="binary_crossentropy", metrics=["accuracy"])
model3.fit(..., validation_split=0.1,batch_size=32, epochs=3, verbose=1)

Train on 8308 samples, validate on 924 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fb5604456d0>

In [146]:
print('Accuracy = {}'.format(accuracy_score(y_te,y_pred)))
print('Sensitivity = {}'.format(recall_score(y_te,y_pred)))
print('Especificity = {}'.format(especi_score(y_te,y_pred.flatten())))

Accuracy = 0.931572109138155
Sensitivity = 0.7926829268292683
Especificity = 0.969179966978536


## Excercise 3

¿What is th result if you define a Dense network based on the BERT's pooled output?

You should get the following results using a hidden Dense layer of 128 neurons:

In [150]:
print('Accuracy = {}'.format(accuracy_score(y_te,y_pred)))
print('Sensitivity = {}'.format(recall_score(y_te,y_pred)))
print('Especificity = {}'.format(especi_score(y_te,y_pred.flatten())))

Accuracy = 0.8813339107838891
Sensitivity = 0.6788617886178862
Especificity = 0.9361585030269676
