In [12]:
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

from transformers import pipeline

# Table of Contents

## 1. Multi-Head Connection

The following is the architecutre MultiHeadAttention layer, we can use it as reference when implementing the multi-head layer:

![multihead_attention](../images/multi-head_attention.png)

In simple english this is what is invovled in the multi-head layer:

1. Configure the number of heads you need (hyper-parameter) 
2. Inputs to MH layer are 3 word vectors(Query, Key, Value) and it outputs a context aware vector 
3. The inputs are passed to each attention head which have 3 Dense Layers (learnable)
4. Finally the outputs of each head is concatenated and output is presented

### Keras has an implementation of multi-head layer:


In [2]:
num_heads = 2
embedding_vector_dim = 256
inputs = tf.keras.Input(shape=[8, 256])
mha_layer = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_vector_dim)
outputs = mha_layer(inputs, inputs, inputs)


Metal device set to: Apple M1

systemMemory: 8.00 GB
maxCacheSize: 2.67 GB



2022-01-07 21:06:57.998932: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2022-01-07 21:06:57.999543: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [3]:
print(outputs.shape)

(None, 8, 256)


## 2. Transformer Encoder

The following is the architecture of Encoder from the original "Attention is all you need"paper

![transformer_encoder](../images/transformer_encoder.png)

In breif this is what is invovled in the Transformer Encoder layer:

1. It begins with a multihead attention (as described above)
2. The original word vectors have a residual connection with the output from multihead attention
3. Then the output goes through a Normalization layer, NL1
4. Now we have a dense projection block (2 Dense layers maybe configurable)
   output of this layer is equal to the output/input vector dimension
5. Then we have a residual connection of the NL1 with the output of Dense projection
6. Finally we have one more Normalization layer, NL2

### A quick note on why we use residual connection and Normalization:

1.Why residual Connection?
 - It is a fix against vanishing gradient problem
 - It acts as a information shortcut around destructive or nosiy blocks such as blocks that contain relu activations or dropout layers)
 - It enables the gradient info to flow noiselessy propogate in a Deep Network
 
 2.Why use normalization layer?
 - It helps in graidents flow better during backprop
 - The Normalization we use here is LayerNormalization layer, which normalizes each sequence independently from other sequences in the batch.
 Note: BatchNorm doesn't work that great with sequence data


## 3. The Code for Tranformer Encoder layer

In [4]:
import tensorflow as tf

In [5]:
class TransformerEncoder(tf.keras.layers.Layer):
    
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        
        self.embed_dim = embed_dim #Vector embedding Dimension
        self.dense_dim = dense_dim #Dense layers number of neurons
        self.num_heads = num_heads #Number of heads in your MLH Layer
        self.attention = tf.keras.layers.MultiHeadAttention( #implementing the mutlihead attention block
                         num_heads = num_heads,
                         key_dim   = embed_dim )
        self.dnse_proj = tf.keras.Sequential(
                         [tf.keras.layers.Dense(dense_dim, activation='relu'),
                          tf.keras.layers.Dense(embed_dim)
                         ]
                         )
        self.layrnorm1 = tf.keras.layers.LayerNormalization()
        self.layrnorm2 = tf.keras.layers.LayerNormalization()
    
    def call(self, inputs, mask=None):
        if mask is not None:
            mask = mask[:, tf.newaxis, :]
        
        attention_output = self.attention(inputs, inputs,
                                          attention_mask=mask)
        #Input to projection layer
        proj_input = self.layrnorm1(inputs + attention_output)
        
        #Dense block computation
        proj_output = self.dnse_proj(proj_input)
        
        #Finally add the Dense projection output with along with its original input passed to it
        return self.layrnorm2(proj_input + proj_output)
    
    def get_config(self):
        
        config = super().get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "dense_dim": self.dense_dim,
            "num_heads": self.num_heads,
        })
        return config

## Now we are going to build a text classifier using the Transformer Encoder Block

In [6]:
import pandas as pd
import numpy as  np

In [7]:
df = pd.read_csv('../data/uhack_review_train.csv')

In [8]:
text = df['Review']
lables = df['Polarity']

In [9]:
## splitting data
train_size = int(0.9 * len(df))
train_data = df[:train_size]
test_data = df[train_size:]


train_sentences = train_data['Review'].values
test_sentences = test_data['Review'].values

train_labels = np.array(train_data['Polarity'].values)
test_labels = np.array(test_data['Polarity'].values)


## HYPER-PARAM:-

NUM_WORDS = 1000
TRUNCATE = 'post'  # 'pre'
PADDING = 'post'   # 'pre
MAX_LEN = 100
EVD = 16

## 1. Fit Tokenizer

bbc_tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=NUM_WORDS,
                                                      oov_token='<OOV>')
bbc_tokenizer.fit_on_texts(train_sentences)

## 2. Convert text to sequence

train_seq = bbc_tokenizer.texts_to_sequences(train_sentences)
test_seq = bbc_tokenizer.texts_to_sequences(test_sentences)

## 3. Convert the sequence to padded sequences

train_padded = tf.keras.preprocessing.sequence.pad_sequences(train_seq,
                                                             truncating=TRUNCATE,
                                                             padding=PADDING,
                                                             maxlen=MAX_LEN)
test_padded = tf.keras.preprocessing.sequence.pad_sequences(test_seq,
                                                             truncating=TRUNCATE,
                                                             padding=PADDING,
                                                             maxlen=MAX_LEN)

In [10]:
## Classification

## Hyper-params for transformer

num_heads = 2
dense_dim = 32


inputs = tf.keras.Input(shape=[None], dtype='int64')
embedd = tf.keras.layers.Embedding(NUM_WORDS,EVD)(inputs)
transf = TransformerEncoder(EVD, dense_dim, num_heads)(embedd)
glmaxp = tf.keras.layers.GlobalMaxPool1D()(transf)
droput = tf.keras.layers.Dropout(0.5)(glmaxp)
output = tf.keras.layers.Dense(1, activation='sigmoid')(droput)

tmodel = tf.keras.models.Model(inputs, output)

tmodel.compile(optimizer='adam',
               loss='binary_crossentropy',
               metrics=['accuracy'])
tmodel.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 16)          16000     
_________________________________________________________________
transformer_encoder (Transfo (None, None, 16)          3296      
_________________________________________________________________
global_max_pooling1d (Global (None, 16)                0         
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 17        
Total params: 19,313
Trainable params: 19,313
Non-trainable params: 0
_________________________________________________________

In [11]:
MC = tf.keras.callbacks.ModelCheckpoint(
    '../model/first_transformer.h5',
    monitor='val_loss',
    save_best_only='True',
    verbose=1
)

ES = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    verbose=1,
    restore_best_weights='True'
)

TB = tf.keras.callbacks.TensorBoard('../tboard/')

tmodel.fit(train_padded,
              train_labels,
               epochs=10,
               validation_data=(test_padded, test_labels),
               callbacks=[ES, MC, TB]
              )

Epoch 1/10


2022-01-07 21:06:58.609593: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2022-01-07 21:06:58.609601: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2022-01-07 21:06:58.609735: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.
2022-01-07 21:06:58.649818: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2022-01-07 21:06:58.652122: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2022-01-07 21:06:58.927614: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


  5/173 [..............................] - ETA: 7s - loss: 1.7985 - accuracy: 0.3500

2022-01-07 21:07:00.473874: I tensorflow/core/profiler/lib/profiler_session.cc:131] Profiler session initializing.
2022-01-07 21:07:00.473885: I tensorflow/core/profiler/lib/profiler_session.cc:146] Profiler session started.
2022-01-07 21:07:00.512749: I tensorflow/core/profiler/lib/profiler_session.cc:66] Profiler session collecting data.
2022-01-07 21:07:00.516370: I tensorflow/core/profiler/lib/profiler_session.cc:164] Profiler session tear down.
2022-01-07 21:07:00.521267: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: ../tboard/train/plugins/profile/2022_01_07_21_07_00

2022-01-07 21:07:00.522487: I tensorflow/core/profiler/rpc/client/save_profile.cc:142] Dumped gzipped tool data for trace.json.gz to ../tboard/train/plugins/profile/2022_01_07_21_07_00/Virajdatts-MacBook-Air.local.trace.json.gz
2022-01-07 21:07:00.526632: I tensorflow/core/profiler/rpc/client/save_profile.cc:136] Creating directory: ../tboard/train/plugins/profile/2022_01_07_21_07_00



2022-01-07 21:07:06.232604: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.



Epoch 00001: val_loss improved from inf to 0.38945, saving model to ../model/first_transformer.h5
Epoch 2/10

Epoch 00002: val_loss improved from 0.38945 to 0.33707, saving model to ../model/first_transformer.h5
Epoch 3/10

Epoch 00003: val_loss improved from 0.33707 to 0.32859, saving model to ../model/first_transformer.h5
Epoch 4/10

Epoch 00004: val_loss improved from 0.32859 to 0.30827, saving model to ../model/first_transformer.h5
Epoch 5/10

Epoch 00005: val_loss improved from 0.30827 to 0.29782, saving model to ../model/first_transformer.h5
Epoch 6/10

Epoch 00006: val_loss improved from 0.29782 to 0.27371, saving model to ../model/first_transformer.h5
Epoch 7/10

Epoch 00007: val_loss improved from 0.27371 to 0.27342, saving model to ../model/first_transformer.h5
Epoch 8/10

Epoch 00008: val_loss improved from 0.27342 to 0.24373, saving model to ../model/first_transformer.h5
Epoch 9/10

Epoch 00009: val_loss did not improve from 0.24373
Epoch 10/10

Epoch 00010: val_loss did n

<keras.callbacks.History at 0x16b0fe3d0>

In [13]:
#test_labels

hf_sentiment_classifier = pipeline('sentiment-analysis')

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english)
Downloading: 100%|███████████████████████████████████████████████████████████████████████| 629/629 [00:00<00:00, 273kB/s]
Downloading: 100%|████████████████████████████████████████████████████████████████████| 256M/256M [00:09<00:00, 27.1MB/s]
2022-01-07 21:09:32.275431: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.
Some layers from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing TFDistilBertForSequenceClassification: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraini

In [30]:
%%timeit
results = hf_sentiment_classifier(list(test_data['Review'].values))

1min 7s ± 975 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [47]:
from datasets import Dataset
from transformers.pipelines.base import KeyDataset

In [41]:
#dataset = load_dataset('csv', data_files='../data/uhack_review_train.csv')

In [44]:
dataset = Dataset.from_pandas(test_data[["Review"]])

In [60]:
sentiment = []
for review in test_data['Review'].values:
    sentiment.append(hf_sentiment_classifier(review)[0]['label'])
    #print(hf_sentiment_classifier(review)[0]['label'])
    #break

In [63]:
sent_value = []
for sen in sentiment:
    if sen == 'POSITIVE':
        sent_value.append(1)
    else:
        sent_value.append(0)

In [66]:

# Calculate accuracy percentage between two lists
def accuracy_metric(actual, predicted):
 correct = 0
 for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
 return correct / float(len(actual)) * 100.0

In [68]:
accuracy_metric(sent_value, list(test_data['Polarity']))

84.85342019543974