<a href="https://colab.research.google.com/github/RayGone/SentimentAnalysis/blob/phase1/Experiments/Transformer/Transformer_4_SA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# !pip install transformers datasets -q

import os
import random
import numpy as np
import pandas as pd
import tensorflow as tf
import datasets
from datasets import load_dataset
from transformers import BertTokenizerFast, TFAutoModel
from tensorflow.keras.preprocessing.sequence import pad_sequences


import gc

def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    tf.keras.utils.set_random_seed(rand_seed)
    tf.random.set_seed(seed) # tensorflow

rand_seed = 999
seed_everything(rand_seed)

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.python.ops.numpy_ops import np_config
np_config.enable_numpy_behavior()

'''
    merge: 'interleave' or 'concat' or 'add' or None : defaults: 'interleave'
        -> when 'concat' combines [sin,sin,...,cos,cos,...]
        -> when 'interleave' combines [sin,cos,sin,cos......]
        -> when 'add' it adds 'interleave' and 'concat'
'''
def PositionalEncoding(seq_length=2048,feature_depth=512,merge='interleave'):
      depth = feature_depth/2
      length = seq_length

      positions = np.arange(length)[:, np.newaxis]      # (seq, 1)
      depths = np.arange(depth)[np.newaxis, :]/depth    # (1, depth)

      angle_rates = 1 / (10000**depths)                 # (1, depth)
      angle_rads = positions * angle_rates + 0.0001     # (pos, depth)

      sin = np.sin(angle_rads)
      cos = np.cos(angle_rads)
      pos_encoding = np.concatenate([sin, cos], axis=-1)

      ipos_encoding = np.zeros(pos_encoding.shape)
      ipos_encoding[:, ::2] = sin
      ipos_encoding[:, 1::2] = cos
      if merge=='concat':
            return tf.cast(pos_encoding, dtype=tf.float32)
            print("Concatanation",str(pos_encoding[:2]),pos_encoding.shape)
      elif merge=='add':
            return tf.cast((pos_encoding+ipos_encoding)/2, dtype=tf.float32)
      else:
            return tf.cast(ipos_encoding, dtype=tf.float32)
            print("Interleaving",str(ipos_encoding[:2]),ipos_encoding.shape)

class PositionalEmbedding(tf.keras.layers.Layer):
  def __init__(self, vocab_size, d_model, context_length=2048,pos_enc_merge='add'):
    super().__init__()
    self._name='PosEmbd'
    self.d_model = d_model
    self.embedding = tf.keras.layers.Embedding(vocab_size, d_model,
                                               mask_zero=True)
    self.pos_encoding = PositionalEncoding(seq_length=context_length, feature_depth=d_model,merge=pos_enc_merge)

  def compute_mask(self, *args, **kwargs):
    return self.embedding.compute_mask(*args, **kwargs)

  def call(self, x):
    length = tf.shape(x)[1]
    x = self.embedding(x)
    # This factor sets the relative scale of the embedding and positonal_encoding.
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x = x + self.pos_encoding[tf.newaxis, :length, :]
    return x


In [3]:
##------------
##------Attention----------------
##----------------------

class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

  def build(self,input_shape):
    self.mha._build_from_signature(tf.TensorShape(input_shape),tf.TensorShape(input_shape),tf.TensorShape(input_shape))


class LocalGlobalSelfAttention(tf.keras.layers.Layer):
  def __init__(self,num_heads, key_dim, dropout, num_window=8):
    super().__init__()
    self._name='LocalGlobal_Self_Attention'
    
    self.local_mha = tf.keras.layers.MultiHeadAttention(num_heads=num_heads,key_dim=key_dim,dropout=dropout)
    self.global_mha = tf.keras.layers.MultiHeadAttention(num_heads=num_heads,key_dim=key_dim,dropout=dropout)

    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()
    
    self.num_window = num_window
    self.concat_layer = tf.keras.layers.Concatenate(axis=1)
    
    self.global_attention_score = None
    self.local_attention_score = None

    
  def call(self, x):
    ##-------------------------------------
    global_attn_output,self.global_attention_score = self.global_mha(
        query=x,
        value=x,
        key=x, return_attention_scores=True)
    
    ##---------------------------------------
    local_attn_output = []
    self.local_attention_score = []
    for t in tf.split(x,num_or_size_splits=self.num_window,axis=1):
      aout, ascore = self.local_mha(key=t,query=t,value=t,return_attention_scores=True)
      local_attn_output.append(aout)
      self.local_attention_score.append(ascore)
      
    local_attn_output = self.concat_layer(local_attn_output)
    
    ##---------------------------------------
    x = self.add([x, global_attn_output,local_attn_output])
    x = self.layernorm(x)
    return x


In [7]:
class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='gelu'),
      tf.keras.layers.Dense(d_model),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x)
    return x

In [8]:
##-------
##------Encoder----------------
##----------------------

class EncoderBlock(tf.keras.layers.Layer):
  def __init__(self,*, d_model,num_window, dff, num_heads, dropout_rate=0.1):
    super().__init__()

    self._name='Encoder_Block'
    self.self_attention = LocalGlobalSelfAttention(
        num_window=num_window,
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x


"""_summary_

  attn_stack_type: defines how to arrange LSA and GSA; defaults to 'add' [(LSA+GSA),...];
                    another option is 'stack': one after another [GSA,LSA,.....,GSA+LSA]
"""
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads, num_window, dff, dropout_rate=0.1):
    super().__init__()

    self.num_layers = num_layers if num_layers else 1

    self.enc_layers = [
      EncoderBlock(d_model=d_model, num_window=num_window, num_heads=num_heads, dff=dff,dropout_rate=dropout_rate)
      for _ in range(num_layers)
      ]

    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):
    for i in range(self.num_layers):
      x = self.enc_layers[i](x)
    return self.dropout(x)



In [6]:
class Transformer(tf.keras.models.Model):
  def __init__(self, *, num_layers, d_model, num_heads, num_window,
               dff, vocab_size,num_class=2, dropout_rate=0.1):
    super().__init__()
    self._name = 'Fusion'
    if not dff:
      dff = d_model * 2 ## default value for dff

    self.d_model = d_model
    self.num_layers = num_layers if num_layers else 1

    self.pos_embedding = PositionalEmbedding(vocab_size=vocab_size, d_model=d_model)

    self.encoder = EncoderLayer(num_layers=num_layers, d_model=d_model, num_heads=num_heads,
                           num_window=num_window, dff=dff, dropout_rate=dropout_rate)

    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.out = tf.keras.layers.Dense(d_model,activation='tanh',name='feature')
    self.head = tf.keras.layers.Dense(num_class,activation='softmax',name='classification_head')

  def call(self, x):
    # `x` is token-IDs shape: (batch, seq_len)
    x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.
    x = self.encoder(x) # Shape `(batch_size, seq_len, d_model)`.
    self.last_hidden_state = x
    self.pooled_state = self.dropout(self.out(x[0]))
    # self.pooled_state = tf.reduce_logsumexp(x,axis=1) * 0.1 # Shape `(batch_size, d_model)`.
    return self.head(self.pooled_state)  # Shape `(batch_size, num_class)`.
  
  def changeHead(self,num_class,activation='softmax'):
    self.head = tf.keras.layers.Dense(num_class,activation=activation,name='classification_head')

In [6]:
model = Transformer(num_layers=1,d_model=256,num_heads=4,
                    num_window=4,dff=768,
                    vocab_size=len(tokenizer),num_class=3)
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=2e-6),
    loss='categorical_crossentropy',
    metrics=['acc'])

batch_size = 8
dummy = np.ones((batch_size,max_len))
model(dummy) ## building the model
model.summary()

NameError: name 'tokenizer' is not defined

In [14]:
history = model.fit(tf.constant(train_input),
        tf.constant(train_labels),
        epochs=100,batch_size=batch_size,
        validation_data=[tf.constant(test_input),tf.constant(test_labels)],
        callbacks=[tf.keras.callbacks.EarlyStopping(
                            monitor='val_loss', patience=1,
                            verbose=1, mode='auto',
                            restore_best_weights=True)
                        ])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100

In [None]:
import seaborn
from matplotlib import pyplot as plt

seaborn.lineplot(history.history['loss'])
seaborn.lineplot(history.history['val_loss'])
plt.title("Loss Graph")
plt.show()

seaborn.lineplot(history.history['acc'])
seaborn.lineplot(history.history['val_acc'])
plt.title("Accuracy Graph")

In [None]:
####-----------------------------------------
## ---------------Prediction------------------
####-----------------------------------------

pred_labels = [np.argmax(x) for x in
            model.predict(
                x=tf.constant(test_input)
            )
    ]

from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
eval_labels = [np.argmax(x) for x in test_labels]


In [None]:
print("F1-Score",f1_score(eval_labels,pred_labels,average='weighted'))
print("Precision-Score",precision_score(eval_labels,pred_labels,average='weighted'))
print("Recall-Score",recall_score(eval_labels,pred_labels,average='weighted'))
print("Accuracy-Score",accuracy_score(eval_labels,pred_labels))

from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt

confusion_matrix = tf.math.confusion_matrix(eval_labels,pred_labels,num_classes=3)
print(confusion_matrix)
cmd = ConfusionMatrixDisplay(confusion_matrix.numpy())
cmd.plot()
# plt.show()

print("True Labels Onlys",tf.math.confusion_matrix(eval_labels,eval_labels,num_classes=3))

## Attention Score Visualization

In [None]:
for i in range(len(eval_labels)):
  if eval_labels[i] == pred_labels[i]:
    print(i)
    break

In [None]:
## processing a text:
# tokens = tokenizer('विश्वमा महामारीको रुपमा फैलिरहेको कोरोना भाइरस(कोभिड–१९) को बारेमा विभिन्न भ्रामक समाचारहरु आइरहेका छन्',padding='max_length',max_length=128)['input_ids']
tokens = train_input[0]
context_size = len([0 for _ in tokens if _ != tokenizer.pad_token_id])

model(tf.constant([tokens]))
print("Actual Context Length",context_size)
print(tokenizer.convert_ids_to_tokens(tokens[:context_size]))

### 1. Global Attentions

In [None]:
import math
import seaborn as sns

nLayer = model.num_layers
nHead = model.encoder.enc_layers[0].self_attention.global_attention_score.shape[1] ## attention_score[nBatch][nHead]

attention_span = context_size + int(context_size/2)

cols = 4
rows= math.ceil(nHead/cols) if nHead > cols else 1
for i in range(nLayer):
  fig = plt.figure(figsize=[5*cols, 4*rows])
  fig.suptitle('Global Self Attention Score: Layer {}'.format(i+1), fontsize=14)
  index = 1
  for j in range(nHead):
    ax = fig.add_subplot(rows,cols,index)
    ax.set_title("Head {}".format(j+1))
    sns.heatmap(model.encoder.gsa_enc_layers[i].self_attention.attention_score[0][j][:attention_span,:attention_span],ax=ax,cmap="RdBu")
    index+=1
  plt.show()


### 2. Local Attentions

In [None]:
per_window_context = model.encoder.lsa_enc_layers[0].self_attention.attention_score[0].shape[-1]

nWindow = 1 if per_window_context >= context_size else context_size//per_window_context + 2   #len(model.encoder.lsa_enc_layers[0].self_attention.attention_score)
nHead = model.encoder.lsa_enc_layers[0].self_attention.attention_score[0].shape[1]  ## attention_score[nWindow][nBatch]

print("Per Window Context Length: ",per_window_context)

nAscore = nHead*nWindow
cols = 4 if nAscore > 4 else nAscore
rows = math.ceil(nAscore/cols) if nAscore > cols else 1
for i in range(nLayer):
  fig = plt.figure(figsize=[5*cols, 4*rows])
  fig.suptitle('Local Self Attention Score: Layer {}'.format(i+1), fontsize=14)
  index = 1
  for j in range(nWindow):
    for k in range(nHead):
      ax = fig.add_subplot(rows,cols,index)
      ax.set_title("Window {} - Head {}".format(j+1,k+1))
      sns.heatmap(model.encoder.lsa_enc_layers[i].self_attention.attention_score[j][0][k],ax=ax,cmap="RdBu")
      index+=1
  plt.show()