<a href="https://colab.research.google.com/github/Srvand/NLP_Transformers/blob/main/Bert_SequenceClassification_V2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Installing transformers library
!pip install transformers

Collecting transformers
  Downloading transformers-4.15.0-py3-none-any.whl (3.4 MB)
[K     |████████████████████████████████| 3.4 MB 5.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.1-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 463 kB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 70.3 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 43.4 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 51.1 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
 

In [3]:
import numpy as np
import pandas as pd

import tensorflow as tf
import transformers as trfs

from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split

In [4]:
# Importing Data from file
df = pd.read_csv('data.csv').drop(['Unnamed: 0'],axis=1)

In [5]:
df.shape

(13083, 2)

In [6]:
df.head(5)

Unnamed: 0,text,category
0,I am still waiting on my card?,card_arrival
1,What can I do if my card still hasn't arrived ...,card_arrival
2,I have been waiting over a week. Is the card s...,card_arrival
3,Can I track my card while it is in the process...,card_arrival
4,"How do I know if I will get my card, or if it ...",card_arrival


In [9]:
# Finding max length of sentence from the data
df.text.str.len().max()

433

In [7]:
# Converting Categorical column into Categhory & one hot encoding to feed into model
df['category_label']=pd.Categorical(df['category'])
df['category']=df['category_label'].cat.codes

In [8]:
# Unique number of labels
df['category'].nunique()

77

In [10]:
# Setting hyperparameters for the Model
# Max length of encoded string(including special tokens such as [CLS] and [SEP]):
MAX_SEQUENCE_LENGTH = 100
# Standard BERT model with lowercase chars only:
PRETRAINED_MODEL_NAME = 'bert-base-uncased' 
# Batch size for fitting:
BATCH_SIZE = 64
# Number of epochs:
EPOCHS=5
# Setting num of labels
num_labels=df['category'].nunique()

In [11]:
#Load BERT tokenizer
tokenizer = trfs.BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
# Load Bert pre-trained model
bert_model = trfs.TFBertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME,num_labels=num_labels)


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Build model input&attention mask
input_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='attention_mask')
#Model output
output = bert_model([input_ids, attention_mask])[0]
output = tf.keras.layers.Dense(num_labels, activation='softmax')(output)
# Load the Transformers BERT model as a layer in a Keras model(Inputs&Outputs)
model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=output)

In [14]:
#Model Summary
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 100)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 100)]        0           []                               
                                                                                                  
 tf_bert_for_sequence_classific  TFSequenceClassifie  109541453  ['input_ids[0][0]',              
 ation (TFBertForSequenceClassi  rOutput(loss=None,               'attention_mask[0][0]']         
 fication)                      logits=(None, 77),                                                
                                 hidden_states=None                                           

In [15]:
# Set an optimizer
opt = tf.keras.optimizers.Adam(learning_rate=3e-5)
# Compile the model
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [17]:
# Split the model into Train& Test Data
X_train,X_val,Y_train,Y_val = train_test_split(df.text, df.category, test_size=0.2)

In [18]:
print(X_train.shape,X_val.shape,Y_train.shape,Y_val.shape)

(10466,) (2617,) (10466,) (2617,)


In [39]:
#Below step breaks sentences/words into Tokens,add special [CLS]&[SEP]tokens,substitute tokens with ID's
def batch_encode(X,tokenizer):
    return tokenizer.batch_encode_plus(
    X,
    max_length=MAX_SEQUENCE_LENGTH, # set the length of the sequences
    add_special_tokens=True, # add [CLS] and [SEP] tokens
    return_attention_mask=True,
    return_token_type_ids=False, # not needed for this type of ML task
    pad_to_max_length=True, # add 0 pad tokens to the sequences less than max_length
    return_tensors='tf'
)
# There are few methods for Tokenizer available. Tokenizer,Tokenizer.encode,Tokenizer.encode_plus,Tokenizer.batch_encode_plus
#input_ids,toke_type_ids,attention_mask are ouputs of tokenizer
# [CLS] token added at first position and [SEP] token added at end of each sentence
# In this case we are using batch_encode_plus as sequence of sentences list to be encoded    

In [20]:
batch_encode(X_val,tokenizer)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'input_ids': <tf.Tensor: shape=(2617, 100), dtype=int32, numpy=
array([[  101,  1996, 10439, ...,     0,     0,     0],
       [  101,  2054,  2003, ...,     0,     0,     0],
       [  101,  1045,  2572, ...,     0,     0,     0],
       ...,
       [  101,  2129,  2079, ...,     0,     0,     0],
       [  101,  2003,  2045, ...,     0,     0,     0],
       [  101,  2054,  2003, ...,     0,     0,     0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2617, 100), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}

In [40]:
s1="This is a sample sentence"
s2="This is second one"
s=[["This is a sample sentence"],["This is second one"]]

In [41]:
# Sample tokenizer example
t=tokenizer(s1,s2)
print(t)
decoded = tokenizer.decode(t["input_ids"])
print(decoded)
#input_ids,toke_type_ids,attention_mask are ouputs of tokenizer
# [CLS] token added at first position and [SEP] token added at end of each sentence

{'input_ids': [101, 2023, 2003, 1037, 7099, 6251, 102, 2023, 2003, 2117, 2028, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
[CLS] this is a sample sentence [SEP] this is second one [SEP]


In [37]:
# t=tokenizer.encode(s1,s2)
# print(t)
# t=tokenizer.encode_plus(s1)
# print(t)
# decoded = tokenizer.decode(t["input_ids"])
# print(decoded)
# t=tokenizer.batch_encode_plus(['This is a sample sentence'], ['This is second one'])
# print(t["input_ids"])

[101, 2023, 2003, 1037, 7099, 6251, 102, 2023, 2003, 2117, 2028, 102]
{'input_ids': [101, 2023, 2003, 1037, 7099, 6251, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
[CLS] this is a sample sentence [SEP]
[[101, 2023, 2003, 1037, 7099, 6251, 102]]


In [42]:
# Encoding the train and test features
X_train = batch_encode(X_train,tokenizer)
X_val = batch_encode(X_val,tokenizer)



In [43]:
X_val.values()

dict_values([<tf.Tensor: shape=(2617, 100), dtype=int32, numpy=
array([[  101,  1996, 10439, ...,     0,     0,     0],
       [  101,  2054,  2003, ...,     0,     0,     0],
       [  101,  1045,  2572, ...,     0,     0,     0],
       ...,
       [  101,  2129,  2079, ...,     0,     0,     0],
       [  101,  2003,  2045, ...,     0,     0,     0],
       [  101,  2054,  2003, ...,     0,     0,     0]], dtype=int32)>, <tf.Tensor: shape=(2617, 100), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>])

In [None]:
model.fit(
    x=X_train.values(),
    y=Y_train,
    validation_data=(X_val.values(),Y_val),
    epochs=1,
    batch_size=64
)

UnimplementedError: ignored

In [None]:
|