<a href="https://colab.research.google.com/github/Srvand/NLP_Transformers/blob/main/Bert_SequenceClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [83]:
!pip install transformers



In [98]:
import numpy as np
import pandas as pd

import tensorflow as tf
import transformers as trfs

from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split

In [85]:
df = pd.read_csv('data.csv').drop(['Unnamed: 0'],axis=1)

In [86]:
df.shape

(13083, 2)

In [20]:
df.text.str.len().max()

433

In [None]:
df['category_label']=pd.Categorical(df['category'])
df['category']=df['category_label'].cat.codes

In [87]:
df['category'].nunique()

77

In [89]:
# Max length of encoded string(including special tokens such as [CLS] and [SEP]):
MAX_SEQUENCE_LENGTH = 512

# Standard BERT model with lowercase chars only:
PRETRAINED_MODEL_NAME = 'bert-base-uncased' 

# Batch size for fitting:
BATCH_SIZE = 64

# Number of epochs:
EPOCHS=5

num_labels=df['category'].nunique()

bert_model = trfs.TFBertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME)
tokenizer = trfs.BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [90]:
input_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='attention_mask')
outp = bert_model([input_ids, attention_mask])[0]
output = tf.keras.layers.Dense(num_labels, activation='softmax')(outp)
model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=outp)

In [91]:
opt = tf.keras.optimizers.Adam(learning_rate=3e-5)
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [None]:
# def create_model(max_sequence, model_name, num_labels):
#     bert_model = trfs.TFBertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
#     tokenizer = trfs.BertTokenizer.from_pretrained(model_name)
    
#     # This is the input for the tokens themselves(words from the dataset after encoding):
#     input_ids = tf.keras.layers.Input(shape=(max_sequence,), dtype=tf.int32, name='input_ids')

#     # attention_mask - is a binary mask which tells BERT which tokens to attend and which not to attend.
#     # Encoder will add the 0 tokens to the some sequence which smaller than MAX_SEQUENCE_LENGTH, 
#     # and attention_mask, in this case, tells BERT where is the token from the original data and where is 0 pad token:
#     attention_mask = tf.keras.layers.Input((max_sequence,), dtype=tf.int32, name='attention_mask')
    
#     # Use previous inputs as BERT inputs:
#     outp = bert_model([input_ids, attention_mask])[0]

#     # We can also add dropout as regularization technique:
#     # output = tf.keras.layers.Dropout(rate=0.15)(output)

#     # Provide number of classes to the final layer:
#     output = tf.keras.layers.Dense(num_labels, activation='softmax')(outp)

#     # Final model:
#     model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=outp)
#     return model

In [None]:
# model = create_model(MAX_SEQUENCE_LENGTH, PRETRAINED_MODEL_NAME, df.category.nunique())

# opt = tf.keras.optimizers.Adam(learning_rate=3e-5)
# model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [92]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 tf_bert_for_sequence_classific  TFSequenceClassifie  109483778  ['input_ids[0][0]',              
 ation_4 (TFBertForSequenceClas  rOutput(loss=None,               'attention_mask[0][0]']         
 sification)                    logits=(None, 2),                                                 
                                 hidden_states=None                                         

In [105]:
X_train,X_val,Y_train,Y_val = train_test_split(df.text, df.category, test_size=0.2)

In [106]:
print(X_train.shape,X_val.shape,Y_train.shape,Y_val.shape)

(10466,) (2617,) (10466,) (2617,)


In [107]:
def encode(X):
    return tokenizer(
    X.to_list(),
    max_length=MAX_SEQUENCE_LENGTH, # set the length of the sequences
    add_special_tokens=True, # add [CLS] and [SEP] tokens
    return_attention_mask=True,
    return_token_type_ids=False, # not needed for this type of ML task
    pad_to_max_length=True, # add 0 pad tokens to the sequences less than max_length
    return_tensors='tf'
    )

In [108]:
encode(X_train)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


{'input_ids': <tf.Tensor: shape=(10466, 512), dtype=int32, numpy=
array([[ 101, 1045, 1005, ...,    0,    0,    0],
       [ 101, 1045, 2031, ...,    0,    0,    0],
       [ 101, 2673, 2001, ...,    0,    0,    0],
       ...,
       [ 101, 1045, 2001, ...,    0,    0,    0],
       [ 101, 2026, 3042, ...,    0,    0,    0],
       [ 101, 2079, 2017, ...,    0,    0,    0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(10466, 512), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}

In [109]:
def batch_encode(X,tokenizer):
    return tokenizer.batch_encode_plus(
    X,
    max_length=MAX_SEQUENCE_LENGTH, # set the length of the sequences
    add_special_tokens=True, # add [CLS] and [SEP] tokens
    return_attention_mask=True,
    return_token_type_ids=False, # not needed for this type of ML task
    pad_to_max_length=True, # add 0 pad tokens to the sequences less than max_length
    return_tensors='tf'
)

In [111]:
batch_encode(X_train,tokenizer)



{'input_ids': <tf.Tensor: shape=(10466, 512), dtype=int32, numpy=
array([[ 101, 1045, 1005, ...,    0,    0,    0],
       [ 101, 1045, 2031, ...,    0,    0,    0],
       [ 101, 2673, 2001, ...,    0,    0,    0],
       ...,
       [ 101, 1045, 2001, ...,    0,    0,    0],
       [ 101, 2026, 3042, ...,    0,    0,    0],
       [ 101, 2079, 2017, ...,    0,    0,    0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(10466, 512), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}

In [63]:
s1="This is a sample sentence"
s2="This is second one"
s=[["This is a sample sentence"],["This is second one"]]

In [66]:
print(s)

[['This is a sample sentence'], ['This is second one']]


In [77]:

t=tokenizer(s1,s2)
print(t)
print(t["input_ids"])
decoded = tokenizer.decode(t["input_ids"])
print(decoded)

{'input_ids': [101, 2023, 2003, 1037, 7099, 6251, 102, 2023, 2003, 2117, 2028, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
[101, 2023, 2003, 1037, 7099, 6251, 102, 2023, 2003, 2117, 2028, 102]
[CLS] this is a sample sentence [SEP] this is second one [SEP]


In [56]:
t=tokenizer.encode(s1,s2)
print(t)

[101, 2023, 2003, 1037, 7099, 6251, 102, 2023, 2003, 2117, 2028, 102]


In [71]:
t=tokenizer.encode_plus(s1)
print(t)
decoded = tokenizer.decode(t["input_ids"])
print(decoded)

{'input_ids': [101, 2023, 2003, 1037, 7099, 6251, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
[CLS] this is a sample sentence [SEP]


In [69]:

t=tokenizer.batch_encode_plus(['This is a sample sentence'], ['This is second one'])
print(t)

{'input_ids': [[101, 2023, 2003, 1037, 7099, 6251, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}


In [112]:
X_train = batch_encode(X_train,tokenizer)
X_val = batch_encode(X_val,tokenizer)



In [114]:
X_train

{'input_ids': <tf.Tensor: shape=(10466, 512), dtype=int32, numpy=
array([[ 101, 1045, 1005, ...,    0,    0,    0],
       [ 101, 1045, 2031, ...,    0,    0,    0],
       [ 101, 2673, 2001, ...,    0,    0,    0],
       ...,
       [ 101, 1045, 2001, ...,    0,    0,    0],
       [ 101, 2026, 3042, ...,    0,    0,    0],
       [ 101, 2079, 2017, ...,    0,    0,    0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(10466, 512), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}

In [None]:
model.fit(
    x=X_train.values(),
    y=Y_train,
    validation_data=(X_val.values(),Y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE
)

Epoch 1/5
Epoch 2/5
 33/164 [=====>........................] - ETA: 2:03:28 - loss: 2.2445 - accuracy: 0.6501

In [None]:
|