<a href="https://colab.research.google.com/github/Srvand/NLP_Transformers/blob/main/Bert_SequenceClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers



In [18]:
import numpy as np
import pandas as pd

import tensorflow as tf
import transformers as trfs

from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split

In [19]:
df = pd.read_csv('data.csv').drop(['Unnamed: 0'],axis=1)

In [20]:
df.shape

(13083, 2)

In [21]:
df.text.str.len().max()

433

In [22]:
df['category_label']=pd.Categorical(df['category'])
df['category']=df['category_label'].cat.codes

In [23]:
df['category'].nunique()

77

In [25]:
# Max length of encoded string(including special tokens such as [CLS] and [SEP]):
MAX_SEQUENCE_LENGTH = 100

# Standard BERT model with lowercase chars only:
PRETRAINED_MODEL_NAME = 'bert-base-uncased' 

# Batch size for fitting:
BATCH_SIZE = 64

# Number of epochs:
EPOCHS=5

num_labels=df['category'].nunique()

bert_model = trfs.TFBertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME,num_labels=num_labels)
tokenizer = trfs.BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
input_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='input_ids')
attention_mask = tf.keras.layers.Input((MAX_SEQUENCE_LENGTH,), dtype=tf.int32, name='attention_mask')
output = bert_model([input_ids, attention_mask])[0]
output = tf.keras.layers.Dense(num_labels, activation='softmax')(output)
model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=output)

In [27]:
opt = tf.keras.optimizers.Adam(learning_rate=3e-5)
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [28]:
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 100)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 100)]        0           []                               
                                                                                                  
 tf_bert_for_sequence_classific  TFSequenceClassifie  109541453  ['input_ids[0][0]',              
 ation_2 (TFBertForSequenceClas  rOutput(loss=None,               'attention_mask[0][0]']         
 sification)                    logits=(None, 77),                                                
                                 hidden_states=None                                         

In [29]:
X_train,X_val,Y_train,Y_val = train_test_split(df.text, df.category, test_size=0.2)

In [30]:
print(X_train.shape,X_val.shape,Y_train.shape,Y_val.shape)

(10466,) (2617,) (10466,) (2617,)


In [107]:
def encode(X):
    return tokenizer(
    X.to_list(),
    max_length=MAX_SEQUENCE_LENGTH, # set the length of the sequences
    add_special_tokens=True, # add [CLS] and [SEP] tokens
    return_attention_mask=True,
    return_token_type_ids=False, # not needed for this type of ML task
    pad_to_max_length=True, # add 0 pad tokens to the sequences less than max_length
    return_tensors='tf'
    )

In [137]:
a=encode(X_train)



In [139]:
print(a['input_ids'])

tf.Tensor(
[[ 101 2129 3522 ...    0    0    0]
 [ 101 1045 4015 ...    0    0    0]
 [ 101 1045 2123 ...    0    0    0]
 ...
 [ 101 2339 2064 ...    0    0    0]
 [ 101 1045 2288 ...    0    0    0]
 [ 101 1045 2342 ...    0    0    0]], shape=(10466, 512), dtype=int32)


In [31]:
def batch_encode(X,tokenizer):
    return tokenizer.batch_encode_plus(
    X,
    max_length=MAX_SEQUENCE_LENGTH, # set the length of the sequences
    add_special_tokens=True, # add [CLS] and [SEP] tokens
    return_attention_mask=True,
    return_token_type_ids=False, # not needed for this type of ML task
    pad_to_max_length=True, # add 0 pad tokens to the sequences less than max_length
    return_tensors='tf'
)

In [162]:
batch_encode(X_val,tokenizer)



{'input_ids': <tf.Tensor: shape=(2617, 100), dtype=int32, numpy=
array([[ 101, 1045, 3641, ...,    0,    0,    0],
       [ 101, 2064, 1045, ...,    0,    0,    0],
       [ 101, 2017, 5763, ...,    0,    0,    0],
       ...,
       [ 101, 2045, 2003, ...,    0,    0,    0],
       [ 101, 1045, 2074, ...,    0,    0,    0],
       [ 101, 1045, 2245, ...,    0,    0,    0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(2617, 100), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}

In [63]:
s1="This is a sample sentence"
s2="This is second one"
s=[["This is a sample sentence"],["This is second one"]]

In [66]:
print(s)

[['This is a sample sentence'], ['This is second one']]


In [77]:

t=tokenizer(s1,s2)
print(t)
print(t["input_ids"])
decoded = tokenizer.decode(t["input_ids"])
print(decoded)

{'input_ids': [101, 2023, 2003, 1037, 7099, 6251, 102, 2023, 2003, 2117, 2028, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
[101, 2023, 2003, 1037, 7099, 6251, 102, 2023, 2003, 2117, 2028, 102]
[CLS] this is a sample sentence [SEP] this is second one [SEP]


In [56]:
t=tokenizer.encode(s1,s2)
print(t)

[101, 2023, 2003, 1037, 7099, 6251, 102, 2023, 2003, 2117, 2028, 102]


In [71]:
t=tokenizer.encode_plus(s1)
print(t)
decoded = tokenizer.decode(t["input_ids"])
print(decoded)

{'input_ids': [101, 2023, 2003, 1037, 7099, 6251, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}
[CLS] this is a sample sentence [SEP]


In [69]:

t=tokenizer.batch_encode_plus(['This is a sample sentence'], ['This is second one'])
print(t)

{'input_ids': [[101, 2023, 2003, 1037, 7099, 6251, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1]]}


In [32]:
X_train = batch_encode(X_train,tokenizer)
X_val = batch_encode(X_val,tokenizer)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [17]:
X_val.values()

dict_values([<tf.Tensor: shape=(2617, 100), dtype=int32, numpy=
array([[  101,  2079,  2017, ...,     0,     0,     0],
       [  101,  1045,  2363, ...,     0,     0,     0],
       [  101,  2026,  4070, ...,     0,     0,     0],
       ...,
       [  101,  2043,  1045, ...,     0,     0,     0],
       [  101,  1045,  2572, ...,     0,     0,     0],
       [  101, 22286,  2039, ...,     0,     0,     0]], dtype=int32)>, <tf.Tensor: shape=(2617, 100), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>])

In [33]:
model.fit(
    x=X_train.values(),
    y=Y_train,
    validation_data=(X_val.values(),Y_val),
    epochs=1,
    batch_size=64
)

 12/164 [=>............................] - ETA: 2:34:43 - loss: 4.3665 - accuracy: 0.0208

KeyboardInterrupt: ignored

In [None]:
|