In [1]:
!pip install transformers



In [2]:
import numpy as np
import pandas as pd

import sklearn.model_selection as ms
import sklearn.preprocessing as p

import tensorflow as tf
import transformers as trfs

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [16]:
# Max length of encoded string(including special tokens such as [CLS] and [SEP]):
MAX_SEQUENCE_LENGTH = 100 

# Standard BERT model with lowercase chars only:
PRETRAINED_MODEL_NAME = 'bert-base-uncased' 

# Batch size for fitting:
BATCH_SIZE = 64

# Number of epochs:
EPOCHS=5

In [4]:
df = pd.read_csv('data.csv').drop(['Unnamed: 0'],axis=1)

In [5]:
df.shape

(13083, 2)

In [6]:
df['category_label']=pd.Categorical(df['category'])
df['category']=df['category_label'].cat.codes

In [7]:
df['category'].nunique()

77

In [8]:
def create_model(max_sequence, model_name, num_labels):
    bert_model = trfs.TFBertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    
    # This is the input for the tokens themselves(words from the dataset after encoding):
    input_ids = tf.keras.layers.Input(shape=(max_sequence,), dtype=tf.int32, name='input_ids')

    # attention_mask - is a binary mask which tells BERT which tokens to attend and which not to attend.
    # Encoder will add the 0 tokens to the some sequence which smaller than MAX_SEQUENCE_LENGTH, 
    # and attention_mask, in this case, tells BERT where is the token from the original data and where is 0 pad token:
    attention_mask = tf.keras.layers.Input((max_sequence,), dtype=tf.int32, name='attention_mask')
    
    # Use previous inputs as BERT inputs:
    output = bert_model([input_ids, attention_mask])[0]

    # We can also add dropout as regularization technique:
    # output = tf.keras.layers.Dropout(rate=0.15)(output)

    # Provide number of classes to the final layer:
    output = tf.keras.layers.Dense(num_labels, activation='softmax')(output)

    # Final model:
    model = tf.keras.models.Model(inputs=[input_ids, attention_mask], outputs=output)
    return model

In [9]:
model = create_model(MAX_SEQUENCE_LENGTH, PRETRAINED_MODEL_NAME, df.category.nunique())

opt = tf.keras.optimizers.Adam(learning_rate=3e-5)
model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 100)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 100)]        0           []                               
                                                                                                  
 tf_bert_for_sequence_classific  TFSequenceClassifie  109541453  ['input_ids[0][0]',              
 ation (TFBertForSequenceClassi  rOutput(loss=None,               'attention_mask[0][0]']         
 fication)                      logits=(None, 77),                                                
                                 hidden_states=None                                           

In [11]:
def batch_encode(X, tokenizer):
    return tokenizer.batch_encode_plus(
    X,
    max_length=MAX_SEQUENCE_LENGTH, # set the length of the sequences
    add_special_tokens=True, # add [CLS] and [SEP] tokens
    return_attention_mask=True,
    return_token_type_ids=False, # not needed for this type of ML task
    pad_to_max_length=True, # add 0 pad tokens to the sequences less than max_length
    return_tensors='tf'
)

In [12]:
tokenizer = trfs.BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

In [13]:
X_train,X_val,Y_train,Y_val = ms.train_test_split(df.text.values, df.category.values, test_size=0.2)

In [14]:
print(X_train.shape,X_val.shape,Y_train.shape,Y_val.shape)

(10466,) (2617,) (10466,) (2617,)


In [None]:
Y_val

array([23,  9, 27, ..., 59, 39, 21], dtype=int8)

In [15]:
X_train = batch_encode(X_train,tokenizer)
X_val = batch_encode(X_val,tokenizer)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
model.fit(
    x=X_train.values(),
    y=Y_train,
    validation_data=(X_val.values(),Y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE
)

Epoch 1/5
 22/164 [===>..........................] - ETA: 2:15:56 - loss: 4.3315 - accuracy: 0.0227

In [None]:
|