In [1]:
# https://www.analyticsvidhya.com/blog/2022/04/building-state-of-the-art-text-classifier-using-huggingface-and-tensorflow/
# https://www.kaggle.com/code/satyampd/bert-text-classification-w-keras-huggingface/notebook
# https://huggingface.co/course/chapter3/3?fw=tf

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf



In [3]:
data = pd.read_csv('../datasets/text_data.csv')
data.head()

Unnamed: 0,code,label,subcategory,feedback,criticality,organization,question,row_index
0,xn,Couldn't be improved,Nothing to improve,Nothing.,3,Trust A,Trust A - Q1,0
1,ee,Environment/ facilities,Environment/ facilities,Temperature in theatre a little low.,-1,Trust A,Trust A - Q1,1
2,ap,Access,Provision of services,Same service available at Bingham Health Centre.,-2,Trust A,Trust A - Q1,2
3,mi,Communication,Amount/clarity of information,Appointment details given over phone - no phys...,-1,Trust A,Trust A - Q1,3
4,mm,Communication,Communication,On one occasion I was not made aware that my a...,-3,Trust A,Trust A - Q1,4


In [4]:
data.shape

(10334, 8)

In [5]:
values = {'-5': 0,
 '-4': 0,
 '-3': 1,
 '-2': 1,
 '-1': 1,
 '0': 2,
 '1': 3,
 '2': 3,
 '3': 3,
 '4': 4,
 '5': 4}

In [6]:
data['sentiment_categories'] = data['criticality'].map(values)
data['sentiment_categories'].value_counts()

3.0    5025
2.0    2566
1.0    1630
4.0     882
0.0     195
Name: sentiment_categories, dtype: int64

In [7]:
text_data = data[['feedback', 'sentiment_categories']].dropna()
x = text_data[['feedback']].rename(columns = {'feedback':'predictor'})
y = text_data['sentiment_categories'].to_numpy()

In [8]:

# x_train, x_test, y_train, y_test, index_training_data, index_test_data  = train_test_split(x, y, pd.DataFrame(x).index,
#                                                                                              test_size=0.3,
#                                                                                              stratify=y,
#                                                                                              shuffle=True
#                                                                                              )

In [9]:
from transformers import AutoTokenizer, DataCollatorWithPadding
model_ckpt = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_ckpt, truncation=True, max_length=100)

In [10]:
from tqdm import tqdm
def tokenize_function(example):
    return tokenizer(example, truncation=True, max_length=100)

input_ids=[]
attention_masks=[]

for sent in tqdm(x['predictor']):
    bert_inp=tokenizer.encode_plus(sent,add_special_tokens = True, truncation = True, padding = 'max_length',  max_length =128,return_attention_mask = True)
    input_ids.append(bert_inp['input_ids'])
    attention_masks.append(bert_inp['attention_mask'])

input_ids=np.asarray(input_ids)
attention_masks=np.array(attention_masks)
target = np.array(pd.get_dummies(y))


100%|███████████████████████████████████████████████████████████████████████████| 10297/10297 [00:01<00:00, 6300.71it/s]


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test,train_mask,test_mask=train_test_split(input_ids,target,attention_masks,test_size=0.2)

In [12]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=5)
model.summary()

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  3845      
                                                                 
Total params: 109,486,085
Trainable params: 109,486,085
Non-trainable params: 0
_________________________________________________________________


In [13]:
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.callbacks import EarlyStopping


num_epochs = 6
# The number of training steps is the number of samples in the dataset, divided by the batch size then multiplied
# by the total number of epochs. Note that the tf_train_dataset here is a batched tf.data.Dataset,
# not the original Hugging Face Dataset, so its len() is already num_samples // batch_size.
num_train_steps = len(X_train) * num_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)
opt = Adam(learning_rate=lr_scheduler)

es = EarlyStopping(patience = 1, restore_best_weights=True)

model.compile(
    optimizer=opt,
    loss=CategoricalCrossentropy(from_logits=True),
    metrics=["accuracy"]
)


In [14]:
history=model.fit([X_train,train_mask],y_train,batch_size=32,epochs=5,validation_split = 0.2 , callbacks=[es], verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5


In [15]:
model.save('BERT_5_level_sentiment')



INFO:tensorflow:Assets written to: BERT_5_level_sentiment/assets


INFO:tensorflow:Assets written to: BERT_5_level_sentiment/assets


## No good. should have used tf.data.Dataset to create my dataset 22/11/22