In [None]:
# bert_variant = 'bert-base-chinese'
bert_variant = 'hfl/chinese-roberta-wwm-ext'
freeze_bert = False

In [9]:
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'


import torch
import numpy as np
import pandas as pd
import tensorflow as tf
from tabulate import tabulate
from keras.optimizers import Adam
from keras import Model, Sequential
from keras.callbacks import Callback
from keras.layers import Dense, Dropout
from transformers import TFBertModel, BertTokenizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score
from keras.losses import SparseCategoricalCrossentropy


def seed_everything(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    tf.random.set_seed(seed)


seed_everything(seed=42)

# Load training and testing dataframes
train_df = pd.read_excel('train.xlsx')
train_df.columns = ['text', 'class']

test_df = pd.read_excel('test.xlsx')
test_df.columns = ['text', 'class']

# Split training set into 5 folds, stratified by class (only fold 0 is used in this notebook)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
train_df['fold'] = -1
for i, (_, test_index) in enumerate(skf.split(train_df, train_df['class'])):
    train_df.iloc[test_index, -1] = i
table = pd.DataFrame(
    {
        f'fold{i}': train_df[train_df['fold'] == i].groupby('class').size()
        for i in range(5)
    }
    | {'test': test_df.groupby('class').size()}
)
print(tabulate(table, headers='keys'))

# Show random samples from the training set
print(train_df['text'][train_df['class'] == 0].sample(5).values)
print(train_df['text'][train_df['class'] == 1].sample(5).values)

  class    fold0    fold1    fold2    fold3    fold4    test
-------  -------  -------  -------  -------  -------  ------
      0      160      160      160      160      159     199
      1      160      160      160      160      160     202
['高考理科录取人数最多20个主流大众专业解析' '中央民族大学2009年普通本科招生章程' '研究生考试迟到15分钟不得入场'
 '广州中考体育12日报名考生签名确认成绩' '09成考时间17日至18日 11月上旬划定分数线']
['教育部就中小学教师队伍补充等有关工作答问' '安理会半数成员反对巴勒斯坦加入联合国' '美媒称国际刑事法庭决定逮捕苏丹领导人'
 '阿西：无效的G20并非是坏事情' '查韦斯抨击美国解决洪都拉斯危机不力']


In [10]:
max_length = 128

tokenizer: BertTokenizer = BertTokenizer.from_pretrained(bert_variant)
train_tokens = tokenizer(
    train_df['text'][train_df['fold'] != 0].tolist(),
    return_tensors='tf',
    truncation=True,
    padding=True,
    max_length=max_length,
)
val_tokens = tokenizer(
    train_df['text'][train_df['fold'] == 0].tolist(),
    return_tensors='tf',
    truncation=True,
    padding=True,
    max_length=max_length,
)
test_tokens = tokenizer(
    test_df['text'].tolist(),
    return_tensors='tf',
    truncation=True,
    padding=True,
    max_length=max_length,
)

train_labels = train_df['class'][train_df['fold'] != 0].values.astype(np.int32)
val_labels = train_df['class'][train_df['fold'] == 0].values.astype(np.int32)
test_labels = test_df['class'].values.astype(np.int32)

train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_tokens), train_labels))
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_tokens), val_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_tokens), test_labels))

In [11]:
lr = 2e-5
dropout = 0.1

num_epochs = 3
batch_size = 32


class TextClassifier(Model):
    def __init__(self, num_classes, mlp_ratio=2, dropout_rate=0.1, freeze_bert=False):
        super().__init__()
        self.bert = TFBertModel.from_pretrained(bert_variant, name='bert')
        self.bert.trainable = not freeze_bert
        self.dropout = Dropout(dropout_rate, name='dropout')
        embedding_size = self.bert.config.hidden_size
        self.head = Sequential(
            [
                Dense(embedding_size * mlp_ratio, activation='silu', name='hidden'),
                Dropout(dropout_rate, name='dropout'),
                Dense(num_classes, activation='softmax', name='output'),
            ],
            name='head',
        )

    def call(self, inputs):
        pooler_output = self.bert(inputs).pooler_output
        pooler_output = self.dropout(pooler_output)
        return self.head(pooler_output)


class EvalCallback(Callback):
    def __init__(
        self, dataset, labels, prefix='val', output_file=f'{freeze_bert=}.csv'
    ):
        super().__init__()
        self.results = []
        self.dataset = dataset
        self.labels = labels
        self.prefix = prefix
        self.best_acc = 0
        self.output_file = output_file
        self.model: TextClassifier

    def on_epoch_begin(self, epoch, logs=None):
        print(f'Epoch {epoch + 1}/{num_epochs}')

    def on_train_batch_end(self, batch, logs=None):
        loss = logs['loss']
        print(f'\rStep {batch + 1} - loss: {loss:.4f}', end='')
        if (batch + 1) % 10 == 0:
            y_pred = self.model.predict(self.dataset, verbose=0).argmax(axis=-1)
            acc = accuracy_score(self.labels, y_pred)
            f1 = f1_score(self.labels, y_pred, average='macro')
            self.results.append([loss, acc, f1])
            print(f' - {self.prefix}_accuracy: {acc:.4f} - {self.prefix}_f1: {f1:.4f}')
            if acc > self.best_acc:
                self.best_acc = acc
                self.model.save_weights(f'{freeze_bert=}.h5')

    def on_epoch_end(self, epoch, logs=None):
        pd.DataFrame(
            self.results,
            columns=[
                f'{self.prefix}_loss',
                f'{self.prefix}_accuracy',
                f'{self.prefix}_f1',
            ],
        ).to_csv(self.output_file, index=False)


seed_everything(seed=42)

model = TextClassifier(num_classes=2, dropout_rate=dropout, freeze_bert=freeze_bert)
model(train_tokens[:1])
model.summary(expand_nested=True)

model.compile(optimizer=Adam(learning_rate=lr), loss=SparseCategoricalCrossentropy())
model.fit(
    train_dataset.shuffle(1000).batch(batch_size),
    epochs=num_epochs,
    callbacks=[EvalCallback(val_dataset.batch(batch_size), val_labels)],
    verbose=0,
)

All model checkpoint layers were used when initializing TFBertModel.

All the layers of TFBertModel were initialized from the model checkpoint at hfl/chinese-roberta-wwm-ext.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "text_classifier_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertModel)          multiple                  102267648 
|¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯|
| bert (TFBertMainLayer)     multiple                  102267648|
¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
 dropout (Dropout)           multiple                  0         
                                                                 
 head (Sequential)           (1, 2)                    1184258   
|¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯|
| hidden (Dense)             (1, 1536)                 1181184  |
|                                                               |
| dropout (Dropout)          (1, 1536)                 0        |
|                                                               |
| output (Dense)             (1, 2)              



Epoch 1/3
Step 10 - loss: 0.5507 - val_accuracy: 0.9437 - val_f1: 0.9437
Step 20 - loss: 0.3595 - val_accuracy: 0.9656 - val_f1: 0.9656
Step 30 - loss: 0.2615 - val_accuracy: 0.9719 - val_f1: 0.9719
Step 40 - loss: 0.2044 - val_accuracy: 0.9781 - val_f1: 0.9781
Epoch 2/3
Step 10 - loss: 0.0292 - val_accuracy: 0.9719 - val_f1: 0.9719
Step 20 - loss: 0.0329 - val_accuracy: 0.9719 - val_f1: 0.9719
Step 30 - loss: 0.0315 - val_accuracy: 0.9719 - val_f1: 0.9719
Step 40 - loss: 0.0433 - val_accuracy: 0.9781 - val_f1: 0.9781
Epoch 3/3
Step 10 - loss: 0.0220 - val_accuracy: 0.9719 - val_f1: 0.9719
Step 20 - loss: 0.0251 - val_accuracy: 0.9750 - val_f1: 0.9750
Step 30 - loss: 0.0204 - val_accuracy: 0.9750 - val_f1: 0.9750
Step 40 - loss: 0.0170 - val_accuracy: 0.9719 - val_f1: 0.9719


<keras.src.callbacks.History at 0x2980fc550>

In [12]:
model.load_weights(f'{freeze_bert=}.h5')
y_pred = model.predict(test_dataset.batch(batch_size)).argmax(axis=-1)
print(f'Test accuracy: {accuracy_score(test_labels, y_pred):.4f}')
print(f'Test f1: {f1_score(test_labels, y_pred, average="macro"):.4f}')

Test accuracy: 0.9875
Test f1: 0.9875
