# PTT gossip classification

這章節我們使用中文預訓練模型`bert-base-chinese`來進行`finetune`。

In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds
import numpy as np
import pandas as pd
import os
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.model_selection import train_test_split
from transformers import *

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [2]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-chinese')

loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--bert-base-chinese/snapshots/8d2a91f91cc38c96bb8b4556ba70c392f8d5ee55/config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.30.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}

loading weights file model.safetensors from c

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

loading file vocab.txt from cache at /home/jovyan/.cache/huggingface/hub/models--bert-base-chinese/snapshots/8d2a91f91cc38c96bb8b4556ba70c392f8d5ee55/vocab.txt
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /home/jovyan/.cache/huggingface/hub/models--bert-base-chinese/snapshots/8d2a91f91cc38c96bb8b4556ba70c392f8d5ee55/tokenizer_config.json
loading configuration file config.json from cache at /home/jovyan/.cache/huggingface/hub/models--bert-base-chinese/snapshots/8d2a91f91cc38c96bb8b4556ba70c392f8d5ee55/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,


### Data overview

我們使用從ptt八卦版進行爬蟲整理，$0$表示該留言的推數小於噓數，$1$表示該留言的推數大於噓數，所以這個任務是屬於`Text classification`任務(二元分類)。

In [None]:
# 上傳資料
!wget -q https://github.com/TA-aiacademy/course_3.0/releases/download/v2.5_nlp/NLP_part5.zip
!unzip -q NLP_part5.zip

In [4]:
ptt = pd.read_csv('Data/ptt_gossip.csv')

bert_max_length = 512
ptt['sentence'] = [t[:bert_max_length] for t in ptt.sentence]

In [5]:
ptt.head()

Unnamed: 0,idx,sentence,label
0,0,反核人士最愛靠妖 核電廠蓋你家好不好，我當然說好 核廢料放我家好不好，我也ok 放在地下室就...,1
1,1,如標題， 今天去逛才看到的，如下圖所示: 位置在西屯區漢口路二段118號。 少了一個可以看...,1
2,2,新聞來源 2025非核家園，燃煤電廠30%、再生能源(綠能)20%、 天然氣發電50%的能...,1
3,3,牽了一台新的摩托車 車行老闆跟我說記得汽油要加95 還附帶 我開車行幾十年了 聽我的準沒錯 ...,1
4,4,各位30cm大大、F cup的水水，打給後 胎嘎後 本魯邊緣人，平日臉書沒朋友近日更4鬼怪肆...,1


In [6]:
"""
訓練集80%，測試集20%
"""
train_size = 0.8

mask = np.random.rand(len(ptt)) < train_size
train_dataset = ptt[mask]
valid_dataset = ptt[~mask]

In [7]:
train_size = len(train_dataset)
valid_size = len(valid_dataset)

In [8]:
print('Train size: ', train_size)
print('Valid size: ', valid_size)

Train size:  5616
Valid size:  1459


### Convert to tensor

各種`Transformer`預訓練都支持`tf.tensor`輸入格式，需要將資料集轉為`tf.tensor`格式。

In [9]:
train_dataset = tf.data.Dataset.from_tensor_slices(dict(train_dataset))
valid_dataset = tf.data.Dataset.from_tensor_slices(dict(valid_dataset))

### Traing data format

使用`glue_convert_examples_to_features`將資料集轉為模型可讀取格式，因為是二元分類，所以我們使用的任務為`cola`，`cola`是`bert`在`finetune`時的任務之一，一樣是二元分類任務，我們可以套用他的輸入格式來進行轉換，而在中文部分目前的預訓練模型都是用`chararcter-level`進行斷詞，所以我們將`max_length`提高至$256$，下表為在`Titan X 12G`上`finetune`的參數限制，表示模型以及多少句子長度對應其最大的`batch_size`，需要注意其硬體限制，而`1080ti`為`11G`，可以使用句子長度`256`搭配`batch_size`為16。

<img src="Slides_image/4.png" alt="Drawing" style="width: 250px;"/>

In [10]:
max_length = 512
task = 'cola'

train_dataset = glue_convert_examples_to_features(train_dataset,
                                                  tokenizer,
                                                  max_length,
                                                  task)
valid_dataset = glue_convert_examples_to_features(valid_dataset,
                                                  tokenizer,
                                                  max_length,
                                                  task)

Using label list ['0', '1'] for task cola
Using output mode classification for task cola
*** Example ***
guid: 0
features: InputFeatures(input_ids=[101, 1353, 3417, 782, 1894, 3297, 2695, 7479, 1973, 3417, 7442, 2449, 5901, 872, 2157, 1962, 679, 1962, 8024, 2769, 4534, 4197, 6303, 1962, 3417, 2450, 3160, 3123, 2769, 2157, 1962, 679, 1962, 8024, 2769, 738, 8270, 3123, 1762, 1765, 678, 2147, 2218, 6121, 749, 6929, 1353, 6882, 889, 8024, 4125, 1213, 4634, 7442, 4240, 2130, 4638, 3942, 3942, 738, 3221, 3300, 2523, 1914, 3123, 2198, 2595, 4289, 6549, 3123, 872, 2157, 6206, 1621, 4412, 1762, 2218, 3021, 1343, 7983, 759, 857, 6206, 1621, 6206, 1343, 6275, 4692, 4692, 136, 2865, 1266, 2769, 1377, 809, 1343, 3417, 676, 2449, 7621, 5439, 6291, 6352, 671, 763, 5439, 7531, 1762, 6174, 7481, 6882, 4272, 4272, 1059, 6956, 7442, 5582, 4675, 2971, 8024, 3760, 752, 976, 2994, 4234, 3160, 3472, 1348, 679, 3221, 782, 1343, 2994, 4125, 1213, 4634, 7442, 4240, 7028, 3779, 4240, 4209, 872, 6206, 1343, 2521,

In [11]:
train_temp = next(iter(train_dataset))

In [12]:
train_temp

({'input_ids': <tf.Tensor: shape=(512,), dtype=int32, numpy=
  array([ 101, 1353, 3417,  782, 1894, 3297, 2695, 7479, 1973, 3417, 7442,
         2449, 5901,  872, 2157, 1962,  679, 1962, 8024, 2769, 4534, 4197,
         6303, 1962, 3417, 2450, 3160, 3123, 2769, 2157, 1962,  679, 1962,
         8024, 2769,  738, 8270, 3123, 1762, 1765,  678, 2147, 2218, 6121,
          749, 6929, 1353, 6882,  889, 8024, 4125, 1213, 4634, 7442, 4240,
         2130, 4638, 3942, 3942,  738, 3221, 3300, 2523, 1914, 3123, 2198,
         2595, 4289, 6549, 3123,  872, 2157, 6206, 1621, 4412, 1762, 2218,
         3021, 1343, 7983,  759,  857, 6206, 1621, 6206, 1343, 6275, 4692,
         4692,  136, 2865, 1266, 2769, 1377,  809, 1343, 3417,  676, 2449,
         7621, 5439, 6291, 6352,  671,  763, 5439, 7531, 1762, 6174, 7481,
         6882, 4272, 4272, 1059, 6956, 7442, 5582, 4675, 2971, 8024, 3760,
          752,  976, 2994, 4234, 3160, 3472, 1348,  679, 3221,  782, 1343,
         2994, 4125, 1213, 4634, 7442, 

In [13]:
buffer_size = 100
train_bz = 6
epochs = 3
valid_bz = 6

train_gen = train_dataset.shuffle(buffer_size).batch(train_bz).repeat(epochs)
valid_gen = valid_dataset.batch(valid_bz)

In [14]:
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5,
                                     epsilon=1e-8,
                                     clipnorm=1.0)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                     reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [15]:
history = model.fit(train_gen,
                    epochs=epochs,
                    steps_per_epoch=train_size//train_bz, 
                    validation_data=valid_gen,
                    validation_steps=valid_size//valid_bz)

Epoch 1/3
Epoch 2/3
Epoch 3/3


## Save model

In [16]:
save_path = 'save_chinese'
if not os.path.exists(save_path):
    os.mkdir(save_path)

In [17]:
model.save_pretrained('./save_chinese/')

Configuration saved in ./save_chinese/config.json
Model weights saved in ./save_chinese/tf_model.h5


## Evaluation

畫出`precision`, `recall`, `f1-score`以及`confusion matrix`評估模型表現。

In [18]:
valid_pred = model.predict(valid_gen)
valid_pred_ids = np.argmax(valid_pred.logits, axis=-1)

In [19]:
valid_label = list()
for x in valid_dataset:
    valid_label += [x[1].numpy()]

In [20]:
print(classification_report(y_pred=valid_pred_ids, y_true=valid_label))

              precision    recall  f1-score   support

           0       0.47      0.47      0.47        70
           1       0.97      0.97      0.97      1389

    accuracy                           0.95      1459
   macro avg       0.72      0.72      0.72      1459
weighted avg       0.95      0.95      0.95      1459



In [21]:
confm = confusion_matrix(y_pred=valid_pred_ids, y_true=valid_label)

index = ['Actual_0', 'Actual_1']
columns = ['Pred_0', 'Pred_1']
pd.DataFrame(confm, index=index, columns=columns)

Unnamed: 0,Pred_0,Pred_1
Actual_0,33,37
Actual_1,37,1352


## Load model and predict

In [22]:
new_model = TFBertForSequenceClassification.from_pretrained('save_chinese/')
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')

loading configuration file save_chinese/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.30.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}

loading weights file save_chinese/tf_model.h5
Some layers from the model checkpoint at save_chinese/ were not used wh

In [23]:
sentence = ["文瑋助教好壯"]

test_dataset = pd.DataFrame(dict(idx=list(range(len(sentence))),
                                 label=[0]*len(sentence),
                                 sentence=sentence))

In [24]:
test_dataset

Unnamed: 0,idx,label,sentence
0,0,0,文瑋助教好壯


In [25]:
test_gen = tf.data.Dataset.from_tensor_slices(dict(test_dataset))

In [26]:
max_length = 512
task = 'cola'
test_gen = glue_convert_examples_to_features(test_gen, tokenizer, max_length, task)

Using label list ['0', '1'] for task cola
Using output mode classification for task cola
*** Example ***
guid: 0
features: InputFeatures(input_ids=[101, 3152, 4441, 1221, 3136, 1962, 1897, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [27]:
test_gen = test_gen.batch(1)

In [28]:
next(iter(test_gen))

({'input_ids': <tf.Tensor: shape=(1, 512), dtype=int32, numpy=
  array([[ 101, 3152, 4441, 1221, 3136, 1962, 1897,  102,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    

In [29]:
pred = new_model.predict(test_gen)

In [30]:
pred_ids = np.argmax(pred.logits, axis=-1)

In [31]:
print(pred_ids[0])

1
