<a href="https://colab.research.google.com/github/PeterHJY628/tutorial_notebooks/blob/main/Text_Classification_MultiClass_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Installing libraries

In [2]:
! pip -q install transformers

#Dataset
- We are using the Jigsaw toxic data from [Kaggle](https://www.kaggle.com/)
- This is competion provide the souce dataset [Toxic Comment Competition](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)
- If the comment is `toxic` and `obscene`, then for both those headers the value will be `1` and for the others it will be `0`.


Download dataset:

In [1]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
'''
GoogleAuth: 用于处理 Google 认证的类。
GoogleDrive: 用于与 Google Drive API 进行交互的类。
auth: Google Colab 中的认证模块，用于简化认证流程。
GoogleCredentials: 用于获取默认的 Google 认证凭证。
'''

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

id = '1R3EpofPMq9pK0eCodtVqz_8-c6aCEbpy'
downloaded = drive.CreateFile({'id':id})
downloaded.GetContentFile('train.csv.zip')
!unzip -q train.csv.zip -d data



MessageError: Error: credential propagation was unsuccessful

Dataset Preparation

In [None]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

def update_cat(x):
    return my_dict[x]

def encode_cat(x):
    if x not in encode_dict.keys():
        encode_dict[x]=len(encode_dict)
    return encode_dict[x]

df = pd.read_csv("./data/train.csv")
df['list'] = df[df.columns[2:]].values.tolist()
new_df = df[['comment_text', 'list']].copy()#这行代码的作用是从 df 中提取第 3 列及之后的所有列（df.columns[2:]），并将这些列的值转换为列表形式，然后赋值给新的列 list。
new_df.head()#调用 head() 方法查看 new_df 的前 5 行数据。

Unnamed: 0,comment_text,list
0,Explanation\nWhy the edits made under my usern...,"[0, 0, 0, 0, 0, 0]"
1,D'aww! He matches this background colour I'm s...,"[0, 0, 0, 0, 0, 0]"
2,"Hey man, I'm really not trying to edit war. It...","[0, 0, 0, 0, 0, 0]"
3,"""\nMore\nI can't make any real suggestions on ...","[0, 0, 0, 0, 0, 0]"
4,"You, sir, are my hero. Any chance you remember...","[0, 0, 0, 0, 0, 0]"


Dataloader

In [None]:
from transformers import BertTokenizerBertTokenizer #是在使用 BERT 模型进行自然语言处理任务时不可或缺的一部分。它负责将原始文本数据转换为模型可接受的格式，以便进行训练和推理。
'''
在 NLP 中，tokenization 是将输入文本拆分为更小的单元（通常称为 tokens）的过程。这些 tokens 可以是单词、子词或字符，具体取决于所使用的 tokenizer 和模型。

BertTokenizer
当你使用 BERT 模型时，BertTokenizer 是与模型配套的 tokenizer，它提供了一系列功能，以确保文本能够正确地输入到 BERT 模型中。以下是 BertTokenizer 的主要功能：

标记化：

将文本拆分为 tokens，使用 WordPiece 方法处理未登录词（out-of-vocabulary words）。
添加特殊标记：

在文本的开头添加 [CLS] 标记（表示分类任务）。
在句子之间添加 [SEP] 标记（用于区分句子）。
编码：

将 tokens 转换为相应的 ID。这些 ID 是 BERT 模型词汇表中的索引。
填充和截断：

将文本序列填充到指定的最大长度（max_length），确保所有输入具有相同的长度。
截断超过最大长度的文本，以适应模型输入的限制。
返回多个输出：

返回 input_ids、attention_mask 和 token_type_ids，这些都是模型所需的输入。
'''
class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.comment_text
        self.targets = self.data.list
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        comment_text = str(self.comment_text[index])#转换成字符串
        comment_text = " ".join(comment_text.split())#去掉多余空格

        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

MAX_LEN = 200
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_size = 0.8
train_dataset=new_df.sample(frac=train_size,random_state=200)
test_dataset=new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

FULL Dataset: (159571, 2)
TRAIN Dataset: (127657, 2)
TEST Dataset: (31914, 2)


Architecture of BERT:

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(768, 6)

    def forward(self, ids, mask, token_type_ids):
        output_1 = self.l1(ids, attention_mask = mask, token_type_ids = token_type_ids)
        output_2 = self.l2(output_1[1])
        output = self.l3(output_2)
        return output

model = BERTClass()
model.to(device);

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Training Script

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)


def train(epoch, model, training_loader, optimizer, loss_fn):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%50==0:
            print(f'Epoch: {epoch}, Step 50\{len(training_loader)}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

def validation(epoch, testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

Training

In [None]:

loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

best_epoch, best_acc = 0.0, 0
for epoch in range(EPOCHS):
    train(epoch, model, training_loader, optimizer, loss_fn)
    outputs, targets = validation(epoch, testing_loader)
    outputs = np.array(outputs) >= 0.5
    acc = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {acc}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

    if acc > best_acc:
        best_acc = acc
        best_epoch = epoch
        torch.save(model.state_dict(), 'best_model_cifar10h.pth.tar')
    print('epoch: {}  acc: {:.4f}  best epoch: {}  best acc: {:.4f}'.format(
            epoch, acc, best_epoch, best_acc, optimizer.param_groups[0]['lr']))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Step 50\3990, Loss:  0.7527999877929688
Epoch: 0, Step 50\3990, Loss:  0.34039562940597534
Epoch: 0, Step 50\3990, Loss:  0.13606807589530945
Epoch: 0, Step 50\3990, Loss:  0.12781018018722534
Epoch: 0, Step 50\3990, Loss:  0.06847912073135376
Accuracy Score = 0.9146455223880597
F1 Score (Micro) = 0.6640625
F1 Score (Macro) = 0.34401975322576045
