# 1.Preparation

https://huggingface.co/docs/transformers/preprocessing

https://leemeng.tw/attack_on_bert_transfer_learning_in_nlp.html

https://clay-atlas.com/blog/2020/06/30/pytorch-%E5%A6%82%E4%BD%95%E4%BD%BF%E7%94%A8-hugging-face-%E6%89%80%E6%8F%90%E4%BE%9B%E7%9A%84-transformers-%E4%BB%A5-bert-%E7%82%BA%E4%BE%8B/

https://skimai.com/fine-tuning-bert-for-sentiment-analysis/

https://www.kaggle.com/code/xhlulu/disaster-nlp-keras-bert-using-tfhub/notebook

https://mccormickml.com/2019/07/22/BERT-fine-tuning/

https://huggingface.co/docs/transformers/main_classes/optimizer_schedules

https://huggingface.co/docs/transformers/main_classes/optimizer_schedules#transformers.get_linear_schedule_with_warmup

https://blog.csdn.net/orangerfun/article/details/120400247

In [1]:
!pip install transformers regex -q
# !pip install transformers tqdm boto3 requests regex -q

[K     |████████████████████████████████| 4.4 MB 28.4 MB/s 
[K     |████████████████████████████████| 596 kB 52.1 MB/s 
[K     |████████████████████████████████| 6.6 MB 10.4 MB/s 
[K     |████████████████████████████████| 101 kB 6.3 MB/s 
[?25h

In [2]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [3]:
RepoPATH = "./gdrive/MyDrive/Colab Notebooks/DisasterTweets"

import sys
sys.path.append(RepoPATH)

import os
os.chdir(RepoPATH)

In [4]:
from tqdm import tqdm

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
from transformers import AutoTokenizer


PRETRAINED_MODEL_NAME = "bert-base-cased"

# 取得此預訓練模型所使用的 tokenizer
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

print("PyTorch 版本：", torch.__version__)

vocab = tokenizer.vocab
print("字典大小：", len(vocab))

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

PyTorch 版本： 1.12.0+cu113
字典大小： 28996


In [6]:
# from transformers import BertForSequenceClassification
# my_model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME)

In [7]:
import utils
from model import BERTClassifier

# 2.Data

In [40]:
train_df = pd.read_csv("./data/train.csv")
test_df = pd.read_csv("./data/test.csv")

train_df, val_df = train_test_split(train_df, test_size=0.2, random_state=1024)

In [9]:
train_text = train_df["text"].to_numpy().astype(str).tolist()
train_target = train_df["target"].to_numpy().astype(int).tolist()

val_text = val_df["text"].to_numpy().astype(str).tolist()
val_target = val_df["target"].to_numpy().astype(int).tolist()

test_text = test_df["text"].to_numpy().astype(str).tolist()

In [10]:
train_text = [utils.text_preprocessing(s) for s in train_text]
val_text = [utils.text_preprocessing(s) for s in val_text]
test_text = [utils.text_preprocessing(s) for s in test_text]

In [11]:
len(train_text), len(val_text), len(test_text)

(6090, 1523, 3263)

In [12]:
train_token_result = tokenizer(train_text, max_length=256, padding="max_length", return_tensors="pt")
val_token_result = tokenizer(val_text, max_length=256, padding="max_length", return_tensors="pt")
test_token_result = tokenizer(test_text, max_length=256, padding="max_length", return_tensors="pt")

In [13]:
train_dataset = utils.TokenDataset(
    token_tensor=train_token_result["input_ids"],
    mask_tensor=train_token_result["attention_mask"],
    target_tensor=torch.Tensor(train_target).type(torch.LongTensor),
)

val_dataset = utils.TokenDataset(
    token_tensor=val_token_result["input_ids"],
    mask_tensor=val_token_result["attention_mask"],
    target_tensor=torch.Tensor(val_target).type(torch.LongTensor),
)

test_dataset = utils.TokenDataset(
    token_tensor=test_token_result["input_ids"],
    mask_tensor=test_token_result["attention_mask"],
)

In [14]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 3.Model define

In [15]:
my_model = BERTClassifier().to(device)

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
optimizer = optim.AdamW(my_model.parameters(), lr=1e-5)

criterion = nn.CrossEntropyLoss()
criterion_ = nn.CrossEntropyLoss(reduction="sum")

softmax = nn.Softmax(dim=1)

# 4.Training

In [19]:
def TrainProcess(model, data_loader, criterion, optimizer):

  model.train()

  for batch in tqdm(data_loader):
    tokens, masks, targets = batch[0], batch[1], batch[2]

    preds = model(tokens.to(device), masks.to(device))

    loss = criterion(preds, targets.to(device))

    # backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


def EvalProcess(model, data_loader, criterion):

  model.eval()

  total_loss = 0
  pred_result = []
  true_label = []
  with torch.no_grad():

    for batch in tqdm(data_loader):
      tokens, masks, targets = batch[0], batch[1], batch[2]

      preds = model(tokens.to(device), masks.to(device))

      loss = criterion(preds, targets.to(device))
      total_loss += loss

      preds_labels = np.argmax(softmax(preds.cpu()).numpy(), axis=1)
      pred_result.append(preds_labels)
      true_label.append(targets.cpu().numpy())

  pred_result = np.concatenate(pred_result)
  true_label = np.concatenate(true_label)

  return total_loss, pred_result, true_label

In [20]:
epochs = 5

In [21]:
loss_history = []
acc_histiry = []

for epoch in range(epochs):
    torch.save(my_model.state_dict(), "./model_parm_dict/epoch_{}.pt".format(epoch))
    # Evaluating stage
    total_loss, pred_result, true_label = EvalProcess(my_model, val_loader, criterion_)
    loss_history.append(total_loss)
    print("Epoch[{}/{}], TrainLoss: {:.4f}".format(epoch, epochs, total_loss))

    acc_ = accuracy_score(true_label, pred_result)
    recall_ = recall_score(true_label, pred_result)
    precision_ = precision_score(true_label, pred_result)
    acc_histiry.append(acc_)
    print("Acc: [{:.4f}], Recall: [{:.4f}], Precision: [{:.4f}]".format(acc_, recall_, precision_))

    # Training stage
    TrainProcess(my_model, train_loader, criterion, optimizer)

# Evaluating stage
total_loss, pred_result, true_label = EvalProcess(my_model, val_loader, criterion_)
loss_history.append(total_loss)
print("Epoch[{}/{}], TrainLoss: {:.4f}".format(epoch+1, epochs, total_loss))

acc_ = accuracy_score(true_label, pred_result)
recall_ = recall_score(true_label, pred_result)
precision_ = precision_score(true_label, pred_result)
acc_histiry.append(acc_)
print("Acc: [{:.4f}], Recall: [{:.4f}], Precision: [{:.4f}]".format(acc_, recall_, precision_))

100%|██████████| 48/48 [00:25<00:00,  1.89it/s]
  _warn_prf(average, modifier, msg_start, len(result))


Epoch[0/5], TrainLoss: 1101.2990
Acc: [0.5791], Recall: [0.0000], Precision: [0.0000]


100%|██████████| 191/191 [03:59<00:00,  1.25s/it]
100%|██████████| 48/48 [00:22<00:00,  2.14it/s]


Epoch[1/5], TrainLoss: 596.2872
Acc: [0.8286], Recall: [0.7644], Precision: [0.8167]


100%|██████████| 191/191 [03:59<00:00,  1.25s/it]
100%|██████████| 48/48 [00:22<00:00,  2.14it/s]


Epoch[2/5], TrainLoss: 715.2510
Acc: [0.7997], Recall: [0.8206], Precision: [0.7346]


100%|██████████| 191/191 [03:59<00:00,  1.25s/it]
100%|██████████| 48/48 [00:22<00:00,  2.14it/s]


Epoch[3/5], TrainLoss: 684.6671
Acc: [0.8253], Recall: [0.7629], Precision: [0.8109]


100%|██████████| 191/191 [03:59<00:00,  1.25s/it]
100%|██████████| 48/48 [00:22<00:00,  2.14it/s]


Epoch[4/5], TrainLoss: 759.5368
Acc: [0.8122], Recall: [0.7691], Precision: [0.7813]


100%|██████████| 191/191 [03:59<00:00,  1.25s/it]
100%|██████████| 48/48 [00:22<00:00,  2.14it/s]

Epoch[4/5], TrainLoss: 964.2299
Acc: [0.8102], Recall: [0.8003], Precision: [0.7611]





# 5.Prediction

In [57]:
pred_result = []

my_model.eval()

with torch.no_grad():
    for batch in tqdm(test_loader):
      tokens, masks = batch[0], batch[1]
      preds = (my_model(tokens.to(device), masks.to(device))).cpu().numpy()
      pred_result.append(preds)

100%|██████████| 102/102 [00:49<00:00,  2.05it/s]


In [58]:
pred_result = np.row_stack(pred_result)

In [62]:
pred_result = np.argmax(pred_result, axis=1).astype(int)

In [63]:
np.unique(pred_result, return_counts=True)

(array([0, 1]), array([1866, 1397]))

# 6.Saving

In [64]:
test_df["target"] = pred_result
test_df = test_df[["id", "target"]]

In [65]:
test_df.to_csv("submission/bert.csv",index=False)

# 7.Test code