In [None]:
%env ALL_PROXY=http://127.0.0.1:7890
%env HTTP_PROXY=http://127.0.0.1:7890
%env HTTPS_PROXY=http://127.0.0.1:7890

In [None]:
%env HF_HUB_CACHE=./data/hf_cache

# Text classification example

## Step 01. import related packages

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from transformers import TrainingArguments, Trainer

## Step 02. Load data

In [None]:
dataset = load_dataset('csv', data_files='./data/train/ChnSentiCorp_htl_all.csv', split='train')
dataset = dataset.filter(lambda x: x['review'] is not None)
dataset

## Step 03. Split dataset

In [None]:
splited_ds_dict = dataset.train_test_split(test_size=0.2)
splited_ds_dict

## Step 04. Pre-process Data

In [None]:
import torch

tokenizer = AutoTokenizer.from_pretrained('hfl/chinese-macbert-large')

def process_function(examples):
    tokenized_examples = tokenizer(examples['review'], max_length=32, padding='max_length', truncation=True)
    tokenized_examples['labels'] = examples['label']
    return tokenized_examples

tokenized_ds = splited_ds_dict.map(process_function, batched=True, remove_columns=splited_ds_dict['train'].column_names)
tokenized_ds

## Step 05. Create the model

In [None]:
from torch.optim import Adam

model = AutoModelForSequenceClassification.from_pretrained('hfl/chinese-macbert-large')

## Step 06. Create evaluation func

In [None]:
import evaluate

acc_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1')

In [None]:
def eval_metrics(eval_preds):
    preds, labels = eval_preds
    preds = preds.argmax(axis=1)
    acc = acc_metric.compute(predictions=preds, references=labels)
    f1 = f1_metric.compute(predictions=preds, references=labels, average='macro')
    acc.update(f1)
    return acc

## Step 07. Create the trainer

In [None]:
train_args = TrainingArguments(output_dir="./outs/checkpoints",      # 输出文件夹
                                per_device_train_batch_size=1,   # 训练时的batch_size
                                gradient_accumulation_steps=32,  # *** 梯度累加 ***
                                gradient_checkpointing=True,     # *** 梯度检查点 ***
                                optim="adafactor",               # *** adafactor优化器 *** 
                                per_device_eval_batch_size=1,    # 验证时的batch_size
                                num_train_epochs=1,              # 训练轮数
                                logging_steps=10,                # log 打印的频率
                                eval_strategy="epoch",     # 评估策略
                                save_strategy="epoch",           # 保存策略
                                save_total_limit=3,              # 最大保存数
                                learning_rate=2e-5,              # 学习率
                                weight_decay=0.01,               # weight_decay
                                metric_for_best_model="f1",      # 设定评估指标
                                load_best_model_at_end=True)     # 训练完成后加载最优模型

train_args

In [None]:
from transformers import DataCollatorWithPadding

for name, param in model.bert.named_parameters():
    param.requires_grad = False

trainer = Trainer(
    args=train_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['test'],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    model=model,
    compute_metrics=eval_metrics
)

## Step 08. Train the model

In [None]:
trainer.train()

# Step 10. Trainer Evaluation

In [None]:
trainer.evaluate()

In [None]:
trainer.evaluate(tokenized_ds['train'])

In [None]:
trainer.evaluate(tokenized_ds['test'])

## Step 11. Model prediction

In [None]:
trainer.predict(tokenized_ds['test'])

# Other : tensorboard show training trace

In [None]:
!tensorboard --logdir ./outs/checkpoints/runs/

In [None]:
# Also, can use the extension in VS Code (By 'Ctrl + Shift + P', search 'tensorboard')