In [1]:
%env ALL_PROXY=http://127.0.0.1:7890
%env HTTP_PROXY=http://127.0.0.1:7890
%env HTTPS_PROXY=http://127.0.0.1:7890

env: ALL_PROXY=http://127.0.0.1:7890
env: HTTP_PROXY=http://127.0.0.1:7890
env: HTTPS_PROXY=http://127.0.0.1:7890


In [2]:
%env HF_HUB_CACHE=./data/hf_cache

env: HF_HUB_CACHE=./data/hf_cache


# Text classification example

## Step 01. import related packages

In [3]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from transformers import TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


## Step 02. Load data

In [4]:
dataset = load_dataset('csv', data_files='./data/train/ChnSentiCorp_htl_all.csv', split='train')
dataset = dataset.filter(lambda x: x['review'] is not None)
dataset

Dataset({
    features: ['label', 'review'],
    num_rows: 7765
})

## Step 03. Split dataset

In [5]:
splited_ds_dict = dataset.train_test_split(test_size=0.2)
splited_ds_dict

DatasetDict({
    train: Dataset({
        features: ['label', 'review'],
        num_rows: 6212
    })
    test: Dataset({
        features: ['label', 'review'],
        num_rows: 1553
    })
})

## Step 04. Pre-process Data

In [6]:
import torch

tokenizer = AutoTokenizer.from_pretrained('hfl/rbt3')

def process_function(examples):
    tokenized_examples = tokenizer(examples['review'], max_length=128, truncation=True)
    tokenized_examples['labels'] = examples['label']
    return tokenized_examples

tokenized_ds = splited_ds_dict.map(process_function, batched=True, remove_columns=splited_ds_dict['train'].column_names)
tokenized_ds

Map: 100%|██████████| 6212/6212 [00:00<00:00, 7574.88 examples/s]
Map: 100%|██████████| 1553/1553 [00:00<00:00, 9797.65 examples/s] 


DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 6212
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 1553
    })
})

## Step 05. Create the model

In [7]:
from torch.optim import Adam

model = AutoModelForSequenceClassification.from_pretrained('hfl/rbt3')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/rbt3 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Step 06. Create evaluation func

In [8]:
import evaluate

acc_metric = evaluate.load('accuracy')
f1_metric = evaluate.load('f1')

In [9]:
def eval_metrics(eval_preds):
    preds, labels = eval_preds
    preds = preds.argmax(axis=1)
    acc = acc_metric.compute(predictions=preds, references=labels)
    f1 = f1_metric.compute(predictions=preds, references=labels, average='macro')
    acc.update(f1)
    return acc

## Step 07. Create the trainer

In [10]:
train_args = TrainingArguments(
    output_dir='./outs/checkpoints',
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    logging_steps=10,
    eval_strategy='epoch',
    # eval_strategy='steps',   # use eval_steps + steps strategy or only epoch strategy
    # eval_steps=10,
    save_strategy='epoch',
    save_total_limit=5,
    num_train_epochs=10,
    learning_rate=2e-5,
    weight_decay=1e-2,
    metric_for_best_model='accuracy',
    load_best_model_at_end=True,
)

train_args

TrainingArguments(
_n_gpu=4,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_use_gather_object=False

In [11]:
from transformers import DataCollatorWithPadding

trainer = Trainer(
    args=train_args,
    train_dataset=tokenized_ds['train'],
    eval_dataset=tokenized_ds['test'],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    model=model,
    compute_metrics=eval_metrics
)

## Step 08. Train the model

In [12]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4797,0.387976,0.823567,0.792001
2,0.3017,0.305435,0.872505,0.848974
3,0.2654,0.281988,0.889891,0.872186
4,0.2572,0.275206,0.891822,0.873357
5,0.2338,0.275402,0.883451,0.867956
6,0.1911,0.275259,0.894398,0.875942
7,0.2037,0.273395,0.894398,0.877486
8,0.1758,0.275637,0.897618,0.880348
9,0.1755,0.275423,0.891822,0.874912
10,0.1698,0.276968,0.895686,0.878436




TrainOutput(global_step=250, training_loss=0.25316318607330324, metrics={'train_runtime': 70.7954, 'train_samples_per_second': 877.459, 'train_steps_per_second': 3.531, 'total_flos': 1042770706821120.0, 'train_loss': 0.25316318607330324, 'epoch': 10.0})

# Step 10. Trainer Evaluation

In [13]:
trainer.evaluate()



{'eval_loss': 0.2756370007991791,
 'eval_accuracy': 0.8976175144880876,
 'eval_f1': 0.8803479560344991,
 'eval_runtime': 0.805,
 'eval_samples_per_second': 1929.227,
 'eval_steps_per_second': 8.696,
 'epoch': 10.0}

In [14]:
trainer.evaluate(tokenized_ds['train'])



{'eval_loss': 0.15963904559612274,
 'eval_accuracy': 0.9425305859626529,
 'eval_f1': 0.9334833081550676,
 'eval_runtime': 3.1519,
 'eval_samples_per_second': 1970.893,
 'eval_steps_per_second': 7.932,
 'epoch': 10.0}

In [15]:
trainer.evaluate(tokenized_ds['test'])



{'eval_loss': 0.2756370007991791,
 'eval_accuracy': 0.8976175144880876,
 'eval_f1': 0.8803479560344991,
 'eval_runtime': 0.8698,
 'eval_samples_per_second': 1785.516,
 'eval_steps_per_second': 8.048,
 'epoch': 10.0}

## Step 11. Model prediction

In [16]:
trainer.predict(tokenized_ds['test'])



PredictionOutput(predictions=array([[-1.9880457,  2.1107924],
       [-2.4324806,  2.685149 ],
       [-1.9879626,  2.026359 ],
       ...,
       [-2.9331942,  2.9876163],
       [-2.9660394,  3.0099857],
       [-3.0106335,  3.2024503]], dtype=float32), label_ids=array([1, 1, 1, ..., 1, 1, 1]), metrics={'test_loss': 0.2756370007991791, 'test_accuracy': 0.8976175144880876, 'test_f1': 0.8803479560344991, 'test_runtime': 0.8909, 'test_samples_per_second': 1743.263, 'test_steps_per_second': 7.858})

# Other : tensorboard show training trace

In [17]:
!tensorboard --logdir ./outs/checkpoints/runs/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


TensorFlow installation not found - running with reduced feature set.

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

Serving TensorBoard on localhost; to expose to the network, use a proxy or pass --bind_all
TensorBoard 2.20.0 at http://localhost:6006/ (Press CTRL+C to quit)
^C


In [None]:
# Also, can use the extension in VS Code (By 'Ctrl + Shift + P', search 'tensorboard')