# 使用完整的 YelpReviewFull 数据集训练，对比看 Acc 最高能到多少

## 1.加载数据集

In [1]:
from datasets import load_dataset

dataset = load_dataset("yelp_review_full")

  from .autonotebook import tqdm as notebook_tqdm


## 2.预处理数据

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)
full_train_dataset = tokenized_datasets["train"].shuffle(seed=46)
full_eval_dataset = tokenized_datasets["test"].shuffle(seed=46)

## 3.加载模型

In [3]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 4.配置训练超参数

In [6]:
from transformers import TrainingArguments

model_dir = "models/bert-base-cased-finetune-yelp"
full_training_args = TrainingArguments(output_dir=model_dir,
                                  evaluation_strategy="epoch",
                                  save_total_limit=5,  # 设置自动保存
                                  per_device_train_batch_size=16,
                                  num_train_epochs=3,
                                  logging_steps=100)

## 5.配置指标评估方法

In [7]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

## 6.开始训练

In [8]:
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=full_training_args,
    train_dataset=full_train_dataset,
    eval_dataset=full_eval_dataset,
    compute_metrics=compute_metrics,
)
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.7612,0.760435,0.66582
2,0.6861,0.723348,0.68732
3,0.6024,0.731828,0.69126


TrainOutput(global_step=121875, training_loss=0.7100966492951222, metrics={'train_runtime': 64696.6655, 'train_samples_per_second': 30.141, 'train_steps_per_second': 1.884, 'total_flos': 5.130803778048e+17, 'train_loss': 0.7100966492951222, 'epoch': 3.0})

## 7.验证并保存

In [9]:
small_test_dataset = tokenized_datasets["test"].shuffle(seed=30).select(range(100))
trainer.evaluate(small_test_dataset)

{'eval_loss': 0.7058154344558716,
 'eval_accuracy': 0.71,
 'eval_runtime': 1.3864,
 'eval_samples_per_second': 72.13,
 'eval_steps_per_second': 9.377,
 'epoch': 3.0}

In [10]:
trainer.save_model(model_dir)
trainer.save_state()