In [1]:
import os
os.environ["TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL"] = "1"

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, pipeline
from datasets import load_dataset
import evaluate

In [3]:
datasets = load_dataset("json", data_files="./train_pair_1w.json", split="train").train_test_split(test_size=0.2)

In [None]:
datasets["train"]

In [4]:
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-macbert-base")

In [5]:
def process_function(examples, tokenizer=tokenizer):
    inputs = tokenizer(examples["sentence1"], examples["sentence2"], truncation=True, max_length=256, padding="max_length")
    inputs["labels"] = [float(v) for v in examples["label"]]
    return inputs

In [6]:
tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

In [None]:
tokenized_datasets["train"]

In [7]:
model = AutoModelForSequenceClassification.from_pretrained("hfl/chinese-macbert-base", num_labels=1).to("cuda")

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertForSequenceClassification LOAD REPORT[0m from: hfl/chinese-macbert-base
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.decoder.weight             | UNEXPECTED | 
bert.embeddings.position_ids               | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.decoder.bias               | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; n

In [8]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

In [9]:
def compute_metric(pred):
    predictions, labels = pred
    predictions = [int((p > 0.5).item()) for p in predictions]
    labels = [int(l) for l in labels]
    accuracy_metric = accuracy.compute(predictions=predictions, references=labels)
    accuracy_metric.update(f1.compute(predictions=predictions, references=labels))
    return accuracy_metric

In [None]:
from transformers import TrainerCallback, TrainerState, TrainerControl

class MyTrainerCallback(TrainerCallback):
    def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        if state.global_step % args.logging_steps == 0 and len(state.log_history) > 0:
            print(state.log_history[len(state.log_history) - 1])

In [11]:
args = TrainingArguments(
    output_dir="./cross_model",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    logging_steps=10,
    eval_strategy="steps",
    save_strategy="steps",
    save_steps=100,
    eval_steps=100,
    learning_rate=2e-5,
    weight_decay=0.01,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    log_level="info",
    log_level_replica="info",
    logging_first_step=True
)

In [12]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metric,
    # callbacks=[MyTrainerCallback()]
)

In [13]:
trainer.train()

***** Running training *****
  Num examples = 8,000
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 750
  Number of trainable parameters = 102,268,417


Step,Training Loss,Validation Loss,Accuracy,F1
100,0.130455,0.084607,0.8885,0.851827
200,0.120786,0.071307,0.9025,0.870432
300,0.082128,0.087688,0.893,0.844477
400,0.079608,0.068191,0.9145,0.888308
500,0.068918,0.064208,0.9185,0.891694
600,0.056298,0.067615,0.9175,0.890656
700,0.059931,0.066182,0.9175,0.891233



***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
Saving model checkpoint to ./cross_model\checkpoint-100
Configuration saved in ./cross_model\checkpoint-100\config.json


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Model weights saved in ./cross_model\checkpoint-100\model.safetensors
Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`
tokenizer config file saved in ./cross_model\checkpoint-100\tokenizer_config.json

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
Saving model checkpoint to ./cross_model\checkpoint-200
Configuration saved in ./cross_model\checkpoint-200\config.json


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Model weights saved in ./cross_model\checkpoint-200\model.safetensors
Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`
tokenizer config file saved in ./cross_model\checkpoint-200\tokenizer_config.json

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
Saving model checkpoint to ./cross_model\checkpoint-300
Configuration saved in ./cross_model\checkpoint-300\config.json


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Model weights saved in ./cross_model\checkpoint-300\model.safetensors
Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`
tokenizer config file saved in ./cross_model\checkpoint-300\tokenizer_config.json

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
Saving model checkpoint to ./cross_model\checkpoint-400
Configuration saved in ./cross_model\checkpoint-400\config.json


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Model weights saved in ./cross_model\checkpoint-400\model.safetensors
Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`
tokenizer config file saved in ./cross_model\checkpoint-400\tokenizer_config.json

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
Saving model checkpoint to ./cross_model\checkpoint-500
Configuration saved in ./cross_model\checkpoint-500\config.json


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Model weights saved in ./cross_model\checkpoint-500\model.safetensors
Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`
tokenizer config file saved in ./cross_model\checkpoint-500\tokenizer_config.json

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
Saving model checkpoint to ./cross_model\checkpoint-600
Configuration saved in ./cross_model\checkpoint-600\config.json


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Model weights saved in ./cross_model\checkpoint-600\model.safetensors
Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`
tokenizer config file saved in ./cross_model\checkpoint-600\tokenizer_config.json

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
Saving model checkpoint to ./cross_model\checkpoint-700
Configuration saved in ./cross_model\checkpoint-700\config.json


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Model weights saved in ./cross_model\checkpoint-700\model.safetensors
Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`
tokenizer config file saved in ./cross_model\checkpoint-700\tokenizer_config.json
Saving model checkpoint to ./cross_model\checkpoint-750
Configuration saved in ./cross_model\checkpoint-750\config.json


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Model weights saved in ./cross_model\checkpoint-750\model.safetensors
Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`
tokenizer config file saved in ./cross_model\checkpoint-750\tokenizer_config.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./cross_model\checkpoint-500 (score: 0.8916943521594685).
There were missing keys in the checkpoint model loaded: ['bert.embeddings.LayerNorm.weight', 'bert.embeddings.LayerNorm.bias', 'bert.encoder.layer.0.attention.output.LayerNorm.weight', 'bert.encoder.layer.0.attention.output.LayerNorm.bias', 'bert.encoder.layer.0.output.LayerNorm.weight', 'bert.encoder.layer.0.output.LayerNorm.bias', 'bert.encoder.layer.1.attention.output.LayerNorm.weight', 'bert.encoder.layer.1.attention.output.LayerNorm.bias', 'bert.encoder.layer.1.output.LayerNorm.weight', 'bert.encoder.layer.1.output.LayerNorm.bias', 'bert.encoder.layer.2.attention.output.LayerNorm.

TrainOutput(global_step=750, training_loss=0.08427901224295298, metrics={'train_runtime': 360.3861, 'train_samples_per_second': 66.595, 'train_steps_per_second': 2.081, 'total_flos': 3157304315904000.0, 'train_loss': 0.08427901224295298, 'epoch': 3.0})

In [17]:
trainer.evaluate(tokenized_datasets["test"].select(range(10)))


***** Running Evaluation *****
  Num examples = 10
  Batch size = 64


{'eval_loss': 0.03167562931776047,
 'eval_accuracy': 0.9,
 'eval_f1': 0.8571428571428571,
 'eval_runtime': 0.0678,
 'eval_samples_per_second': 147.541,
 'eval_steps_per_second': 14.754,
 'epoch': 3.0}

In [14]:
model.config.id2label = {0: "不相似", 1: "相似"}

In [15]:
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)

In [16]:
result = pipe({"text": "我喜欢北京", "text_pair": "天气怎样"}, function_to_apply="none")
result["label"] ="相似" if result["score"] > 0.5 else "不相似"
result

{'label': '不相似', 'score': 0.006054386030882597}