In [10]:
import warnings

warnings.filterwarnings("ignore")

from datasets import load_dataset

data = load_dataset("glue", "mnli")
data

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx'],
        num_rows: 9847
    })
})

In [11]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [12]:
def tokenize_fun(examples):
    return tokenizer(examples["premise"], examples["hypothesis"], truncation=True)

In [13]:
tokenizer_datasets = data.map(tokenize_fun, batched=True)
tokenizer_datasets

Map:   0%|          | 0/9832 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 392702
    })
    validation_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9815
    })
    validation_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9832
    })
    test_matched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9796
    })
    test_mismatched: Dataset({
        features: ['premise', 'hypothesis', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9847
    })
})

In [14]:
#对数据集进行填充操作
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
#训练参数
from transformers import TrainingArguments
# 
# training_args = TrainingArguments("out_files",
#                                   per_device_train_batch_size=16,
#                                   num_train_epochs=1,
#                                   logging_steps=50,
#                                   evaluation_strategy='steps')


In [16]:
from transformers import AutoModelForSequenceClassification

# model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

In [17]:

# import numpy as np
# from datasets import load_metric
# 
# metric = load_metric("glue", "mnli")
# 
# 
# def compute_metrics(eval_preds):
#     predictions, labels = eval_preds
#     predictions = np.argmax(predictions, axis=1)
#     return metric.compute(predictions=predictions, references=labels)



In [18]:
from transformers import Trainer
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_preds):
    predictions, labels = eval_preds
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)
training_args = TrainingArguments("out_files",
                                  per_device_train_batch_size=16,
                                  num_train_epochs=1,
                                  logging_steps=50,
                                  evaluation_strategy='steps')
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenizer_datasets["train"],
    eval_dataset=tokenizer_datasets["validation_matched"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()

Using the latest cached version of the module from C:\Users\admin\.cache\huggingface\modules\datasets_modules\metrics\accuracy\9756d5fa4a0f9da966341741fc3926eafdc604b8276add51d5abbaa8958a25f9 (last modified on Mon May 13 15:29:26 2024) since it couldn't be found locally at accuracy, or remotely on the Hugging Face Hub.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss,Accuracy
50,1.1089,1.036048,0.460927
100,1.0163,0.970233,0.53622
150,0.8817,0.86638,0.612736


KeyboardInterrupt: 

In [None]:
# from transformers import Trainer
# 
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenizer_datasets["train"],  #取前20000个样本
#     eval_dataset=tokenizer_datasets["validation_matched"],
#     data_collator=data_collator,
#     tokenizer=tokenizer,
#     compute_metrics=compute_metrics
# )
# 
# 


In [None]:
# trainer.train()

In [None]:
predictions = trainer.predict(tokenizer_datasets["validation_matched"])
print(predictions.predictions.shape, predictions.label_ids.shape)

In [None]:
metric = load_metric("glue", "mnli")
preds = np.argmax(predictions.predictions, axis=1)
metric.compute(predictions=preds, references=predictions.label_ids)

In [None]:
print(predictions.predictions)