In [1]:
import os 
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer, TrainingArguments
from transformers import pipeline
from datasets import load_dataset
import pandas as pd

In [2]:
# 我找这个数据集好像有点问题，还有title，我把title和正文内容做了一下合并
train_data = pd.read_csv('data/ag/train.csv')
train_data.rename(columns={'Class Index': 'labels'}, inplace=True)
train_data['text'] = train_data['Title']+' '+train_data['Description']
train_data = train_data.drop(columns=['Description', 'Title'])
train_data = train_data[['text', 'labels']]
class_number = train_data['labels'].nunique()
train_data['labels'] = train_data['labels'] - 1
train_data.to_csv('data/ag/train_drop_title.csv',index=False)

test_data = pd.read_csv('data/ag/train.csv')
test_data.rename(columns={'Class Index': 'labels'}, inplace=True)
test_data['text'] = test_data['Title']+' '+test_data['Description']
test_data = test_data.drop(columns=['Description', 'Title'])
test_data = test_data[['text', 'labels']]
test_data['labels'] = test_data['labels'] - 1
train_data.to_csv('data/ag/test_drop_title.csv',index=False)

In [3]:

train_csv = load_dataset('csv',data_files='data/ag/train_drop_title.csv',split='train')
dataset_size = len(train_csv)
train_size = int(dataset_size*0.9)
train_dataset = train_csv.select(range(train_size))
dev_dataset = train_csv.select(range(train_size,dataset_size))
test_dataset = load_dataset('csv',data_files='data/ag/test_drop_title.csv',split='train')

print(train_dataset)
print(dev_dataset)
print(test_dataset)


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['text', 'labels'],
    num_rows: 108000
})
Dataset({
    features: ['text', 'labels'],
    num_rows: 12000
})
Dataset({
    features: ['text', 'labels'],
    num_rows: 120000
})


国内使用huggingface是有问题的，离线使用预训练模型参数参考
[link]'https://zhuanlan.zhihu.com/p/475260268'

In [4]:
model = AutoModelForSequenceClassification.from_pretrained('pretrained_model/bert/', num_labels=class_number)
tokenizer = AutoTokenizer.from_pretrained('pretrained_model/bert/')
MAX_LENGTH = 256


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at pretrained_model/bert/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# 对数据集做分词预处理，把所有序列补充或截断到256个token
train_dataset = train_dataset.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length', max_length=MAX_LENGTH), batched=True)
dev_dataset = dev_dataset.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length', max_length=MAX_LENGTH), batched=True)
test_dataset = test_dataset.map(lambda e: tokenizer(e['text'], truncation=True, padding='max_length', max_length=MAX_LENGTH), batched=True)
# 处理成pytorch的格式
train_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
dev_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
print(train_dataset)
print(dev_dataset)
print(test_dataset)


Map:   0%|          | 0/108000 [00:00<?, ? examples/s]

Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 108000
})
Dataset({
    features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 12000
})
Dataset({
    features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 120000
})


In [6]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [9]:
import os
os.environ["CUDA_VISIBLE_DEIVCES"]=""
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    learning_rate=1e-4,
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=32,   # batch size for evaluation
    logging_dir='./logs',            # directory for storing logs
    logging_steps=100,
    do_train=True,
    do_eval=True,
    no_cuda=False,
    load_best_model_at_end=True,
    # eval_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

trainer = Trainer(
    model=model,                         # the instantiated   Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=dev_dataset,            # evaluation dataset
    compute_metrics=compute_metrics
)

train_out = trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.1243,0.252292,0.919583,0.919811,0.920515,0.91994
2,0.137,0.249654,0.920417,0.920633,0.921302,0.920654
3,0.1045,0.263735,0.91925,0.91954,0.920789,0.919602


