## 1 数据预处理

In [1]:
from datasets import Dataset
from transformers import BertTokenizer
import os

In [3]:
# 载入原始数据
def load_data(base_path):
    paths = os.listdir(base_path)
    result = []
    for path in paths:
        with open(os.path.join(base_path, path), 'r', encoding='utf-8') as f:
            result.append(f.readline())
    return result

# 读入数据并转化为datasets.Dataset
def get_dataset(base_path):
		# 为了展示方便，这里只取前3个数据，真实使用需要删掉切片操作
    pos_data = load_data(os.path.join(base_path, 'pos'))[:3]
    neg_data = load_data(os.path.join(base_path, 'neg'))[:3]
    
		# 列表合并
    texts = pos_data + neg_data
		# 生成标签，其中使用 '1.' 和 '0.' 是因为需要转化为浮点数，要不然模型训练时会报错
    labels = [[1., 0.]]*len(pos_data) + [[0., 1.]] * len(neg_data)
    dataset = Dataset.from_dict({'texts':texts, 'labels':labels})
    return dataset

# 加载数据
train_dataset = get_dataset('../aclImdb/train/')
test_dataset = get_dataset('../aclImdb/test/')

In [4]:
print(train_dataset)
print(train_dataset['labels'])
print(train_dataset.features)

Dataset({
    features: ['texts', 'labels'],
    num_rows: 6
})
[[1.0, 0.0], [1.0, 0.0], [1.0, 0.0], [0.0, 1.0], [0.0, 1.0], [0.0, 1.0]]
{'texts': Value(dtype='string', id=None), 'labels': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None)}


In [6]:
# 载入文本标记器
# cache_dir是预训练模型的地址
cache_dir="bert-base-uncased1/"
tokenizer = BertTokenizer.from_pretrained(cache_dir)

# 将数据转化为模型可以接受的格式
# 设置最大长度
MAX_LENGTH = 512

# 使用文本标记器对texts进行编码
train_dataset = train_dataset.map(lambda e: tokenizer(e['texts'], truncation=True, padding='max_length', max_length=MAX_LENGTH), batched=True)
test_dataset = test_dataset.map(lambda e: tokenizer(e['texts'], truncation=True, padding='max_length', max_length=MAX_LENGTH), batched=True)

# 将数据保存到本地
# train_dataset.save_to_disk('./data/train_dataset')
# test_dataset.save_to_disk('./data/test_dataset')


Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

In [7]:
print(train_dataset.features)

{'texts': Value(dtype='string', id=None), 'labels': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


## 2 训练模型

In [1]:
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments, BertConfig
import torch
from datasets import Dataset
import json
import os
# 设定使用的GPU编号，也可以不设置，但trainer会默认使用多GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "1"




In [3]:
# 将num_labels设置为2，因为我们训练的任务为2分类
model = BertForSequenceClassification.from_pretrained('bert-base-uncased1/', num_labels=2)

# 加载处理好的数据
train_dataset = Dataset.load_from_disk('./data/train_dataset/')
test_dataset = Dataset.load_from_disk('./data/test_dataset/')
'''
这里可以冻结BERT参数，只训练最后一层二分类层，不过我这里采用的全部训练策略
for param in model.base_model.parameters():
    param.requires_grad = False
'''
# 训练超参配置
training_args = TrainingArguments(
    output_dir='./my_results',          # output directory 结果输出地址
    num_train_epochs=10,              # total # of training epochs 训练总批次
    per_device_train_batch_size=32,  # batch size per device during training 训练批大小
    per_device_eval_batch_size=32,   # batch size for evaluation 评估批大小
    logging_dir='./my_logs',            # directory for storing logs 日志存储位置
)

# 创建Trainer
trainer = Trainer(
    model=model.to('cuda'),              # the instantiated 🤗 Transformers model to be trained 需要训练的模型
    args=training_args,                  # training arguments, defined above 训练参数
    train_dataset=train_dataset,         # training dataset 训练集
    eval_dataset=test_dataset,           # evaluation dataset 测试集
)


  return self.fget.__get__(instance, owner)()
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased1/ and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [4]:
# 开始训练
trainer.train()

# 开始评估模型
trainer.evaluate()

# 保存模型 会保存到配置的output_dir处
trainer.save_model()
torch.save(model.state_dict(), 'model_save.bin')

Step,Training Loss
500,0.2843
1000,0.1736
1500,0.1392
2000,0.0837
2500,0.0676
3000,0.0446
3500,0.028
4000,0.0253
4500,0.0178
5000,0.0124


In [5]:
# 开始评估模型
trainer.evaluate()

{'eval_loss': 0.3857584297657013,
 'eval_runtime': 171.263,
 'eval_samples_per_second': 145.974,
 'eval_steps_per_second': 4.566,
 'epoch': 10.0}

: 

## 3 模型准确率评估

In [1]:
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments, BertConfig
import torch
from datasets import Dataset
import json
import os
import numpy as np
import csv

# Set the device to use the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 设定使用的GPU编号，也可以不设置，但trainer会默认使用多GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# 加载模型
output_config_file = './my_results/config.json'
output_model_file = 'model_save.bin'

config = BertConfig.from_json_file(output_config_file)
model = BertForSequenceClassification(config).to(device)
state_dict = torch.load(output_model_file)
model.load_state_dict(state_dict)

# 加载数据
test_dataset = Dataset.load_from_disk('./data/test_dataset/')
cache_dir="bert-base-uncased1/"
tokenizer = BertTokenizer.from_pretrained(cache_dir)

# 降低批处理大小和序列长度
batch_size = 16

data = test_dataset['texts']
data = tokenizer(data, max_length=512, truncation=True, padding='max_length', return_tensors="pt")

# 使用较小批次进行推理
preds = []
labels = []
for i in range(0, len(data['input_ids']), batch_size):
    batch_data = {k: v[i:i+batch_size].to(device) for k, v in data.items()}
    with torch.no_grad():
        batch_preds = model(**batch_data).logits
    preds.extend(np.argmax(batch_preds.detach().cpu().numpy(), axis=-1))
    labels.extend(np.argmax(test_dataset['labels'][i:i+batch_size], axis=-1))

preds = np.array(preds)
labels = np.array(labels)

# 计算准确率
accuracy = sum(preds == labels) / len(labels)
print(f"Accuracy: {accuracy:.2f}")



Accuracy: 0.94


In [None]:
# 将一维数组转换为CSV文件中的单列
with open('output1.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(preds)  # 写入头部

# 将一维数组转换为CSV文件中的单列
with open('output2.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(labels)  # 写入头部

In [12]:
import csv
import numpy as np

# 读取 CSV 文件
with open('output1.csv', 'r') as csvfile1:
    reader1 = csv.reader(csvfile1)
    rows1 = list(reader1)

rows1 = np.array(rows1).astype(np.int32)
preds = rows1[0]

with open('output2.csv', 'r') as csvfile2:
    reader2 = csv.reader(csvfile2)
    rows2 = list(reader2)

rows2 = np.array(rows2).astype(np.int32)
labels = rows2[0]


In [23]:
# 计算准确率
accuracy = sum(preds == labels) / len(labels)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.94


#### 调试代码

In [12]:
labels = np.argmax(test_dataset[:3]['labels'],axis=-1)
labels

array([0, 0, 0])

In [21]:
cache_dir="bert-base-uncased/"
tokenizer = BertTokenizer.from_pretrained(cache_dir)
data = test_dataset[:3]['texts']
data = tokenizer(data, max_length=512, truncation=True, padding='max_length', return_tensors="pt")
preds = model(**data)
preds = np.argmax(preds.logits.detach().numpy(),axis=-1)
preds

array([0, 0, 0])

In [15]:
cache_dir="bert-base-uncased/"
tokenizer = BertTokenizer.from_pretrained(cache_dir)
data = tokenizer(['This is a good movie', 'This is a bad movie'], max_length=512, truncation=True, padding='max_length', return_tensors="pt")
print(model(**data))


SequenceClassifierOutput(loss=None, logits=tensor([[ 5.9377, -5.8652],
        [-7.1596,  7.5822]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
