## 1 数据预处理

In [1]:
from datasets import Dataset
from transformers import RobertaTokenizer
import os

In [9]:
# 载入原始数据
def load_data(base_path):
    paths = os.listdir(base_path)
    result = []
    for path in paths:
        with open(os.path.join(base_path, path), 'r', encoding='utf-8') as f:
            result.append(f.readline())
    return result

# 读入数据并转化为datasets.Dataset
def get_dataset(base_path):
		# 为了展示方便，这里只取前3个数据，真实使用需要删掉切片操作
    pos_data = load_data(os.path.join(base_path, 'pos'))
    neg_data = load_data(os.path.join(base_path, 'neg'))
    
		# 列表合并
    texts = pos_data + neg_data
		# 生成标签，其中使用 '1.' 和 '0.' 是因为需要转化为浮点数，要不然模型训练时会报错
    labels = [[1., 0.]]*len(pos_data) + [[0., 1.]] * len(neg_data)
    dataset = Dataset.from_dict({'texts':texts, 'labels':labels})
    return dataset

# 加载数据
train_dataset = get_dataset('../aclImdb/train/')
test_dataset = get_dataset('../aclImdb/test/')

In [10]:
print(train_dataset)


Dataset({
    features: ['texts', 'labels'],
    num_rows: 25000
})


In [11]:
# 载入文本标记器
# cache_dir是预训练模型的地址
cache_dir="roberta-base1/"
tokenizer = RobertaTokenizer.from_pretrained(cache_dir)

# 将数据转化为模型可以接受的格式
# 设置最大长度
MAX_LENGTH = 512

# 使用文本标记器对texts进行编码
train_dataset = train_dataset.map(lambda e: tokenizer(e['texts'], truncation=True, padding='max_length', max_length=MAX_LENGTH), batched=True)
test_dataset = test_dataset.map(lambda e: tokenizer(e['texts'], truncation=True, padding='max_length', max_length=MAX_LENGTH), batched=True)

# 将数据保存到本地
train_dataset.save_to_disk('./data/train_dataset')
test_dataset.save_to_disk('./data/test_dataset')


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/25000 [00:00<?, ? examples/s]

In [12]:
print(train_dataset.features)

{'texts': Value(dtype='string', id=None), 'labels': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None), 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}


## 2 训练模型

In [13]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer, Trainer, TrainingArguments, RobertaConfig
import torch
from datasets import Dataset
import json
import os
# 设定使用的GPU编号，也可以不设置，但trainer会默认使用多GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"


In [14]:
# 将num_labels设置为2，因为我们训练的任务为2分类
model = RobertaForSequenceClassification.from_pretrained('roberta-base1/', num_labels=2)

# 加载处理好的数据
train_dataset = Dataset.load_from_disk('./data/train_dataset/')
test_dataset = Dataset.load_from_disk('./data/test_dataset/')

# 冻结BERT参数
for param in model.base_model.parameters():
    param.requires_grad = False

# 训练超参配置
training_args = TrainingArguments(
    output_dir='./my_results',          # output directory 结果输出地址
    num_train_epochs=10,              # total # of training epochs 训练总批次
    per_device_train_batch_size=32,  # batch size per device during training 训练批大小
    per_device_eval_batch_size=32,   # batch size for evaluation 评估批大小
    logging_dir='./my_logs',            # directory for storing logs 日志存储位置
)

# 创建Trainer
trainer = Trainer(
    model=model.to('cuda'),              # the instantiated 🤗 Transformers model to be trained 需要训练的模型
    args=training_args,                  # training arguments, defined above 训练参数
    train_dataset=train_dataset,         # training dataset 训练集
    eval_dataset=test_dataset,           # evaluation dataset 测试集
)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base1/ and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [15]:
# 开始训练
trainer.train()

# 开始评估模型
trainer.evaluate()


Step,Training Loss
500,0.6755
1000,0.6197
1500,0.552
2000,0.5027
2500,0.468
3000,0.4482
3500,0.4417
4000,0.4216
4500,0.4209
5000,0.4176


{'eval_loss': 0.3427954614162445,
 'eval_runtime': 162.1998,
 'eval_samples_per_second': 154.131,
 'eval_steps_per_second': 4.821,
 'epoch': 10.0}

In [18]:
# 保存模型 会保存到配置的output_dir处
trainer.save_model()


In [19]:
torch.save(model.state_dict(), 'model_save.bin')

In [16]:
# 将num_labels设置为2，因为我们训练的任务为2分类
model = RobertaForSequenceClassification.from_pretrained('roberta-base1/', num_labels=2)

# 加载处理好的数据
train_dataset = Dataset.load_from_disk('./data/train_dataset/')
test_dataset = Dataset.load_from_disk('./data/test_dataset/')

# 冻结BERT参数
# for param in model.base_model.parameters():
#     param.requires_grad = False

# 训练超参配置
training_args = TrainingArguments(
    output_dir='./my_results1',          # output directory 结果输出地址
    num_train_epochs=10,              # total # of training epochs 训练总批次
    per_device_train_batch_size=32,  # batch size per device during training 训练批大小
    per_device_eval_batch_size=32,   # batch size for evaluation 评估批大小
    logging_dir='./my_logs1',            # directory for storing logs 日志存储位置
)

# 创建Trainer
trainer = Trainer(
    model=model.to('cuda'),              # the instantiated 🤗 Transformers model to be trained 需要训练的模型
    args=training_args,                  # training arguments, defined above 训练参数
    train_dataset=train_dataset,         # training dataset 训练集
    eval_dataset=test_dataset,           # evaluation dataset 测试集
)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base1/ and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [20]:
# 开始训练
trainer.train()
# 开始评估模型
trainer.evaluate()

Step,Training Loss
500,0.2583
1000,0.1929
1500,0.1565
2000,0.1156
2500,0.1017
3000,0.0693
3500,0.0527
4000,0.0413
4500,0.0341
5000,0.0239


{'eval_loss': 0.32094162702560425,
 'eval_runtime': 162.4909,
 'eval_samples_per_second': 153.855,
 'eval_steps_per_second': 4.813,
 'epoch': 10.0}

In [21]:
# 保存模型 会保存到配置的output_dir处
trainer.save_model()
torch.save(model.state_dict(), 'model_save1.bin')

In [3]:
output_config_file = './my_results/config.json'
output_model_file = 'model_save.bin'

config = RobertaConfig.from_json_file(output_config_file)
model = RobertaForSequenceClassification(config)
state_dict = torch.load(output_model_file)
model.load_state_dict(state_dict)


<All keys matched successfully>

In [15]:
cache_dir="bert-base-uncased/"
tokenizer = RobertaTokenizer.from_pretrained(cache_dir)
data = tokenizer(['This is a good movie', 'This is a bad movie'], max_length=512, truncation=True, padding='max_length', return_tensors="pt")
print(model(**data))


SequenceClassifierOutput(loss=None, logits=tensor([[ 5.9377, -5.8652],
        [-7.1596,  7.5822]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)


## 模型准确率评估

In [None]:
import torch
from datasets import Dataset
import json
import os
import numpy as np
import csv

# Set the device to use the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 设定使用的GPU编号，也可以不设置，但trainer会默认使用多GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# 加载模型
output_config_file = './my_results/config.json'
output_model_file = 'model_save.bin'

config = BertConfig.from_json_file(output_config_file)
model = BertForSequenceClassification(config).to(device)
state_dict = torch.load(output_model_file)
model.load_state_dict(state_dict)

# 加载数据
test_dataset = Dataset.load_from_disk('./data/test_dataset/')
cache_dir="bert-base-uncased1/"
tokenizer = BertTokenizer.from_pretrained(cache_dir)

# 降低批处理大小和序列长度
batch_size = 16

data = test_dataset['texts']
data = tokenizer(data, max_length=512, truncation=True, padding='max_length', return_tensors="pt")

# 使用较小批次进行推理
preds = []
labels = []
for i in range(0, len(data['input_ids']), batch_size):
    batch_data = {k: v[i:i+batch_size].to(device) for k, v in data.items()}
    with torch.no_grad():
        batch_preds = model(**batch_data).logits
    preds.extend(np.argmax(batch_preds.detach().cpu().numpy(), axis=-1))
    labels.extend(np.argmax(test_dataset['labels'][i:i+batch_size], axis=-1))

preds = np.array(preds)
labels = np.array(labels)


# 将一维数组转换为CSV文件中的单列
with open('output1.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(preds)  # 写入头部

# 将一维数组转换为CSV文件中的单列
with open('output2.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(labels)  # 写入头部


In [24]:
import torch
from datasets import Dataset
import json
import os
import numpy as np
import csv

In [25]:
# Set the device to use the GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# 设定使用的GPU编号，也可以不设置，但trainer会默认使用多GPU
# os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# 降低批处理大小和序列长度
batch_size = 16

data = test_dataset['texts']
data = tokenizer(data, max_length=512, truncation=True, padding='max_length', return_tensors="pt")

# 使用较小批次进行推理
preds = []
labels = []
for i in range(0, len(data['input_ids']), batch_size):
    batch_data = {k: v[i:i+batch_size].to(device) for k, v in data.items()}
    with torch.no_grad():
        batch_preds = model(**batch_data).logits
    preds.extend(np.argmax(batch_preds.detach().cpu().numpy(), axis=-1))
    labels.extend(np.argmax(test_dataset['labels'][i:i+batch_size], axis=-1))

preds = np.array(preds)
labels = np.array(labels)


# 将一维数组转换为CSV文件中的单列
with open('output1.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(preds)  # 写入头部

# 将一维数组转换为CSV文件中的单列
with open('output2.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(labels)  # 写入头部


In [28]:
# 计算准确率
accuracy = sum(preds == labels) / len(labels)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.95


In [26]:
preds

array([0, 1, 0, ..., 1, 1, 1])

In [27]:
labels

array([0, 0, 0, ..., 1, 1, 1])