In [118]:
import torch
from transformers import BertForSequenceClassification
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import os
from tqdm import tqdm
from datetime import timedelta

In [119]:
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ["TF_FORCE_GPU_ALLOW_CROWTH"] = "true"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [120]:
PAD, CLS = '[PAD]', '[CLS]'


def build_dataset():
    def load_dataset(path, pad_size=32):
        contents = []
        with open(path, 'r', encoding='UTF-8') as f:
            for line in tqdm(f):
                lin = line.strip()
                if not lin:
                    continue
                content, label = line.split('\t')
                token = tokenizer.tokenize(content)
                token = [CLS] + token
                seq_len = len(token)
                mask = []
                token_ids = tokenizer.convert_tokens_to_ids(token)

                if pad_size:
                    if len(token) < pad_size:
                        mask = [1] * len(token_ids) + [0] * (pad_size - len(token))
                        token_ids += ([0] * (pad_size - len(token)))
                    else:
                        mask = [1] * pad_size
                        token_ids = token_ids[:pad_size]
                        seq_len = pad_size
                contents.append((token_ids, int(label), seq_len, mask))
        return contents

    train = load_dataset(train_path, pad_size)
    dev = load_dataset(dev_path, pad_size)
    test = load_dataset(test_path, pad_size)

    return train, dev, test

In [121]:
class DatasetIterater(object):
    def __init__(self, batches, batch_size, device):
        self.batch_size = batch_size
        self.batches = batches
        self.n_batches = len(batches) / batch_size
        self.residue = False
        if len(batches) % self.n_batches != 0:
            self.residue = True
        self.index = 0
        self.device = device

    def _to_tensor(self, datas):
        x = torch.LongTensor([_[0] for _ in datas]).to(self.device)
        y = torch.LongTensor([_[1] for _ in datas]).to(self.device)
        seq_len = torch.LongTensor([_[2] for _ in datas]).to(self.device)
        mask = torch.LongTensor([_[3] for _ in datas]).to(self.device)
        return (x, seq_len, mask), y

    def __next__(self):
        if self.residue and self.index == self.n_batches:
            batches = self.batches[self.index * self.batch_size: len(self.batches)]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches
        elif self.index >= self.n_batches:
            self.index = 0
            raise StopIteration
        else:
            batches = self.batches[self.index * self.batch_size: (self.index + 1) * self.batch_size]
            self.index += 1
            batches = self._to_tensor(batches)
            return batches

    def __iter__(self):
        return self

    def __len__(self):
        if self.residue:
            return self.n_batches + 1
        else:
            return int(self.n_batches)

In [122]:
def build_iterator(dataset):
    iter = DatasetIterater(dataset, batch_size, device)
    return iter


def get_time_dif(start_time):
    end_time = time.time()
    time.dif = end_time - start_time
    return timedelta(seconds=int(round(time.dif)))

In [123]:
train_path = './dataset/train.txt'
dev_path = './dataset/dev.txt'
test_path = './dataset/test.txt'

batch_size = 16
model_name = 'bert-base-chinese'
tokenizer = BertTokenizer.from_pretrained(model_name)

train_data, dev_data, test_data = build_dataset()
train_iter = build_iterator(train_data)
dev_iter = build_iterator(dev_data)
test_iter = build_iterator(test_data)

36000it [00:02, 17882.16it/s]
2000it [00:00, 19305.77it/s]
2000it [00:00, 19160.91it/s]


In [132]:
class Model(nn.Module):
    def __init__(self, num_classes, hidden_size, model_name):
        super(Model, self).__init__()
        # 加载预训练的BERT模型
        self.bert = BertForSequenceClassification.from_pretrained(model_name, num_labels=n_cls)

        # 设置BERT参数为可训练
        for param in self.bert.parameters():
            param.requires_grad = True

        # 最后一层全连接层，将hidden_size映射到单输出
        self.fc = nn.Linear(hidden_size, num_classes)

    # forward方法中，dataset包含 input_ids_list, seq_len, attention_mask
    def forward(self, dataset):
        input_ids_list = dataset[0]
        attention_mask = dataset[2]

        # 获取BERT的输出
        outputs = self.bert(input_ids=input_ids_list, attention_mask=attention_mask)

        # 提取池化后的向量
        # pooled_output = outputs.pooler_output

        # 通过全连接层后，使用sigmoid激活函数，得到概率值
        logits = self.fc(outputs)
        probs = torch.sigmoid(logits)
        return probs

In [129]:
import time
from torch.optim import AdamW
from transformers import get_scheduler


# 训练方法
def train(model, train_iter, num_epochs=3, learning_rate=5e-5, accumulate_steps=4):
    """
    训练模型。

    参数:
    - model: 要训练的模型。
    - train_iter: 训练数据迭代器。
    - num_epochs: 训练轮数 (默认: 3)。
    - learning_rate: 学习率 (默认: 5e-5)。
    - accumulate_steps: 梯度累积的步数 (默认: 4)。
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)  # 确保模型在正确的设备上
    model.train()  # 设置模型为训练模式

    # 优化器和学习率调度器
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

    total_steps = len(train_iter) * num_epochs
    warmup_ratio = 0.05
    warmup_steps = int(total_steps * warmup_ratio)
    scheduler = get_scheduler(
        "linear", optimizer=optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps
    )

    # 训练循环
    total_batch = 0  # 记录总的批次
    start_time = time.time()

    for epoch in range(num_epochs):
        print('Epoch [{}/{}]'.format(epoch + 1, num_epochs))
        optimizer.zero_grad()

        for i, (trains, labels) in enumerate(train_iter):
            # 确保训练数据和标签在正确的设备上
            # trains, labels = trains.to(device), labels.to(device)

            # 前向传播
            outputs = model(trains)

            # 计算损失（支持梯度累积）
            loss = F.binary_cross_entropy(outputs, labels) / accumulate_steps
            loss.backward()

            # 梯度累积更新
            if (i + 1) % accumulate_steps == 0 or (i + 1) == len(train_iter):
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # 梯度裁剪
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

            # 打印训练信息（每500批次打印一次）
            if total_batch % 500 == 0:
                time_dif = time.time() - start_time
                msg = 'Iter: {0:>6}, Train Loss: {1:>5.2f}, Time: {2}'
                print(msg.format(total_batch, loss.item() * accumulate_steps, time_dif))
            total_batch += 1

    print("Training complete!")

In [133]:
hidden_size = 768
n_cls = 2
random_seed = 1221
torch.manual_seed(random_seed)
pad_size = 32
model = Model(n_cls, hidden_size, model_name)
learning_rate = 5e-5
num_epochs = 3
model.to(device)
from torch import optim

optimizer = optim.AdamW(model.parameters(), lr=5e-5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [134]:
train(model, train_iter)

Epoch [1/3]


TypeError: linear(): argument 'input' (position 1) must be Tensor, not SequenceClassifierOutput

In [53]:
def test(n_cls, model, test_iter):
    # model.load_state_dict(torch.load(config.save_path))
    model.eval()
    start_time = time.time()
    test_acc, test_loss, test_report, test_confusion = evaluate(n_cls, model, test_iter, test=True)
    msg = 'Test Loss: {0:>5.2}, Test Acc: {1:>6.2%}'
    print(msg.format(test_loss, test_acc))
    print("Precision, Recall and F1-Score...")
    print(test_report)
    print("Confusion Matrix...")
    print(test_confusion)
    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)


In [54]:
def evaluate(n_cls, model, data_iter, test=False):
    model.eval()
    loss_total = 0
    predict_all = []
    label_all = []
    with torch.no_grad():
        for texts, labels in data_iter:
            # # 如果 texts 和 labels 不是张量，先转换为张量
            # if not isinstance(texts, torch.Tensor):
            #     texts = torch.tensor(texts, dtype=torch.float32)
            # if not isinstance(labels, torch.Tensor):

            #     labels = torch.tensor(labels, dtype=torch.long)
            #
            # # 将张量移动到指定设备（如 GPU）
            # texts, labels = texts.to(config.device), labels.to(config.device)

            outputs = model(texts)
            loss = F.cross_entropy(outputs, labels)
            loss_total += loss.item()
            predic = torch.max(outputs, 1)[1]
            predict_all.append(predic)
            label_all.append(labels)

    # 在 GPU 上拼接结果，并在最后转为 NumPy 数组
    predict_all = torch.cat(predict_all).cpu().numpy()
    label_all = torch.cat(label_all).cpu().numpy()

    acc = metrics.accuracy_score(label_all, predict_all)

    if test:
        report = metrics.classification_report(label_all, predict_all, target_names=n_cls, digits=4)
        confusion = metrics.confusion_matrix(label_all, predict_all)
        return acc, loss_total / len(data_iter), report, confusion

    return acc, loss_total / len(data_iter)

In [60]:
import time
from sklearn import metrics

learning_rate = 5e-5
num_epochs = 3


# 优化后的代码
def train(n_cls, model, train_iter, dev_iter, test_iter):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)  # 确保模型在正确的设备上
    model.train()  # 调整为只在外层调用一次

    start_time = time.time()
    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate)

    total_steps = len(train_iter) * num_epochs
    warmup_ratio = 0.05
    warmup_steps = int(total_steps * warmup_ratio)

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )

    total_batch = 0
    dev_best_loss = float('inf')
    accumulate_steps = 4  # 梯度累积的步数

    for epoch in range(num_epochs):
        print('Epoch [{}/{}]'.format(epoch + 1, num_epochs))
        optimizer.zero_grad()

        for i, (trains, labels) in enumerate(train_iter):
            # 确保训练数据和标签在正确的设备上
            # trains, labels = trains.to(device), labels.to(device)

            outputs = model(trains)
            # time.sleep(0.1)  # time sleep, 减小由GPU利用率过高造成的显示器黑屏问题

            loss = F.binary_cross_entropy(outputs, labels) / accumulate_steps  # 累积梯度分摊损失
            loss.backward()

            if (i + 1) % accumulate_steps == 0 or (i + 1) == len(train_iter):
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # 梯度裁剪
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()

            if total_batch % 500 == 0:
                true = labels.data.cpu()
                predic = torch.max(outputs.data, 1)[1].cpu()
                train_acc = metrics.accuracy_score(true, predic)
                dev_acc, dev_loss = evaluate(n_cls, model, dev_iter)

                if dev_loss < dev_best_loss:
                    dev_best_loss = dev_loss
                    # torch.save(model.state_dict(), config.save_path)
                    improve = '*'
                else:
                    improve = ''

                time_dif = get_time_dif(start_time)
                msg = 'Iter: {0:>6}, Train Loss: {1:>5.2f}, Train Acc: {2:>6.2%}, Val Loss: {3:>5.2f}, Val Acc: {4:>6.2%}, Time: {5}, {6}'
                print(
                    msg.format(total_batch, loss.item() * accumulate_steps, train_acc, dev_loss, dev_acc, time_dif,
                               improve))
            total_batch += 1

    test(n_cls, model, test_iter)


In [56]:
train(n_cls, model, train_iter, dev_iter, test_iter)

Epoch [1/3]


TypeError: linear(): argument 'input' (position 1) must be Tensor, not SequenceClassifierOutput