# 主题分析

### 1. 导包

In [22]:
from random import random
import os
import random
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup, BertPreTrainedModel
from sklearn.metrics import classification_report
from tqdm.notebook import tqdm
import numpy as np
import torch
import re
from transformers import AutoConfig, AutoTokenizer, BertModel
from torch.utils.tensorboard import SummaryWriter
import time
import sklearn.metrics as metrics

### 2. 设置超参数

In [23]:
max_length = 512                                                     # 最大长度
batch_size = 20                                                      # 批量大小
learning_rate = 1e-5                                                 # 学习率
epoch_num = 5                                                        # epoch数
themes = {"动力", "价格", "内饰", "配置", "安全性", "外观", "操控", "油耗", "空间", "舒适性"}

device = 'cuda' if torch.cuda.is_available() else 'cpu'              # 设备(cuda)
print(f'using device {device}')

checkpoint = "bert-base-chinese"                                     # 预训练模型
tokenizer = AutoTokenizer.from_pretrained(checkpoint)                # 分词器

using device cuda


### 3. 设置随机种子

In [24]:
def seed_everything(seed=1029):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(12)                                                  # 随机种子

### 4. 定义Accuracy函数

In [25]:
def Ac(y_true, y_pred):
    return metrics.accuracy_score(y_true, y_pred)                    # 正确率 (正确个数/总个数)

In [26]:
y_true_try = torch.tensor([1,2,3,4,5,6,7,8,9,0])
y_pred_try = torch.tensor([1,2,3,4,5,6,9,9,9,9])
Ac(y_true_try, y_pred_try)

0.7

### 5. 定义predict函数

In [27]:
def predict(output, alpha=0.4):
    pred = torch.sigmoid(output)                                     # 转换成概率
    zero = torch.zeros_like(pred)                                    
    topk = torch.topk(pred, k=2, dim=1, largest=True)[1]             # k个最大概率的index
    for i, x in enumerate(topk):
        for k in x:                                                  # k个最大概率的index
            if pred[i][k] > alpha:
                zero[i][k] = 1
    return zero.cpu()

In [28]:
output_try = torch.randn(4, 6).to(device)
output_try

tensor([[-0.1320, -0.1254,  0.3443, -0.4519, -0.8888, -0.3526],
        [-1.3373,  0.5223, -0.6958, -0.0522, -0.0351,  0.5274],
        [-0.8227,  0.5942,  0.6618, -0.0125,  1.4400,  0.7946],
        [ 0.8444,  1.2668, -1.0249,  1.2336,  0.8366, -2.0645]],
       device='cuda:0')

In [29]:
predict(output_try)

tensor([[0., 1., 1., 0., 0., 0.],
        [0., 1., 0., 0., 0., 1.],
        [0., 0., 0., 0., 1., 1.],
        [0., 1., 0., 1., 0., 0.]])

### 6. 定义Dataset

In [30]:
class ChnSentiCorp(Dataset):

    def __init__(self, data_file):
        self.themes = ["动力", "价格", "内饰", "配置", "安全性", "外观", "操控", "油耗", "空间", "舒适性"]
        self.data = self.load_data(data_file)

    def load_data(self, data_file):
        theme_sentiment_pattern = re.compile(r'(\S+?)#(-?\d+)')
        Data = {}

        with open(data_file, 'rt', encoding='utf-8') as f:
            for idx, line in enumerate(f):
                line = line.strip()
                matches = theme_sentiment_pattern.findall(line)
                comment = re.sub(theme_sentiment_pattern, "", line).strip()
                theme_in_line = [theme for theme, _ in matches]
                multi_hot_vector = [1 if theme in theme_in_line else 0 for theme in self.themes]
                total_sentiment = sum(int(senti) for _, senti in matches)

                if total_sentiment > 0:
                    sentiment_label = 2
                elif total_sentiment < 0:
                    sentiment_label = 0
                else:
                    sentiment_label = 1

                Data[idx] = {
                    "comment": comment.replace(" ", ""),
                    "themes": multi_hot_vector,
                    "sentiment": sentiment_label
                }
            return Data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [31]:
train_data = ChnSentiCorp('data/train.txt')
test_data = ChnSentiCorp('data/test.txt')

print(f"length of training set: {len(train_data)}")
print(f"length of test set: {len(test_data)}")
print(train_data[0])

length of training set: 8000
length of test set: 2653
{'comment': '因为森林人即将换代，这套系统没必要装在一款即将换代的车型上，因为肯定会影响价格。', 'themes': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0], 'sentiment': 1}


### 7. 定义DataLoader

In [32]:
def collate_fn(batch_samples):
    batch_sentences, batch_themes_labels, batch_sentiment_labels = [], [], []

    for sample in batch_samples:
        batch_sentences.append(sample['comment'])
        batch_themes_labels.append(sample['themes'])

    batch_inputs = tokenizer(
        batch_sentences,
        max_length=max_length,
        padding=True,
        truncation=True,
        return_tensors="pt"
    )

    return {
        'batch_inputs': batch_inputs,
        'theme_labels': batch_themes_labels
    }

train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [33]:
next(iter(train_dataloader))
#print(len(train_dataloader))

{'batch_inputs': {'input_ids': tensor([[ 101, 6121, 7724,  ..., 3211, 1168,  102],
         [ 101, 2769, 4638,  ...,    0,    0,    0],
         [ 101,  123, 8026,  ...,    0,    0,    0],
         ...,
         [ 101, 4958, 6444,  ...,    0,    0,    0],
         [ 101,  817, 3419,  ...,    0,    0,    0],
         [ 101, 2769, 4638,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])},
 'theme_labels': [[0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
  [0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
  [0, 0, 0, 0, 0, 0, 0, 1, 1, 0],
  [0, 1, 0, 0, 0, 0, 0, 0, 0, 0

### 8. 定义Bert类

In [34]:
class BertForMultiTaskLearning(BertPreTrainedModel):

    def __init__(self, config, num_themes):
        super().__init__(config)
        self.bert = BertModel(config)
        self.num_themes = num_themes
        # self.num_sentiments = 3

        self.theme_classifier = nn.Linear(self.bert.config.hidden_size, num_themes)               # 多标签主题分类
        # self.sentiment_classifier = nn.Linear(self.bert.config.hidden_size, num_sentiments)     # 情感分类

        self.theme_loss_fn = nn.BCEWithLogitsLoss()                                               # 损失函数
        #self.sentiment_loss_fn = nn.CrossEntropyLoss()

    def forward(self, batch_inputs, theme_labels=None):
        _ = self.bert(**batch_inputs)

        theme_logits = self.theme_classifier(_[1])

        theme_loss = self.theme_loss_fn(theme_logits, theme_labels.float())

        return {
            "theme_loss": theme_loss,
            "theme_logits": theme_logits,
        }

In [35]:
config = AutoConfig.from_pretrained(checkpoint)
model = BertForMultiTaskLearning.from_pretrained(checkpoint, config=config, num_themes=len(themes)).to(device)

Some weights of BertForMultiTaskLearning were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['theme_classifier.bias', 'theme_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
config

BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.49.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}

In [37]:
model

BertForMultiTaskLearning(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

### 9. 优化器

In [38]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=epoch_num * len(train_dataloader),
)

writer = SummaryWriter(log_dir='themes_classification_logs' + '/' + time.strftime('%m-%d_%H.%M', time.localtime()))



### 10. 定义train函数

In [39]:
total_train_step = 0
total_train_loss = 0.
best_f1_score = 0.
total_test_loss = 0

In [40]:
def train_loop(dataloader, model, optimizer, lr_scheduler, epoch, total_train_loss, total_train_step):
    progress_bar = tqdm(range(len(dataloader)),disable=False)
    progress_bar.set_description(f'loss: {0:>7f}')
    finish_step_num = epoch * len(dataloader)
    true_labels, predictions = [], []
    model.train()
    for step, batch_data in enumerate(dataloader, start=1):
        batch_data = {k: torch.tensor(v).to(device) if isinstance(v, list) else v.to(device) for k, v in batch_data.items()}
        theme_labels = batch_data["theme_labels"]
        theme_outputs = model(**batch_data)
        loss = theme_outputs["theme_loss"]
        logits = theme_outputs["theme_logits"]

        optimizer.zero_grad()
        loss.backward()

        optimizer.step()
        lr_scheduler.step()

        total_train_loss += loss.item()
        pred = predict(logits)
        true_labels += theme_labels.cpu().numpy().tolist()
        predictions += pred.cpu().numpy().tolist()
        #theme_metrics = classification_report(true_labels, predictions, target_names=themes, output_dict=True)


        total_train_step +=1
        progress_bar.set_description(f'loss: {total_train_loss / (finish_step_num + step):>7f}')
        progress_bar.update(1)

        if total_train_step % 100 == 0:
            accuracy = Ac(true_labels, predictions)
            print("训练次数:{}，loss:{}".format(total_train_step, loss.item()))
            writer.add_scalar("train_loss", loss.item(), total_train_step)
            writer.add_scalar("train_accuracy", accuracy, total_train_step)

    return total_train_loss, total_train_step

In [41]:
def test_loop(dataloader, model, epoch):
    true_labels, predictions = [], []
    model.eval()
    total_test_loss = 0
    with torch.no_grad():
        for step, batch_data in enumerate(dataloader, start=1):
            batch_data = {k: torch.tensor(v).to(device) if isinstance(v, list) else v.to(device) for k, v in batch_data.items()}
            theme_labels = batch_data["theme_labels"]
            theme_outputs = model(**batch_data)
            loss = theme_outputs["theme_loss"]
            logits = theme_outputs["theme_logits"]

            pred = predict(logits)

            true_labels += theme_labels.cpu().numpy().tolist()
            predictions += pred.cpu().numpy().tolist()

            total_test_loss += loss.item()

    metrics = classification_report(true_labels, predictions, target_names=themes, output_dict=True)

    macro_precision = metrics["macro avg"]["precision"]
    macro_recall = metrics["macro avg"]["recall"]
    macro_f1 = metrics['macro avg']['f1-score']

    accuracy = Ac(true_labels, predictions)

    print("整体测试集上的Loss:{}".format(total_test_loss))
    writer.add_scalar("test_loss", total_test_loss, epoch)
    writer.add_scalar("test_accuarcy", accuracy, epoch)

    print(f"Accuracy: {accuracy * 100:>0.2f}\n")
    print(f"Recall: {macro_recall * 100:>0.2f}\n")
    print(f"Precision: {macro_precision * 100:>0.2f}\n")
    print(f"Macro-F1: {macro_f1 * 100:>0.2f}\n")

    return metrics

### 11. 训练

In [42]:
for epoch in range(epoch_num):
    print(f"Epoch {epoch + 1}/{epoch_num}\n" + 30 * "-")
    total_train_loss, total_train_step= train_loop(train_dataloader, model, optimizer, lr_scheduler, epoch, total_train_loss, total_train_step)
    valid_scores = test_loop(test_dataloader, model, epoch)
    macro_f1, micro_f1 = valid_scores['macro avg']['f1-score'], valid_scores['weighted avg']['f1-score']
    f1_score = (macro_f1 + micro_f1) / 2
    if f1_score > best_f1_score:
        best_f1_score = f1_score
        print('saving new weights...\n')
        torch.save(
            model.state_dict(),
            f'epoch_{epoch + 1}_valid_macrof1_{(macro_f1 * 100):0.3f}_microf1_{(micro_f1 * 100):0.3f}_model_weights.bin'
        )

writer.close()
print("Done!")

Epoch 1/5
------------------------------


  0%|          | 0/400 [00:00<?, ?it/s]

训练次数:100，loss:0.34327030181884766
训练次数:200，loss:0.2552517354488373
训练次数:300，loss:0.20409934222698212
训练次数:400，loss:0.15108773112297058


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


整体测试集上的Loss:19.401519365608692
Accuracy: 78.70

Recall: 77.41

Precision: 90.41

Macro-F1: 82.94

saving new weights...

Epoch 2/5
------------------------------


  0%|          | 0/400 [00:00<?, ?it/s]

训练次数:500，loss:0.1235220655798912
训练次数:600，loss:0.14323262870311737
训练次数:700，loss:0.09380602836608887
训练次数:800，loss:0.13543227314949036


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


整体测试集上的Loss:13.763881355524063
Accuracy: 82.40

Recall: 85.24

Precision: 89.59

Macro-F1: 87.22

saving new weights...

Epoch 3/5
------------------------------


  0%|          | 0/400 [00:00<?, ?it/s]

训练次数:900，loss:0.09821362048387527
训练次数:1000，loss:0.1551397144794464
训练次数:1100，loss:0.10701768845319748
训练次数:1200，loss:0.09039012342691422


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


整体测试集上的Loss:11.97926040366292
Accuracy: 83.19

Recall: 87.49

Precision: 88.49

Macro-F1: 87.91

saving new weights...

Epoch 4/5
------------------------------


  0%|          | 0/400 [00:00<?, ?it/s]

训练次数:1300，loss:0.059925761073827744
训练次数:1400，loss:0.10005031526088715
训练次数:1500，loss:0.06047553941607475
训练次数:1600，loss:0.1033034697175026


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


整体测试集上的Loss:11.21520533412695
Accuracy: 83.75

Recall: 88.33

Precision: 88.74

Macro-F1: 88.42

saving new weights...

Epoch 5/5
------------------------------


  0%|          | 0/400 [00:00<?, ?it/s]

训练次数:1700，loss:0.08307182043790817
训练次数:1800，loss:0.05157347396016121
训练次数:1900，loss:0.058793261647224426
训练次数:2000，loss:0.12694351375102997


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


整体测试集上的Loss:11.037318212911487
Accuracy: 83.75

Recall: 88.90

Precision: 88.17

Macro-F1: 88.41

saving new weights...

Done!


### 12. 预测新comment