# 情感分析

### 1. 导包

In [3]:
from random import random
import os
import random
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import classification_report
from tqdm.auto import tqdm
import numpy as np
import torch
import re
from transformers import AutoConfig, AutoTokenizer, BertForSequenceClassification
from torch.utils.tensorboard import SummaryWriter
import time
import sklearn.metrics as metrics

### 2. 设置超参数

In [4]:
max_length = 512                                                     # 最大长度
batch_size = 20                                                      # 批量大小
learning_rate = 1e-5                                                 # 学习率
epoch_num = 2                                                        # epoch数
themes = {"动力", "价格", "内饰", "配置", "安全性", "外观", "操控", "油耗", "空间", "舒适性"}

device = 'cuda' if torch.cuda.is_available() else 'cpu'              # 设备(cuda)
print(f'using device {device}')

checkpoint = "bert-base-chinese"                                     # 预训练模型
tokenizer = AutoTokenizer.from_pretrained(checkpoint)                # 分词器

using device cuda


### 3. 设置随机种子

In [5]:
def seed_everything(seed=1029):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

seed_everything(12)                                                  # 随机种子

### 4. 定义Dataset

In [6]:
class ChnSentiCorp(Dataset):

    def __init__(self, data_file):
        self.themes = ["动力", "价格", "内饰", "配置", "安全性", "外观", "操控", "油耗", "空间", "舒适性"]
        self.data = self.load_data(data_file)

    def load_data(self, data_file):
        theme_sentiment_pattern = re.compile(r'(\S+?)#(-?\d+)')
        Data = {}

        with open(data_file, 'rt', encoding='utf-8') as f:
            for idx, line in enumerate(f):
                line = line.strip()
                matches = theme_sentiment_pattern.findall(line)
                comment = re.sub(theme_sentiment_pattern, "", line).strip()
                theme_in_line = [theme for theme, _ in matches]
                multi_hot_vector = [1 if theme in theme_in_line else 0 for theme in self.themes]
                total_sentiment = sum(int(senti) for _, senti in matches)

                if total_sentiment > 0:
                    sentiment_label = 2
                elif total_sentiment < 0:
                    sentiment_label = 0
                else:
                    sentiment_label = 1

                Data[idx] = {
                    "comment": comment.replace(" ", ""),
                    "themes": multi_hot_vector,
                    "sentiment": sentiment_label
                }
            return Data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [7]:
train_data = ChnSentiCorp('data/train.txt')
test_data = ChnSentiCorp('data/test.txt')

print(f"length of training set: {len(train_data)}")
print(f"length of test set: {len(test_data)}")
print(train_data[0])

length of training set: 8000
length of test set: 2653
{'comment': '因为森林人即将换代，这套系统没必要装在一款即将换代的车型上，因为肯定会影响价格。', 'themes': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0], 'sentiment': 1}


### 5. 定义DataLoader

In [8]:
def collate_fn(batch_samples):
    batch_sentences, batch_sentiment_labels = [], []

    for sample in batch_samples:
        batch_sentences.append(sample['comment'])
        batch_sentiment_labels.append(sample['sentiment'])

    batch_inputs = tokenizer(
        batch_sentences,
        max_length=max_length,
        padding=True,
        truncation=True,
        return_tensors="pt",
        return_attention_mask=True
    )

    input_ids = batch_inputs['input_ids']
    attention_mask = batch_inputs['attention_mask']
    labels = torch.tensor(batch_sentiment_labels, dtype=torch.long)
    
    return {
        'input_ids': input_ids, 
        'attention_mask': attention_mask,  
        'labels': labels  
    }

train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

In [9]:
next(iter(train_dataloader))

{'input_ids': tensor([[ 101, 1094, 6887,  ...,    0,    0,    0],
         [ 101, 4684, 2970,  ...,    0,    0,    0],
         [ 101, 2769, 4500,  ...,    0,    0,    0],
         ...,
         [ 101,  100, 4696,  ...,    0,    0,    0],
         [ 101, 1963, 3362,  ...,    0,    0,    0],
         [ 101, 2990, 6756,  ...,    0,    0,    0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 0, 1, 1])}

### 6. 配置模型

In [10]:
config = AutoConfig.from_pretrained(checkpoint)
model = BertForSequenceClassification.from_pretrained(checkpoint, num_labels=3).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
config

BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.49.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}

In [12]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

### 7. 优化器

In [13]:
optimizer = AdamW(model.parameters(), lr=learning_rate)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=epoch_num * len(train_dataloader),
)

writer = SummaryWriter(log_dir='sentiment_classification_logs' + '/' + time.strftime('%m-%d_%H.%M', time.localtime()))



### 8. 定义train函数

In [14]:
total_train_step = 0
total_train_loss = 0.
best_f1_score = 0.
total_test_loss = 0

In [15]:
def train_loop(dataloader, model, optimizer, lr_scheduler, epoch, total_train_loss, total_train_step):
    progress_bar = tqdm(range(len(dataloader)),disable=False)
    progress_bar.set_description(f'loss: {0:>7f}')
    finish_step_num = epoch * len(dataloader)

    model.train()
    for step, batch_data in enumerate(dataloader, start=1):
        batch_data = {k: torch.tensor(v).to(device) if isinstance(v, list) else v.to(device) for k, v in batch_data.items()}
        theme_outputs = model(**batch_data)
        loss = theme_outputs.loss
        logits = theme_outputs.logits

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_train_loss += loss.item()
        total_train_step +=1
        progress_bar.set_description(f'loss: {total_train_loss / (finish_step_num + step):>7f}')
        progress_bar.update(1)

        if total_train_step % 100 == 0:
            print("训练次数:{}，loss:{}".format(total_train_step, loss.item()))
            writer.add_scalar("train_loss", loss.item(), total_train_step)

    return total_train_loss, total_train_step

In [16]:
def test_loop(dataloader, model, epoch):
    true_labels, predictions = [], []
    model.eval()
    total_test_loss = 0
    with torch.no_grad():
        for step, batch_data in enumerate(dataloader, start=1):
            batch_data = {k: torch.tensor(v).to(device) if isinstance(v, list) else v.to(device) for k, v in batch_data.items()}
            theme_outputs = model(**batch_data)
            loss = theme_outputs.loss
            logits = theme_outputs.logits

            pred = logits.argmax(dim=-1)

            true_labels += batch_data["labels"].cpu().numpy().tolist()
            predictions += pred.cpu().numpy().tolist()

            total_test_loss += loss.item()

    print("整体测试集上的Loss:{}".format(total_test_loss))
    writer.add_scalar("test_loss", total_test_loss, epoch)

    metrics = classification_report(true_labels, predictions, output_dict=True)

    pos_p, pos_r, pos_f1 = metrics['2']['precision'], metrics['2']['recall'], metrics['2']['f1-score']  # 正向情感 (类别 2)

    neu_p, neu_r, neu_f1= metrics['1']['precision'], metrics['1']['recall'], metrics['1']['f1-score']   # 中性情感 (类别 1)

    neg_p, neg_r, neg_f1 = metrics['0']['precision'], metrics['0']['recall'], metrics['0']['f1-score'] # 负向情感 (类别 0)

    macro_f1 = metrics['macro avg']['f1-score']
    micro_f1 = metrics['weighted avg']['f1-score']
    accuracy = metrics['accuracy']
    writer.add_scalar("test_accuarcy", accuracy, epoch)

    print(f"Positive (2): Precision: {pos_p * 100:>0.2f} / Recall: {pos_r * 100:>0.2f} / F1: {pos_f1 * 100:>0.2f}")
    print(f"Neutral (1): Precision: {neu_p * 100:>0.2f} / Recall: {neu_r * 100:>0.2f} / F1: {neu_f1 * 100:>0.2f}")
    print(f"Negative (0): Precision: {neg_p * 100:>0.2f} / Recall: {neg_r * 100:>0.2f} / F1: {neg_f1 * 100:>0.2f}")
    print(f"Accuracy: {accuracy * 100:>0.2f}")
    print(f"Macro-F1: {macro_f1 * 100:>0.2f} / Micro-F1: {micro_f1 * 100:>0.2f}\n")
    
    return metrics

### 9. 训练

In [17]:
for epoch in range(epoch_num):
    print(f"Epoch {epoch + 1}/{epoch_num}\n" + 30 * "-")
    total_train_loss, total_train_step= train_loop(train_dataloader, model, optimizer, lr_scheduler, epoch, total_train_loss, total_train_step)
    valid_scores = test_loop(test_dataloader, model, epoch)
    macro_f1, micro_f1 = valid_scores['macro avg']['f1-score'], valid_scores['weighted avg']['f1-score']
    f1_score = (macro_f1 + micro_f1) / 2
    if f1_score > best_f1_score:
        best_f1_score = f1_score
        print('saving new weights...\n')
        torch.save(
            model.state_dict(),
            f'epoch_{epoch + 1}_valid_macrof1_{(macro_f1 * 100):0.3f}_microf1_{(micro_f1 * 100):0.3f}_model_weights.bin'
        )

writer.close()
print("Done!")

Epoch 1/2
------------------------------


  0%|          | 0/400 [00:00<?, ?it/s]

训练次数:100，loss:0.8188006281852722
训练次数:200，loss:0.49138039350509644
训练次数:300，loss:0.4329761564731598
训练次数:400，loss:0.7496187090873718
整体测试集上的Loss:87.31010556221008
Positive (2): Precision: 58.05 / Recall: 31.56 / F1: 40.89
Neutral (1): Precision: 74.02 / Recall: 93.03 / F1: 82.44
Negative (0): Precision: 62.03 / Recall: 21.59 / F1: 32.03
Accuracy: 72.07
Macro-F1: 51.79 / Micro-F1: 67.91

saving new weights...

Epoch 2/2
------------------------------


  0%|          | 0/400 [00:00<?, ?it/s]

训练次数:500，loss:0.15900513529777527
训练次数:600，loss:0.40734371542930603
训练次数:700，loss:0.2783155143260956
训练次数:800，loss:0.28716492652893066
整体测试集上的Loss:88.30534793436527
Positive (2): Precision: 53.65 / Recall: 44.83 / F1: 48.84
Neutral (1): Precision: 77.31 / Recall: 86.22 / F1: 81.53
Negative (0): Precision: 53.92 / Recall: 36.34 / F1: 43.42
Accuracy: 71.81
Macro-F1: 57.93 / Micro-F1: 70.36

saving new weights...

Done!
