In [3]:
# 一、导入数据
# -- coding: utf-8 --
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from torch import nn
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import jieba
import torch
import warnings
import os
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import jieba
import torch
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
import os
from sklearn.metrics import classification_report, f1_score

warnings.filterwarnings("ignore")  # 忽略警告信息

# 数据加载和预处理
neg = pd.read_excel('data/neg.xls', header=None)
pos = pd.read_excel('data/pos.xls', header=None)

pos['words'] = pos[0].apply(lambda x: jieba.lcut(str(x)))
neg['words'] = neg[0].apply(lambda x: jieba.lcut(str(x)))

texts = np.concatenate((pos['words'], neg['words']))
labels = np.concatenate((np.ones(len(pos)), np.zeros(len(neg))))

# BERT 模型准备
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=2)

def encode_text(texts):
    return tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=128)

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)

train_encodings = encode_text([' '.join(text) for text in train_texts])
val_encodings = encode_text([' '.join(text) for text in val_texts])

class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx], dtype=torch.long) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

# 计算多个指标
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    report = classification_report(labels, predictions, output_dict=True)
    accuracy = report['accuracy']
    precision = report['macro avg']['precision']
    recall = report['macro avg']['recall']
    f1 = report['macro avg']['f1-score']

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "loss": float('nan'),  # 这里可能需要根据具体需求调整
    }

# 设置优化器和学习率调整器
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True, min_lr=1e-7)

# 训练参数
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_dir='./logs',
    lr_scheduler_type="reduce_lr_on_plateau",
    metric_for_best_model="loss",
)

# 初始化 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# 开始训练
trainer.train()

# 保存模型
model.save_pretrained('bert_sentiment_model')
tokenizer.save_pretrained('bert_sentiment_model')

# 输出训练指标表格
log_history = trainer.state.log_history

# 初始化一个列表来保存每一epoch的指标
metrics = []

for log_item in log_history:
    # 确保该日志条目包含训练或评估的指标
    if 'loss' in log_item or 'eval_loss' in log_item:
        # 记录 epoch，如果有的话
        epoch = log_item.get('epoch', None)
        # 获取 Training Loss
        train_loss = log_item.get('loss', None)
        # 获取 Validation Loss
        eval_loss = log_item.get('eval_loss', None)
        # 获取其他指标
        accuracy = log_item.get('eval_accuracy', None)
        precision = log_item.get('eval_precision', None)
        recall = log_item.get('eval_recall', None)
        f1 = log_item.get('eval_f1', None)

        # 把当前 Epoch 的指标收集到字典中
        metrics.append({
            'cust-epoch': epoch,
            'train_loss': train_loss,
            'eval_loss': eval_loss,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1
        })

# 把列表转换为 Pandas DataFrame
metrics_df = pd.DataFrame(metrics)

# 使用 pd.set_option 来设置表格显示的列宽和宽度，避免省略号
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

print("Model training metrics table:")
print(metrics_df)

# 情感预测部分
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def bert_predict(string):
    inputs = tokenizer(string, return_tensors='pt', padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits).item()
    sentiment = '积极' if predicted_class == 1 else '消极'
    print(f"{string} [{sentiment}]")
    return sentiment

string = '还不错，符合需求'
pred_result = bert_predict(string)
print(f"预测结果: {pred_result}")
warnings.filterwarnings("ignore")  # 忽略警告信息

# 加载语料库文件，并导入数据
neg = pd.read_excel('data/neg.xls', header=None)
pos = pd.read_excel('data/pos.xls', header=None)

pos.head()

# 分词处理
word_cut = lambda x: jieba.lcut(str(x))
pos['words'] = pos[0].apply(word_cut)
neg['words'] = neg[0].apply(word_cut)

# 使用 1 表示积极情绪，0 表示消极情绪，并完成数组拼接
texts = np.concatenate((pos['words'], neg['words']))
labels = np.concatenate((np.ones(len(pos)), np.zeros(len(neg))))

# 准备训练数据
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, train_size=0.1)

# 二、构建 LSTM + Attention 模型
class LSTMAttention(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, num_classes):
        super(LSTMAttention, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.attention = nn.Linear(hidden_dim, 1)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        # Embedding layer
        embedded = self.embedding(x)  # (batch_size, seq_len, embed_dim)

        # LSTM layer
        lstm_out, _ = self.lstm(embedded)  # (batch_size, seq_len, hidden_dim)

        # Attention layer
        attention_weights = torch.softmax(self.attention(lstm_out).squeeze(-1), dim=1)  # (batch_size, seq_len)
        context_vector = torch.bmm(attention_weights.unsqueeze(1), lstm_out).squeeze(1)  # (batch_size, hidden_dim)

        # Fully connected layer
        output = self.fc(context_vector)  # (batch_size, num_classes)
        return output

# 三、数据预处理
# 构建词汇表
vocab = set()
for text in texts:
    vocab.update(text)
vocab = sorted(vocab)
vocab_size = len(vocab) + 1  # 加1是为了留出一个索引给填充符

# 将文本转换为索引
def text_to_indices(text):
    return [vocab.index(word) + 1 for word in text]  # 加1是为了避免索引为0

train_indices = [text_to_indices(text) for text in train_texts]
val_indices = [text_to_indices(text) for text in val_texts]

# 填充序列
def collate_batch(batch):
    texts, labels = zip(*batch)
    texts = pad_sequence([torch.tensor(text, dtype=torch.long) for text in texts], batch_first=True)
    labels = torch.tensor(labels, dtype=torch.long)
    return texts, labels

# 将数据转换为 PyTorch 数据集
class SentimentDataset(Dataset):
    def __init__(self, indices, labels):
        self.indices = indices
        self.labels = labels

    def __getitem__(self, idx):
        return self.indices[idx], self.labels[idx]

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_indices, train_labels)
val_dataset = SentimentDataset(val_indices, val_labels)

# 创建数据加载器
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=collate_batch)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=collate_batch)

# 四、训练 LSTM + Attention 模型
# 设置超参数
embed_dim = 64  # 减小嵌入维度
hidden_dim = 128  # 减小隐藏层维度
num_classes = 2
learning_rate = 2e-3

# 初始化模型、优化器和学习率调度器
device = torch.device("cpu")  # 切换到 CPU
model = LSTMAttention(vocab_size, embed_dim, hidden_dim, num_classes).to(device)
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True, min_lr=1e-5)
criterion = nn.CrossEntropyLoss()

# 训练函数
def train(model, train_loader, val_loader, epochs=3):
    best_loss = float('inf')
    history = []  # 用于存储每个 epoch 的指标

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for texts, labels in train_loader:
            texts, labels = texts.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Training Loss: {total_loss / len(train_loader):.4f}")

        # 验证阶段
        model.eval()
        val_loss = 0
        all_labels = []
        all_predictions = []
        with torch.no_grad():
            for texts, labels in val_loader:
                texts, labels = texts.to(device), labels.to(device)
                outputs = model(texts)
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                _, predictions = torch.max(outputs, 1)
                all_labels.extend(labels.cpu().numpy())
                all_predictions.extend(predictions.cpu().numpy())

        # 计算验证集上的指标
        accuracy = accuracy_score(all_labels, all_predictions)
        precision = precision_score(all_labels, all_predictions, average='binary')
        recall = recall_score(all_labels, all_predictions, average='binary')
        f1 = f1_score(all_labels, all_predictions, average='binary')

        # 保存指标到历史记录
        history.append({
            "Epoch": epoch + 1,
            "Training Loss": total_loss / len(train_loader),
            "Validation Loss": val_loss / len(val_loader),
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1
        })

        print(f"Epoch {epoch+1}, Validation Loss: {val_loss / len(val_loader):.4f}, "
              f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, "
              f"Recall: {recall:.4f}, F1 Score: {f1:.4f}")

        # 保存最佳模型
        if val_loss < best_loss:
            best_loss = val_loss
            torch.save(model.state_dict(), 'lstm_attention_model.pth')
            print("Best model saved!")

        # 调整学习率
        scheduler.step(val_loss)

    # 创建表格并打印
    df = pd.DataFrame(history)
    print("\nTraining History:")
    print(df.to_markdown(index=False))

# 开始训练
train(model, train_loader, val_loader, epochs=3)

# 五、情感预测
# 加载训练好的模型
model.load_state_dict(torch.load('lstm_attention_model.pth'))
model.eval()

# 对电影评论进行情感判断
def lstm_attention_predict(string):
    # 对输入文本进行分词
    words = jieba.lcut(str(string))
    indices = text_to_indices(words)
    indices = torch.tensor(indices).unsqueeze(0).to(device)  # 添加 batch 维度

    with torch.no_grad():
        outputs = model(indices)
        _, predicted_class = torch.max(outputs, 1)

    # 输出结果
    sentiment = '积极' if predicted_class.item() == 1 else '消极'
    print(f"{string} [{sentiment}]")
    return sentiment

# 测试预测
string = '还不错，符合需求'
pred_result = lstm_attention_predict(string)
print(f"预测结果: {pred_result}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2682,,0.918522,0.919037,0.918065,0.918377
2,0.1871,,0.938181,0.93825,0.938596,0.938172
3,0.1612,,0.942918,0.942867,0.942879,0.942873


Model training metrics table:
   cust-epoch  train_loss  eval_loss  Accuracy  Precision    Recall  F1 Score
0    0.473485      0.3174        NaN       NaN        NaN       NaN       NaN
1    0.946970      0.2682        NaN       NaN        NaN       NaN       NaN
2    1.000000         NaN        NaN  0.918522   0.919037  0.918065  0.918377
3    1.420455      0.1934        NaN       NaN        NaN       NaN       NaN
4    1.893939      0.1871        NaN       NaN        NaN       NaN       NaN
5    2.000000         NaN        NaN  0.938181   0.938250  0.938596  0.938172
6    2.367424      0.1501        NaN       NaN        NaN       NaN       NaN
7    2.840909      0.1612        NaN       NaN        NaN       NaN       NaN
8    3.000000         NaN        NaN  0.942918   0.942867  0.942879  0.942873
还不错，符合需求 [积极]
预测结果: 积极
Epoch 1, Training Loss: 0.6277
Epoch 1, Validation Loss: 0.5348, Accuracy: 0.7312, Precision: 0.7183, Recall: 0.7762, F1 Score: 0.7461
Best model saved!
Epoch 2, Train

In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import jieba
import torch
from torch.optim import AdamW
import   warnings
from torch.optim.lr_scheduler import ReduceLROnPlateau
import os
from sklearn.metrics import classification_report, f1_score

warnings.filterwarnings("ignore")  # 忽略警告信息

# 数据加载和预处理
neg = pd.read_excel('data/neg.xls', header=None)
pos = pd.read_excel('data/pos.xls', header=None)

pos['words'] = pos[0].apply(lambda x: jieba.lcut(str(x)))
neg['words'] = neg[0].apply(lambda x: jieba.lcut(str(x)))

texts = np.concatenate((pos['words'], neg['words']))
labels = np.concatenate((np.ones(len(pos)), np.zeros(len(neg))))

# BERT 模型准备
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=2)

def encode_text(texts):
    return tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=128)

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)

train_encodings = encode_text([' '.join(text) for text in train_texts])
val_encodings = encode_text([' '.join(text) for text in val_texts])

class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx], dtype=torch.long) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

# 计算多个指标
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)

    report = classification_report(labels, predictions, output_dict=True)
    accuracy = report['accuracy']
    precision = report['macro avg']['precision']
    recall = report['macro avg']['recall']
    f1 = report['macro avg']['f1-score']

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "loss": float('nan'),  # 这里可能需要根据具体需求调整
    }

# 设置优化器和学习率调整器
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=True, min_lr=1e-7)

# 训练参数
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    logging_dir='./logs',
    lr_scheduler_type="reduce_lr_on_plateau",
    metric_for_best_model="loss",
)

# 初始化 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# 开始训练
trainer.train()

# 保存模型
model.save_pretrained('bert_sentiment_model')
tokenizer.save_pretrained('bert_sentiment_model')

# 输出训练指标表格
log_history = trainer.state.log_history

# 初始化一个列表来保存每一epoch的指标
metrics = []

for log_item in log_history:
    # 确保该日志条目包含训练或评估的指标
    if 'loss' in log_item or 'eval_loss' in log_item:
        # 记录 epoch，如果有的话
        epoch = log_item.get('epoch', None)
        # 获取 Training Loss
        train_loss = log_item.get('loss', None)
        # 获取 Validation Loss
        eval_loss = log_item.get('eval_loss', None)
        # 获取其他指标
        accuracy = log_item.get('eval_accuracy', None)
        precision = log_item.get('eval_precision', None)
        recall = log_item.get('eval_recall', None)
        f1 = log_item.get('eval_f1', None)

        # 把当前 Epoch 的指标收集到字典中
        metrics.append({
            'cust-epoch': epoch,
            'train_loss': train_loss,
            'eval_loss': eval_loss,
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1
        })

# 把列表转换为 Pandas DataFrame
metrics_df = pd.DataFrame(metrics)

# 使用 pd.set_option 来设置表格显示的列宽和宽度，避免省略号
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

print("Model training metrics table:")
print(metrics_df)

# 情感预测部分
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

def bert_predict(string):
    inputs = tokenizer(string, return_tensors='pt', padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits).item()
    sentiment = '积极' if predicted_class == 1 else '消极'
    print(f"{string} [{sentiment}]")
    return sentiment

string = '还不错，符合需求'
pred_result = bert_predict(string)
print(f"预测结果: {pred_result}")

Building prefix dict from the default dictionary ...
Dumping model to file cache /var/folders/_w/hfqvgqcd4rj79tq5flh6k91h0000gn/T/jieba.cache
Loading model cost 0.301 seconds.
Prefix dict has been built successfully.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 