In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import jieba
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score, mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm
  re_han_default = re.compile("([\u4E00-\u9FD5a-zA-Z0-9+#&\._%\-]+)", re.U)
  re_skip_default = re.compile("(\r\n|\s)", re.U)
  re_skip = re.compile("([a-zA-Z0-9]+(?:\.\d+)?%?)")


In [2]:
# ===================== 1. 配置参数 =====================
class Config:
    bert_path = 'bert-base-chinese'  # 预训练BERT路径（自动下载）
    max_seq_len = 64  # 文本最大长度
    batch_size = 32
    learning_rate = 1e-5
    bert_lr = 1e-6  # BERT微调学习率
    num_epochs = 10
    weight_decay = 1e-4
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    gender_weight = 1.0  # 性别损失权重
    fans_weight = 0.5    # 粉丝数损失权重
    location_weight = 1.2# 位置损失权重

config = Config()

In [3]:
# ===================== 2. 数据预处理 =====================
def load_data(user_path, weibo_path):
    """加载并关联用户数据和微博数据"""
    # 加载数据
    user_df = pd.read_excel(user_path)
    weibo_df = pd.read_excel(weibo_path)

    # 数据清洗：只保留有微博文本、性别、粉丝数、位置的样本
    user_df = user_df[user_df['性别'].isin(['f', 'm'])].copy()
    user_df = user_df.dropna(subset=['位置', '粉丝数']).copy()
    weibo_df = weibo_df.dropna(subset=['文本内容']).copy()

    # 关联数据（一个用户可能有多条微博，取第一条有效微博）
    weibo_df = weibo_df.groupby('uid').first().reset_index()  # 每个用户保留一条微博
    data_df = pd.merge(weibo_df, user_df, on='uid', how='inner')

    # 处理目标变量
    # 2.1 性别：f→0，m→1
    data_df['gender_label'] = (data_df['性别'] == 'm').astype(int)

    # 2.2 粉丝数：对数变换
    data_df['fans_label'] = np.log1p(data_df['粉丝数'])  # log(粉丝数+1)

    # 2.3 位置：合并细分地区，编码标签
    data_df['location_clean'] = data_df['位置'].str.extract(r'([^ ]+)')[0]  # 提取省份/国家
    data_df['location_clean'] = data_df['location_clean'].replace(['海外', '其他'], ['海外地区', '其他地区'])
    # 过滤低频类别（样本数≥5）
    location_counts = data_df['location_clean'].value_counts()
    valid_locations = location_counts[location_counts >= 5].index.tolist()
    data_df = data_df[data_df['location_clean'].isin(valid_locations)].copy()

    # 位置标签编码
    label_encoder = LabelEncoder()
    data_df['location_label'] = label_encoder.fit_transform(data_df['location_clean'])

    return data_df, label_encoder

def clean_text(text):
    """清洗微博文本"""
    text = str(text)
    # 去除@用户、话题、URL、特殊符号
    text = re.sub(r'@[^ ]+', '', text)
    text = re.sub(r'#.*?#', '', text)
    text = re.sub(r'http[s]?://\S+', '', text)
    text = re.sub(r'[^\u4e00-\u9fa5a-zA-Z0-9\s]', '', text)
    # 分词
    words = jieba.lcut(text.strip())
    return ' '.join(words)


In [4]:
# ===================== 3. 数据集类 =====================
class WeiboDataset(Dataset):
    def __init__(self, texts, gender_labels, fans_labels, location_labels, tokenizer, max_seq_len):
        self.texts = texts
        self.gender_labels = gender_labels
        self.fans_labels = fans_labels
        self.location_labels = location_labels
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        # 文本编码
        encoding = self.tokenizer(
            self.texts[idx],
            padding='max_length',
            truncation=True,
            max_length=self.max_seq_len,
            return_tensors='pt'
        )
        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()

        # 标签转换为tensor
        gender = torch.tensor(self.gender_labels[idx], dtype=torch.long)
        fans = torch.tensor(self.fans_labels[idx], dtype=torch.float32)
        location = torch.tensor(self.location_labels[idx], dtype=torch.long)

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'gender': gender,
            'fans': fans,
            'location': location
        }

In [5]:
# ===================== 4. 多输出模型定义 =====================
class MultiOutputBERT(nn.Module):
    def __init__(self, bert_path, num_location_classes):
        super().__init__()
        # 共享BERT编码器
        self.bert = BertModel.from_pretrained(bert_path, output_hidden_states=True)
        self.bert_hidden_size = self.bert.config.hidden_size

        # 冻结BERT前6层（可选，根据数据量调整）
        for param in list(self.bert.parameters())[:12]:
            param.requires_grad = False

        # 性别输出头（二分类）
        self.gender_head = nn.Sequential(
            nn.Linear(self.bert_hidden_size, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 2)
        )

        # 粉丝数输出头（回归）
        self.fans_head = nn.Sequential(
            nn.Linear(self.bert_hidden_size, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 1)
        )

        # 位置输出头（多分类）
        self.location_head = nn.Sequential(
            nn.Linear(self.bert_hidden_size, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_location_classes)
        )

    def forward(self, input_ids, attention_mask):
        # BERT编码：取[CLS] token的输出
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = outputs.last_hidden_state[:, 0, :]  # (batch_size, hidden_size)

        # 三个任务输出
        gender_logits = self.gender_head(cls_embedding)
        fans_pred = self.fans_head(cls_embedding).squeeze(-1)  # (batch_size,)
        location_logits = self.location_head(cls_embedding)

        return gender_logits, fans_pred, location_logits


In [6]:
# ===================== 5. 训练与评估函数 =====================
def train_model(model, train_loader, val_loader, optimizer, criterion_gender, criterion_fans, criterion_location, config):
    model.to(config.device)
    best_val_loss = float('inf')

    for epoch in range(config.num_epochs):
        # 训练阶段
        model.train()
        train_loss = 0.0
        for batch in train_loader:
            # 数据移到设备
            input_ids = batch['input_ids'].to(config.device)
            attention_mask = batch['attention_mask'].to(config.device)
            gender_labels = batch['gender'].to(config.device)
            fans_labels = batch['fans'].to(config.device)
            location_labels = batch['location'].to(config.device)

            # 前向传播
            optimizer.zero_grad()
            gender_logits, fans_pred, location_logits = model(input_ids, attention_mask)

            # 计算损失
            loss_gender = criterion_gender(gender_logits, gender_labels)
            loss_fans = criterion_fans(fans_pred, fans_labels)
            loss_location = criterion_location(location_logits, location_labels)
            total_loss = (config.gender_weight * loss_gender +
                          config.fans_weight * loss_fans +
                          config.location_weight * loss_location)

            # 反向传播
            total_loss.backward()
            optimizer.step()

            train_loss += total_loss.item() * input_ids.size(0)

        train_loss /= len(train_loader.dataset)

        # 验证阶段
        val_metrics = evaluate_model(model, val_loader, criterion_gender, criterion_fans, criterion_location, config)
        val_loss = val_metrics['total_loss']

        # 保存最优模型
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'multi_output_bert_best.pth')
            print(f'Epoch {epoch+1} | 最优模型保存，验证总损失：{val_loss:.4f}')

        # 打印日志
        print(f'''Epoch {epoch+1}/{config.num_epochs}
        训练总损失：{train_loss:.4f}
        验证总损失：{val_loss:.4f}
        性别准确率：{val_metrics['gender_acc']:.4f} | 性别F1：{val_metrics['gender_f1']:.4f}
        粉丝数RMSE：{val_metrics['fans_rmse']:.4f} | 粉丝数R²：{val_metrics['fans_r2']:.4f}
        位置准确率：{val_metrics['location_acc']:.4f}''')

def evaluate_model(model, dataloader, criterion_gender, criterion_fans, criterion_location, config):
    model.eval()
    total_loss = 0.0

    # 存储预测结果和真实标签
    gender_preds, gender_trues = [], []
    fans_preds, fans_trues = [], []
    location_preds, location_trues = [], []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(config.device)
            attention_mask = batch['attention_mask'].to(config.device)
            gender_labels = batch['gender'].to(config.device)
            fans_labels = batch['fans'].to(config.device)
            location_labels = batch['location'].to(config.device)

            # 前向传播
            gender_logits, fans_pred, location_logits = model(input_ids, attention_mask)

            # 计算损失
            loss_gender = criterion_gender(gender_logits, gender_labels)
            loss_fans = criterion_fans(fans_pred, fans_labels)
            loss_location = criterion_location(location_logits, location_labels)
            batch_loss = (config.gender_weight * loss_gender +
                          config.fans_weight * loss_fans +
                          config.location_weight * loss_location)
            total_loss += batch_loss.item() * input_ids.size(0)

            # 收集预测结果（还原对数变换）
            gender_pred = torch.argmax(gender_logits, dim=1).cpu().numpy()
            location_pred = torch.argmax(location_logits, dim=1).cpu().numpy()

            gender_preds.extend(gender_pred)
            gender_trues.extend(gender_labels.cpu().numpy())
            fans_preds.extend(np.expm1(fans_pred.cpu().numpy()))  # 还原：exp(x)-1
            fans_trues.extend(np.expm1(fans_labels.cpu().numpy()))
            location_preds.extend(location_pred)
            location_trues.extend(location_labels.cpu().numpy())

    # 计算评估指标
    total_loss /= len(dataloader.dataset)
    gender_acc = accuracy_score(gender_trues, gender_preds)
    gender_f1 = f1_score(gender_trues, gender_preds, average='binary')
    fans_rmse = np.sqrt(mean_squared_error(fans_trues, fans_preds))
    fans_r2 = r2_score(fans_trues, fans_preds)
    location_acc = accuracy_score(location_trues, location_preds)

    return {
        'total_loss': total_loss,
        'gender_acc': gender_acc,
        'gender_f1': gender_f1,
        'fans_rmse': fans_rmse,
        'fans_r2': fans_r2,
        'location_acc': location_acc
    }


In [7]:
# ===================== 6. 推理函数 =====================
def infer(model, tokenizer, text, label_encoder, config):
    """单条微博文本推理"""
    model.eval()
    # 文本预处理和编码
    clean_txt = clean_text(text)
    encoding = tokenizer(
        clean_txt,
        padding='max_length',
        truncation=True,
        max_length=config.max_seq_len,
        return_tensors='pt'
    )
    input_ids = encoding['input_ids'].to(config.device)
    attention_mask = encoding['attention_mask'].to(config.device)

    # 推理
    with torch.no_grad():
        gender_logits, fans_pred, location_logits = model(input_ids, attention_mask)

    # 结果解析
    gender_prob = torch.softmax(gender_logits, dim=1).cpu().numpy()[0]
    gender = '男性' if gender_prob[1] > 0.5 else '女性'
    gender_confidence = max(gender_prob)

    fans = int(np.expm1(fans_pred.cpu().numpy()[0]))  # 还原真实粉丝数

    location_prob = torch.softmax(location_logits, dim=1).cpu().numpy()[0]
    location_idx = np.argmax(location_prob)
    location = label_encoder.inverse_transform([location_idx])[0]
    location_confidence = location_prob[location_idx]

    return {
        '性别': gender,
        '性别置信度': f'{gender_confidence:.2f}',
        '预测粉丝数': fans,
        '位置': location,
        '位置置信度': f'{location_confidence:.2f}'
    }


In [10]:


# ===================== 7. 主函数（执行流程） =====================
if __name__ == '__main__':
    # 1. 加载和预处理数据
    user_path = r'datasets/Weibo/用户信息.xlsx'  # 替换为你的用户数据路径
    weibo_path = r'datasets/Weibo/用户微博总.xlsx'  # 替换为你的微博数据路径
    data_df, label_encoder = load_data(user_path, weibo_path)

    # 清洗文本
    data_df['clean_text'] = data_df['文本内容'].apply(clean_text)

    # 划分数据集（按用户ID划分，避免数据泄露）
    train_df, val_df = train_test_split(data_df, test_size=0.2, random_state=42, stratify=data_df['gender_label'])

    # 2. 初始化Tokenizer
    tokenizer = BertTokenizer.from_pretrained(config.bert_path)

    # 3. 创建数据集和DataLoader
    train_dataset = WeiboDataset(
        texts=train_df['clean_text'].tolist(),
        gender_labels=train_df['gender_label'].tolist(),
        fans_labels=train_df['fans_label'].tolist(),
        location_labels=train_df['location_label'].tolist(),
        tokenizer=tokenizer,
        max_seq_len=config.max_seq_len
    )
    val_dataset = WeiboDataset(
        texts=val_df['clean_text'].tolist(),
        gender_labels=val_df['gender_label'].tolist(),
        fans_labels=val_df['fans_label'].tolist(),
        location_labels=val_df['location_label'].tolist(),
        tokenizer=tokenizer,
        max_seq_len=config.max_seq_len
    )

    train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=config.batch_size, shuffle=False)

    # 4. 初始化模型和损失函数
    num_location_classes = len(label_encoder.classes_)
    model = MultiOutputBERT(config.bert_path, num_location_classes)

    # 损失函数
    criterion_gender = nn.CrossEntropyLoss()
    criterion_fans = nn.MSELoss()
    criterion_location = nn.CrossEntropyLoss()

    # 优化器（分层学习率）
    param_groups = [
        {'params': model.bert.parameters(), 'lr': config.bert_lr},
        {'params': model.gender_head.parameters(), 'lr': config.learning_rate},
        {'params': model.fans_head.parameters(), 'lr': config.learning_rate},
        {'params': model.location_head.parameters(), 'lr': config.learning_rate}
    ]
    optimizer = optim.AdamW(param_groups, weight_decay=config.weight_decay)

    # 5. 训练模型
    print(f'开始训练，设备：{config.device}')
    train_model(model, train_loader, val_loader, optimizer, criterion_gender, criterion_fans, criterion_location, config)

    # 6. 加载最优模型进行推理
    model.load_state_dict(torch.load('multi_output_bert_best.pth'))
    model.to(config.device)

    # 测试推理
    test_text = '今天去故宫玩了，人好多呀，风景超美～'  # 测试文本
    result = infer(model, tokenizer, test_text, label_encoder, config)
    print('\n推理结果：')
    for k, v in result.items():
        print(f'{k}：{v}')

开始训练，设备：cuda
Epoch 1 | 最优模型保存，验证总损失：24.3285
Epoch 1/10
        训练总损失：25.2852
        验证总损失：24.3285
        性别准确率：0.6860 | 性别F1：0.2895
        粉丝数RMSE：1817967.1433 | 粉丝数R²：-0.0224
        位置准确率：0.0349
Epoch 2 | 最优模型保存，验证总损失：21.8934
Epoch 2/10
        训练总损失：23.0303
        验证总损失：21.8934
        性别准确率：0.6977 | 性别F1：0.0000
        粉丝数RMSE：1817966.9954 | 粉丝数R²：-0.0224
        位置准确率：0.0407
Epoch 3 | 最优模型保存，验证总损失：19.3802
Epoch 3/10
        训练总损失：20.7798
        验证总损失：19.3802
        性别准确率：0.6977 | 性别F1：0.0000
        粉丝数RMSE：1817966.6986 | 粉丝数R²：-0.0224
        位置准确率：0.1628
Epoch 4 | 最优模型保存，验证总损失：16.8683
Epoch 4/10
        训练总损失：18.4368
        验证总损失：16.8683
        性别准确率：0.6977 | 性别F1：0.0000
        粉丝数RMSE：1817966.0899 | 粉丝数R²：-0.0224
        位置准确率：0.2035
Epoch 5 | 最优模型保存，验证总损失：14.5154
Epoch 5/10
        训练总损失：16.1821
        验证总损失：14.5154
        性别准确率：0.6977 | 性别F1：0.0000
        粉丝数RMSE：1817964.7274 | 粉丝数R²：-0.0224
        位置准确率：0.2035
Epoch 6 | 最优模型保存，验证总损失：12.6225
Epoch 6/10
        训练