<a href="https://colab.research.google.com/github/Thomas-Land/car/blob/main/%E6%B1%BD%E8%BD%A6%E7%94%A8%E6%88%B7%E6%BB%A1%E6%84%8F%E5%BA%A6%E5%B1%82%E6%AC%A1%E5%8C%96%E6%B3%A8%E6%84%8F%E5%8A%9B%E6%A8%A1%E5%9E%8B_(%E4%BB%BB%E5%8A%A11_%E5%8A%A8%E6%80%81%E5%9B%A0%E5%AD%90%E5%88%86%E7%BB%84).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore', category=FutureWarning) # Ignore some pandas warnings

# --- 1. 数据加载与结构解析 ---
data_file = 'D:/新建文件夹/2/案例game/案例第一题数据.xlsx'
# 加载数据
data_df = pd.read_excel(data_file,sheet_name='数据')
factor_desc_df = pd.read_excel(data_file,sheet_name='因子说明')
structure_df = pd.read_excel(data_file, sheet_name='结构说明')
print("数据概览 (前5行):")
print(data_df.head())
print(f"\n数据形状: {data_df.shape}")
print("\n因子说明:")
print(factor_desc_df.head())
print("\n结构说明:")
print(structure_df)

# --- 动态构建因子分组 ---
print("\n动态构建因子分组:")
high_level_factors_map = {}
try:
    # 假设 '因子说明.csv' 有 '一级指标' 和 '变量名' 列
    # 假设 '变量名' 列包含的是 'case1_data.csv' 中的实际列名
    # !! 如果您的列名不同，请修改这里的 '一级指标' 和 '变量名' !!
    required_cols_factor = ['一级指标', '变量名']
    if not all(col in factor_desc_df.columns for col in required_cols_factor):
        print(f"错误: '因子说明.csv' 文件缺少必要的列: {required_cols_factor}。请检查文件。")
        exit()

    # 按 '一级指标' 分组，并将对应的 '变量名' 收集成列表
    grouped = factor_desc_df.groupby('一级指标')['变量名'].apply(list)
    high_level_factors_map = grouped.to_dict()

    print("根据 '因子说明.csv' 构建的初步分组:")
    for name, cols in high_level_factors_map.items():
        print(f"- {name}: 找到了 {len(cols)} 个变量名 (例如: {cols[:3]}...)")

except Exception as e:
    print(f"错误: 从 '因子说明.csv' 构建分组时出错: {e}")
    print("将尝试使用之前的假设性分组，但这很可能不正确。")
    # Fallback to previous assumption (likely incorrect based on user feedback)
    high_level_factors_map = {
        '质量可靠性': [f'X{i}' for i in range(1, 12)],
        '性能设计': [f'X{i}' for i in range(12, 139)],
        '销售服务': [f'S{i}' for i in range(1, 85)],
        '售后服务': [f'A{i}' for i in range(1, 59)]
    }


# 过滤掉数据中不存在的列名 (基于动态或回退后的映射)
valid_factor_groups = {}
all_low_level_columns = []
print("\n检查并过滤因子分组中的列:")
for group_name, columns in high_level_factors_map.items():
    # 确保列名是字符串类型
    columns = [str(col) for col in columns]
    # 检查数据列名是否存在 (也确保数据列名是字符串)
    data_columns_str = [str(col) for col in data_df.columns]
    valid_cols = [col for col in columns if col in data_columns_str]

    if valid_cols:
        valid_factor_groups[group_name] = valid_cols
        # 避免重复添加列
        for col in valid_cols:
            if col not in all_low_level_columns:
                all_low_level_columns.append(col)
        print(f"因子组 '{group_name}': 找到 {len(valid_cols)} 个有效列 (例如: {valid_cols[:3]}...)")
    else:
        print(f"警告: 因子组 '{group_name}' 没有在数据 ('{data_file}') 中找到任何对应的列。")
        print(f"  尝试匹配的列名示例: {columns[:5]}")
        print(f"  '{data_file}' 中的列名示例: {data_columns_str[:5]}")


# 确保至少找到了一些低层因子列
if not all_low_level_columns:
     print(f"\n错误: 未能在数据文件 ('{data_file}') 中根据因子说明找到任何有效的低层因子列。")
     print("请检查 '因子说明.csv' 中的 '变量名' 列是否与 '{data_file}' 中的列名完全匹配。")
     exit()
else:
    print(f"\n共找到 {len(all_low_level_columns)} 个有效的低层因子列用于模型输入。")


target_column = '满意度'
# 检查目标列是否存在 (也转为字符串比较以防万一)
if str(target_column) not in [str(col) for col in data_df.columns]:
    print(f"错误: 目标列 '{target_column}' 不在数据文件 ('{data_file}') 中!")
    exit()

# 提取特征和目标
X_low_level = data_df[all_low_level_columns].copy()
y = data_df[target_column].copy()

# --- 2. 数据预处理 ---

# a. 处理缺失值 (使用中位数填充)
if X_low_level.empty:
    print("错误：特征数据框为空，无法进行缺失值填充。")
    exit()

# 确保所有特征列都是数值类型，尝试转换，如果失败则报错
try:
    X_low_level = X_low_level.astype(float)
except ValueError as e:
    print(f"错误: 特征列包含无法转换为数值的数据: {e}")
    # 找出哪些列有问题
    non_numeric_cols = []
    for col in X_low_level.columns:
        try:
            pd.to_numeric(X_low_level[col])
        except ValueError:
            non_numeric_cols.append(col)
    print(f"无法转换为数值的列: {non_numeric_cols}")
    print("请检查这些列在 data_df 中的原始数据类型和内容。")
    exit()


imputer = SimpleImputer(strategy='median')
try:
    X_low_level = pd.DataFrame(imputer.fit_transform(X_low_level), columns=all_low_level_columns)
    print("\n缺失值已用中位数填充。")
except ValueError as e:
    print(f"错误：在填充缺失值时发生错误: {e}")
    print("请再次检查特征列是否只包含数值数据。")
    exit()

if y.isnull().any():
    print(f"目标变量 '{target_column}' 存在缺失值，用中位数填充。")
    y.fillna(y.median(), inplace=True)

# b. 标准化数值特征
if X_low_level.empty:
    print("错误：特征数据框为空，无法进行标准化。")
    exit()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_low_level)
X_scaled_df = pd.DataFrame(X_scaled, columns=all_low_level_columns) # 转回DataFrame方便按组取列
print("特征数据已标准化。")

# --- 3. 准备PyTorch数据 ---
X_tensor = torch.tensor(X_scaled_df.values.astype(np.float32))
y_tensor = torch.tensor(y.values.astype(np.float32)).unsqueeze(1)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_tensor, y_tensor, test_size=0.2, random_state=42)

print(f"\n训练集大小: {X_train.shape[0]}, 测试集大小: {X_test.shape[0]}")
print(f"特征维度: {X_train.shape[1]}")

# 创建DataLoader
batch_size = 64
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# --- 4. 构建HAN模型 ---

# 定义简单的注意力层 (与之前版本相同)
class Attention(nn.Module):
    def __init__(self, feature_dim, attention_dim):
        super(Attention, self).__init__()
        self.attention_fc = nn.Linear(feature_dim, attention_dim)
        self.context_vector = nn.Parameter(torch.Tensor(attention_dim, 1))
        self.softmax = nn.Softmax(dim=1)
        self._reset_parameters()

    def _reset_parameters(self):
        nn.init.xavier_uniform_(self.attention_fc.weight)
        nn.init.zeros_(self.attention_fc.bias)
        nn.init.xavier_uniform_(self.context_vector)

    def forward(self, x):
        # x shape: (batch_size, seq_len, feature_dim)
        attn_hidden = torch.tanh(self.attention_fc(x))
        # scores shape: (batch_size, seq_len, 1)
        scores = attn_hidden @ self.context_vector
        # attention_weights shape: (batch_size, seq_len, 1)
        attention_weights = self.softmax(scores)
        # context shape: (batch_size, feature_dim)
        context = torch.sum(attention_weights * x, dim=1)
        return context, attention_weights.squeeze(-1)

# 定义层次化注意力网络 (HAN) (与之前版本相同，但依赖于正确的 factor_groups_indices)
class HANRegression(nn.Module):
    def __init__(self, factor_groups_indices, low_level_feature_dim,
                 factor_attention_dim=64, factor_output_dim=64,
                 overall_attention_dim=64, dropout_rate=0.3):
        super(HANRegression, self).__init__()
        self.factor_groups_indices = factor_groups_indices
        self.num_high_level_factors = len(factor_groups_indices)
        self.low_level_feature_dim = low_level_feature_dim # This is the EMBEDDING dim for level 1

        # Layer 1: Factor-level Attention
        # We need an embedding layer if input feature dim is 1
        self.embedding_dim = 16 # Define the embedding dimension for input features (if they are 1D)
        self.embedding_layer = nn.Linear(1, self.embedding_dim) # Simple linear layer acts as embedding

        self.factor_attentions = nn.ModuleDict({
            # Attention layer now operates on the embedding dimension
            group_name: Attention(self.embedding_dim, factor_attention_dim)
            for group_name in factor_groups_indices.keys()
        })
        self.factor_linears = nn.ModuleDict({
             # Input dim is embedding_dim (output of Attention)
             group_name: nn.Linear(self.embedding_dim, factor_output_dim)
             for group_name in factor_groups_indices.keys()
        })
        self.factor_relu = nn.ReLU()
        self.factor_dropout = nn.Dropout(dropout_rate)

        # Layer 2: Overall Attention
        self.overall_attention = Attention(factor_output_dim, overall_attention_dim)

        # Output Layer
        self.output_layer = nn.Linear(factor_output_dim, 1)

    def forward(self, x):
        # x shape: (batch_size, total_low_level_features)
        factor_contexts = []
        factor_group_attentions = {}

        # Layer 1 Attention: Process each factor group
        for group_name, indices in self.factor_groups_indices.items():
            x_group = x[:, indices] # (batch_size, num_features_in_group)
            if x_group.shape[1] == 0: continue

            # Reshape and Embed: Treat each feature score as a sequence element with feature_dim=1
            x_group_seq = x_group.unsqueeze(-1) # (batch_size, num_features_in_group, 1)
            x_group_embedded = self.embedding_layer(x_group_seq) # (batch_size, num_features_in_group, embedding_dim)

            # Apply factor-level attention on embedded features
            context, attn1 = self.factor_attentions[group_name](x_group_embedded) # context shape: (batch_size, embedding_dim)

            # Apply linear layer after attention
            processed_context = self.factor_dropout(self.factor_relu(self.factor_linears[group_name](context))) # (batch_size, factor_output_dim)
            factor_contexts.append(processed_context)
            factor_group_attentions[group_name] = attn1

        if not factor_contexts:
             # This should not happen if all_low_level_columns is not empty, but check anyway
             print("警告: 在前向传播中没有有效的因子上下文生成。")
             # Return zeros or handle appropriately
             return torch.zeros(x.shape[0], 1).to(x.device), None # Return dummy output and None weights

        # Stack context vectors
        factor_contexts_tensor = torch.stack(factor_contexts, dim=1) # (batch_size, num_high_level_factors, factor_output_dim)

        # Layer 2 Attention
        overall_context, overall_attention_weights = self.overall_attention(factor_contexts_tensor) # (batch_size, factor_output_dim)

        # Output Layer
        output = self.output_layer(overall_context) # (batch_size, 1)

        return output, overall_attention_weights


# 获取每个因子组对应的列索引 (基于 all_low_level_columns 的顺序)
factor_groups_indices = {}
current_columns_in_tensor = all_low_level_columns # 特征张量中的列顺序与此列表一致
for group_name, group_cols in valid_factor_groups.items():
    # 找到 group_cols 在 current_columns_in_tensor 中的索引
    indices = [current_columns_in_tensor.index(col) for col in group_cols if col in current_columns_in_tensor]
    if indices:
        factor_groups_indices[group_name] = indices
    else:
         print(f"警告: 因子组 '{group_name}' 在最终特征列表中的索引为空，可能在过滤时出现问题。")


# 检查是否有有效的因子组索引
if not factor_groups_indices:
    print("\n错误: 未能为任何因子组生成有效的列索引。无法实例化模型。")
    exit()

# 实例化模型
# low_level_feature_dim is the dimension *after* embedding
model = HANRegression(factor_groups_indices, low_level_feature_dim=16, # Corresponds to self.embedding_dim
                      factor_attention_dim=32, factor_output_dim=64,
                      overall_attention_dim=32)

print("\nHAN 模型结构 (概要):")
print(f"Number of high-level factors: {model.num_high_level_factors}")
print(f"Embedding dim for low-level features: {model.embedding_dim}")
print(f"Factor attention dim: 32, Factor output dim: 64")
print(f"Overall attention dim: 32")


# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f"模型将在 {device} 上运行。")


# --- 5. 定义损失函数和优化器 ---
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# --- 6. 训练模型 ---
num_epochs = 30
print(f"\n开始训练模型，共 {num_epochs} 个周期...")

train_losses = []
test_losses = []
best_test_loss = float('inf')
model_save_path = 'best_han_model.pth'

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs, _ = model(inputs)
        # Check for NaN in outputs or labels
        if torch.isnan(outputs).any() or torch.isnan(labels).any():
            print(f"警告: 周期 {epoch+1} 检测到 NaN 值。跳过此批次。")
            continue
        loss = criterion(outputs, labels)
        if torch.isnan(loss):
             print(f"警告: 周期 {epoch+1} 损失为 NaN。跳过梯度更新。")
             continue
        loss.backward()
        # Optional: Gradient clipping to prevent exploding gradients
        # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)

    epoch_train_loss = running_loss / len(train_loader.dataset) if len(train_loader.dataset) > 0 else 0
    train_losses.append(epoch_train_loss)

    # Evaluate on the test set
    model.eval()
    running_test_loss = 0.0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs, _ = model(inputs)
            if torch.isnan(outputs).any() or torch.isnan(labels).any():
                 print(f"警告: 测试时检测到 NaN 值。")
                 continue # Skip this batch if NaN occurs
            loss = criterion(outputs, labels)
            if torch.isnan(loss):
                 print(f"警告: 测试损失为 NaN。")
                 continue
            running_test_loss += loss.item() * inputs.size(0)

    epoch_test_loss = running_test_loss / len(test_loader.dataset) if len(test_loader.dataset) > 0 else float('inf')
    test_losses.append(epoch_test_loss)

    if (epoch + 1) % 5 == 0:
        print(f"周期 [{epoch+1}/{num_epochs}], 训练损失: {epoch_train_loss:.4f}, 测试损失: {epoch_test_loss:.4f}")

    # Save the best model based on test loss (and ensure loss is not NaN)
    if not np.isnan(epoch_test_loss) and epoch_test_loss < best_test_loss:
        best_test_loss = epoch_test_loss
        try:
            torch.save(model.state_dict(), model_save_path)
        except Exception as e:
            print(f"保存模型时出错: {e}")

print("模型训练完成。")

# --- 7. 评估模型 (使用最佳模型) ---
if best_test_loss == float('inf'):
    print("\n警告: 未能成功训练模型或保存最佳模型（测试损失始终为无穷大或NaN）。无法进行评估。")
else:
    try:
        model.load_state_dict(torch.load(model_save_path))
        print(f"\n已加载最佳模型 (测试损失: {best_test_loss:.4f}) 进行最终评估。")
    except Exception as e:
        print(f"加载最佳模型失败: {e}. 使用最终训练的模型进行评估。")

    model.eval()
    all_preds = []
    all_labels = []
    all_attn_weights = []

    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs, attn_weights = model(inputs)
            # Check for NaN before extending lists
            if not torch.isnan(outputs).any() and attn_weights is not None and not torch.isnan(attn_weights).any():
                all_preds.extend(outputs.cpu().numpy())
                all_labels.extend(labels.cpu().numpy())
                all_attn_weights.extend(attn_weights.cpu().numpy())
            else:
                print("警告: 评估时跳过包含 NaN 的批次输出或注意力权重。")


    if not all_preds:
        print("\n错误: 未能生成任何有效的预测结果，无法进行评估。")
    else:
        all_preds = np.array(all_preds).flatten()
        all_labels = np.array(all_labels).flatten()
        if all_attn_weights:
            all_attn_weights = np.array(all_attn_weights)
        else:
             all_attn_weights = np.empty((0, model.num_high_level_factors))


        mse = mean_squared_error(all_labels, all_preds)
        rmse = np.sqrt(mse)
        r2 = r2_score(all_labels, all_preds)

        print("\n模型在测试集上的最终评估结果:")
        print(f"均方误差 (MSE): {mse:.4f}")
        print(f"均方根误差 (RMSE): {rmse:.4f}")
        print(f"R² 分数: {r2:.4f}")

        # --- 8. 可解释性分析 (第二层注意力) ---
        if all_attn_weights.shape[0] > 0:
            avg_attn_weights = np.mean(all_attn_weights, axis=0)
            # Get factor names in the order they appear in the model's final processing
            # This order corresponds to the order in factor_groups_indices keys
            high_level_factor_names = list(factor_groups_indices.keys())

            print("\n高层因子（第二层）平均注意力权重:")
            sorted_indices = np.argsort(avg_attn_weights)[::-1]
            for i in sorted_indices:
                # Ensure index is within bounds
                if i < len(high_level_factor_names):
                     print(f"- {high_level_factor_names[i]}: {avg_attn_weights[i]:.4f}")
                else:
                     print(f"警告: 排序索引 {i} 超出因子名称列表范围。")


            # Visualize attention weights
            plt.figure(figsize=(10, 6))
            # Ensure sorted_names matches sorted_weights length
            valid_sorted_indices = [i for i in sorted_indices if i < len(high_level_factor_names)]
            sorted_names = [high_level_factor_names[i] for i in valid_sorted_indices]
            sorted_weights = avg_attn_weights[valid_sorted_indices]

            if len(sorted_names) > 0: # Check if there are names to plot
                plt.bar(sorted_names, sorted_weights, color='skyblue')
                plt.xlabel("高层因子 (按重要性排序)")
                plt.ylabel("平均注意力权重")
                plt.title("高层因子对满意度预测的平均重要性 (第二层注意力)")
                plt.xticks(rotation=45, ha='right')
                plt.tight_layout()
                # plt.show()
                try:
                    plt.savefig("han_factor_attention_weights.png")
                    print("注意力权重图已保存为 han_factor_attention_weights.png")
                except Exception as e:
                    print(f"保存注意力权重图时出错: {e}")
            else:
                print("警告: 没有有效的排序因子名称用于绘图。")


        else:
            print("\n未能收集到注意力权重进行分析。")


# --- 9. 结论与后续 ---
print("\n--- 结论与后续步骤 ---")
print("1. 更新了代码以尝试从 '因子说明.csv' 动态构建因子分组。")
print("2. **重要**: 请确认 '因子说明.csv' 文件中包含名为 '一级指标' 和 '变量名' 的列，并且 '变量名' 列的内容与 'case1_data.csv' 中的实际列名匹配。如果列名不同，请修改代码中读取这些列的部分。")
print("3. 增加了对特征列数据类型的检查和转换，以确保它们是数值类型。")
print("4. 调整了HAN模型结构，明确加入了Embedding层处理输入的1维特征。")
print("5. 增加了训练过程中的NaN值检查，以提高稳定性。")
print("6. 如果动态分组仍然失败或不准确，您可能需要手动检查 `case1_data.csv` 的列名，并在代码中直接定义 `high_level_factors_map`。")
print("7. 模型性能和解释性分析依赖于正确的分组和训练过程的稳定。")