In [None]:
"""
社会经济事件预测系统 - 完整版
包含：数据预处理、时空图神经网络、因果推理模块、训练与可视化
严格适配用户提供的JSON数据结构
"""

# --------------------- 核心依赖 ---------------------
import os
import re
import json
import logging  
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.data import Data
from torch_geometric.nn import TGCN2, global_mean_pool
from torch_geometric.loader import DataLoader
from sentence_transformers import SentenceTransformer
from geopy.geocoders import Nominatim
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import TimeSeriesSplit
import matplotlib.pyplot as plt
import networkx as nx
from typing import Dict, List, Tuple

# --------------------- 日志配置 ---------------------
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.FileHandler('processing.log'), logging.StreamHandler()]
)

# --------------------- 配置类 ---------------------
class Config:
    # 数据参数
    data_path = "output_dataset.jsonl"  # 替换为实际路径
    text_embed_model = "all-mpnet-base-v2"  # 文本编码模型
    geo_default = (39.8283, -98.5795)  # 美国地理中心坐标
    
    # 经济指标配置
    economic_indicators = {
        "Retail Sales": {"unit": "percent", "scaler": RobustScaler()},
        "Arts Funding": {"unit": "absolute", "scaler": RobustScaler()},
        "Local GDP": {"unit": "percent", "scaler": RobustScaler()},
        "Tourism Revenue": {"unit": "percent", "scaler": RobustScaler()}
    }
    
    # 图模型参数
    node_feature_dims = {
        "text_embed": 768,
        "economic": 4,
        "temporal": 6,
        "spatial": 2,
        "type_embed": 8
    }
    hidden_dim = 128
    temporal_decay = 0.02
    
    # 训练参数
    epochs = 500
    batch_size = 16
    learning_rate = 0.0005
    early_stopping = 25

# --------------------- 数据处理管道 ---------------------
class SocioEconomicProcessor:
    def __init__(self, config: Config):
        self.cfg = config
        self.geolocator = Nominatim(user_agent="socio_geo_v1")
        self.text_encoder = SentenceTransformer(self.cfg.text_embed_model)
        self._init_scalers()
        
    def _init_scalers(self):
        """初始化各经济指标的独立标准化器"""
        self.scalers = {
            ind: scaler.__class__()  # 创建新实例避免数据泄漏
            for ind, scaler in self.cfg.economic_indicators.items()
        }
    
    def full_pipeline(self) -> Data:
        """端到端数据处理流水线"""
        raw_data = self._load_and_validate()
        df = self._parse_to_dataframe(raw_data)
        df = self._process_temporal(df)
        df = self._process_geospatial(df)
        df = self._process_semantic(df)
        df = self._process_economic(df)
        return self._build_pyg_data(df)
    
    def _load_and_validate(self) -> dict:
        """数据加载与完整性验证"""
        if not os.path.exists(self.cfg.data_path):
            raise FileNotFoundError(f"数据文件 {self.cfg.data_path} 不存在")
        
        with open(self.cfg.data_path) as f:
            data = json.load(f)
        
        # 结构验证
        required_sections = ['event_table', 'interpretability_table', 'economic_impact_table']
        for entry in data['data']:
            for sec in required_sections:
                if sec not in entry:
                    raise ValueError(f"条目缺少必要部分: {sec}")
                if not isinstance(entry[sec], list):
                    raise ValueError(f"{sec} 必须为列表")
        
        logging.info("数据基础结构验证通过")
        return data
    
    def _parse_to_dataframe(self, raw_data: dict) -> pd.DataFrame:
        """将嵌套JSON解析为结构化DataFrame"""
        records = []
        
        for entry in raw_data['data']:
            # 基础事件信息
            event_info = {
                'event_id': entry['event_table'][0]['event_id'],
                'timestamp': pd.to_datetime(entry['event_table'][0]['timestamp'], errors='coerce'),
                'event_type': entry['event_table'][0].get('event_type', 'Unknown'),
                'location': entry['event_table'][0].get('location', 'Unknown'),
                'description': entry['event_table'][0].get('description', ''),
                'raw_text': entry['source_meta']['raw_text_snippet']
            }
            
            # 经济影响信息
            economic_impacts = {}
            for imp in entry['economic_impact_table']:
                key = f"economic_{imp['indicator']}"
                value = self._parse_magnitude(imp['magnitude'], imp['indicator'])
                economic_impacts[key] = value
            
            # 因果路径信息
            causal_paths = []
            for interp in entry['interpretability_table']:
                path = {
                    'source': interp['reasoning_path'].split(' → ')[0].strip(),
                    'target': interp['reasoning_path'].split(' → ')[-1].strip(),
                    'strength': interp['causal_strength'],
                    'time_window': interp['time_window']
                }
                causal_paths.append(path)
            
            records.append({**event_info, **economic_impacts, 'causal_paths': causal_paths})
        
        return pd.DataFrame(records)
    
    def _parse_magnitude(self, value: str, indicator: str) -> float:
        """解析不同格式的经济指标值"""
        config = self.cfg.economic_indicators[indicator]
        
        try:
            # 百分比处理
            if config['unit'] == "percent" and '%' in value:
                return float(value.strip('%')) / 100
            # 绝对数值处理
            return float(value)
        except ValueError:
            logging.warning(f"无法解析指标值: {value}，已替换为0")
            return 0.0
    
    def _process_temporal(self, df: pd.DataFrame) -> pd.DataFrame:
        """时间特征工程"""
        # 基础时间特征
        df['days'] = (df['timestamp'] - pd.Timestamp('2000-01-01')).dt.days
        
        # 周期性编码
        df['year_sin'] = np.sin(2 * np.pi * df['timestamp'].dt.year / 2024)
        df['year_cos'] = np.cos(2 * np.pi * df['timestamp'].dt.year / 2024)
        df['month_sin'] = np.sin(2 * np.pi * df['timestamp'].dt.month / 12)
        df['month_cos'] = np.cos(2 * np.pi * df['timestamp'].dt.month / 12)
        df['day_sin'] = np.sin(2 * np.pi * df['timestamp'].dt.day / 31)
        df['day_cos'] = np.cos(2 * np.pi * df['timestamp'].dt.day / 31)
        
        return df
    
    def _process_geospatial(self, df: pd.DataFrame) -> pd.DataFrame:
        """地理空间处理"""
        # 地理编码
        df['coordinates'] = df['location'].apply(self._geocode_location)
        
        # 区域划分
        df['region'] = df['coordinates'].apply(
            lambda x: self._determine_region(x, self.cfg.geo_default))
        
        return df
    
    def _geocode_location(self, loc_str: str) -> Tuple[float, float]:
        """分级地理编码策略"""
        try:
            # 尝试完整地址
            loc = self.geolocator.geocode(loc_str, timeout=10)
            if loc: return (loc.latitude, loc.longitude)
            
            # 分层解析
            parts = [p.strip() for p in loc_str.split('/') if p.strip()]
            for i in range(len(parts)-1, 0, -1):
                loc = self.geolocator.geocode("/".join(parts[i:]), timeout=5)
                if loc: return (loc.latitude, loc.longitude)
        except Exception as e:
            logging.error(f"地理编码失败: {loc_str} - {str(e)}")
        
        return self.cfg.geo_default
    
    def _determine_region(self, coords: Tuple[float, float], default) -> int:
        """划分地理区域"""
        # 示例：简单经纬度划分
        lat, lon = coords
        if lat > 40: return 0  # 北部
        elif lat < 35: return 1  # 南部
        return 2 if lon < -100 else 3  # 西部/东部
    
    def _process_semantic(self, df: pd.DataFrame) -> pd.DataFrame:
        """语义特征处理"""
        # 拼接文本
        texts = df.apply(lambda x: f"{x['event_type']}: {x['description']} [CONTEXT] {x['raw_text']}", axis=1)
        
        # 批量编码
        df['text_embed'] = self.text_encoder.encode(texts.tolist(), show_progress_bar=True, batch_size=32)
        return df
    
    def _process_economic(self, df: pd.DataFrame) -> pd.DataFrame:
        """经济指标处理"""
        # 独立标准化
        for ind in self.cfg.economic_indicators:
            col = f"economic_{ind}"
            scaler = self.scalers[ind]
            df[col] = scaler.fit_transform(df[[col]].values)
        
        return df
    
    def _build_pyg_data(self, df: pd.DataFrame) -> Data:
        """构建PyG数据对象"""
        # 节点特征
        features = {
            'text': np.stack(df['text_embed']),
            'economic': df[[c for c in df.columns if c.startswith('economic_')]].values,
            'temporal': df[['year_sin', 'year_cos', 'month_sin', 'month_cos', 'day_sin', 'day_cos']].values,
            'spatial': np.array([list(x) for x in df['coordinates']]),
            'type': pd.get_dummies(df['event_type']).values
        }
        x = np.hstack([features['text'], features['economic'], features['temporal'], 
                      features['spatial'], features['type']])
        
        # 边构建
        edge_index, edge_attr = [], []
        event_id_map = df['event_id'].reset_index().set_index('event_id')['index'].to_dict()
        
        for _, row in df.iterrows():
            for path in row['causal_paths']:
                src = event_id_map.get(path['source'], -1)
                tgt = event_id_map.get(path['target'], -1)
                
                if src == -1 or tgt == -1:
                    logging.warning(f"无效因果路径: {path['source']} → {path['target']}")
                    continue
                
                # 时间衰减计算
                time_diff = df.iloc[tgt]['days'] - df.iloc[src]['days']
                decay = np.exp(-self.cfg.temporal_decay * abs(time_diff))
                weight = path['strength'] * decay
                
                edge_index.append([src, tgt])
                edge_attr.append(weight)
        
        return Data(
            x=torch.tensor(x, dtype=torch.float32),
            edge_index=torch.tensor(edge_index, dtype=torch.long).t().contiguous(),
            edge_attr=torch.tensor(edge_attr, dtype=torch.float32).unsqueeze(1),
            y=torch.tensor(df[[c for c in df.columns if c.startswith('economic_')]].values, 
                     dtype=torch.float32)
        )

# --------------------- 因果增强GNN模型 ---------------------
class CausalTGCN(nn.Module):
    def __init__(self, config: Config):
        super().__init__()
        self.cfg = config
        
        # 时序图卷积
        self.tgcn = TGCN2(
            in_channels=self._calc_input_dim(),
            out_channels=self.cfg.hidden_dim
        )
        
        # 动态边权重
        self.edge_weight_layer = nn.Sequential(
            nn.Linear(1, 16),
            nn.ReLU(),
            nn.Linear(16, 1),
            nn.Sigmoid()
        )
        
        # 预测头
        self.pred_head = nn.Sequential(
            nn.Linear(self.cfg.hidden_dim, self.cfg.hidden_dim//2),
            nn.GELU(),
            nn.Dropout(0.3),
            nn.Linear(self.cfg.hidden_dim//2, len(self.cfg.economic_indicators))
        )
    
    def _calc_input_dim(self) -> int:
        """计算总输入维度"""
        return sum(self.cfg.node_feature_dims.values())
    
    def forward(self, data: Data) -> torch.Tensor:
        # 动态边权重
        edge_weights = self.edge_weight_layer(data.edge_attr)
        
        # 时空卷积
        h = self.tgcn(data.x, data.edge_index, edge_weights)
        
        # 全局池化
        graph_embed = global_mean_pool(h, batch=None)
        
        # 经济指标预测
        return self.pred_head(graph_embed)

# --------------------- 训练框架 ---------------------
class SocioEconomicTrainer:
    def __init__(self, config: Config):
        self.cfg = config
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
        # 初始化组件
        self.processor = SocioEconomicProcessor(config)
        self.model = CausalTGCN(config).to(self.device)
        self.optimizer = torch.optim.AdamW(
            self.model.parameters(), 
            lr=config.learning_rate,
            weight_decay=1e-5
        )
        self.scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            self.optimizer, 'min', patience=5
        )
    
    def execute(self):
        """执行完整训练流程"""
        try:
            dataset = self.processor.full_pipeline()
            self._train(dataset)
        except Exception as e:
            logging.error(f"训练流程异常终止: {str(e)}")
            raise
    
    def _train(self, dataset: Data):
        """训练循环"""
        # 时间序列分割
        tscv = TimeSeriesSplit(n_splits=5)
        best_loss = float('inf')
        
        for fold, (train_idx, val_idx) in enumerate(tscv.split(dataset.x)):
            logging.info(f"\n=== 开始第 {fold+1}/5 折交叉验证 ===")
            
            # 数据准备
            train_data = dataset[train_idx]
            val_data = dataset[val_idx]
            
            train_loader = DataLoader([train_data], batch_size=1, shuffle=False)
            val_loader = DataLoader([val_data], batch_size=1)
            
            # 折叠训练
            for epoch in range(self.cfg.epochs):
                self.model.train()
                train_loss = 0.0
                
                for batch in train_loader:
                    batch = batch.to(self.device)
                    self.optimizer.zero_grad()
                    
                    pred = self.model(batch)
                    loss = F.huber_loss(pred, batch.y, delta=1.0)
                    loss.backward()
                    
                    # 梯度裁剪
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0)
                    
                    self.optimizer.step()
                    train_loss += loss.item()
                
                # 验证阶段
                val_loss = self._validate(val_loader)
                self.scheduler.step(val_loss)
                
                # 早停检查
                if val_loss < best_loss:
                    best_loss = val_loss
                    torch.save(self.model.state_dict(), f"best_model_fold{fold}.pth")
                    patience = 0
                else:
                    patience += 1
                    if patience >= self.cfg.early_stopping:
                        logging.info(f"早停触发于第 {epoch} 轮")
                        break
                
                # 日志记录
                logging.info(
                    f"Epoch {epoch+1}/{self.cfg.epochs} | "
                    f"Train Loss: {train_loss/len(train_loader):.4f} | "
                    f"Val Loss: {val_loss:.4f} | "
                    f"LR: {self.optimizer.param_groups[0]['lr']:.2e}"
                )
    
    def _validate(self, loader: DataLoader) -> float:
        """验证循环"""
        self.model.eval()
        total_loss = 0.0
        
        with torch.no_grad():
            for batch in loader:
                batch = batch.to(self.device)
                pred = self.model(batch)
                total_loss += F.huber_loss(pred, batch.y).item()
        
        return total_loss / len(loader)

# --------------------- 可视化模块 ---------------------
class SocioVisualizer:
    @staticmethod
    def plot_causal_graph(data: Data, top_n=50):
        """绘制因果图"""
        plt.figure(figsize=(20, 15))
        G = nx.DiGraph()
        
        # 添加节点
        for i in range(min(len(data.x), top_n)):
            node_type = torch.argmax(data.x[i][-8:]).item()
            G.add_node(i, type=node_type)
        
        # 添加边
        edge_list = data.edge_index.t().tolist()[:top_n*2]
        for (s, t), w in zip(edge_list, data.edge_attr[:top_n*2]):
            if s < top_n and t < top_n:
                G.add_edge(s, t, weight=w.item())
        
        # 可视化参数
        pos = nx.spring_layout(G, seed=42)
        node_colors = [G.nodes[n]['type'] for n in G.nodes]
        edge_weights = [G.edges[e]['weight']*2 for e in G.edges]
        
        nx.draw_networkx_nodes(G, pos, node_color=node_colors, cmap=plt.cm.tab20, node_size=500)
        nx.draw_networkx_edges(G, pos, width=edge_weights, edge_color=edge_weights, 
                              edge_cmap=plt.cm.Blues, arrows=True)
        nx.draw_networkx_labels(G, pos)
        
        plt.title("Top 50 因果事件网络")
        plt.colorbar(plt.cm.ScalarMappable(cmap=plt.cm.Blues), label="因果强度")
        plt.axis('off')
        plt.show()
    
    @staticmethod
    def plot_economic_trends(predictions: np.ndarray, 
                            ground_truth: np.ndarray,
                            indicators: List[str]):
        """经济指标预测趋势可视化"""
        plt.figure(figsize=(18, 12))
        for i, ind in enumerate(indicators):
            plt.subplot(2, 2, i+1)
            plt.plot(ground_truth[:, i], label='实际值', color='blue', alpha=0.6)
            plt.plot(predictions[:, i], label='预测值', color='red', linestyle='--')
            plt.title(ind)
            plt.xlabel("时间步")
            plt.ylabel("标准化值")
            plt.legend()
            plt.grid(True)
        
        plt.tight_layout()
        plt.show()

# --------------------- 主程序 ---------------------
if __name__ == "__main__":
    # 初始化系统
    cfg = Config()
    trainer = SocioEconomicTrainer(cfg)
    
    try:
        # 执行完整流程
        trainer.execute()
        
        # 可视化结果
        dataset = SocioEconomicProcessor(cfg).full_pipeline()
        SocioVisualizer.plot_causal_graph(dataset)
        
        # 加载最佳模型预测
        model = CausalTGCN(cfg).to(trainer.device)
        model.load_state_dict(torch.load("best_model_fold0.pth"))
        
        with torch.no_grad():
            pred = model(dataset.to(trainer.device)).cpu().numpy()
            true = dataset.y.numpy()
            SocioVisualizer.plot_economic_trends(pred, true, list(cfg.economic_indicators.keys()))
    
    except Exception as e:
        logging.error(f"系统运行异常: {str(e)}")
        raise

In [None]:
import torch
print("PyTorch 版本:", torch.__version__)          # 应输出 2.3.0+cu121
print("CUDA 是否可用:", torch.cuda.is_available())  # 应输出 True

import torch_geometric
print("PyG 版本:", torch_geometric.__version__)    # 应输出 2.5.0