In [1]:
import pandas as pd
import requests
import json
import time
import hashlib
import threading
import concurrent.futures
import re
from typing import Dict, List, Union
from pydantic import BaseModel, Field, ValidationError, field_validator  # 修改导入
from functools import partial
from collections import Counter

# ------------------ Configuration ------------------（保持不变）
DEEPSEEK_API_KEY = "sk-02eec7cecae3429facd43a26fc0ab060"
CSV_PATH = "part1.csv"
OUTPUT_JSON = "enhanced_structured_events2.json"
API_URL = "https://api.deepseek.com/v1/chat/completions"
MODEL_NAME = "deepseek-chat"
MAX_WORKERS = 3

# ------------------ Pydantic Models ------------------
class EventSchema(BaseModel):
    event_id: str = Field(..., example="E20231005_01")
    timestamp: str = Field(..., description="ISO8601 timestamp")
    event_type: str = Field(..., examples=["Social Protest", "Policy Change", "M&A"])
    description: str = Field(..., description="Subject-Action-Object format")
    source: str = Field(..., examples=["News Media", "Government Report", "Social Media"])
    location: str = Field(..., examples=["China/Beijing", "USA/New York"])
    impact_industries: List[str] = Field(..., examples=["Finance", "Energy"])
    summary: str = Field(..., max_length=500, description="Concise event summary")  # 修改为允许最多 500 个字符
    explanation: List[str] = Field(..., description="Step-by-step reasoning process")

class InterpretabilitySchema(BaseModel):  # 保持不变
    event_id: str = Field(...)
    time_window: str = Field(..., examples=["T+1d", "T+7d"])
    causal_strength: Union[float, str] = Field(..., examples=[0.8, "High"])
    reasoning_path: str = Field(..., example="Policy Change → Regulatory Impact → Market Response")
    impact_summary: str = Field(..., max_length=300, description="Impact summary")

class EconomicImpactSchema(BaseModel):
    event_id: str = Field(...)
    indicator: str = Field(..., 
        examples=["GDP", "Unemployment Rate", "Trade Volume"],
        pattern=r"^(GDP|Unemployment|Inflation|Trade|FDI|Stock Market|Local GDP|Tourism Revenue|Arts Funding|Retail Sales|Crime Rate|Church Donations|Community Engagement|Volunteer Participation|Real Estate Prices)$"  # 扩展白名单
    )
    direction: str = Field(..., examples=["Increase", "Decrease"])
    magnitude: str = Field(..., examples=["2%", "0.5B USD"])
    confidence: float = Field(..., ge=0, le=1)
    time_horizon: str = Field(..., examples=["1 year", "3-5 years"])
    affected_areas: List[str] = Field(..., examples=["UK", "EU"])

    @field_validator('magnitude')  # 修改为field_validator
    def validate_magnitude(cls, v: str) -> str:
        pattern = r'^[\d\.]+-?[\d\.]*%?(?:\s?[A-Za-z\/]+)*$'  # 增强正则表达式
        if not re.match(pattern, v):
            raise ValueError(f'Invalid magnitude format: {v}')
        return v

    @field_validator('time_horizon')
    def validate_time_horizon(cls, v: str) -> str:
        if not re.match(r'^\d+(\s?-\s?\d+)?\s(years?|months?|quarters?|year|month|quarter)$', v):
            raise ValueError(f'Invalid time horizon: {v}')
        return v

class StructuredResponse(BaseModel):  # 保持不变
    event_table: Union[List[EventSchema], EventSchema]
    interpretability_table: Union[List[InterpretabilitySchema], InterpretabilitySchema]
    economic_impact_table: Union[List[EconomicImpactSchema], EconomicImpactSchema]

# ------------------ Enhanced Error Handler ------------------（保持不变）
class EnhancedErrorHandler:
    def __init__(self):
        self.error_log = []
        self.lock = threading.Lock()
    
    def add_error(self, text: str, error_type: str, details: str = ""):
        with self.lock:
            error_entry = {
                "error_type": error_type,
                "details": details,
                "original_text": text[:500]
            }
            self.error_log.append(error_entry)
            return error_entry

# ------------------ Optimized Processor ------------------
class DeepSeekProcessor:
    def __init__(self):
        self.error_handler = EnhancedErrorHandler()
        self.session = requests.Session()
        
        # 调整系统提示（关键修改）
        self.system_prompt = """You are an expert analyst. Generate VALID JSON in ENGLISH with:
        {schema_template}
        
        Critical Requirements:
        1. Use EXACTLY these economic indicators: 
           [GDP, Unemployment, Inflation, Trade, FDI, Stock Market, Local GDP, Tourism Revenue, Arts Funding]
        2. Magnitude format examples: "5%", "0.5B USD", "1.5%" 
        3. Time horizon format: "6 months", "2-3 years"
        4. Ensure JSON brackets are properly closed
        5. Never truncate JSON output"""

        self.user_prompt = """Analyze this Chinese text and generate structured data in ENGLISH:
        {text}
        
        Output MUST:
        - Use ONLY the allowed economic indicators
        - Format magnitudes exactly like examples
        - Keep JSON syntax valid
        - Escape special characters"""

    # 增强JSON解析（关键修改）
    def parse_response(self, response_text: str) -> Dict:
        try:
            # 清理响应文本
            clean_text = re.sub(r'^```json|```$', '', response_text, flags=re.MULTILINE)
            clean_text = re.sub(r',(\s*?[}\]])', r'\1', clean_text)  # 修复尾随逗号
            
            # 尝试多层解析
            for _ in range(3):
                try:
                    raw_data = json.loads(clean_text)
                    
                    # 规范化数据结构
                    for table in ["event_table", "interpretability_table", "economic_impact_table"]:
                        if isinstance(raw_data.get(table), dict):
                            raw_data[table] = [raw_data[table]]
                    
                    validated = StructuredResponse(**raw_data)
                    return validated.model_dump()  # 修改为V2的model_dump()
                except json.JSONDecodeError as e:
                    # 自动修复常见错误
                    if "Unterminated string" in str(e):
                        clean_text += '"'
                    elif "Expecting ',' delimiter" in str(e):
                        clean_text = clean_text.rsplit(',', 1)[0] + '}'
                    continue
        except Exception as e:
            self.error_handler.add_error(clean_text, "PARSE_ERROR", str(e))
            return {}

    # 其他方法保持不变
    def generate_payload(self, text: str) -> dict:
        schema_template = json.dumps(StructuredResponse.schema(), indent=2)
        return {
            "model": MODEL_NAME,
            "messages": [
                {"role": "system", "content": self.system_prompt.format(schema_template=schema_template)},
                {"role": "user", "content": self.user_prompt.format(text=text)}
            ],
            "temperature": 0.3,
            "max_tokens": 2500
        }

    def call_api_with_retry(self, text: str, max_retries=3) -> Union[dict, None]:
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {DEEPSEEK_API_KEY}"
        }
        
        for attempt in range(max_retries):
            try:
                response = self.session.post(
                    API_URL,
                    headers=headers,
                    json=self.generate_payload(text),
                    timeout=60
                )
                response.raise_for_status()
                return self.parse_response(response.json()['choices'][0]['message']['content'])
            except requests.exceptions.Timeout:
                if attempt < max_retries - 1:
                    time.sleep(2 ** attempt)
                    continue
                self.error_handler.add_error(text, "API_TIMEOUT", f"Max retries {max_retries}")
            except Exception as e:
                self.error_handler.add_error(text, "API_ERROR", str(e))
        return None

# ------------------ Processing Pipeline ------------------
def process_row(processor, row):
    # 增加输入预处理（关键修改）
    try:
        text = str(row['Content']).strip()
        if not text:
            return {}
            
        # 生成唯一哈希
        text_hash = hashlib.sha256(text.encode()).hexdigest()[:32]
        
        # 处理时间戳冲突
        original_time = f"{float(row['Time']):.4f}" if 'Time' in row else ""
        
        response = processor.call_api_with_retry(text)
        if not response:
            return {}
            
        response["source_meta"] = {
            "original_time": original_time,
            "text_hash": text_hash,
            "raw_text_snippet": text[:256]
        }
        return response
    except Exception as e:
        processor.error_handler.add_error(str(row), "PROCESS_ERROR", str(e))
        return {}
def main_process():
    processor = DeepSeekProcessor()
    
    try:
        df = pd.read_csv(CSV_PATH)
        required_cols = ['Time', 'Content']
        missing = [col for col in required_cols if col not in df.columns]
        if missing:
            raise ValueError(f"Missing required columns: {missing}")
            
        df = df.dropna(subset=['Content']).drop_duplicates(subset=['Content'])
        print(f"Successfully loaded {len(df)} records")
        
    except Exception as e:
        print(f"Initialization failed: {str(e)}")
        return

    results = []
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(process_row, processor, row) for _, row in df.iterrows()]
        
        for i, future in enumerate(concurrent.futures.as_completed(futures)):
            try:
                if data := future.result():
                    results.append(data)
                print(f"Processed {i+1}/{len(df)}", end='\r')
            except Exception as e:
                processor.error_handler.add_error("", "THREAD_ERROR", str(e))

    save_results(results, processor.error_handler)
    
# 保存结果函数保持不变
def save_results(data: List[Dict], handler: EnhancedErrorHandler):
    output = {
        "metadata": {
            "total_events": sum(len(d.get("event_table", [])) for d in data),
            "total_interpretations": sum(len(d.get("interpretability_table", [])) for d in data),
            "economic_impact": {
                "total_predictions": sum(len(d.get("economic_impact_table", [])) for d in data),
                "common_indicators": Counter(
                    impact["indicator"] 
                    for d in data 
                    for impact in d.get("economic_impact_table", [])
                ).most_common(5),
                "confidence_distribution": {
                    "0-0.3": sum(1 for d in data for impact in d.get("economic_impact_table", []) if 0 <= impact.get("confidence", 0) <= 0.3),
                    "0.3-0.7": sum(1 for d in data for impact in d.get("economic_impact_table", []) if 0.3 < impact.get("confidence", 0) <= 0.7),
                    "0.7-1": sum(1 for d in data for impact in d.get("economic_impact_table", []) if 0.7 < impact.get("confidence", 0) <= 1),
                }
            },
            "success_rate": f"{len(data)/(len(data)+len(handler.error_log)):.1%}" if (len(data)+len(handler.error_log)) > 0 else "0%",
            "error_stats": {
                "total_errors": len(handler.error_log),
                "error_types": dict(Counter(e["error_type"] for e in handler.error_log))
            }
        },
        "data": data,
        "errors": handler.error_log
    }
    
    with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

if __name__ == "__main__":
    start_time = time.time()
    main_process()
    print(f"\nTotal processing time: {time.time()-start_time:.2f}s")
    print(f"Results saved to: {OUTPUT_JSON}")

Successfully loaded 298 records
Processed 298/298
Total processing time: 3246.00s
Results saved to: enhanced_structured_events.json
