In [4]:
import pandas as pd
import requests
import json
import time
import hashlib
import threading
import concurrent.futures
from functools import partial
from typing import Dict, List, Union
from pydantic import BaseModel, Field, ValidationError

# ------------------ Configuration ------------------
DEEPSEEK_API_KEY = "sk-02eec7cecae3429facd43a26fc0ab060"
CSV_PATH = "Code/cleaned_dataset.csv"
OUTPUT_JSON = "enhanced_structured_events.json"
API_URL = "https://api.deepseek.com/v1/chat/completions"
MODEL_NAME = "deepseek-chat"
MAX_WORKERS = 3
# ---------------------------------------------------

# ------------------ Pydantic Models ------------------
class EventSchema(BaseModel):
    event_id: str = Field(..., example="E20231005_01")
    timestamp: str = Field(..., description="ISO8601 timestamp")
    event_type: str = Field(..., examples=["Social Protest", "Policy Change", "M&A"])
    description: str = Field(..., description="Subject-Action-Object format")
    source: str = Field(..., examples=["News Media", "Government Report", "Social Media"])
    location: str = Field(..., examples=["China/Beijing", "USA/New York"])
    impact_industries: List[str] = Field(..., examples=["Finance", "Energy"])
    summary: str = Field(..., max_length=150, description="Concise event summary")
    explanation: List[str] = Field(..., description="Step-by-step reasoning process")

class InterpretabilitySchema(BaseModel):
    event_id: str = Field(...)
    time_window: str = Field(..., examples=["T+1d", "T+7d"])
    causal_strength: Union[float, str] = Field(..., examples=[0.8, "High"])
    reasoning_path: str = Field(..., example="Policy Change → Regulatory Impact → Market Response")
    impact_summary: str = Field(..., max_length=150, description="Impact summary")

class StructuredResponse(BaseModel):
    event_table: Union[List[EventSchema], EventSchema]
    interpretability_table: Union[List[InterpretabilitySchema], InterpretabilitySchema]

# ------------------ Enhanced Error Handler ------------------
class EnhancedErrorHandler:
    def __init__(self):
        self.error_log = []
        self.lock = threading.Lock()
    
    def add_error(self, text: str, error_type: str, details: str = ""):
        with self.lock:
            error_entry = {
                "error_type": error_type,
                "details": details,
                "original_text": text[:500]
            }
            self.error_log.append(error_entry)
            return error_entry

# ------------------ Optimized Processor ------------------
class DeepSeekProcessor:
    def __init__(self):
        self.error_handler = EnhancedErrorHandler()
        self.session = requests.Session()
        
        self.system_prompt = """You are a expert analyst. Generate structured data in ENGLISH with:
        {schema_template}
        
        Requirements:
        1. Event ID format: E_YYYYMMDD_XX (e.g. E20231005_01)
        2. Time window format: T+[number][unit] (e.g. T+7d)
        3. Causal strength: 0-1 value preferred
        4. Impact industries: [Finance, Energy, Tech, Manufacturing, Real Estate]
        5. Summary: 1-sentence key points
        6. Explanation: 3-5 stepwise reasoning points"""
        
        self.user_prompt = """Analyze this text in Chinese and generate structured data in ENGLISH:
        {text}
        
        Output Requirements:
        - Event description MUST use Subject-Verb-Object format
        - Each event must have ≥1 interpretability record
        - Maintain ID consistency between tables
        - Summaries should capture core elements
        - Explanations show multi-factor analysis"""

    def generate_payload(self, text: str) -> dict:
        schema_template = json.dumps(StructuredResponse.schema(), indent=2)
        return {
            "model": MODEL_NAME,
            "messages": [
                {"role": "system", "content": self.system_prompt.format(schema_template=schema_template)},
                {"role": "user", "content": self.user_prompt.format(text=text)}
            ],
            "temperature": 0.3,
            "max_tokens": 2500
        }

    def call_api_with_retry(self, text: str, max_retries=3) -> Union[dict, None]:
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {DEEPSEEK_API_KEY}"
        }
        
        for attempt in range(max_retries):
            try:
                response = self.session.post(
                    API_URL,
                    headers=headers,
                    json=self.generate_payload(text),
                    timeout=60
                )
                response.raise_for_status()
                return self.parse_response(response.json()['choices'][0]['message']['content'])
            except requests.exceptions.Timeout:
                if attempt < max_retries - 1:
                    time.sleep(2 ** attempt)
                    continue
                self.error_handler.add_error(text, "API_TIMEOUT", f"Max retries {max_retries}")
            except Exception as e:
                self.error_handler.add_error(text, "API_ERROR", str(e))
        return None

    def parse_response(self, response_text: str) -> Dict:
        try:
            clean_text = response_text.replace("```json", "").replace("```", "").strip()
            raw_data = json.loads(clean_text)
            
            # Normalize data structure
            if isinstance(raw_data.get("event_table"), dict):
                raw_data["event_table"] = [raw_data["event_table"]]
            if isinstance(raw_data.get("interpretability_table"), dict):
                raw_data["interpretability_table"] = [raw_data["interpretability_table"]]
            
            validated = StructuredResponse(**raw_data)
            return validated.dict()
        except (json.JSONDecodeError, ValidationError) as e:
            self.error_handler.add_error(response_text, "PARSE_ERROR", str(e))
            return {}

# ------------------ Processing Pipeline ------------------
def process_row(processor, row):
    try:
        text = str(row['Content'])
        response = processor.call_api_with_retry(text)
        if not response:
            return {}
            
        # Add processing metadata
        response["source_meta"] = {
            "original_time": row.get('Time', ''),
            "text_hash": hashlib.sha256(text.encode()).hexdigest()[:32],
            "raw_text_snippet": text[:256]
        }
        return response
    except Exception as e:
        processor.error_handler.add_error(str(row), "PROCESS_ERROR", str(e))
        return {}

def main_process():
    processor = DeepSeekProcessor()
    
    try:
        df = pd.read_csv(CSV_PATH)
        required_cols = ['Time', 'Content']
        missing = [col for col in required_cols if col not in df.columns]
        if missing:
            raise ValueError(f"Missing required columns: {missing}")
            
        df = df.dropna(subset=['Content']).drop_duplicates(subset=['Content'])
        print(f"Successfully loaded {len(df)} records")
        
    except Exception as e:
        print(f"Initialization failed: {str(e)}")
        return

    results = []
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(process_row, processor, row) for _, row in df.iterrows()]
        
        for i, future in enumerate(concurrent.futures.as_completed(futures)):
            try:
                if data := future.result():
                    results.append(data)
                print(f"Processed {i+1}/{len(df)}", end='\r')
            except Exception as e:
                processor.error_handler.add_error("", "THREAD_ERROR", str(e))

    save_results(results, processor.error_handler)

def save_results(data: List[Dict], handler: EnhancedErrorHandler):
    output = {
        "metadata": {
            "total_events": sum(len(d.get("event_table", [])) for d in data),
            "total_interpretations": sum(len(d.get("interpretability_table", [])) for d in data),
            "success_rate": f"{len(data)/(len(data)+len(handler.error_log)):.1%}" if (len(data)+len(handler.error_log)) > 0 else "0%",
            "error_stats": {
                "total_errors": len(handler.error_log),
                "error_types": {err["error_type"]: sum(1 for e in handler.error_log if e["error_type"] == err["error_type"]) 
                              for err in handler.error_log}
            }
        },
        "data": data,
        "errors": handler.error_log
    }
    
    with open(OUTPUT_JSON, 'w', encoding='utf-8') as f:
        json.dump(output, f, ensure_ascii=False, indent=2)

if __name__ == "__main__":
    start_time = time.time()
    main_process()
    print(f"\nTotal processing time: {time.time()-start_time:.2f}s")
    print(f"Results saved to: {OUTPUT_JSON}")

Successfully loaded 3 records
Processed 3/3
Total processing time: 30.20s
Results saved to: enhanced_structured_events.json
