In [4]:
import pandas as pd
import numpy as np
import os
import time
import warnings
import sys

# =============================================================================
# 1. Configuration
# =============================================================================

# Define Input/Output Directories
# Based on the path structure from the original script
BASE_OUTPUT_DIR = '../../results/04_conj_enh_opp_sup/sg'
CACHE_DIR = os.path.join(BASE_OUTPUT_DIR, 'cache_data')

# Output File Path
OUTPUT_EXCEL_FILE = os.path.join(BASE_OUTPUT_DIR, 'sg_781_raw_count_area.csv')

# Analysis Parameters
# Windows in degrees (1 to 5)
THRESHOLDS = [1, 2, 3, 4, 5]

# Filter warnings for cleaner output
warnings.filterwarnings('ignore')

# =============================================================================
# 2. Worker Function
# =============================================================================

def worker_algo3_excel_export(args):
    """
    Core logic to calculate raw statistics for a single body.
    
    Args:
        args (tuple): Contains (body_name, body_lons, sun_lons, sun_areas, thresholds)
        
    Returns:
        list: A list of dictionaries containing statistics for each window/type.
    """
    body_name, body_lons, sun_lons, sun_areas, thresholds = args
    results = []
    
    # Pre-calculate integers for potential optimization (though not strictly needed for raw count)
    # n_recs = len(sun_lons)
    
    for w in thresholds:
        for etype in ['Opposition', 'Conjunction']:
            
            # --- A. Event Detection ---
            if etype == 'Conjunction':
                # Calculate difference for Conjunction (0 deg target)
                d = np.abs(sun_lons - body_lons)
                d = np.where(d > 180, 360 - d, d)
                is_event = (d <= w)
            else: 
                # Calculate difference for Opposition (180 deg target)
                d = np.abs(np.abs(sun_lons - body_lons) - 180)
                is_event = (d <= w)
            
            # --- B. Calculate Statistics ---
            # 1. Raw Count (Frequency)
            k_obs = np.sum(is_event)
            
            # 2. Area Statistics
            if k_obs > 0:
                # Extract area values for the specific event days
                event_areas = sun_areas[is_event]
                
                # Calculate Total Area and Average Area
                total_area = np.sum(event_areas)
                avg_area = np.mean(event_areas)
            else:
                total_area = 0.0
                avg_area = 0.0
            
            # --- C. Store Results ---
            results.append({
                'Body': body_name,
                'Window': w,
                'Type': etype,
                'Raw_Count': int(k_obs),
                'Total_Area': round(total_area, 2),
                'Avg_Area': round(avg_area, 2)
            })
            
    return results

# =============================================================================
# 3. Main Execution Function
# =============================================================================

def run_extraction_to_excel():
    print("\n" + "=" * 60)
    print("Step: Exporting Raw Statistics (Count, Total Area, Avg Area)")
    print("=" * 60)
    
    # Check if cache directory exists
    if not os.path.exists(CACHE_DIR):
        print(f"[Error] Cache directory not found: {CACHE_DIR}")
        print("Please run Step 1 of the main script first to generate cache files.")
        return

    # Get list of parquet files
    files = [f for f in os.listdir(CACHE_DIR) if f.startswith('ready_') and f.endswith('.parquet')]
    
    if not files:
        print("[Error] No 'ready_*.parquet' files found in cache.")
        return

    all_results = []
    total_start_time = time.time()
    
    for f in files:
        # Extract stage name from filename (e.g., 'ready_Daily.parquet' -> 'Daily')
        stage_name = f.replace('ready_', '').replace('.parquet', '')
        print(f"Processing Stage: {stage_name} ...")
        
        # Load Data
        file_path = os.path.join(CACHE_DIR, f)
        try:
            df = pd.read_parquet(file_path)
        except Exception as e:
            print(f"  [Error] Failed to read {f}: {e}")
            continue
        
        # Identify columns
        # Meta columns to exclude
        meta_cols = {'date', 'hme_lon', 'area', 'ephem_idx_daily', 'Group', 'lat_lon', 'hg_lon', 'hgc_lon', 'lon', 'lat'}
        # Body columns end with '_lon'
        body_cols = [c for c in df.columns if c not in meta_cols and c.endswith('_lon')]
        
        print(f"  - Bodies found: {len(body_cols)} | Total Records: {len(df)}")
        
        # Prepare numpy arrays (float32 for memory efficiency)
        sun_lons = df['hme_lon'].values.astype(np.float32)
        sun_areas = df['area'].values.astype(np.float32)
        
        # Iterate through each body
        # Using a simple loop (Single-Core) to avoid multiprocessing overhead for this export task
        for col in body_cols:
            body_data = df[col].values.astype(np.float32)
            
            clean_body_id = col.replace('_lon', '')
            # Prepare arguments
            args = (clean_body_id, body_data, sun_lons, sun_areas, THRESHOLDS)
            # Run worker
            body_stats = worker_algo3_excel_export(args)
            
            # Append Stage info to results
            for stat in body_stats:
                stat['Stage'] = stage_name
                all_results.append(stat)
                
    print("-" * 60)
    print("Calculation finished. Generating Excel file...")
    
    if all_results:
        # Create DataFrame
        df_final = pd.DataFrame(all_results)
        
        # Reorder columns for better readability
        cols_order = ['Stage', 'Body', 'Window', 'Type', 'Raw_Count', 'Total_Area', 'Avg_Area']
        # Ensure only existing columns are selected
        cols_order = [c for c in cols_order if c in df_final.columns]
        df_final = df_final[cols_order]
        
        # 保存
        try:
            df_final.to_csv(OUTPUT_EXCEL_FILE, index=False)
            print(f"[Success] File saved to: {OUTPUT_EXCEL_FILE}")
            print(f"Total Execution Time: {time.time() - total_start_time:.1f} seconds")
            
        except Exception as e:
            print(f"[Error] Failed to save file: {e}")
            
    else:
        print("[Warning] No results were generated. Please check your data.")

if __name__ == '__main__':
    run_extraction_to_excel()


Step: Exporting Raw Statistics (Count, Total Area, Avg Area)
Processing Stage: All ...
  - Bodies found: 781 | Total Records: 256824
Processing Stage: Daily ...
  - Bodies found: 781 | Total Records: 8301
Processing Stage: Dissipation ...
  - Bodies found: 781 | Total Records: 27870
Processing Stage: Duration ...
  - Bodies found: 781 | Total Records: 75678
Processing Stage: Onset ...
  - Bodies found: 781 | Total Records: 33278
------------------------------------------------------------
Calculation finished. Generating Excel file...
[Success] File saved to: ../../results/04_conj_enh_opp_sup/sg\sg_781_raw_count_area.csv
Total Execution Time: 7.1 seconds
