## Step 0: Global Configuration

In [1]:
import pandas as pd
import numpy as np
import os
import pickle
import time
import gc
import warnings
import importlib
from multiprocessing import Pool, cpu_count
import scipy.stats as stats  # for fast P-value calculation

# Try to import external worker, if not available then ignore (will use inline logic later)
try:
    import algo_workers
    importlib.reload(algo_workers)
except ImportError:
    algo_workers = None

# =============================================================================
# 1. Global Configuration & Switches
# =============================================================================

# [Core Switch] True = Test Mode (fast, 100 simulations); False = Production Mode (rigorous, 1000/10000 simulations)
TEST_MODE = False 

if TEST_MODE:
    print(">>> [Mode] Test Mode")
    print("    - Algo 1/2 simulations: 100")
    print("    - Algo 3 simulations: 100 (or using binomial distribution)")
    print("    - Workers: 1 (avoid debugging deadlocks)")
    N_SIM_ALGO12 = 100
    N_SIM_ALGO3 = 100
    N_WORKERS = 1
else:
    print(">>> [Mode] Production Mode")
    print("    - Algo 1/2 simulations: 10000")
    print("    - Algo 3 simulations: 1000")
    # Leave 2 cores for system to prevent freezing
    N_WORKERS = max(1, cpu_count() - 2)
    N_SIM_ALGO12 = 10000
    N_SIM_ALGO3 = 1000
    print(f"    - Workers: {N_WORKERS}")

# Path configuration
BASE_INPUT_DIR = '../../data/ready'
BASE_OUTPUT_DIR = '../../results/04_conj_enh_opp_sup/sf'
CACHE_DIR = os.path.join(BASE_OUTPUT_DIR, 'cache_data')

# Original large file path (containing 781 bodies)
EPHEMERIS_FILE = os.path.join(BASE_INPUT_DIR, '781_planets_dwarfs_asteroids_lonlat.parquet')

FILE_FLARE = 'flare_1975_2017.csv'  # Flare file name

# Output file paths
OUTPUT_FILE_ALGO1 = os.path.join(BASE_OUTPUT_DIR, 'sf_algo1_total_pairs.csv')
OUTPUT_FILE_ALGO2 = os.path.join(BASE_OUTPUT_DIR, 'sf_algo2_at_least_one.csv')
OUTPUT_FILE_ALGO3 = os.path.join(BASE_OUTPUT_DIR, 'sf_algo3_single_body_781.csv')
OUTPUT_FILE_KUIPER = os.path.join(BASE_OUTPUT_DIR, 'sf_algo_kuiper_test.csv')

# 8 major planets column names (for Algo 1 & 2)
PLANET_COLS = ['199_lon', '299_lon', '399_lon', '499_lon', '599_lon', '699_lon', '799_lon', '899_lon']
THRESHOLDS = [1, 2, 3, 4, 5]

# Ensure directories exist
if not os.path.exists(BASE_OUTPUT_DIR): os.makedirs(BASE_OUTPUT_DIR)
if not os.path.exists(CACHE_DIR): os.makedirs(CACHE_DIR)
warnings.filterwarnings('ignore')

# =============================================================================
# 2. Common Utility Functions
# =============================================================================

# Intensity conversion function
def convert_to_intensity(row):
    """Convert GOES flare class to relative intensity proxy"""
    type_multiplier = {'A': 1, 'B': 10, 'C': 100, 'M': 1000, 'X': 10000}
    try:
        if 'xray_class' in row and not pd.isna(row['xray_class']):
            raw = str(row['xray_class']).strip().upper()
        else:
            return np.nan

        if len(raw) < 1: return np.nan
        flare_type = raw[0]
        
        if flare_type not in type_multiplier: return np.nan
        
        try: 
            level_val = float(raw[1:]) if len(raw) > 1 else 1.0
        except: 
            level_val = 1.0
            
        return type_multiplier[flare_type] * level_val
    except: return np.nan

# Classification function
def categorize_flare(row_val):
    """Group by first letter of xray_class"""
    if not isinstance(row_val, str): return 'Other'
    first_char = row_val[0].upper()
    if first_char in ['A', 'B', 'C', 'M', 'X']:
        return f"{first_char}-Class"
    return 'Other'

# interpolate_angle function remains unchanged...

def interpolate_angle(angle1, angle2, fraction):
    """Vectorized angle interpolation (handling 0-360 wrap)"""
    rad1 = np.deg2rad(angle1)
    rad2 = np.deg2rad(angle2)
    # Calculate shortest path difference
    delta = (rad2 - rad1 + np.pi) % (2 * np.pi) - np.pi
    interpolated_rad = rad1 + fraction * delta
    return np.degrees(interpolated_rad) % 360.0

print("--- Configuration Loaded ---")


>>> [Mode] Production Mode
    - Algo 1/2 simulations: 10000
    - Algo 3 simulations: 1000
    - Workers: 30
--- Configuration Loaded ---


## Step 1: Flare Data Pre-processing (Single File Processing)

In [2]:
# =============================================================================
# Step 1: Flare Data Pre-processing (Single File Processing)
# =============================================================================

def step1_data_preparation_optimized():
    print("\n" + "=" * 60)
    print("Step 1: Flare Data Pre-processing")
    print("=" * 60)
    
    start_time = time.time()
    
    # --- 1. Load complete ephemeris (keep original sunspot code logic) ---
    print(f"Loading FULL Ephemeris from: {EPHEMERIS_FILE} ...")
    if not os.path.exists(EPHEMERIS_FILE):
        print("Error: Ephemeris file not found!")
        return

    df_ephem = pd.read_parquet(EPHEMERIS_FILE)
    if 'date' in df_ephem.columns:
        df_ephem['date'] = pd.to_datetime(df_ephem['date'])
        df_ephem.set_index('date', inplace=True)
    
    df_ephem.index = df_ephem.index.normalize()
    df_ephem = df_ephem[~df_ephem.index.duplicated(keep='first')]
    df_ephem.sort_index(inplace=True)
    
    all_body_cols = [c for c in df_ephem.columns if str(c).endswith('_lon')]
    ephem_matrix_all = df_ephem[all_body_cols].values.astype(np.float32)
    
    # Save cache (needed for Algo 1/2)
    path_matrix_8p = os.path.join(CACHE_DIR, 'ephem_matrix_8p.npy')
    np.save(path_matrix_8p, df_ephem[PLANET_COLS].values.astype(np.float64))
    
    print("Calculating Kepler probability maps...")
    prob_maps = {}
    for col in PLANET_COLS:
        hist, _ = np.histogram(df_ephem[col], bins=360, range=(0, 360), density=True)
        prob_maps[col] = hist
    with open(os.path.join(CACHE_DIR, 'kepler_prob_maps.pkl'), 'wb') as f:
        pickle.dump(prob_maps, f)

    # 2. Process flare file
    path_flare = os.path.join(BASE_INPUT_DIR, FILE_FLARE) # Ensure Cell 1 defined FILE_FLARE
    print(f"\nProcessing Flare dataset: {path_flare} ...")
    
    if not os.path.exists(path_flare):
        print(f"Error: file not found!")
        return

    # Read data
    df_flare = pd.read_csv(path_flare)
    
    # [Key Mapping 1] Time
    df_flare['date'] = pd.to_datetime(df_flare['datetime_start'])
    
    # [Key Mapping 2] Calculate intensity proxy (assign to area)
    print("  Calculating intensity proxy from xray_class...")
    # Use the function just defined in Cell 1
    df_flare['area'] = df_flare.apply(convert_to_intensity, axis=1)
    
    # Check conversion results
    valid_area_count = df_flare['area'].notna().sum()
    print(f"  Converted {valid_area_count}/{len(df_flare)} intensity records.")
    
    # Remove rows with failed conversion (i.e., rows without valid xray_class)
    df_flare.dropna(subset=['area', 'hme_lon', 'date'], inplace=True)

    # Filter time range
    min_date, max_date = df_ephem.index.min(), df_ephem.index.max()
    df_flare = df_flare[(df_flare['date'] >= min_date) & (df_flare['date'] <= max_date)].copy()
    
    # Generate Group (using xray_class)
    df_flare['Group'] = df_flare['xray_class'].apply(categorize_flare)
    df_flare = df_flare[df_flare['Group'] != 'Other']

    # Filter invalid data
    df_flare.dropna(subset=['hme_lon', 'date'], inplace=True)

    # [New] Critical deduplication step
    len_before = len(df_flare)
    
    # 1. Physical deduplication: if time, class, and longitude are all the same, treat as same event
    # keep='first' means keep the first occurrence and delete subsequent duplicates
    df_flare.drop_duplicates(subset=['date', 'xray_class', 'hme_lon'], inplace=True)
    
    len_after = len(df_flare)
    n_dropped = len_before - len_after
    
    print(f"  [Data Clean] Removed {n_dropped} duplicate records.")
    if n_dropped > 0:
        print(f"               (Exact/Logical duplicates dropped from {len_before} -> {len_after})")
    
    # Filter time range
    min_date, max_date = df_ephem.index.min(), df_ephem.index.max()
    df_flare = df_flare[(df_flare['date'] >= min_date) & (df_flare['date'] <= max_date)].copy()
    
    # Generate Group (for Algo 1/2)
    df_flare['Group'] = df_flare['xray_class'].apply(categorize_flare)
    df_flare = df_flare[df_flare['Group'] != 'Other'] # Filter miscellaneous

    # --- Interpolation calculation (keep unchanged) ---
    print(f"  Interpolating {len(df_flare)} records...")
    
    day_t = df_flare['date'].dt.normalize()
    fraction = (df_flare['date'] - day_t).dt.total_seconds() / 86400.0
    idx_t = df_ephem.index.searchsorted(day_t)
    
    valid_mask = (idx_t < len(df_ephem) - 1)
    if not valid_mask.all():
        df_flare = df_flare[valid_mask]
        idx_t = idx_t[valid_mask]
        fraction = fraction[valid_mask]
        
    pos_t = ephem_matrix_all[idx_t]
    pos_t_plus_1 = ephem_matrix_all[idx_t + 1]
    fraction_vals = fraction.values[:, np.newaxis].astype(np.float32)
    
    interpolated_matrix = interpolate_angle(pos_t, pos_t_plus_1, fraction_vals)
    
    # Construct result (including calculated area)
    df_result = df_flare.copy()
    df_result['ephem_idx_daily'] = idx_t
    
    df_bodies = pd.DataFrame(interpolated_matrix, columns=all_body_cols, index=df_result.index)
    df_final = pd.concat([df_result, df_bodies], axis=1)
    
    # Save
    save_path = os.path.join(CACHE_DIR, 'ready_Flare_All.parquet')
    df_final.to_parquet(save_path)
    print(f"  Saved {len(df_final)} flare records to: {save_path}")
    print(f"Step 1 Completed. Time: {time.time() - start_time:.1f}s")

if __name__ == '__main__':
    step1_data_preparation_optimized()



Step 1: Flare Data Pre-processing
Loading FULL Ephemeris from: ../../data/ready/781_planets_dwarfs_asteroids_lonlat.parquet ...
Calculating Kepler probability maps...

Processing Flare dataset: ../../data/ready/flare_1975_2017.csv ...
  Calculating intensity proxy from xray_class...
  Converted 39039/39039 intensity records.
  [Data Clean] Removed 5 duplicate records.
               (Exact/Logical duplicates dropped from 39039 -> 39034)
  Interpolating 39034 records...
  Saved 39034 flare records to: ../../results/04_conj_enh_opp_sup/sf/cache_data/ready_Flare_All.parquet
Step 1 Completed. Time: 2.8s


## Step 2: Algorithm 1 (Total Pairs Method)

In [3]:
# =============================================================================
# Step 2: Algorithm 1 (Total Pairs Method)
# =============================================================================

def step2_run_algo1():
    print("\n" + "=" * 60)
    print(f"Step 2: Running Algo 1 | Sim: {N_SIM_ALGO12} | Workers: {N_WORKERS}")
    print("=" * 60)

    # [New] Prevent duplicate appends from repeated Jupyter runs
    if os.path.exists(OUTPUT_FILE_ALGO1):
        os.remove(OUTPUT_FILE_ALGO1)
        print(f"Warning: Removed existing output file {OUTPUT_FILE_ALGO1} to avoid duplicates.")
    
    # Load cache
    try:
        # Load daily position matrix for 8 major planets (for CTS fast lookup)
        ephem_matrix_daily = np.load(os.path.join(CACHE_DIR, 'ephem_matrix_8p.npy'))
        with open(os.path.join(CACHE_DIR, 'kepler_prob_maps.pkl'), 'rb') as f:
            prob_maps = pickle.load(f)
    except FileNotFoundError:
        print("Error: Cache files not found. Please run Step 1 first.")
        return

    results_buffer = []
    files = [f for f in os.listdir(CACHE_DIR) if f.startswith('ready_') and f.endswith('.parquet')]
    
    total_start_time = time.time()

    for f in files:
        stage_name = f.replace('ready_', '').replace('.parquet', '')
        print(f"Processing Stage: {stage_name} ...")
        
        # Read data (only need 8 major planet columns)
        cols_to_load = ['hme_lon', 'ephem_idx_daily', 'Group'] + PLANET_COLS
        df = pd.read_parquet(os.path.join(CACHE_DIR, f), columns=cols_to_load)
        
        groups = sorted(df['Group'].unique(), key=lambda x: ('BCMX'.find(x[0]) if x[0] in 'BCMX' else 99, x))
        groups.append('Total')
        
        for group in groups:
            subset = df if group == 'Total' else df[df['Group'] == group]
            if subset.empty: continue
            
            # Prepare core data
            sun_lons = subset['hme_lon'].values.astype(np.float64)
            obs_planets_interp = subset[PLANET_COLS].values.astype(np.float64) # Already interpolated high-precision data
            sun_idxs_daily = subset['ephem_idx_daily'].values.astype(int)      # CTS uses integer indices
            n_recs = len(subset)
            
            print(f"  Group: {group} (N={n_recs})")
            
            for w in THRESHOLDS:
                for etype in ['Opposition', 'Conjunction']:
                    
                    # --- 1. Calculate observed value (k_obs) ---
                    # Call vectorized function in algo_workers
                    if algo_workers:
                        k_obs = algo_workers.count_events_vectorized(sun_lons, obs_planets_interp, w, etype)
                    else:
                        # Inline fallback logic
                        if etype == 'Conjunction':
                            delta = np.abs(sun_lons[:, None] - obs_planets_interp)
                            delta = np.where(delta > 180, 360 - delta, delta)
                            k_obs = np.sum(delta <= w)
                        else:
                            delta = np.abs(np.abs(sun_lons[:, None] - obs_planets_interp) - 180)
                            k_obs = np.sum(delta <= w)
                    
                    # --- 2. Prepare CTS simulation ---
                    seeds = np.random.randint(0, 1000000000, N_SIM_ALGO12)
                    args = [(seed, sun_lons, ephem_matrix_daily, sun_idxs_daily, w, etype) for seed in seeds]
                    
                    # --- 3. Execute simulation (k_sims) ---
                    # Must use algo_workers.cts_worker_algo1 for multi-core pickle support
                    if N_WORKERS > 1 and algo_workers:
                        with Pool(N_WORKERS) as pool:
                            k_sims = pool.starmap(algo_workers.cts_worker_algo1, args)
                    else:
                        # Single core or no external worker
                        if algo_workers:
                             k_sims = [algo_workers.cts_worker_algo1(*a) for a in args]
                        else:
                             # Simple placeholder, if no algo_workers file cannot perform CTS
                             print("Warning: algo_workers.py missing, skipping simulations.")
                             k_sims = [k_obs] * N_SIM_ALGO12

                    k_sims = np.array(k_sims)
                    
                    # --- 4. Statistical calculation ---
                    # --- [Modified] P-value calculation (two-tailed + effect direction) ---
                    # Calculate left tail (suppression) probability
                    p_left = (np.sum(k_sims <= k_obs) + 1) / (N_SIM_ALGO12 + 1)
                    # Calculate right tail (enhancement) probability
                    p_right = (np.sum(k_sims >= k_obs) + 1) / (N_SIM_ALGO12 + 1)
                    
                    # Two-tailed P-value (double the minimum, capped at 1.0)
                    p_val = 2 * min(p_left, p_right)
                    if p_val > 1.0: p_val = 1.0
                    
                    # Record effect direction
                    effect = 'Suppression' if k_obs < k_sims.mean() else 'Enhancement'

                    if k_sims.std() == 0: z_score = 0
                    else: z_score = (k_obs - k_sims.mean()) / k_sims.std()
                    
                    # --- 5. Theoretical baseline (Kepler Baseline) ---
                    # Calculate precise expectation using Kepler Prob Maps
                    lon_indices = np.floor(sun_lons).astype(int) % 360
                    target_indices = (lon_indices + 180) % 360 if etype == 'Opposition' else lon_indices
                    k_exp = 0
                    for col in PLANET_COLS:
                        k_exp += np.sum(prob_maps[col][target_indices] * (2 * w))
                    
                    ratio = (k_obs / k_exp * 100) if k_exp > 0 else 0
                    
                    # --- [Important] Remember to add 'Effect': effect to the dict ---
                    results_buffer.append({
                        'Stage': stage_name, 'Group': group, 'Window': w, 'Type': etype,
                        'N_Records': n_recs, 'k_obs': k_obs, 'k_exp': round(k_exp, 2),
                        'Ratio': round(ratio, 2), 
                        'p_val': p_val, 
                        'Z_score': round(z_score, 2),
                        'Effect': effect  # <--- Add this line!
                    })
        
        # Stage save
        if results_buffer:
            new_df = pd.DataFrame(results_buffer)
            write_header = not os.path.exists(OUTPUT_FILE_ALGO1)
            new_df.to_csv(OUTPUT_FILE_ALGO1, mode='a', header=write_header, index=False)
            results_buffer = []

    print(f"Step 2 Completed. Time: {time.time()-total_start_time:.1f}s")

if __name__ == '__main__':
    step2_run_algo1()



Step 2: Running Algo 1 | Sim: 10000 | Workers: 30
Processing Stage: Flare_All ...
  Group: B-Class (N=7127)
  Group: C-Class (N=26654)
  Group: M-Class (N=4824)
  Group: X-Class (N=429)
  Group: Total (N=39034)
Step 2 Completed. Time: 106.4s


## Step 3: Algorithm 2 (Trigger Mechanism - At Least One)

In [4]:
# =============================================================================
# Step 3: Algorithm 2 (Trigger Mechanism - At Least One)
# =============================================================================

def step3_run_algo2():
    print("\n" + "=" * 60)
    print(f"Step 3: Running Algo 2 | Sim: {N_SIM_ALGO12} | Workers: {N_WORKERS}")
    print("=" * 60)

    # [New] Prevent duplicate appends from repeated Jupyter runs (for Algo 2 file)
    if os.path.exists(OUTPUT_FILE_ALGO2):
        os.remove(OUTPUT_FILE_ALGO2)
        print(f"Warning: Removed existing output file {OUTPUT_FILE_ALGO2} to avoid duplicates.")
    
    try:
        ephem_matrix_daily = np.load(os.path.join(CACHE_DIR, 'ephem_matrix_8p.npy'))
        with open(os.path.join(CACHE_DIR, 'kepler_prob_maps.pkl'), 'rb') as f:
            prob_maps = pickle.load(f)
    except FileNotFoundError: return
            
    results_buffer = []
    files = [f for f in os.listdir(CACHE_DIR) if f.startswith('ready_') and f.endswith('.parquet')]
    
    total_start_time = time.time()
    
    for f in files:
        stage_name = f.replace('ready_', '').replace('.parquet', '')
        print(f"Processing Stage: {stage_name} ...")
        
        cols_to_load = ['hme_lon', 'ephem_idx_daily', 'Group'] + PLANET_COLS
        df = pd.read_parquet(os.path.join(CACHE_DIR, f), columns=cols_to_load)
        
        groups = sorted(df['Group'].unique(), key=lambda x: ('BCMX'.find(x[0]) if x[0] in 'BCMX' else 99, x))
        groups.append('Total')
        
        for group in groups:
            subset = df if group == 'Total' else df[df['Group'] == group]
            if subset.empty: continue
            
            sun_lons = subset['hme_lon'].values.astype(np.float64)
            obs_planets_interp = subset[PLANET_COLS].values.astype(np.float64)
            sun_idxs_daily = subset['ephem_idx_daily'].values.astype(int)
            n_recs = len(subset)
            
            print(f"  Group: {group} (N={n_recs})")
            
            for w in THRESHOLDS:
                for etype in ['Opposition', 'Conjunction']:
                    
                    # --- 1. Calculate observed value (k_obs) ---
                    if algo_workers:
                        k_obs = algo_workers.count_events_at_least_once(sun_lons, obs_planets_interp, w, etype)
                    else:
                        # Inline fallback
                        if etype == 'Conjunction':
                            delta = np.abs(sun_lons[:, None] - obs_planets_interp)
                            delta = np.where(delta > 180, 360 - delta, delta)
                            is_event = np.any(delta <= w, axis=1)
                        else:
                            delta = np.abs(np.abs(sun_lons[:, None] - obs_planets_interp) - 180)
                            is_event = np.any(delta <= w, axis=1)
                        k_obs = np.sum(is_event)
                    
                    # --- 2. Simulation ---
                    seeds = np.random.randint(0, 1000000000, N_SIM_ALGO12)
                    args = [(seed, sun_lons, ephem_matrix_daily, sun_idxs_daily, w, etype) for seed in seeds]
                    
                    if N_WORKERS > 1 and algo_workers:
                        with Pool(N_WORKERS) as pool:
                            k_sims = pool.starmap(algo_workers.cts_worker_algo2, args)
                    elif algo_workers:
                        k_sims = [algo_workers.cts_worker_algo2(*arg) for arg in args]
                    else:
                        k_sims = [k_obs] * N_SIM_ALGO12
                            
                    k_sims = np.array(k_sims)
                    
                    # --- 3. Statistics ---
                    # --- [Modified] P-value calculation (two-tailed + effect direction) ---
                    # Calculate left tail (suppression) probability
                    p_left = (np.sum(k_sims <= k_obs) + 1) / (N_SIM_ALGO12 + 1)
                    # Calculate right tail (enhancement) probability
                    p_right = (np.sum(k_sims >= k_obs) + 1) / (N_SIM_ALGO12 + 1)
                    
                    # Two-tailed P-value
                    p_val = 2 * min(p_left, p_right)
                    if p_val > 1.0: p_val = 1.0
                    
                    # Record effect direction
                    effect = 'Suppression' if k_obs < k_sims.mean() else 'Enhancement'

                    if k_sims.std() == 0: z_score = 0
                    else: z_score = (k_obs - k_sims.mean()) / k_sims.std()
                    
                    # --- 4. Theoretical baseline (Prob At Least One) ---
                    lon_indices = np.floor(sun_lons).astype(int) % 360
                    target_indices = (lon_indices + 180) % 360 if etype == 'Opposition' else lon_indices
                    
                    p_mat = np.zeros((n_recs, len(PLANET_COLS)))
                    for i, col in enumerate(PLANET_COLS):
                        p_mat[:, i] = prob_maps[col][target_indices] * (2 * w)
                    
                    # P(At Least One) = 1 - Prod(1 - P_i)
                    p_at_least_one = 1.0 - np.prod(1.0 - p_mat, axis=1)
                    k_exp = np.sum(p_at_least_one)
                    
                    ratio = (k_obs / k_exp * 100) if k_exp > 0 else 0
                 
                    # --- [Important] Remember to add 'Effect': effect to the dict ---
                    results_buffer.append({
                        'Stage': stage_name, 'Group': group, 'Window': w, 'Type': etype,
                        'N_Records': n_recs, 'k_obs': k_obs, 'k_exp': round(k_exp, 2),
                        'Ratio': round(ratio, 2), 
                        'p_val': p_val, 
                        'Z_score': round(z_score, 2),
                        'Effect': effect  # <--- Add this line!
                    })
        
        if results_buffer:
            new_df = pd.DataFrame(results_buffer)
            write_header = not os.path.exists(OUTPUT_FILE_ALGO2)
            new_df.to_csv(OUTPUT_FILE_ALGO2, mode='a', header=write_header, index=False)
            results_buffer = []

    print(f"Step 3 Completed. Time: {time.time()-total_start_time:.1f}s")

if __name__ == '__main__':
    step3_run_algo2()



Step 3: Running Algo 2 | Sim: 10000 | Workers: 30
Processing Stage: Flare_All ...
  Group: B-Class (N=7127)
  Group: C-Class (N=26654)
  Group: M-Class (N=4824)
  Group: X-Class (N=429)
  Group: Total (N=39034)
Step 3 Completed. Time: 109.8s


## Step 4: Algorithm 3 (Single Body Complete Version) 

In [5]:
# =============================================================================
# Step 4: Algorithm 3 (Single Body Complete Version) - Frequency + Amplitude + P-value
# =============================================================================

def worker_algo3_final(args):
    """
    Final Version Worker (Fixed): 
    1. Dynamically calculate Kepler probability (solve eccentricity problem)
    2. Two-tailed P-value test
    3. Record Enhancement/Suppression direction
    """
    body_name, body_lons, sun_lons, sun_areas, thresholds, n_sims = args
    results = []
    n_recs = len(sun_lons)
    
    # Global average area (for calculating amplitude ratio)
    global_avg_area = np.mean(sun_areas) if n_recs > 0 else 0
    
    # --- [Key Fix 1] Calculate Kepler probability distribution (histogram) on-site for this body ---
    # Count the occurrence frequency (density) of this body at each degree 0-360, reflecting its dwell time at each orbit position
    # density=True ensures sum * bin_width = 1
    hist_prob, _ = np.histogram(body_lons, bins=360, range=(0, 360), density=True)
    
    # Pre-calculate integer indices of sun positions for fast table lookup
    sun_idx_conj = np.floor(sun_lons).astype(int) % 360
    sun_idx_opp  = (sun_idx_conj + 180) % 360
    
    for w in thresholds:
        # [Removed] Old uniform distribution assumption: p0 = (2 * w) / 360.0 
        
        for etype in ['Opposition', 'Conjunction']:
            # --- A. Basic calculation (keep unchanged) ---
            if etype == 'Conjunction':
                d = np.abs(sun_lons - body_lons)
                d = np.where(d > 180, 360 - d, d)
                is_event = (d <= w)
                
                # [Key Fix 1] Get background probability only at conjunction moments
                target_indices = sun_idx_conj
                
            else: # Opposition
                d = np.abs(np.abs(sun_lons - body_lons) - 180)
                is_event = (d <= w)
                
                target_indices = sun_idx_opp
            
            k_obs = np.sum(is_event)
            
            # --- B. Calculate precise expectation (k_exp) based on Kepler distribution ---
            # Table lookup: hist_prob[idx] is probability density within 1 degree
            # Multiply by (2*w) to get total probability within window
            # Sum over all records to get expected frequency
            daily_probs = hist_prob[target_indices] * (2 * w)
            k_exp = np.sum(daily_probs)
            
            # Calculate frequency ratio
            ratio_freq = (k_obs / k_exp * 100) if k_exp > 0 else 0
            
            # --- C. Amplitude indicator (keep unchanged) ---
            if k_obs > 0:
                event_avg_area = np.mean(sun_areas[is_event])
                ratio_amp = (event_avg_area / global_avg_area * 100) if global_avg_area > 0 else 0
            else:
                event_avg_area = 0
                ratio_amp = 0
            
            # --- D. Significance P-value (Two-tailed fixed version) ---
            # Since probability varies for each trial, binomial distribution is strictly not applicable.
            # But as a robust approximation, we use average probability p_avg.
            p_avg = k_exp / n_recs if n_recs > 0 else 0
            
            # 1. Calculate two-tailed probability
            p_left = stats.binom.cdf(k_obs, n_recs, p_avg)
            p_right = stats.binom.sf(k_obs - 1, n_recs, p_avg)
            p_val = 2 * min(p_left, p_right)
            if p_val > 1.0: p_val = 1.0
            
            # 2. [Key Fix 2] Record effect direction
            effect = 'Suppression' if k_obs < k_exp else 'Enhancement'
              
            results.append({
                'Body': body_name,
                'Window': w,
                'Type': etype,
                'k_obs': k_obs,
                'k_exp': round(k_exp, 2),
                'Ratio_Freq': round(ratio_freq, 2),
                'Avg_Area': round(event_avg_area, 2),
                'Ratio_Amp': round(ratio_amp, 2),
                'p_val': p_val,
                'Effect': effect  # Must include this column
            })
    return results

def step4_run_algo3_final():
    print("\n" + "=" * 60)
    print("Step 4: Running Algo 3 (Final) - FORCE SINGLE CORE MODE")
    print("=" * 60)

    # [CRITICAL CONFIG] Force single-core execution
    # We explicitly set this to 1 to bypass global settings.
    # This avoids Windows multiprocessing pickling errors and IPC deadlocks.
    # Since the calculation is vector-based and fast, serial execution is optimal.
    FORCED_WORKERS = 1 
    
    # Remove existing output file to prevent duplicate appending from previous runs
    if os.path.exists(OUTPUT_FILE_ALGO3):
        os.remove(OUTPUT_FILE_ALGO3)
        print(f"Warning: Removed existing output file {OUTPUT_FILE_ALGO3} to avoid duplicates.")
    
    # Get list of input files from cache directory
    files = [f for f in os.listdir(CACHE_DIR) if f.startswith('ready_') and f.endswith('.parquet')]
    
    total_start = time.time()
    
    for f in files:
        stage_name = f.replace('ready_', '').replace('.parquet', '')
        print(f"\nProcessing Stage: {stage_name} ...")
        
        # Read data
        df = pd.read_parquet(os.path.join(CACHE_DIR, f))
        
        # Identify columns: exclude metadata, keep planetary/body columns
        meta_cols = {'date', 'hme_lon', 'area', 'ephem_idx_daily', 'Group', 'lat_lon', 'hg_lon', 'hgc_lon', 'lon', 'lat'}
        body_cols = [c for c in df.columns if c not in meta_cols and c.endswith('_lon')]
        
        print(f"  Analyzing {len(body_cols)} bodies against {len(df)} records...")
        
        # Prepare data arrays
        sun_lons = df['hme_lon'].values.astype(np.float32)
        sun_areas = df['area'].values.astype(np.float32)
        
        # Create task list
        # Note: Passing large arrays (sun_lons) in single-core mode is efficient 
        # because Python passes by reference (no memory copying/pickling involved).
        tasks = []
        for col in body_cols:
            body_data = df[col].values.astype(np.float32)
            tasks.append((col, body_data, sun_lons, sun_areas, THRESHOLDS, N_SIM_ALGO3))
            
        results_flat = []
        
        # [CORE LOGIC] Execute in Serial Mode
        # We loop directly in the main process. No overhead, no deadlocks.
        print("  [Mode] Executing in Serial Mode (No Multiprocessing)...")
        
        for i, t in enumerate(tasks):
            # Execute the worker function directly
            results_flat.extend(worker_algo3_final(t))
            
        print(f"  Processed {len(tasks)}/{len(tasks)} bodies. Done.")

        # Save results to CSV
        if results_flat:
            df_res = pd.DataFrame(results_flat)
            df_res['Stage'] = stage_name
            
            # Write header only if file does not exist
            hdr = not os.path.exists(OUTPUT_FILE_ALGO3)
            df_res.to_csv(OUTPUT_FILE_ALGO3, mode='a', header=hdr, index=False)
            print(f"  Saved {len(df_res)} rows to {OUTPUT_FILE_ALGO3}")
            
    print(f"Step 4 Completed. Time: {time.time() - total_start:.1f}s")

if __name__ == '__main__':
    step4_run_algo3_final()



Step 4: Running Algo 3 (Final) - FORCE SINGLE CORE MODE

Processing Stage: Flare_All ...
  Analyzing 781 bodies against 39034 records...
  [Mode] Executing in Serial Mode (No Multiprocessing)...
  Processed 781/781 bodies. Done.
  Saved 7810 rows to ../../results/04_conj_enh_opp_sup/sf/sf_algo3_single_body_781.csv
Step 4 Completed. Time: 1.3s


# Step 5: physical trend analysis

In [6]:
import pandas as pd
import numpy as np
import os
from statsmodels.stats.multitest import multipletests

# Configuration Paths (Automatically read BASE_OUTPUT_DIR from above)
# =============================================================================
# Use variables defined at the beginning of the script to ensure path consistency
BASE_DIR = BASE_OUTPUT_DIR 

# Note: Filenames may need modification based on your flare code (e.g., prefix 'sf_' or 'sg_')
# Please check the OUTPUT_FILE_ALGO1 variable names defined in Step 2/3/4
FILE_ALGO1 = OUTPUT_FILE_ALGO1
FILE_ALGO2 = OUTPUT_FILE_ALGO2
FILE_ALGO3 = OUTPUT_FILE_ALGO3

def analyze_physics_trend(csv_file, algo_name):
    """
    For Algo 1 & 2: 
    Do not use FDR (treated as N=1 sensitivity analysis), but check for consistency in physical trends (Conjunction-Enhancement, Opposition-Suppression).
    """
    if not os.path.exists(csv_file):
        print(f"File not found: {csv_file}")
        return

    print(f"\n{'='*80}")
    print(f"[{algo_name}] Physical Trend Verification Report (No FDR Penalty)")
    print(f"Core Logic: Check ratio trends for different flare classes (C/M/X) within 1-5 degree windows.")
    print(f"{'='*80}")
    
    df = pd.read_csv(csv_file)
    
    # 1. Physical Filtering: Ensure only existing groups are analyzed (usually C, M, X Class)
    # Automatically detect groups present in the data
    available_groups = sorted(df['Group'].unique())
    
    if not available_groups:
        print("No group data found.")
        return
        
    df_target = df.copy()

    # 2. Construct pivot table display format: "Ratio% (P-value)"
    def format_cell(row):
        # Markers: ** p<0.01, * p<0.05 (Raw P-value)
        p = row['p_val']
        mark = ""
        if p < 0.01: mark = "**"
        elif p < 0.05: mark = "*"
        
        # Physical direction consistency check
        ratio = row['Ratio']
        # Expected: Conjunction > 100, Opposition < 100
        is_consistent = False
        if row['Type'] == 'Conjunction' and ratio > 100: is_consistent = True
        elif row['Type'] == 'Opposition' and ratio < 100: is_consistent = True
        
        # If significant but direction is reversed (e.g., conjunction actually decreases), mark as anomalous
        if mark and not is_consistent:
            return f"{ratio:.0f}%(Anomalous{mark})"
        
        return f"{ratio:.0f}% ({p:.3f}{mark})"

    df_target['Result'] = df_target.apply(format_cell, axis=1)
    
    # Create pivot table
    # Rows: Group (Flare Class), Type (Phase Type)
    # Columns: Window (Window Size)
    try:
        pivot = df_target.pivot_table(
            index=['Group', 'Type'], 
            columns='Window', 
            values='Result', 
            aggfunc='first'
        )
        
        # Adjust column order
        cols = sorted(pivot.columns)
        pivot = pivot[cols]
        
        pd.set_option('display.max_rows', None)
        pd.set_option('display.width', 1000)
        print(pivot)
        
        print("\n[Interpretation Guide]")
        print("  1. Look for cells with * or ** in M-Class or X-Class rows.")
        print("  2. Verify trends: Is Conjunction Ratio > 100%? Is Opposition Ratio < 100%?")
        print("  3. If the effect for large flares (M/X) is stronger than for small flares (C), the physical link is established.")
        
    except Exception as e:
        print(f"Failed to generate pivot table: {e}")


def process_algo3_fair_fdr(csv_file):
    """
    For Algo 3 (Single Body Scan):
    FDR must be retained because 781 bodies were tested (N=781).
    """
    if not os.path.exists(csv_file): 
        print(f"File not found: {csv_file}")
        return

    print(f"\n{'='*80}")
    print(f"[Algo 3] Performing Fair FDR Correction (for 781 bodies)")
    print(f"{'='*80}")
    
    df = pd.read_csv(csv_file)
    
    # Must group by environment (Stage + Window + Type)
    # Compare 781 bodies under identical conditions
    # Note: Flare data may not have a 'Stage' concept (only All); adapt automatically if present
    group_cols = ['Stage', 'Window', 'Type']
    actual_cols = [c for c in group_cols if c in df.columns]
    
    df['p_adj_bh'] = np.nan
    df['sig_fdr'] = False
    
    total_sig = 0
    
    for name, group_data in df.groupby(actual_cols):
        # N = Number of bodies in this group (usually 781)
        p_vals = group_data['p_val'].values
        idx = group_data.index
        
        # BH (Benjamini-Hochberg) Correction
        reject, p_adj, _, _ = multipletests(p_vals, alpha=0.05, method='fdr_bh')
        
        df.loc[idx, 'p_adj_bh'] = p_adj
        df.loc[idx, 'sig_fdr'] = reject
        total_sig += reject.sum()
            
    # Save results
    df.to_csv(csv_file, index=False)
    print(f"Correction completed. A total of {total_sig} significant body records passing FDR were found across all scans.")
    if total_sig > 0:
        print("Please open the CSV to view rows where 'sig_fdr' is True, focusing on non-Earth (399) asteroids.")

if __name__ == '__main__':
    # 1. Run physical trend analysis for Algo 1 (Most important)
    analyze_physics_trend(FILE_ALGO1, "Algo 1: Total Pairs")
    
    # 2. Run physical trend analysis for Algo 2
    analyze_physics_trend(FILE_ALGO2, "Algo 2: At Least One")
    
    # 3. Run FDR correction for Algo 3 (Needle in a haystack mode)
    process_algo3_fair_fdr(FILE_ALGO3)



[Algo 1: Total Pairs] Physical Trend Verification Report (No FDR Penalty)
Core Logic: Check ratio trends for different flare classes (C/M/X) within 1-5 degree windows.
Window                           1               2               3               4               5
Group   Type                                                                                      
B-Class Conjunction   112% (0.557)    114% (0.435)    115% (0.366)    117% (0.246)    118% (0.188)
        Opposition     78% (0.063)     83% (0.160)     83% (0.128)     83% (0.120)     84% (0.130)
C-Class Conjunction  124% (0.015*)  127% (0.000**)  124% (0.003**)  123% (0.006**)  123% (0.002**)
        Opposition     90% (0.821)     88% (0.513)     88% (0.473)     88% (0.553)     89% (0.733)
M-Class Conjunction   116% (0.334)    117% (0.249)   122% (0.043*)    120% (0.073)    116% (0.236)
        Opposition    103% (0.883)     97% (0.967)     92% (0.845)     92% (0.833)     92% (0.883)
Total   Conjunction   121% (0.060)  123

## Step 6: Kuiper Test

In [7]:

# =============================================================================
# Step 6: Kuiper Test (Verify Morphology/Bimodal Distribution) - Full Coverage
# =============================================================================
try:
    from astropy.stats import kuiper
except ImportError:
    print("Warning: astropy not installed. Skipping Kuiper test.")
    kuiper = None

# Output path (ensure consistency with above definitions)
OUTPUT_FILE_KUIPER = os.path.join(BASE_OUTPUT_DIR, 'sf_algo_kuiper_test.csv')

def step5_run_kuiper_test_full():
    print("\n" + "=" * 60)
    print("Step 5: Kuiper Test for Distribution Morphology (Full Group Scan)")
    print("=" * 60)
    
    if kuiper is None: return

    # Get all ready files in the cache directory
    files = [f for f in os.listdir(CACHE_DIR) if f.startswith('ready_') and f.endswith('.parquet')]
    
    results_kuiper = []
    
    for f in files:
        stage_name = f.replace('ready_', '').replace('.parquet', '')
        print(f"Processing Stage: {stage_name} ...")
        
        # Read data
        fpath = os.path.join(CACHE_DIR, f)
        df = pd.read_parquet(fpath)
        
        # 1. Determine groups to iterate over
        # Always include 'Total'
        groups_dict = {'Total': df}
        
        # If Group column exists (e.g., C, M, X Class), split and process
        if 'Group' in df.columns:
            for g in df['Group'].unique():
                # Filter out NaN or empty strings
                if pd.isna(g) or str(g).strip() == '': continue
                groups_dict[str(g)] = df[df['Group'] == g]
        
        # 2. Iterate over each group for testing
        for group_name, subset_df in groups_dict.items():
            # Skip if sample size is too small (avoid mathematical errors)
            if len(subset_df) < 10: 
                continue
                
            # Automatically determine reference longitude column name ('hgs_lon' or 'hme_lon')
            # Prioritize 'hme_lon' (used in sunspot code), then 'hgc_lon' (sometimes used in flare code)
            ref_col = 'hme_lon' if 'hme_lon' in subset_df.columns else ('hgc_lon' if 'hgc_lon' in subset_df.columns else None)
            
            if ref_col is None:
                # Skip if reference column not found
                continue
                
            # Test for 8 major planets
            for planet in PLANET_COLS:
                # 1. Calculate phase angle (0-360)
                # Phase = (Planet_Lon - Sun_Event_Lon) % 360
                phases = (subset_df[planet] - subset_df[ref_col]) % 360.0
                phases = phases.dropna().values
                
                if len(phases) == 0: continue
                
                # 2. Normalize to [0, 1]
                data = phases / 360.0
                
                # 3. Execute Kuiper test (check for deviation from uniform distribution)
                try:
                    V, p_val = kuiper(data)
                    
                    results_kuiper.append({
                        'Stage': stage_name,
                        'Group': group_name,
                        'Planet': planet,
                        'N_Count': len(phases),
                        'V_statistic': round(V, 4),
                        'p_value': p_val,
                        # Simple significance marking
                        'Sig': '**' if p_val < 0.01 else ('*' if p_val < 0.05 else '')
                    })
                except Exception as e:
                    pass
    
    # Save results
    if results_kuiper:
        df_k = pd.DataFrame(results_kuiper)
        # Sort by significance, most significant results first
        df_k.sort_values(by='p_value', inplace=True)
        
        df_k.to_csv(OUTPUT_FILE_KUIPER, index=False)
        print(f"Kuiper Test completed. Results saved to: {OUTPUT_FILE_KUIPER}")
        print("\nTop 10 Most Significant Distribution Anomalies (Focus on M/X Class):")
        print(df_k.head(10))
    else:
        print("No Kuiper test results generated (possibly due to mismatched column names or insufficient sample size).")

if __name__ == '__main__':
    step5_run_kuiper_test_full()
    print("\n>>> ALL STEPS COMPLETED SUCCESSFULLY. <<<")



Step 5: Kuiper Test for Distribution Morphology (Full Group Scan)
Processing Stage: Flare_All ...
Kuiper Test completed. Results saved to: ../../results/04_conj_enh_opp_sup/sf/sf_algo_kuiper_test.csv

Top 10 Most Significant Distribution Anomalies (Focus on M/X Class):
        Stage    Group   Planet  N_Count  V_statistic       p_value Sig
2   Flare_All    Total  399_lon    39034       0.4996  0.000000e+00  **
10  Flare_All  C-Class  399_lon    26654       0.5126  0.000000e+00  **
18  Flare_All  M-Class  399_lon     4824       0.4893  0.000000e+00  **
34  Flare_All  B-Class  399_lon     7127       0.4933  0.000000e+00  **
26  Flare_All  X-Class  399_lon      429       0.4917  2.368558e-88  **
12  Flare_All  C-Class  599_lon    26654       0.0301  1.974810e-19  **
3   Flare_All    Total  499_lon    39034       0.0207  4.072441e-13  **
4   Flare_All    Total  599_lon    39034       0.0186  2.048680e-10  **
33  Flare_All  B-Class  299_lon     7127       0.0431  3.170678e-10  **
11  Flare