In [2]:
import pandas as pd
import numpy as np

# 1. Configuration
# Suffixes match the filenames in data/ready/sg_1874_2025_{suffix}.csv
stages_config = [
    {'name': 'Onset SG', 'suffix': 'onset'},
    {'name': 'Diss. SG', 'suffix': 'diss'},
    {'name': 'Dur. SG', 'suffix': 'dur'},
    {'name': 'Daily SG', 'suffix': 'daily'},
    {'name': 'Full DB', 'suffix': 'all'}
]

stats_list = []
full_db_groups = 0 
daily_df = None # Store daily data for footnote calculation

print("Processing data from ready directory...")

for config in stages_config:
    file_path = f'../../data/ready/sg_1874_2025_{config["suffix"]}.csv'
    
    try:
        # Read the final CSV files (headers: date, group_id, area, ...)
        df = pd.read_csv(file_path)
        
        if df.empty:
            n_groups, n_records, a_max, mean_area = 0, 0, 0, 0
        else:
            # Ensure date column is in datetime format
            df['date'] = pd.to_datetime(df['date'])
            
            # Backup daily data for footnote
            if config['suffix'] == 'daily':
                daily_df = df.copy()

            # ==========================================
            # Core Logic: Calculate real Group count (handling ID reuse)
            # ==========================================
            # 1. Sort by ID and date
            df = df.sort_values(['group_id', 'date'])
            
            # 2. Calculate time difference for the same ID
            # NaT for the first record of an ID; otherwise the time delta
            time_diff = df.groupby('group_id')['date'].diff()
            
            # 3. Identify new groups: First record (NaT) or gap > 180 days
            # 180 days is far beyond sunspot physical lifetime, enough to distinguish ID reuse
            is_new_start = (time_diff.isna()) | (time_diff > pd.Timedelta(days=180))
            
            n_groups = is_new_start.sum()
            n_records = len(df)
            a_max = df['area'].max()
            mean_area = df['area'].mean()
            # ==========================================
        
        # Record total groups for proportion calculation
        if config['suffix'] == 'all':
            full_db_groups = n_groups

        stats_list.append({
            'Category': config['name'],
            'N Groups': n_groups,
            'N Records': n_records,
            'A_max (MH)': a_max,
            '< A > (MH)': mean_area
        })
        print(f"Processed: {config['name']} (Groups: {n_groups})")
        
    except FileNotFoundError:
        print(f"Error: File not found {file_path}")
    except Exception as e:
        print(f"Error processing {config['name']}: {e}")

# 2. Generate Main Table
results_df = pd.DataFrame(stats_list)

# Calculate Proportion (%)
if full_db_groups > 0:
    results_df['Prop. (%)'] = (results_df['N Groups'] / full_db_groups) * 100
else:
    results_df['Prop. (%)'] = 0.0

# 3. Format Output (Matching LaTeX table style)
def format_row(row):
    return pd.Series({
        'Category': row['Category'],
        # N Groups: Integer with thousands separator
        'N Groups': f"{int(row['N Groups']):,}", 
        # Prop: 2 decimal places
        'Prop. (%)': f"{row['Prop. (%)']:.2f}",
        # N Records: Integer
        'N Records': f"{int(row['N Records'])}", 
        # A_max: Integer
        'A_max (MH)': f"{row['A_max (MH)']:.0f}", 
        # <A>: 2 decimal places
        '< A > (MH)': f"{row['< A > (MH)']:.2f}"
    })

formatted_df = results_df.apply(format_row, axis=1)

print("\n" + "="*80)
print("Table: Statistics of SG Lifecycles (1874–2025)")
print("="*80)
print(formatted_df.to_string(index=False, justify='right'))
print("-" * 80)

# 4. Footnote Calculation
print("\n[Table Notes Calculation]")
print("Note 1: MH: Millionths of a Solar Hemisphere.")

if daily_df is not None and not daily_df.empty:
    daily_df['Year'] = daily_df['date'].dt.year
    yearly_counts = daily_df.groupby('Year')['group_id'].nunique()
    
    cutoff_year = 1982
    before_1982 = yearly_counts[yearly_counts.index < cutoff_year]
    after_1982 = yearly_counts[yearly_counts.index >= cutoff_year]
    
    avg_before = before_1982.mean() if not before_1982.empty else 0
    avg_after = after_1982.mean() if not after_1982.empty else 0
    
    print(f"Note 2: The average annual count of Daily SGs was {avg_before:.2f} before {cutoff_year},")
    print(f"        dropping to {avg_after:.2f} thereafter.")
else:
    print("\n[Note] Cannot calculate Note 2: Daily data not loaded or empty.")

Processing data from ready directory...
Processed: Onset SG (Groups: 33166)
Processed: Diss. SG (Groups: 27755)
Processed: Dur. SG (Groups: 22976)
Processed: Daily SG (Groups: 8300)
Processed: Full DB (Groups: 42941)

Table: Statistics of SG Lifecycles (1874–2025)
Category N Groups Prop. (%) N Records A_max (MH) < A > (MH)
Onset SG   33,166     77.24     33278       1855      30.65
Diss. SG   27,755     64.64     27870        919      15.04
 Dur. SG   22,976     53.51     75678       1650      34.75
Daily SG    8,300     19.33      8301        919      11.08
 Full DB   42,941    100.00    256859       6132     157.92
--------------------------------------------------------------------------------

[Table Notes Calculation]
Note 1: MH: Millionths of a Solar Hemisphere.
Note 2: The average annual count of Daily SGs was 73.13 before 1982,
        dropping to 11.31 thereafter.


In [4]:
# Check for group_id duplicates within 180 days in Onset, Diss, and Daily databases
import pandas as pd
import os

files_to_check = [
    ('Onset', '../../data/ready/sg_1874_2025_onset.csv'),
    ('Diss.', '../../data/ready/sg_1874_2025_diss.csv'),
    ('Daily', '../../data/ready/sg_1874_2025_daily.csv')
]

print("Checking for group_id duplicates within 180 days (potential errors)...")

for name, path in files_to_check:
    if os.path.exists(path):
        df = pd.read_csv(path)
        df['date'] = pd.to_datetime(df['date'])
        
        # Sort by group_id and date to calculate time difference
        df = df.sort_values(['group_id', 'date'])
        
        # Calculate time difference for the same group_id
        time_diff = df.groupby('group_id')['date'].diff()
        
        # Identify duplicates within 180 days (ID reuse > 180 days is considered normal)
        is_near_dup = (time_diff.notna()) & (time_diff <= pd.Timedelta(days=180))
        
        total_records = len(df)
        dup_records = is_near_dup.sum()
        dup_ratio = (dup_records / total_records) * 100 if total_records > 0 else 0
        
        # Get all records involved in these near-duplicates for display
        dup_ids = df.loc[is_near_dup, 'group_id'].unique()
        near_duplicates = df[df['group_id'].isin(dup_ids)].sort_values(['group_id', 'date'])
        
        print(f"\n[{name}] Total Records: {total_records}, Duplicates: {dup_records} ({dup_ratio:.4f}%)")
        
        if not near_duplicates.empty:
            print(f"Sample of {len(dup_ids)} group_ids with near-duplicates:")
            # Print index and first 5 columns
            print(near_duplicates.iloc[:10, :5]) # Show first 10 rows of such cases
        else:
            print("No group_id duplicates found within 180 days.")
    else:
        print(f"\n[{name}] File not found: {path}")

print("\nNote: These near-duplicates are very rare and will be ignored in the statistical analysis.")


Checking for group_id duplicates within 180 days (potential errors)...

[Onset] Total Records: 33279, Duplicates: 113 (0.3396%)
Sample of 107 group_ids with near-duplicates:
            date  group_id  area  dist_c     pa
24293 1982-01-09      3548     0   0.536  137.4
24294 1982-01-09      3548    10   0.413  132.2
24303 1982-01-17      3554    10   0.212  213.8
24304 1982-01-17      3554     0   0.372  194.6
24487 1982-08-07      3847    10   0.471  125.2
24488 1982-08-07      3847     0   0.384  122.6
24538 1982-09-24      3920     0   0.293  159.4
24539 1982-09-24      3920    10   0.349  223.3
24628 1983-01-02      4044    30   0.209   19.3
24629 1983-01-02      4044    10   0.278  342.1

[Diss.] Total Records: 27870, Duplicates: 115 (0.4126%)
Sample of 106 group_ids with near-duplicates:
            date  group_id  area  dist_c     pa
22825 1982-03-13      3635    10   0.721  274.1
22826 1982-03-13      3635     0   0.738  265.7
22847 1982-04-16      3690    10   0.692  249.5
228